# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# Load in our libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
%matplotlib inline
Train_Path = "/kaggle/input/titanic/train.csv"
Test_Path = "/kaggle/input/titanic/test.csv"
Gender_Submission_Path = "/kaggle/input/titanic/gender_submission.csv"
# Database contain Training Data
# Test_Data contain Testing Data
# Gender_Submission contain sample submission file
Database = pd.read_csv(Train_Path)
Test_Data = pd.read_csv(Test_Path)
Gender_Submission = pd.read_csv(Gender_Submission_Path)
First Analyze Traning Data
Database.head()
Database.isna().sum()
Here we can observe that in Traning Data Age, Cabin, Embarked column contain missing value. Most of the value in Cabin column is missing, so we will not use it as a feature.
Database.info()
Database.describe()
Analyze Test Data
Test_Data.head()
Test_Data.isnull().sum()
Here we can observe that in Test Data Age, Cabin, Fare column contain missing value.
Test_Data.info()
Test_Data.describe()
sns.barplot(Database["Sex"], Database["Survived"])
Female_Survival_Rate = len(Database[(Database["Survived"] == 1) & (Database["Sex"] == "female")])/len(Database[Database["Sex"] == "female"])
Male_Survival_Rate = len(Database[(Database["Survived"] == 1) & (Database["Sex"] == "male")])/len(Database[Database["Sex"] == "male"])
print("Male Survival Rate is", Male_Survival_Rate)
print("Female Survival Rate is", Female_Survival_Rate)
From graph and survival rate we can obesreve that most of the female survived whereas most of the male died.
So to use Sex as a Feature we will do one-hot encoding
for Data in [Database, Test_Data]:
Data["Male"] = Data["Sex"].apply(lambda x: 1 if(x == "male") else 0)
Data["Female"] = Data["Sex"].apply(lambda x: 1 if(x == "female") else 0)
Database.head()
sns.barplot(Database["Embarked"], Database["Survived"])
From graph we can observe that their is a correlation between Embarked and Survival rate. So we will use Embarked
Database["Embarked"].unique()
In training data Embarked contain missing value. So we try to fill with most occured value
Database["Embarked"].value_counts()
# Most Ocuured value is S
Database["Embarked"] = Database["Embarked"].fillna("S")
To use Embarked as a feature we will one-hot encode it.
for Data in [Database, Test_Data]:
Data["Southampton"] = Data["Embarked"].apply(lambda x: 1 if(x == "S") else 0)
Data["Cherbough"] = Data["Embarked"].apply(lambda x: 1 if(x == "C") else 0)
Data["Queenstown"] = Data["Embarked"].apply(lambda x: 1 if(x == "Q") else 0)
Database.head()
Now we will extract Title from Name feature
for Data in [Database, Test_Data]:
Data["Title"] = Data["Name"].str.extract(pat = '([A-Za-z]+)\.')
Database.head()
plt.figure(figsize = (15, 5))
sns.barplot(Database["Title"], Database["Survived"])
From graph we can observe that their is some correlation between Title and Survival Rate
Database["Title"].unique()
Database["Title"].value_counts()
Some Titles have less observation, so we will merge them in Rare. Then we will one-hot encode them.
for Data in [Database, Test_Data]:
Data["Title"] = Data["Title"].replace(["Lady", "Countess", "Capt", "Col", "Don", "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"], "Rare")
Data["Title"] = Data["Title"].replace(["Mlle", "Ms"], "Miss")
Data["Title"] = Data["Title"].replace("Mme", "Mrs")
Data["Mr"] = Data["Title"].apply(lambda x: 1 if(x == "Mr") else 0)
Data["Miss"] = Data["Title"].apply(lambda x: 1 if(x == "Miss") else 0)
Data["Mrs"] = Data["Title"].apply(lambda x: 1 if(x == "Mrs") else 0)
Data["Master"] = Data["Title"].apply(lambda x: 1 if(x == "Master") else 0)
Data["Rare"] = Data["Title"].apply(lambda x: 1 if(x == "Rare") else 0)
Database.head()
sns.barplot(Database["Pclass"], Database["Survived"])
From graph we can observe that most rich people survived whereas most poor people died.
# To give more weightage to Rich people we will change their label
Database["Pclass"] = Database["Pclass"].map({1:3,3:1,2:2})
Test_Data["Pclass"] = Test_Data["Pclass"].map({1:3,3:1,2:2})
Database.head()
Database.groupby(["Survived"])["Age"].plot(kind = "hist", alpha = 0.5, figsize = (10, 5))
plt.title("Age Distribution")
plt.xlabel("Age")
plt.legend(["Not Survived", "Survived"])
plt.show()
From graph we can observe that Most childrens survived whereas adults died.
Database[["Sex", "Age", "Pclass"]].groupby(["Sex", "Pclass"]).mean()
Database[(Database["Sex"] == "female") & (Database["Pclass"] == 1)]["Age"].mean()
print(Database[Database["Title"] == "Master"]["Age"].mean())
print(Database[(Database["Title"] == "Miss") & (Database["Parch"] == 0)]["Age"].mean())
print(Database[(Database["Title"] == "Miss") & (Database["Parch"] != 0)]["Age"].mean())
Age column contain missing value. So what we are doing basically is we are using Title, Parch, Sex, Pclass to fill missing value. Here, if a person has Title "Master" then he must be a child whereas if a person has Title "Miss" and Parch = 0 then she must be an adult whereas if a person has Title "Miss" and Parch != 0 then she must be a child. For rest of the person that does not have "Miss" or "Master" as a Title we are using Sex, Pclass to fill thier Age.
Database["Age"] = Database["Age"].fillna(0)
Test_Data["Age"] = Test_Data["Age"].fillna(0)
for Data in [Database, Test_Data]:
for row in Data.index:
if(Data.loc[row, "Age"] == 0.0):
if(Data.loc[row, "Title"] == "Miss"):
if(Data.loc[row, "Parch"] == 0):
Data.loc[row, "Age"] = Data[(Data["Title"] == "Miss") & (Data["Parch"] == 0)]["Age"].mean()
elif(Data.loc[row, "Parch"] != 0):
Data.loc[row, "Age"] = Data[(Data["Title"] == "Miss") & (Data["Parch"] != 0)]["Age"].mean()
elif(Data.loc[row, "Title"] == "Master"):
Data.loc[row, "Age"] = Data[Data["Title"] == "Master"]["Age"].mean()
elif(Data.loc[row, "Sex"] == "female"):
if(Data.loc[row, "Pclass"] == 1):
Data.loc[row, "Age"] = Data[(Data["Sex"] == "female") & (Data["Pclass"] == 1)]["Age"].mean()
elif(Data.loc[row, "Pclass"] == 2):
Data.loc[row, "Age"] = Data[(Data["Sex"] == "female") & (Data["Pclass"] == 2)]["Age"].mean()
elif(Data.loc[row, "Pclass"] == 3):
Data.loc[row, "Age"] = Data[(Database["Sex"] == "female") & (Data["Pclass"] == 3)]["Age"].mean()
elif(Data.loc[row, "Sex"] == "male"):
if(Data.loc[row, "Pclass"] == 1):
Data.loc[row, "Age"] = Data[(Data["Sex"] == "male") & (Data["Pclass"] == 1)]["Age"].mean()
elif(Data.loc[row, "Pclass"] == 2):
Data.loc[row, "Age"] = Data[(Data["Sex"] == "male") & (Data["Pclass"] == 2)]["Age"].mean()
elif(Data.loc[row, "Pclass"] == 3):
Data.loc[row, "Age"] = Data[(Database["Sex"] == "male") & (Data["Pclass"] == 3)]["Age"].mean()
Now basically we are Label encoding Age
Database["Age_Band"] = pd.cut(Database["Age"], 5)
Database["Age_Band"].unique()
for Data in [Database, Test_Data]:
Data.loc[(Data["Age"] <= 16.336), "Age"] = 1
Data.loc[((Data["Age"] > 16.336) & (Data["Age"] <= 32.252)), "Age"] = 2
Data.loc[((Data["Age"] > 32.252) & (Data["Age"] <= 48.168)), "Age"] = 3
Data.loc[((Data["Age"] > 48.168) & (Data["Age"] <= 64.084)), "Age"] = 4
Data.loc[(Data["Age"] > 64.084), "Age"] = 5
Data["Age"] = Data["Age"].astype(int)
Database.head()
# Graph of Label encoded Age and Survival Rate
sns.barplot(Database["Age"], Database["Survived"])
Database.head()
Here we are creating a feature that will tell us wether a person is travelling Alone or not.
# SibSp :- Number of Siblings/Spouse
# Parch :- Number of Parents/Childrens
Database["SibSp"].unique()
Database["Parch"].unique()
Database["Family"] = Database["SibSp"] + Database["Parch"] + 1
Test_Data["Family"] = Test_Data["SibSp"] + Test_Data["Parch"] + 1
Database.head()
Database["IsAlone"] = Database["Family"].apply(lambda x: 1 if(x != 1) else 0)
Test_Data["IsAlone"] = Test_Data["Family"].apply(lambda x: 1 if(x != 1) else 0)
Database.head()
sns.barplot(Database["IsAlone"], Database["Survived"])
From graph we can observe that person travelling alone has more survival rate. So we will us IsAlone as a feature.
Basically we are label encoding Fare
Database["Fare_Band"] = pd.qcut(Database["Fare"],4)
Database["Fare_Band"].unique()
Test Data contain missing Fare value. So we are filling it with median fare value
Test_Data["Fare"].fillna(Test_Data['Fare'].dropna().median(), inplace=True)
for Data in [Database, Test_Data]:
for row in Data.index:
if(Data.loc[row, "Fare"] <= 7.91):
Data.loc[row, "Fare"] = 0
elif((Data.loc[row, "Fare"] > 7.91) & (Data.loc[row, "Fare"] <= 14.454)):
Data.loc[row, "Fare"] = 1
elif((Data.loc[row, "Fare"] > 14.454) & (Data.loc[row, "Fare"] <= 31)):
Data.loc[row, "Fare"] = 2
else:
Data.loc[row, "Fare"] = 3
Database["Fare"] = Database["Fare"].astype(int)
Test_Data["Fare"] = Test_Data["Fare"].astype(int)
Database.head()
Test_Data.head()
sns.barplot(Database["Fare"], Database["Survived"])
From graph we can observe that survival rate is more for people giving more Fare. So we will use it as our feature
# Drop all unnecessary columns
Database.drop(columns = ["PassengerId", "Name", "Sex", "SibSp", "Parch", "Embarked", "Title", "Age_Band", "Family", "Cabin", "Ticket", "Fare_Band"], inplace = True)
Test_Data.drop(columns = ["Name", "Sex", "SibSp", "Parch", "Embarked", "Title", "Family", "Cabin", "Ticket"], inplace = True)
Finnal Training and Testing Data
Database.head()
Test_Data.head()
X = Database[["Pclass", "Age", "Fare", "Male", "Female", "Southampton", "Cherbough", "Queenstown", "Mr", "Miss", "Mrs", "Master", "Rare", "IsAlone"]]
y = Database["Survived"]
Test_X = Test_Data[["Pclass", "Age", "Fare", "Male", "Female", "Southampton", "Cherbough", "Queenstown", "Mr", "Miss", "Mrs", "Master", "Rare", "IsAlone"]]
Basically what here we are doing is we are making prediction from all level 0 models and giving that to level 1 model and prediction from level 1 model is used as our final ans.
# Level 0 Models
level0 = list()
level0.append(('lr', LogisticRegression()))
level0.append(('knn', KNeighborsClassifier()))
level0.append(('cart', DecisionTreeClassifier()))
level0.append(('svm', SVC()))
level0.append(('bayes', GaussianNB()))
# Level 1 Model
level1 = LogisticRegression()
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
model.fit(X, y)
Final = model.predict(Test_X)
Submission = pd.DataFrame({
"PassengerId": Test_Data["PassengerId"],
"Survived": Final
})
Submission.to_csv('Submission.csv', index=False)