In [2]:
import pandas as pd
import numpy as ny
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib as ml
import seaborn as sn
In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
In [4]:
train_df.head()
Out[4]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
In [5]:
test_df.head()
Out[5]:
PassengerId | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 892 | 3 | Kelly, Mr. James | male | 34.5 | 0 | 0 | 330911 | 7.8292 | NaN | Q |
1 | 893 | 3 | Wilkes, Mrs. James (Ellen Needs) | female | 47.0 | 1 | 0 | 363272 | 7.0000 | NaN | S |
2 | 894 | 2 | Myles, Mr. Thomas Francis | male | 62.0 | 0 | 0 | 240276 | 9.6875 | NaN | Q |
3 | 895 | 3 | Wirz, Mr. Albert | male | 27.0 | 0 | 0 | 315154 | 8.6625 | NaN | S |
4 | 896 | 3 | Hirvonen, Mrs. Alexander (Helga E Lindqvist) | female | 22.0 | 1 | 1 | 3101298 | 12.2875 | NaN | S |
In [6]:
train_df.shape
Out[6]:
(891, 12)
In [7]:
test_df.shape
Out[7]:
(418, 11)
In [8]:
train_df.isnull().sum()
Out[8]:
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
In [9]:
test_df.isnull().sum()
Out[9]:
PassengerId 0 Pclass 0 Name 0 Sex 0 Age 86 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 327 Embarked 0 dtype: int64
In [10]:
train_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
In [11]:
train_df.describe()
Out[11]:
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
In [12]:
test_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 418 non-null int64 1 Pclass 418 non-null int64 2 Name 418 non-null object 3 Sex 418 non-null object 4 Age 332 non-null float64 5 SibSp 418 non-null int64 6 Parch 418 non-null int64 7 Ticket 418 non-null object 8 Fare 417 non-null float64 9 Cabin 91 non-null object 10 Embarked 418 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 36.1+ KB
In [13]:
test_df.describe()
Out[13]:
PassengerId | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|
count | 418.000000 | 418.000000 | 332.000000 | 418.000000 | 418.000000 | 417.000000 |
mean | 1100.500000 | 2.265550 | 30.272590 | 0.447368 | 0.392344 | 35.627188 |
std | 120.810458 | 0.841838 | 14.181209 | 0.896760 | 0.981429 | 55.907576 |
min | 892.000000 | 1.000000 | 0.170000 | 0.000000 | 0.000000 | 0.000000 |
25% | 996.250000 | 1.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 |
50% | 1100.500000 | 3.000000 | 27.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 1204.750000 | 3.000000 | 39.000000 | 1.000000 | 0.000000 | 31.500000 |
max | 1309.000000 | 3.000000 | 76.000000 | 8.000000 | 9.000000 | 512.329200 |
In [14]:
print(train_df['Sex'].unique())
['male' 'female']
In [15]:
print(train_df['Embarked'].unique())
['S' 'C' 'Q' nan]
In [16]:
def data_preprocessing(df, is_train=True):
dropping_columns = ["Name","Ticket","Cabin","PassengerId"]
df = df.drop(columns = dropping_columns,errors = 'ignore')
if "Age" in df.columns:
df['Age'] = df['Age'].fillna(df['Age'].median())
if "Embarked" in df.columns:
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df = pd.get_dummies(df, columns=["Embarked"], drop_first=True,dtype=int)
if 'Fare' in df.columns:
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
if 'Sex' in df.columns:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
if is_train:
X = df.drop(["Survived"],axis = 1)
y = df["Survived"]
return X,y
else:
return df
In [17]:
x_train,y_train = data_preprocessing(train_df,True)
In [18]:
print(x_train)
Pclass Sex Age SibSp Parch Fare Embarked_Q Embarked_S 0 3 1 22.0 1 0 7.2500 0 1 1 1 0 38.0 1 0 71.2833 0 0 2 3 0 26.0 0 0 7.9250 0 1 3 1 0 35.0 1 0 53.1000 0 1 4 3 1 35.0 0 0 8.0500 0 1 .. ... ... ... ... ... ... ... ... 886 2 1 27.0 0 0 13.0000 0 1 887 1 0 19.0 0 0 30.0000 0 1 888 3 0 28.0 1 2 23.4500 0 1 889 1 1 26.0 0 0 30.0000 0 0 890 3 1 32.0 0 0 7.7500 1 0 [891 rows x 8 columns]
In [19]:
print(y_train)
0 0 1 1 2 1 3 1 4 0 .. 886 0 887 1 888 0 889 1 890 0 Name: Survived, Length: 891, dtype: int64
In [20]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
plt = sn.barplot(x="Sex",y="Survived",data=train_df)
train_df.groupby("Sex")["Survived"].mean() *100
Out[20]:
Sex female 74.203822 male 18.890815 Name: Survived, dtype: float64
Survival Rate by Gender¶
The above bar chart shows the survival rate by gender, there is a significant difference between them. The Average survival rate of females was aroud 74% whereas the men's survival rate stayed at only 18%. This mean gender is highly correlated with survival rate.
In [21]:
sn.histplot(test_df["Age"], bins=30, kde=True)
Out[21]:
<Axes: xlabel='Age', ylabel='Count'>
The Surival rate of people with an age group of 18-35 had much higher as compared to others¶
In [ ]:
In [22]:
X_train, X_test, Y_train,Y_test = train_test_split(x_train,y_train,test_size=0.2,random_state=42)
model = LogisticRegression(max_iter=1000)
In [23]:
model.fit(X_train,Y_train)
Out[23]:
LogisticRegression(max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
penalty | 'l2' | |
dual | False | |
tol | 0.0001 | |
C | 1.0 | |
fit_intercept | True | |
intercept_scaling | 1 | |
class_weight | None | |
random_state | None | |
solver | 'lbfgs' | |
max_iter | 1000 | |
multi_class | 'deprecated' | |
verbose | 0 | |
warm_start | False | |
n_jobs | None | |
l1_ratio | None |
In [24]:
new_y_predict = model.predict(X_test)
In [25]:
new_y_predict
Out[25]:
array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1])
In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(Y_test, new_y_predict))
print(confusion_matrix(Y_test, new_y_predict))
print(classification_report(Y_test, new_y_predict))
Accuracy: 0.8100558659217877 [[90 15] [19 55]] precision recall f1-score support 0 0.83 0.86 0.84 105 1 0.79 0.74 0.76 74 accuracy 0.81 179 macro avg 0.81 0.80 0.80 179 weighted avg 0.81 0.81 0.81 179
In [27]:
procesed_data = data_preprocessing(test_df,False)
predicted_data = model.predict(procesed_data)
In [28]:
submission_file = pd.DataFrame({"PassengerId":test_df["PassengerId"],"Survived":predicted_data})
In [29]:
submission_file.head()
Out[29]:
PassengerId | Survived | |
---|---|---|
0 | 892 | 0 |
1 | 893 | 0 |
2 | 894 | 0 |
3 | 895 | 0 |
4 | 896 | 1 |
In [30]:
submission_file.to_csv("submission.csv", index=False)
In [ ]:
In [ ]: