TITANIC SURVIVORS PREDICTION

In [1]:
##Step 1: import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
In [2]:
##Step 2: Load Titanic Dataset from OpenML
titanic = fetch_openml(name="titanic", version=1, as_frame=True)
df = titanic.frame
print(df.info())
print("Dataset Shape:", df.shape)
df.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   float64 
 1   survived   1309 non-null   category
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   float64 
 6   parch      1309 non-null   float64 
 7   ticket     1309 non-null   object  
 8   fare       1308 non-null   float64 
 9   cabin      295 non-null    object  
 10  embarked   1307 non-null   category
 11  boat       486 non-null    object  
 12  body       121 non-null    float64 
 13  home.dest  745 non-null    object  
dtypes: category(3), float64(6), object(5)
memory usage: 116.8+ KB
None
Dataset Shape: (1309, 14)
C:\Users\Sri\anaconda3\lib\site-packages\sklearn\datasets\_openml.py:1022: FutureWarning: The default value of `parser` will change from `'liac-arff'` to `'auto'` in 1.4. You can set `parser='auto'` to silence this warning. Therefore, an `ImportError` will be raised from 1.4 if the dataset is dense and pandas is not installed. Note that the pandas parser may return different data types. See the Notes Section in fetch_openml's API doc for details.
  warn(
Out[2]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1.0 1 Allen, Miss. Elisabeth Walton female 29.0000 0.0 0.0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1.0 1 Allison, Master. Hudson Trevor male 0.9167 1.0 2.0 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1.0 0 Allison, Miss. Helen Loraine female 2.0000 1.0 2.0 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1.0 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 1.0 2.0 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1.0 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 1.0 2.0 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
In [3]:
##Step 3: EDA
print(df.info())
print(df.describe())

# Survival count
df['survived'].value_counts().plot(kind='bar')
plt.title("Survival Count")
plt.show()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   float64 
 1   survived   1309 non-null   category
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   float64 
 6   parch      1309 non-null   float64 
 7   ticket     1309 non-null   object  
 8   fare       1308 non-null   float64 
 9   cabin      295 non-null    object  
 10  embarked   1307 non-null   category
 11  boat       486 non-null    object  
 12  body       121 non-null    float64 
 13  home.dest  745 non-null    object  
dtypes: category(3), float64(6), object(5)
memory usage: 116.8+ KB
None
            pclass          age        sibsp        parch         fare  \
count  1309.000000  1046.000000  1309.000000  1309.000000  1308.000000   
mean      2.294882    29.881135     0.498854     0.385027    33.295479   
std       0.837836    14.413500     1.041658     0.865560    51.758668   
min       1.000000     0.166700     0.000000     0.000000     0.000000   
25%       2.000000    21.000000     0.000000     0.000000     7.895800   
50%       3.000000    28.000000     0.000000     0.000000    14.454200   
75%       3.000000    39.000000     1.000000     0.000000    31.275000   
max       3.000000    80.000000     8.000000     9.000000   512.329200   

             body  
count  121.000000  
mean   160.809917  
std     97.696922  
min      1.000000  
25%     72.000000  
50%    155.000000  
75%    256.000000  
max    328.000000  
In [4]:
### Step 4: Preprocessing
numeric_features = ['age', 'fare', 'sibsp', 'parch']
categorical_features = ['sex', 'embarked', 'pclass']

###Missing value imputation

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])
In [5]:
## Step 5:Select Important Features
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder

# Prepare data
X = df[['pclass','age','sibsp','parch','fare','body']]
y = df['survived']

# Fill missing values
X = X.fillna(X.mean())

# Apply SelectKBest for selecting numerical features
selector = SelectKBest(score_func=f_classif, k=4)
X_new = selector.fit_transform(X, y)

# Get selected feature names
selected = X.columns[selector.get_support()]

print("Top Selected Features:", list(selected))
Top Selected Features: ['pclass', 'age', 'parch', 'fare']
In [6]:
## Step 5:Select Important Features

"""We choose meaningful columns only."""


features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
X = df[features]
y = df['survived']
In [7]:
### Step 6: train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)
In [8]:
### Step 7: Build classification Model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])
In [9]:
### Step 8: Train Model
model.fit(X_train, y_train)
Out[9]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare', 'sibsp',
                                                   'parch']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['sex', 'embarked',
                                                   'pclass'])])),
                ('classifier', LogisticRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [10]:
### Step 9: Test Model
y_pred = model.predict(X_test)
In [11]:
###Step 10: Model Evaluation
print(f"Accuracy %: {accuracy_score(y_test, y_pred):.2f}")

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
Accuracy %: 0.77

Classification Report:

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       144
           1       0.82      0.64      0.72       118

    accuracy                           0.77       262
   macro avg       0.78      0.76      0.77       262
weighted avg       0.78      0.77      0.77       262

In [12]:
### Predict unknown sample
pclass_val = int(input("enter value for pclass (integer, e.g., 1, 2, 3): "))
sex_val_str = input("enter value for sex ('male' or 'female'): ")
age_val = float(input("enter value for age (float, e.g., 29.0): "))
sibsp_val = int(input("enter value for sibsp (integer): "))
parch_val = int(input("enter value for parch (integer): "))
fare_val = float(input("enter value for fare (float, e.g., 211.3375): "))
embarked_val_str = input("enter value for embarked ('S', 'C', or 'Q'): ")

# Create a DataFrame for the single input sample
# The column names must match the 'features' list used in cell mSxFqrJzv_Eu
# features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
input_df = pd.DataFrame([[pclass_val, sex_val_str, age_val, sibsp_val, parch_val, fare_val, embarked_val_str]],
                        columns=['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'])

y_pred_single = model.predict(input_df)

print(f"Predicted survival for the given input: {y_pred_single[0]}")
enter value for pclass (integer, e.g., 1, 2, 3): 1
enter value for sex ('male' or 'female'): male
enter value for age (float, e.g., 29.0): 30
enter value for sibsp (integer): 15
enter value for parch (integer): 25
enter value for fare (float, e.g., 211.3375): 250
enter value for embarked ('S', 'C', or 'Q'): S
Predicted survival for the given input: 0
In [ ]: