##Step 1: import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
##Step 2: Load Titanic Dataset from OpenML
titanic = fetch_openml(name="titanic", version=1, as_frame=True)
df = titanic.frame
print(df.info())
print("Dataset Shape:", df.shape)
df.head()
##Step 3: EDA
print(df.info())
print(df.describe())
# Survival count
df['survived'].value_counts().plot(kind='bar')
plt.title("Survival Count")
plt.show()
### Step 4: Preprocessing
numeric_features = ['age', 'fare', 'sibsp', 'parch']
categorical_features = ['sex', 'embarked', 'pclass']
###Missing value imputation
numeric_transformer = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
## Step 5:Select Important Features
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder
# Prepare data
X = df[['pclass','age','sibsp','parch','fare','body']]
y = df['survived']
# Fill missing values
X = X.fillna(X.mean())
# Apply SelectKBest for selecting numerical features
selector = SelectKBest(score_func=f_classif, k=4)
X_new = selector.fit_transform(X, y)
# Get selected feature names
selected = X.columns[selector.get_support()]
print("Top Selected Features:", list(selected))
## Step 5:Select Important Features
"""We choose meaningful columns only."""
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
X = df[features]
y = df['survived']
### Step 6: train test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42)
### Step 7: Build classification Model
model = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
### Step 8: Train Model
model.fit(X_train, y_train)
### Step 9: Test Model
y_pred = model.predict(X_test)
###Step 10: Model Evaluation
print(f"Accuracy %: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
### Predict unknown sample
pclass_val = int(input("enter value for pclass (integer, e.g., 1, 2, 3): "))
sex_val_str = input("enter value for sex ('male' or 'female'): ")
age_val = float(input("enter value for age (float, e.g., 29.0): "))
sibsp_val = int(input("enter value for sibsp (integer): "))
parch_val = int(input("enter value for parch (integer): "))
fare_val = float(input("enter value for fare (float, e.g., 211.3375): "))
embarked_val_str = input("enter value for embarked ('S', 'C', or 'Q'): ")
# Create a DataFrame for the single input sample
# The column names must match the 'features' list used in cell mSxFqrJzv_Eu
# features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
input_df = pd.DataFrame([[pclass_val, sex_val_str, age_val, sibsp_val, parch_val, fare_val, embarked_val_str]],
columns=['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked'])
y_pred_single = model.predict(input_df)
print(f"Predicted survival for the given input: {y_pred_single[0]}")