"Data visualization is the representation of data through use of common graphics, such as charts, plots, infographics, and even animations to translate the data/information into a visual context."
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
uploaded = files.upload()
import io
my_data = pd.read_csv(io.BytesIO(uploaded['Titanic.csv']))
my_data = pd.read_csv('Titanic.csv')
Below are the features provided in the Test dataset.
Passenger Id: and id given to each traveler on the boat Pclass: the passenger class. It has three possible values: 1,2,3 (first, second and third class) The Name of the passenger Sex Age SibSp: number of siblings and spouses traveling with the passenger Parch: number of parents and children traveling with the passenger The ticket number The ticket Fare The cabin number The embarkation. This describe three possible areas of the Titanic from which the people embark. Three possible values S,C,Q
class_label = [1: 'Survived', 2:'Not Survived']
my_data.info()
print(my_data.head())
my_data.describe()
Finding Unique values
my_data["Embarked"].unique() ###Southampton, Cherbourg, and Queenstown where the boarding has happened
HANDLING MISSING VALUES
m1=my_data["Age"].median(skipna=True)
m2=my_data["Age"].mean(skipna=True)
print("Median: {} and Mean: {} | Median age is 28 as compared to mean which is ~30".format(m1,m2))
a=sum(pd.isnull(my_data['Age'])) # COUNT Missing Values in age
b=round(a/(len(my_data["PassengerId"])),4) # proportion of "Age" missing in percent
# proportion of "Age" missing
print("Count of missing Values : {} , The Proportion of this values with dataset is {}\n".format(a,b*100))
a=sum(pd.isnull(my_data['Fare'])) # COUNT Missing Values in age
b=round(a/(len(my_data["PassengerId"])),4) # proportion of "Age" missing in percent
# proportion of "Fare" missing
print("Count of missing Values : {} , The Proportion of Fare values with dataset is {}\n".format(a,b*100))
###HANDLING MISSING VALUES
train_data = my_data
train_data["Age"].fillna(28, inplace=True)
train_data["Embarked"].fillna("S", inplace=True)
#train_data.drop('Cabin', axis=1, inplace=True)
train_data.info()
Line charts are used to represent the relation between two data X and Y on a different axis. Here we will see some of the examples of a line chart in Python :
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
y2= my_data["Age"]
x2= np.arange(len(y2))
#x2 = my_data["Survived"]
plt.plot(x2, y2)
plt.show()
sns.lineplot(data=my_data, x="Survived", y="Age")
print(y2.mean())
survived_mean = my_data.query("Survived > 0")
print(survived_mean["Age"].mean())
not_survived_mean = my_data.query("Survived == 0")
print(not_survived_mean["Age"].mean())
import numpy as np
sns.lineplot( x = "Embarked",
y = "Age",
hue = "Survived",
data = my_data);
A bar chart or bar graph is a chart or graph that presents categorical data with rectangular bars with heights or lengths proportional to the values that they represent. The bars can be plotted vertically or horizontally.
category_order = ['S',
'C',
'Q']
sns.catplot(x='Embarked',
data=my_data,
kind='count',
order=category_order)
plt.show()
""" Count plot: It gives you the count of the instances of variable under each category"""
sns.barplot(data=my_data, x="Survived", y="Age")
plt.show()
""" Bar plots look similar to count plots, but instead of the count of observations in each category, they show the mean of a quantitative variable among observations in each category."""
The younger you are the more likely to survive?
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
data = [train_data]
for dataset in data:
mean = train_data["Age"].mean()
std = train_data["Age"].std()
is_null = dataset["Age"].isnull().sum()
# compute random numbers between the mean, std and is_null
rand_age = np.random.randint(mean - std, mean + std, size = is_null)
# fill NaN values in Age column with random values generated
age_slice = dataset["Age"].copy()
age_slice[np.isnan(age_slice)] = rand_age
dataset["Age"] = age_slice
dataset["Age"] = train_data["Age"].astype(int)
survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(16, 8))
women = train_data[train_data['Sex']=='female']
men = train_data[train_data['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False, color="green")
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False, color="red")
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False, color="green")
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False, color="red")
ax.legend()
_ = ax.set_title('Male');
Influence of Pasenger Class on Survival
sns.barplot(x='Pclass', y='Survived', data=train_data);
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=14)
plt.figure()
fig = train_data.groupby('Survived')['Pclass'].plot.hist(histtype= 'bar', alpha = 0.8)
plt.legend(('Died','Survived'), fontsize = 12)
plt.xlabel('Pclass', fontsize = 18)
plt.show()
embarked_mode = train_data['Embarked'].mode()
data = [train_data]
for dataset in data:
dataset['Embarked'] = dataset['Embarked'].fillna(embarked_mode)
FacetGrid = sns.FacetGrid(train_data, row='Embarked', size=4.5, aspect=1.6)
FacetGrid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', order=None, hue_order=None )
FacetGrid.add_legend();
Embarked Influence on Survival
sns.set(style="darkgrid")
sns.countplot( x='Survived', data=train_data, hue="Embarked", palette="Set1");
data = [train_data]
for dataset in data:
dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
dataset.loc[dataset['relatives'] > 0, 'travelled_alone'] = 'No'
dataset.loc[dataset['relatives'] == 0, 'travelled_alone'] = 'Yes'
axes = sns.factorplot('relatives','Survived',
data=train_data, aspect = 2.5, );
A Pie Chart is a circular statistical plot that can display only one series of data. The area of the chart is the total percentage of the given data. The area of slices of the pie represents the percentage of the parts of the data. The slices of pie are called wedges. The area of the wedge is determined by the length of the arc of the wedge. The area of a wedge represents the relative percentage of that part with respect to whole data. Pie charts are commonly used in business presentations like sales, operations, survey results, resources, etc as they provide a quick summary.
#PIE CHART
import matplotlib.pyplot as plt
import seaborn as sns
data = [650,180,61]
#define data
class_label = ["S", "C", "Q"]
#define Seaborn color palette to use
colors = sns.color_palette('pastel')[0:3]
#create pie chart
plt.pie(data, labels = class_label, colors = colors, autopct='%.0f%%')
plt.show()
Donut Charts or Doughnut Charts are a special kind of Pie chart with the difference that it has a Blank Circle at the center. The whole ring represents the data series taken into consideration. Each piece of this ring represents the proportion of the whole Data Series or percentage of total if the whole ring represents 100% of data. Donut Chart got its name from the Donuts which has a circle at its center.
# Create a pieplot
#define data
data = [650,180,61]
class_label = [1,2,3]
plt.pie(data)
# add a circle at the center to transform it in a donut chart
my_circle=plt.Circle( (0,0), 0.7, color='white')
# Give color names
plt.rcParams['text.color'] = 'red' ###changing text colors
plt.pie(data, labels=class_label, colors=['red','green','blue']) ### Adding data labels
p = plt.gcf()
p.gca().add_artist(my_circle)
# Show the graph
plt.show()
A scatter plot is a plot or mathematical diagram using Cartesian coordinates to display values for typically two variables for a set of data. Here each value in the data set is represented by a dot. It is used for understanding the relationship between the 2 variables.
my_data.plot(kind ="scatter",
x ='Age',
y ='Fare')
plt.grid()
sns.set_style("whitegrid")
# sepal_length, petal_length are iris
# feature data height used to define
# Height of graph whereas hue store the
# class of iris dataset.
sns.FacetGrid(my_data, hue ="Survived",
height = 6).map(plt.scatter,
'Age',
'Fare').add_legend()
Pair Plot: A “pairs plot” is also known as a scatterplot, in which one variable in the same data row is matched with another variable's value, like this: Pairs plots are just elaborations on this, showing all variables paired with all the other variables.
###PAIR PLOT
#sns.pairplot(data=my_data,kind='scatter')
sns.pairplot(my_data,hue='Survived')
Box plot was was first introduced in year 1969 by Mathematician John Tukey.Box plot give a statical summary of the features being plotted.Top line represent the max value,top edge of box is third Quartile, middle edge represents the median,bottom edge represents the first quartile value.The bottom most line respresent the minimum value of the feature.The height of the box is called as Interquartile range.The black dots on the plot represent the outlier values in the data.
### BOX PLOT
#sns.boxplot(x=my_data["Age"])
sns.boxplot(x='Survived',y='Age',data=my_data)
A heatmap is a two-dimensional graphical representation of data where the individual values that are contained in a matrix are represented as colours. The Seaborn package allows the creation of annotated heatmaps which can be tweaked using Matplotlib tools as per the creator's requirement.
from scipy import stats
a = train_data["Survived"]
b = train_data["Age"]
stats.pointbiserialr(a, b)
A histogram is a graph showing frequency distributions. It is a graph showing the number of observations within each given interval.
sns.histplot(data=my_data, x="Fare")
#sns.kdeplot(data=my_data, x="Fare")
Density Plot is a type of data visualization tool. It is a variation of the histogram that uses ‘kernel smoothing’ while plotting the values. It is a continuous and smooth version of a histogram inferred from a data.
print("Density Plot of Age for Surviving Population and Deceased Population")
plt.figure(figsize=(15,8))
sns.kdeplot(train_data["Age"][train_data.Survived == 1], color="darkturquoise", shade=True)
sns.kdeplot(train_data["Age"][train_data.Survived == 0], color="lightcoral", shade=True)
plt.legend(['Survived', 'Died'])
plt.title('Density Plot of Age for Surviving Population and Deceased Population')
plt.show()