In [41]:
print("hello world! welcome to data pre processing")
hello world

Data preprocessing is an important step in the data mining process. Even before conducting an visual analysis of data it is important to clean the data. Data preprocessing refers to the cleaning, transforming, and integrating of data in order to make it ready for analysis. The goal of data preprocessing is to improve the quality of the data and to make it more suitable for the specific data mining task.

In [42]:
import numpy as np
import pandas as pd


# for operation 2
df2 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])

Handling Missing Values¶

Delete Row¶

In [43]:
df1 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])
df1.isnull().sum().sort_values(ascending=False)
df_no_missing = df1.dropna(axis=0)
print(df_no_missing.head(5))
print('row count:',len(df1))
print('column count:',len(df1.columns))
#df1.info
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
3             1     89.0           66.0           23.0     94.0  28.1   
4             0    137.0            4.0           35.0    168.0  43.1   
6             3     78.0            5.0           32.0     88.0  31.0   
8             2    197.0            7.0           45.0    543.0   3.5   
13            1    189.0            6.0           23.0    846.0   3.1   

    DiabetesPedigreeFunction  Age  Outcome  
3                      0.167   21        0  
4                      2.288   33        1  
6                      0.248   26        1  
8                      0.158   53        1  
13                     0.398   59        1  
row count: 768
column count: 9

Fill with mean¶

In [3]:
from sklearn.preprocessing import Imputer

# Data from excel
df1 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])

#Imputer to replace Null with mean
imp = Imputer(missing_values = 'NaN', strategy='mean', axis=0)


imp.fit(df1)
df1 = pd.DataFrame(data=imp.transform(df1), columns=df1.columns)

#print
print(df1.head(5))
   Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \
0          6.0    148.0           72.0      35.000000  105.659898  33.6   
1          1.0     85.0           66.0      29.000000  105.659898  26.6   
2          8.0    183.0           64.0      25.876155  105.659898  23.3   
3          1.0     89.0           66.0      23.000000   94.000000  28.1   
4          0.0    137.0            4.0      35.000000  168.000000  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627   5.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:58: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.
  warnings.warn(msg, category=DeprecationWarning)

Fill with median¶

In [4]:
from sklearn.preprocessing import Imputer

# Data from excel
df1 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])

#Imputer to replace Null with mean
imp = Imputer(missing_values = 'NaN', strategy='median', axis=0)


imp.fit(df1)
df1 = pd.DataFrame(data=imp.transform(df1), columns=df1.columns)

#print
print(df1.head(5))
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0     71.0  33.6   
1          1.0     85.0           66.0           29.0     71.0  26.6   
2          8.0    183.0           64.0           27.0     71.0  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          0.0    137.0            4.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627   5.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:58: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead.
  warnings.warn(msg, category=DeprecationWarning)

Handling Noicy Data¶

Function to find outliers¶

In [5]:
def find_outliers_tukey(x):
    q1 = x.quantile(.25)
    q3 = x.quantile(.75)
    iqr = q3 - q1
    floor = q1 - 1.5*iqr
    ceiling = q3 + 1.5*iqr
    outlier_indices = list(x.index[(x < floor) | (x > ceiling)])
    outlier_values = list(x[outlier_indices])
    return outlier_indices, outlier_values

Outliers Found in each column¶

In [6]:
glucose_indices, glucose_values = find_outliers_tukey(df1['Glucose'])
print("Outliers for Glucose")
print(np.sort(glucose_values))

print("Outliers for Pregnancies")
pr_indices, pr_values = find_outliers_tukey(df1['Pregnancies'])
print(np.sort(pr_values))

print("Outliers for BloodPressure")
bp_indices, bp_values = find_outliers_tukey(df1['BloodPressure'])
print(np.sort(bp_values))


print("Outliers for SkinThickness")
st_indices, st_values = find_outliers_tukey(df1['SkinThickness'])
print(np.sort(st_values))

print("Outliers for Insulin")
in_indices, in_values = find_outliers_tukey(df1['Insulin'])
print(np.sort(in_values))

print("Outliers for BMI")
bmi_indices, bmi_values = find_outliers_tukey(df1['BMI'])
print(np.sort(bmi_values))

print("Outliers for DiabetesPedigreeFunction")
dpf_indices, dpf_values = find_outliers_tukey(df1['DiabetesPedigreeFunction'])
print(np.sort(dpf_values))

print("Outliers for Age")
age_indices, age_values = find_outliers_tukey(df1['Age'])
print(np.sort(age_values))
Outliers for Glucose
[]
Outliers for Pregnancies
[14. 14. 15. 17.]
Outliers for BloodPressure
[122.]
Outliers for SkinThickness
[ 1.  1.  1.  1.  1.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.
  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.  3.
  3.  3.  3.  3.  3.  3.  3.  3.  3.  4.  4.  4.  4.  4.  4.  4.  4.  4.
  4.  4.  4.  4.  4.  4.  4.  5.  5.  5.  6. 48. 48. 48. 48. 49. 49. 49.
 51. 52. 52. 54. 54. 56. 63. 99.]
Outliers for Insulin
[  1.   1.   1.   1.   1.   1.   1.   2.   2.   2.   2.   3.   4.   4.
   5.   5.   5.   6.   6.   6.   7.   7.   7.   9.   9.   9.   9.  11.
  11.  11.  11.  11.  11.  12.  12.  12.  12.  12.  12.  12.  12.  13.
  13.  13.  13.  13.  13.  13.  13.  13.  14.  14.  14.  14.  14.  14.
  14.  14.  14.  14.  15.  15.  15.  15.  15.  15.  15.  15.  15.  15.
  15.  15.  15.  15.  16.  16.  16.  16.  16.  16.  16.  16.  17.  17.
  18.  18.  18.  18.  18.  18.  18.  18.  18.  18.  19.  19.  19.  19.
  21.  21.  21.  21.  21.  22.  22.  22.  23.  23.  23.  23.  24.  24.
  24.  25.  25.  25.  25.  27.  27.  27.  28.  29.  31.  32.  33.  34.
  36.  36.  36.  36.  37.  37.  37.  38.  41.  42.  42.  43.  44.  44.
  44.  44.  45.  45.  45.  46.  48.  48.  48.  48.  48.  49.  49.  49.
  49.  49.  51.  51.  52.  53.  53.  54.  54.  54.  54.  54.  55.  55.
  56.  56.  56.  56.  56.  84.  85.  85.  86.  87.  87.  88.  88.  88.
  88.  89.  91.  92.  92.  92.  94.  94.  94.  94.  94.  94.  94.  95.
  95.  96.  96.  99.  99. 112. 114. 114. 115. 115. 115. 115. 115. 115.
 116. 116. 119. 122. 122. 125. 125. 125. 125. 126. 126. 126. 127. 128.
 129. 132. 132. 135. 135. 135. 135. 135. 135. 142. 144. 144. 145. 145.
 145. 146. 148. 148. 152. 152. 155. 155. 155. 155. 156. 156. 156. 158.
 158. 159. 165. 165. 165. 165. 166. 167. 167. 168. 168. 168. 168. 171.
 175. 175. 175. 176. 176. 176. 178. 182. 182. 182. 183. 184. 185. 185.
 188. 191. 192. 192. 193. 194. 194. 194. 196. 215. 215. 215. 225. 225.
 228. 231. 231. 235. 237. 245. 249. 255. 258. 265. 265. 271. 272. 274.
 275. 277. 278. 284. 285. 285. 291. 293. 293. 318. 321. 325. 325. 325.
 326. 328. 335. 342. 375. 387. 392. 415. 465. 474. 478. 485. 495. 495.
 543. 545. 579. 744. 846.]
Outliers for BMI
[ 2.   2.1  2.4  2.4  2.8  2.8  3.   3.   3.   3.   3.   3.   3.   3.1
  3.1  3.1  3.1  3.1  3.1  3.1  3.1  3.1  3.2  3.3  3.4  3.4  3.4  3.4
  3.4  3.4  3.4  3.5  3.5  3.5  3.5  3.5  3.5  3.5  3.7  3.8  3.8  3.8
  3.8  3.8  3.8  3.8  3.8  3.8  3.9  3.9  3.9  3.9  3.9  4.   4.   4.1
  4.2  4.5  4.5  4.5  4.6  4.6  4.6  4.6  4.7  4.8  4.9  4.9  5.  52.3
 52.3 52.9 53.2 55.  57.3 59.4 67.1]
Outliers for DiabetesPedigreeFunction
[1.251 1.258 1.268 1.282 1.292 1.318 1.321 1.34  1.353 1.39  1.391 1.394
 1.4   1.441 1.461 1.476 1.57  1.6   1.698 1.699 1.72  1.731 1.76  1.781
 1.893 1.95  1.96  2.137 2.288 2.329 2.42 ]
Outliers for Age
[62. 62. 62. 62. 63. 63. 63. 63. 64. 65. 65. 65. 66. 66. 66. 66. 67. 67.
 67. 68. 69. 69. 72. 81.]

Deleting Row¶

In [7]:
df_del = df1.drop(bp_indices)
print(df_del.head(5))
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0     71.0  33.6   
1          1.0     85.0           66.0           29.0     71.0  26.6   
2          8.0    183.0           64.0           27.0     71.0  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          0.0    137.0            4.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627   5.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  

Replace with min¶

In [8]:
min_in = np.min(df_del['Insulin'])
df_del['Insulin'] = np.where(df_del['Insulin'] > 321, min_in, df_del['Insulin'])
print(df_del.head(5))
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0     71.0  33.6   
1          1.0     85.0           66.0           29.0     71.0  26.6   
2          8.0    183.0           64.0           27.0     71.0  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          0.0    137.0            4.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627   5.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  

Normalization and Reduction¶

In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(df_del)

PCA(copy=True, n_components=2, whiten=False)

df = pca.transform(df_del)

df_2d = pd.DataFrame(df)

df_2d.index = df_del.index

df_2d.columns = ['PC1', 'PC2']

df_2d.head(5)
Out[9]:
PC1 PC2
0 33.055249 -35.671449
1 -13.762445 6.138142
2 58.976048 -59.330959
3 4.351620 20.820122
4 89.453205 42.851607

Visualization¶

Histogram¶

In [10]:
import matplotlib.pyplot as plt
df_del.hist()
plt.show()
<Figure size 640x480 with 9 Axes>

Density Plot¶

In [11]:
df_del.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
plt.show()

Box Plot¶

In [12]:
df_del.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

Correlation Matrix¶

In [13]:
cor = df_del.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cor, vmin=-1, vmax =1)
fig.colorbar(cax)
ticks=np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
plt.show()

Scatter Plot¶

In [14]:
from pandas.plotting import scatter_matrix
scatter_matrix(df_del)
plt.show()
In [23]:
import numpy as np
import pandas as pd
from sklearn import datasets 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split

feature_col_names = ['Glucose', 'Pregnancies',  'Insulin', 'BMI', 'Age']
predicted_class_names = ['Outcome']

X = df1[feature_col_names].values     # predictor feature columns (8 X m)
y = df1[predicted_class_names].values # predicted class (1=true, 0=false) column (1 X m)
split_test_size = 0.30

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42) 
In [44]:
# training a linear SVM classifier 
from sklearn.svm import SVC 
from sklearn.svm import SVR
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train) 
svm_predictions = svm_model_linear.predict(X_test) 
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [45]:
accuracy = svm_model_linear.score(X_test, y_test) 

# creating a confusion matrix 
cm = confusion_matrix(y_test, svm_predictions)
In [46]:
cm
Out[46]:
array([[128,  23],
       [ 39,  41]], dtype=int64)
In [31]:
trainval = (1.0 * len(X_train)) / (1.0 * len(df1.index))
testval = (1.0 * len(X_test)) / (1.0 * len(df1.index))
print("{0:0.2f}% in training set".format(trainval * 100))
print("{0:0.2f}% in test set".format(testval * 100))
69.92% in training set
30.08% in test set
In [ ]: