print("hello world! welcome to data pre processing")
hello world
Data preprocessing is an important step in the data mining process. Even before conducting an visual analysis of data it is important to clean the data. Data preprocessing refers to the cleaning, transforming, and integrating of data in order to make it ready for analysis. The goal of data preprocessing is to improve the quality of the data and to make it more suitable for the specific data mining task.
import numpy as np
import pandas as pd
# for operation 2
df2 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])
df1 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])
df1.isnull().sum().sort_values(ascending=False)
df_no_missing = df1.dropna(axis=0)
print(df_no_missing.head(5))
print('row count:',len(df1))
print('column count:',len(df1.columns))
#df1.info
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \ 3 1 89.0 66.0 23.0 94.0 28.1 4 0 137.0 4.0 35.0 168.0 43.1 6 3 78.0 5.0 32.0 88.0 31.0 8 2 197.0 7.0 45.0 543.0 3.5 13 1 189.0 6.0 23.0 846.0 3.1 DiabetesPedigreeFunction Age Outcome 3 0.167 21 0 4 2.288 33 1 6 0.248 26 1 8 0.158 53 1 13 0.398 59 1 row count: 768 column count: 9
from sklearn.preprocessing import Imputer
# Data from excel
df1 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])
#Imputer to replace Null with mean
imp = Imputer(missing_values = 'NaN', strategy='mean', axis=0)
imp.fit(df1)
df1 = pd.DataFrame(data=imp.transform(df1), columns=df1.columns)
#print
print(df1.head(5))
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \ 0 6.0 148.0 72.0 35.000000 105.659898 33.6 1 1.0 85.0 66.0 29.000000 105.659898 26.6 2 8.0 183.0 64.0 25.876155 105.659898 23.3 3 1.0 89.0 66.0 23.000000 94.000000 28.1 4 0.0 137.0 4.0 35.000000 168.000000 43.1 DiabetesPedigreeFunction Age Outcome 0 0.627 5.0 1.0 1 0.351 31.0 0.0 2 0.672 32.0 1.0 3 0.167 21.0 0.0 4 2.288 33.0 1.0
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:58: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead. warnings.warn(msg, category=DeprecationWarning)
from sklearn.preprocessing import Imputer
# Data from excel
df1 = pd.read_csv('diabetes_null.csv', na_values=['#NAME?'])
#Imputer to replace Null with mean
imp = Imputer(missing_values = 'NaN', strategy='median', axis=0)
imp.fit(df1)
df1 = pd.DataFrame(data=imp.transform(df1), columns=df1.columns)
#print
print(df1.head(5))
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \ 0 6.0 148.0 72.0 35.0 71.0 33.6 1 1.0 85.0 66.0 29.0 71.0 26.6 2 8.0 183.0 64.0 27.0 71.0 23.3 3 1.0 89.0 66.0 23.0 94.0 28.1 4 0.0 137.0 4.0 35.0 168.0 43.1 DiabetesPedigreeFunction Age Outcome 0 0.627 5.0 1.0 1 0.351 31.0 0.0 2 0.672 32.0 1.0 3 0.167 21.0 0.0 4 2.288 33.0 1.0
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\deprecation.py:58: DeprecationWarning: Class Imputer is deprecated; Imputer was deprecated in version 0.20 and will be removed in 0.22. Import impute.SimpleImputer from sklearn instead. warnings.warn(msg, category=DeprecationWarning)
def find_outliers_tukey(x):
q1 = x.quantile(.25)
q3 = x.quantile(.75)
iqr = q3 - q1
floor = q1 - 1.5*iqr
ceiling = q3 + 1.5*iqr
outlier_indices = list(x.index[(x < floor) | (x > ceiling)])
outlier_values = list(x[outlier_indices])
return outlier_indices, outlier_values
glucose_indices, glucose_values = find_outliers_tukey(df1['Glucose'])
print("Outliers for Glucose")
print(np.sort(glucose_values))
print("Outliers for Pregnancies")
pr_indices, pr_values = find_outliers_tukey(df1['Pregnancies'])
print(np.sort(pr_values))
print("Outliers for BloodPressure")
bp_indices, bp_values = find_outliers_tukey(df1['BloodPressure'])
print(np.sort(bp_values))
print("Outliers for SkinThickness")
st_indices, st_values = find_outliers_tukey(df1['SkinThickness'])
print(np.sort(st_values))
print("Outliers for Insulin")
in_indices, in_values = find_outliers_tukey(df1['Insulin'])
print(np.sort(in_values))
print("Outliers for BMI")
bmi_indices, bmi_values = find_outliers_tukey(df1['BMI'])
print(np.sort(bmi_values))
print("Outliers for DiabetesPedigreeFunction")
dpf_indices, dpf_values = find_outliers_tukey(df1['DiabetesPedigreeFunction'])
print(np.sort(dpf_values))
print("Outliers for Age")
age_indices, age_values = find_outliers_tukey(df1['Age'])
print(np.sort(age_values))
Outliers for Glucose [] Outliers for Pregnancies [14. 14. 15. 17.] Outliers for BloodPressure [122.] Outliers for SkinThickness [ 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 4. 5. 5. 5. 6. 48. 48. 48. 48. 49. 49. 49. 51. 52. 52. 54. 54. 56. 63. 99.] Outliers for Insulin [ 1. 1. 1. 1. 1. 1. 1. 2. 2. 2. 2. 3. 4. 4. 5. 5. 5. 6. 6. 6. 7. 7. 7. 9. 9. 9. 9. 11. 11. 11. 11. 11. 11. 12. 12. 12. 12. 12. 12. 12. 12. 13. 13. 13. 13. 13. 13. 13. 13. 13. 14. 14. 14. 14. 14. 14. 14. 14. 14. 14. 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 15. 16. 16. 16. 16. 16. 16. 16. 16. 17. 17. 18. 18. 18. 18. 18. 18. 18. 18. 18. 18. 19. 19. 19. 19. 21. 21. 21. 21. 21. 22. 22. 22. 23. 23. 23. 23. 24. 24. 24. 25. 25. 25. 25. 27. 27. 27. 28. 29. 31. 32. 33. 34. 36. 36. 36. 36. 37. 37. 37. 38. 41. 42. 42. 43. 44. 44. 44. 44. 45. 45. 45. 46. 48. 48. 48. 48. 48. 49. 49. 49. 49. 49. 51. 51. 52. 53. 53. 54. 54. 54. 54. 54. 55. 55. 56. 56. 56. 56. 56. 84. 85. 85. 86. 87. 87. 88. 88. 88. 88. 89. 91. 92. 92. 92. 94. 94. 94. 94. 94. 94. 94. 95. 95. 96. 96. 99. 99. 112. 114. 114. 115. 115. 115. 115. 115. 115. 116. 116. 119. 122. 122. 125. 125. 125. 125. 126. 126. 126. 127. 128. 129. 132. 132. 135. 135. 135. 135. 135. 135. 142. 144. 144. 145. 145. 145. 146. 148. 148. 152. 152. 155. 155. 155. 155. 156. 156. 156. 158. 158. 159. 165. 165. 165. 165. 166. 167. 167. 168. 168. 168. 168. 171. 175. 175. 175. 176. 176. 176. 178. 182. 182. 182. 183. 184. 185. 185. 188. 191. 192. 192. 193. 194. 194. 194. 196. 215. 215. 215. 225. 225. 228. 231. 231. 235. 237. 245. 249. 255. 258. 265. 265. 271. 272. 274. 275. 277. 278. 284. 285. 285. 291. 293. 293. 318. 321. 325. 325. 325. 326. 328. 335. 342. 375. 387. 392. 415. 465. 474. 478. 485. 495. 495. 543. 545. 579. 744. 846.] Outliers for BMI [ 2. 2.1 2.4 2.4 2.8 2.8 3. 3. 3. 3. 3. 3. 3. 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.1 3.2 3.3 3.4 3.4 3.4 3.4 3.4 3.4 3.4 3.5 3.5 3.5 3.5 3.5 3.5 3.5 3.7 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.8 3.9 3.9 3.9 3.9 3.9 4. 4. 4.1 4.2 4.5 4.5 4.5 4.6 4.6 4.6 4.6 4.7 4.8 4.9 4.9 5. 52.3 52.3 52.9 53.2 55. 57.3 59.4 67.1] Outliers for DiabetesPedigreeFunction [1.251 1.258 1.268 1.282 1.292 1.318 1.321 1.34 1.353 1.39 1.391 1.394 1.4 1.441 1.461 1.476 1.57 1.6 1.698 1.699 1.72 1.731 1.76 1.781 1.893 1.95 1.96 2.137 2.288 2.329 2.42 ] Outliers for Age [62. 62. 62. 62. 63. 63. 63. 63. 64. 65. 65. 65. 66. 66. 66. 66. 67. 67. 67. 68. 69. 69. 72. 81.]
df_del = df1.drop(bp_indices)
print(df_del.head(5))
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \ 0 6.0 148.0 72.0 35.0 71.0 33.6 1 1.0 85.0 66.0 29.0 71.0 26.6 2 8.0 183.0 64.0 27.0 71.0 23.3 3 1.0 89.0 66.0 23.0 94.0 28.1 4 0.0 137.0 4.0 35.0 168.0 43.1 DiabetesPedigreeFunction Age Outcome 0 0.627 5.0 1.0 1 0.351 31.0 0.0 2 0.672 32.0 1.0 3 0.167 21.0 0.0 4 2.288 33.0 1.0
min_in = np.min(df_del['Insulin'])
df_del['Insulin'] = np.where(df_del['Insulin'] > 321, min_in, df_del['Insulin'])
print(df_del.head(5))
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \ 0 6.0 148.0 72.0 35.0 71.0 33.6 1 1.0 85.0 66.0 29.0 71.0 26.6 2 8.0 183.0 64.0 27.0 71.0 23.3 3 1.0 89.0 66.0 23.0 94.0 28.1 4 0.0 137.0 4.0 35.0 168.0 43.1 DiabetesPedigreeFunction Age Outcome 0 0.627 5.0 1.0 1 0.351 31.0 0.0 2 0.672 32.0 1.0 3 0.167 21.0 0.0 4 2.288 33.0 1.0
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(df_del)
PCA(copy=True, n_components=2, whiten=False)
df = pca.transform(df_del)
df_2d = pd.DataFrame(df)
df_2d.index = df_del.index
df_2d.columns = ['PC1', 'PC2']
df_2d.head(5)
PC1 | PC2 | |
---|---|---|
0 | 33.055249 | -35.671449 |
1 | -13.762445 | 6.138142 |
2 | 58.976048 | -59.330959 |
3 | 4.351620 | 20.820122 |
4 | 89.453205 | 42.851607 |
import matplotlib.pyplot as plt
df_del.hist()
plt.show()
<Figure size 640x480 with 9 Axes>
df_del.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
plt.show()
df_del.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()
cor = df_del.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(cor, vmin=-1, vmax =1)
fig.colorbar(cax)
ticks=np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
plt.show()
from pandas.plotting import scatter_matrix
scatter_matrix(df_del)
plt.show()
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
feature_col_names = ['Glucose', 'Pregnancies', 'Insulin', 'BMI', 'Age']
predicted_class_names = ['Outcome']
X = df1[feature_col_names].values # predictor feature columns (8 X m)
y = df1[predicted_class_names].values # predicted class (1=true, 0=false) column (1 X m)
split_test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)
# training a linear SVM classifier
from sklearn.svm import SVC
from sklearn.svm import SVR
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_linear.predict(X_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
accuracy = svm_model_linear.score(X_test, y_test)
# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predictions)
cm
array([[128, 23], [ 39, 41]], dtype=int64)
trainval = (1.0 * len(X_train)) / (1.0 * len(df1.index))
testval = (1.0 * len(X_test)) / (1.0 * len(df1.index))
print("{0:0.2f}% in training set".format(trainval * 100))
print("{0:0.2f}% in test set".format(testval * 100))
69.92% in training set 30.08% in test set