import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import linear_model
import statsmodels.api as sm
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,roc_curve,auc
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
missing = ["?"]
df1 = pd.read_csv('https://raw.githubusercontent.com/Wittline/Machine_Learning/master/Logistic%20Regression/breast-cancer-wisconsin.data',
sep=',',
names=["id", "Clump_Thickness", "Uniformity_CellSize", "Uniformity_CellShape", 'Marginal_Adhesion', 'Single_Epithelial_CellSize', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class'],
na_values = missing);
df1.shape
df1.isnull().sum()
mdn = df1['Bare_Nuclei'].median()
df1['Bare_Nuclei'].fillna(mdn, inplace=True)
df1.isnull().sum()
df1.drop(['id'], axis = 1, inplace = True)
#df1.drop(df1.columns[[0]], axis = 1, inplace = True)
benign = df1[df1['Class']==2]
malignant = df1[df1['Class']==4]
plt.figure(1, figsize=(18, 8));
bp = plt.boxplot([df1.Clump_Thickness, df1.Uniformity_CellSize, df1.Uniformity_CellShape, df1.Marginal_Adhesion, df1.Single_Epithelial_CellSize, df1.Bare_Nuclei, df1.Bland_Chromatin, df1.Normal_Nucleoli, df1.Mitoses], vert=True, patch_artist=True,
flierprops={'alpha':0.6, 'markersize': 6,
'markeredgecolor': '#555555','marker': 'd',
'markerfacecolor': "#555555"},
capprops={'color': '#555555', 'linewidth': 2},
boxprops={'color': '#555555', 'linewidth': 2},
whiskerprops={'color': '#555555', 'linewidth': 2},
medianprops={'color': '#555555', 'linewidth': 2},
meanprops={'color': '#555555', 'linewidth': 2});
plt.grid(True, alpha=0.6);
plt.title("Box Plot", fontsize=20);
plt.ylabel("Frequency", fontsize=20);
plt.xticks(ticks=[1,2,3,4,5,6,7,8,9], labels=["Clump_Thickness", "Uniformity_CellSize", "Uniformity_CellShape", 'Marginal_Adhesion', 'Single_Epithelial_CellSize', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses'], fontsize=10);
bp['boxes'][0].set(facecolor='blue', alpha= 0.6);
bp['boxes'][1].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][2].set(facecolor='blue', alpha= 0.6);
bp['boxes'][3].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][4].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][5].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][6].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][7].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][8].set(facecolor="blue",alpha= 0.6 );
plt.show();
output = 'Class'
cols = [ f for f in df1.columns if df1.dtypes[ f ] != "object"]
cols.remove(output)
f = pd.melt( df1, id_vars=output, value_vars=cols)
g = sns.FacetGrid( f, hue=output, col="variable", col_wrap=4, height=4, sharex=False, sharey=False )
g = g.map( sns.distplot, "value" , kde=False).add_legend()
plt.plot();
plt.figure(figsize=(10, 8))
ax= sns.countplot(df1['Class'], palette=['skyblue', 'red'])
total = len(df1['Class'])
for p in ax.patches:
name = 'Benign'
if((abs(p.get_x())*10 )- 2 == 4):
name = 'Malignant'
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f} % {}'.format(round((height/total)*100,2), name ) ,
ha="center")
plt.show()
corr = df1.corr(method='pearson').round(2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 11))
c_map = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=c_map, vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)
plt.tight_layout()
df1.Class = [1 if each == 4 else 0 for each in df1.Class]
y = df1.Class
x = df1.drop(['Class', 'Uniformity_CellShape'], axis = 1)
test_sizes = [0.1, 0.2, 0.3]
scaler = StandardScaler()
lg = linear_model.LogisticRegression(random_state = 40, max_iter = 100,solver='lbfgs')
for i in range(0, 3):
print("sklearn - LogisticRegression: " + str(test_sizes[i]))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
lg.fit(x_train,y_train)
y_pred = lg.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = lg.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
print("")
clf = MLPClassifier(solver='lbfgs', max_iter=1000, random_state=40, activation='relu', alpha= 0.001, hidden_layer_sizes=(10,5 ,2))
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
clf.fit(x_train,y_train)
y_pred = clf.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = clf.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
print("")
svclassifier = SVC(kernel='rbf')
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
svclassifier.fit(x_train,y_train)
y_pred = svclassifier.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = svclassifier.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
print("")
# for k in range(1,50):
# knn = KNeighborsClassifier(n_neighbors=k,n_jobs=-1)
# for i in range(0, 3):
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)
# x_test = scaler.transform(x_test)
# knn.fit(x_train,y_train)
# pred_i = knn.predict(x_test)
# print(str(k) + ',' + str(test_sizes[i]) + ", " + str(np.mean(pred_i != y_test)))
knn = KNeighborsClassifier(n_neighbors=7)
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = knn.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
print("")
naive_bayes = GaussianNB()
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
naive_bayes.fit(x_train,y_train)
y_pred = naive_bayes.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = naive_bayes.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
print("")