import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import linear_model
import statsmodels.api as sm
from sklearn import metrics
missing = ["?"]
df1 = pd.read_csv('https://raw.githubusercontent.com/Wittline/Machine_Learning/master/Logistic%20Regression/breast-cancer-wisconsin.data',
sep=',',
names=["id", "Clump_Thickness", "Uniformity_CellSize", "Uniformity_CellShape", 'Marginal_Adhesion', 'Single_Epithelial_CellSize', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class'],
na_values = missing);
df1.head(13)
df1.isnull().sum()
mdn = df1['Bare_Nuclei'].median()
df1['Bare_Nuclei'].fillna(mdn, inplace=True)
df1.isnull().sum()
df1.drop(['id'], axis = 1, inplace = True)
#df1.drop(df1.columns[[0]], axis = 1, inplace = True)
benign = df1[df1['Class']==2]
malignant = df1[df1['Class']==4]
plt.figure(1, figsize=(18, 8));
bp = plt.boxplot([df1.Clump_Thickness, df1.Uniformity_CellSize, df1.Uniformity_CellShape, df1.Marginal_Adhesion, df1.Single_Epithelial_CellSize, df1.Bare_Nuclei, df1.Bland_Chromatin, df1.Normal_Nucleoli, df1.Mitoses], vert=True, patch_artist=True,
flierprops={'alpha':0.6, 'markersize': 6,
'markeredgecolor': '#555555','marker': 'd',
'markerfacecolor': "#555555"},
capprops={'color': '#555555', 'linewidth': 2},
boxprops={'color': '#555555', 'linewidth': 2},
whiskerprops={'color': '#555555', 'linewidth': 2},
medianprops={'color': '#555555', 'linewidth': 2},
meanprops={'color': '#555555', 'linewidth': 2});
plt.grid(True, alpha=0.6);
plt.title("Box Plot", fontsize=20);
plt.ylabel("Frequency", fontsize=20);
plt.xticks(ticks=[1,2,3,4,5,6,7,8,9], labels=["Clump_Thickness", "Uniformity_CellSize", "Uniformity_CellShape", 'Marginal_Adhesion', 'Single_Epithelial_CellSize', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses'], fontsize=10);
bp['boxes'][0].set(facecolor='blue', alpha= 0.6);
bp['boxes'][1].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][2].set(facecolor='blue', alpha= 0.6);
bp['boxes'][3].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][4].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][5].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][6].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][7].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][8].set(facecolor="blue",alpha= 0.6 );
plt.show();
f, axes = plt.subplots(3, 3, figsize=(20, 10))
sns.distplot( benign["Clump_Thickness"] , color="skyblue", ax=axes[0, 0], kde=False)
sns.distplot( malignant["Clump_Thickness"] , color="red", ax=axes[0, 0], kde=False)
axes[0,0].set_xlim([1, 10])
sns.distplot( benign["Uniformity_CellSize"] , color="skyblue", ax=axes[0, 1], kde=False)
sns.distplot( malignant["Uniformity_CellSize"] , color="red", ax=axes[0, 1], kde=False)
axes[0,1].set_xlim([1, 10])
sns.distplot( benign["Uniformity_CellShape"] , color="skyblue", ax=axes[0, 2], kde=False)
sns.distplot( malignant["Uniformity_CellShape"] , color="red", ax=axes[0, 2], kde=False)
axes[0,2].set_xlim([1, 10])
sns.distplot( benign["Marginal_Adhesion"] , color="skyblue", ax=axes[1, 0], kde=False)
sns.distplot( malignant["Marginal_Adhesion"] , color="red", ax=axes[1, 0], kde=False)
axes[1,0].set_xlim([1, 10])
sns.distplot( benign["Single_Epithelial_CellSize"] , color="skyblue", ax=axes[1, 1], kde=False)
sns.distplot( malignant["Single_Epithelial_CellSize"] , color="red", ax=axes[1, 1], kde=False)
axes[1,1].set_xlim([1, 10])
sns.distplot( benign["Bare_Nuclei"] , color="skyblue", ax=axes[1, 2], kde=False)
sns.distplot( malignant["Bare_Nuclei"] , color="red", ax=axes[1, 2], kde=False)
axes[1,2].set_xlim([1, 10])
sns.distplot( benign["Bland_Chromatin"] , color="skyblue", ax=axes[2, 0], kde=False)
sns.distplot( malignant["Bland_Chromatin"] , color="red", ax=axes[2, 0], kde=False)
axes[2,0].set_xlim([1, 10])
sns.distplot( benign["Normal_Nucleoli"] , color="skyblue", ax=axes[2, 1], kde=False)
sns.distplot( malignant["Normal_Nucleoli"] , color="red", ax=axes[2, 1], kde=False)
axes[2,1].set_xlim([1, 10])
sns.distplot( benign["Mitoses"] , color="skyblue", ax=axes[2,2], kde=False)
sns.distplot( malignant["Mitoses"] , color="red", ax=axes[2, 2], kde=False)
axes[2,2].set_xlim([1, 10])
plt.plot();
The first nine variables have nominal values and ordinal values were assigned to them too, with the same distance between categories. These variables could be normalized to another range, but in this case they all have the same behavior, apparently they are already scaled. these variables will be the input variables.
The variable "Class" is categorical (binary), and this will be used to identify the class of the record.
So far the only variable in the dataset that was removed is the variable "id" which is only a unique identifier of the record, perhaps other variables can be discarded due to the high correlation but will be done it later.
Although all input variables have nominal ordinal values, they will not be normalized, since they are all in the same range of 1 - 10
plt.figure(figsize=(10, 8))
ax= sns.countplot(df1['Class'], palette=['skyblue', 'red'])
total = len(df1['Class'])
for p in ax.patches:
name = 'Benign'
if((abs(p.get_x())*10 )- 2 == 4):
name = 'Malignant'
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f} % {}'.format(round((height/total)*100,2), name ) ,
ha="center")
plt.show()
Checking the correlation matrix below we realize that the variables Uniformity_CellSize and Uniformity_CellShape are highly correlated and any of them coud be rejected for the analysis, in this case we will not reject any of them to be able to observe if this will affect the final outcomes.
corr = df1.corr(method='pearson').round(2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(10, 10))
c_map = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=c_map, vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)
plt.tight_layout()
df1.Class = [1 if each == 4 else 0 for each in df1.Class]
y = df1.Class
X = df1.drop(['Class'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
lg = linear_model.LogisticRegression(random_state = 40, max_iter = 100,solver='lbfgs')
print("Train accuracy: {} ".format(lg.fit(x_train, y_train).score(x_train, y_train)))
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
print("Test accuracy: {} ".format(lg.fit(x_train, y_train).score(x_test, y_test)))
score = lg.score(x_test, y_test)
predictions = lg.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);
Removing the Uniformity_CellShape variable which is highly correlated and with a p-value greater than 0.05 does not improve the model.
y = df1.Class
X = df1.drop(['Uniformity_CellShape', 'Class'], axis = 1)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)
lg = linear_model.LogisticRegression(random_state = 40, max_iter = 100, solver='lbfgs')
print("Train accuracy: {} ".format(lg.fit(x_train, y_train).score(x_train, y_train)))
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
print("Test accuracy: {} ".format(lg.fit(x_train, y_train).score(x_test, y_test)))
score = lg.score(x_test, y_test)
predictions = lg.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);
y = df1.Class
X = df1.drop(['Class'], axis = 1)
kf = KFold(n_splits=6)
lg = linear_model.LogisticRegression(random_state = 40, max_iter = 100, solver='lbfgs')
print(cross_val_score(lg, X, y, cv=kf, scoring='accuracy').mean())
Using 6 splits with the cross_validation method the accuracy improved by almost 0.02%