import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import linear_model
import statsmodels.api as sm
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,roc_curve,auc
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
missing = ["?"]
df1 = pd.read_csv('https://raw.githubusercontent.com/Wittline/Machine_Learning/master/Default%20of%20credit%20card%20clients/default%20of%20credit%20card%20clients.csv',
sep=',',
names=['ID','LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6','default_payment_next_month'], na_values = missing);
df1.head(200)
df1.drop(['ID'], axis = 1, inplace = True)
Default = df1[df1['default_payment_next_month']==1]
NoDefault = df1[df1['default_payment_next_month']==0]
print(len(Default))
print(len(NoDefault))
df1.loc[df1['MARRIAGE'] == 0, 'MARRIAGE'] = 3
df1['MARRIAGE'].value_counts().head(10)
#(1 = married; 2 = single; 3 = others)
df1['SEX'].value_counts().head(10)
df1.loc[df1['EDUCATION'] == 0, 'EDUCATION'] = 4
df1.loc[df1['EDUCATION'] == 5, 'EDUCATION'] = 4
df1.loc[df1['EDUCATION'] == 6, 'EDUCATION'] = 4
df1['EDUCATION'].value_counts().head(10)
#(1 = graduate school; 2 = university; 3 = high school; 4 = others)
sns.set(style="darkgrid")
plt.figure(figsize=(8, 8))
plt.title('Default = 0 - Not Default = 1')
ax = sns.countplot(x="default_payment_next_month", data=df1)
plt.plot()
output = 'default_payment_next_month'
cols = [ f for f in df1.columns if df1.dtypes[ f ] != "object"]
cols.remove(output)
print(cols)
f = pd.melt( df1, id_vars=output, value_vars=cols)
g = sns.FacetGrid( f, hue=output, col="variable", col_wrap=5, height=5, sharex=False, sharey=False )
g = g.map( sns.distplot, "value" , kde=False).add_legend()
plt.plot();
plt.figure(1, figsize=(18, 8));
bp = plt.boxplot([df1.LIMIT_BAL,df1.SEX,df1.EDUCATION,df1.MARRIAGE,df1.AGE,df1.PAY_0,df1.PAY_2,df1.PAY_3,df1.PAY_4,df1.PAY_5,df1.PAY_6,df1.BILL_AMT1,df1.BILL_AMT2,df1.BILL_AMT3,df1.BILL_AMT4,df1.BILL_AMT5,df1.BILL_AMT6,df1.PAY_AMT1,df1.PAY_AMT2,df1.PAY_AMT3,df1.PAY_AMT4,df1.PAY_AMT5,df1.PAY_AMT6], vert=True, patch_artist=True,
flierprops={'alpha':0.6, 'markersize': 3,
'markeredgecolor': '#555555','marker': 'd',
'markerfacecolor': "#555555"},
capprops={'color': '#555555', 'linewidth': 1},
boxprops={'color': '#555555', 'linewidth': 1},
whiskerprops={'color': '#555555', 'linewidth': 1},
medianprops={'color': '#555555', 'linewidth': 1},
meanprops={'color': '#555555', 'linewidth': 1});
plt.grid(True, alpha=0.6);
plt.title("Box Plot", fontsize=10);
plt.ylabel("Frequency", fontsize=10);
plt.xticks(ticks=[1,2,3,4,5,6,7,8,9, 10, 11,12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,23], labels=['LIMIT_BAL','SEX','EDUCATION','MARRIAGE','AGE','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6','BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6'], fontsize=6);
bp['boxes'][0].set(facecolor='blue', alpha= 0.6);
bp['boxes'][1].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][2].set(facecolor='blue', alpha= 0.6);
bp['boxes'][3].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][4].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][5].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][6].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][7].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][8].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][9].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][10].set(facecolor='blue', alpha= 0.6);
bp['boxes'][11].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][12].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][13].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][14].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][15].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][16].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][17].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][18].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][19].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][20].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][21].set(facecolor="blue",alpha= 0.6 );
bp['boxes'][22].set(facecolor="blue",alpha= 0.6 );
plt.show();
corr = df1.corr(method='pearson').round(2)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(17, 17))
c_map = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=c_map, vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .6}, annot=True)
plt.tight_layout()
#pp.ProfileReport(df1)
df1.drop(['BILL_AMT2'], axis = 1, inplace = True)
df1.drop(['BILL_AMT3'], axis = 1, inplace = True)
df1.drop(['BILL_AMT4'], axis = 1, inplace = True)
df1.drop(['BILL_AMT5'], axis = 1, inplace = True)
df1.drop(['BILL_AMT6'], axis = 1, inplace = True)
# categorical_vars = ['SEX','EDUCATION','MARRIAGE', 'PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']
# df1[categorical_vars] = df1[categorical_vars].astype(str)
# df1 = pd.get_dummies(df1,columns=categorical_vars,drop_first=True)
# df1.head()
# col_to_norm = ['LIMIT_BAL','AGE','BILL_AMT1','PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']
# df1[col_to_norm] = df1[col_to_norm].apply(lambda x : (x-np.mean(x))/np.std(x))
y = df1.default_payment_next_month
x = df1.drop(['default_payment_next_month'], axis = 1)
test_sizes = [0.1, 0.2, 0.3]
scaler = StandardScaler()
lg = linear_model.LogisticRegression(random_state = 40, max_iter = 500,solver='lbfgs')
for i in range(0, 3):
print("sklearn - LogisticRegression: " + str(test_sizes[i]))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
lg.fit(x_train, y_train)
y_pred = lg.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = lg.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
print("")
Matriz de confusion
lg = linear_model.LogisticRegression(random_state = 40, max_iter = 500,solver='lbfgs')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
print("Test accuracy: {} ".format(lg.fit(x_train, y_train).score(x_test, y_test)))
score = lg.score(x_test, y_test)
predictions = lg.predict(x_test)
cm = metrics.confusion_matrix(y_test, predictions)
cm
classifier = MLPClassifier(hidden_layer_sizes=(120, 80, 40, 10), max_iter=2000, activation = 'relu', alpha= 0.5, solver='sgd', random_state=40)
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = classifier.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)
print("")
svclassifier = SVC(kernel='rbf')
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
svclassifier.fit(x_train, y_train)
y_pred = svclassifier.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = svclassifier.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
print("")
# for k in range(1,50):
# knn = KNeighborsClassifier(n_neighbors=k,n_jobs=-1)
# for i in range(0, 3):
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
# scaler.fit(x_train)
# x_train = scaler.transform(x_train)
# x_test = scaler.transform(x_test)
# knn.fit(x_train,y_train)
# pred_i = knn.predict(x_test)
# print(str(k) + ',' + str(test_sizes[i]) + ", " + str(np.mean(pred_i != y_test)))
# # plt.figure(figsize=(10,6))
# # plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
# # markerfacecolor='red', markersize=10)
# # plt.title('Error Rate vs. K Value')
# # plt.xlabel('K')
# # plt.ylabel('Error Rate')
knn = KNeighborsClassifier(n_neighbors=23)
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = knn.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
print("")
naive_bayes = GaussianNB()
for i in range(0, 3):
print("Partition: " , test_sizes[i])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_sizes[i], random_state=40)
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
naive_bayes.fit(x_train,y_train)
y_pred = naive_bayes.predict(x_train)
print("Train accuracy_score:", accuracy_score(y_train, y_pred))
print("Train f1_score:", f1_score(y_train, y_pred))
print("Train roc_auc_score:", roc_auc_score(y_train, y_pred))
y_pred = naive_bayes.predict(x_test)
print("Test accuracy_score:", accuracy_score(y_test, y_pred))
print("Test f1_score:", f1_score(y_test, y_pred))
print("Test roc_auc_score:", roc_auc_score(y_test, y_pred))
print("")