Many papers have been written that have used the iris dataset obtained from different sources, and some of these datasets contain errors in their data, which has caused a discrepancy between machine learning models created by different authors, in addition to some of them they do not provide the information from where they obtained the dataset, it is proposed to create a central repository for known datasets, although it has already been performed before, was not successfully, so far it is recommended to take the values directly from the Fisher’s paper.
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
# The iris dataset was uploaded to the directory on my github URL
# After the reading we assign the columns to the panda dataframe
df1 = pd.read_csv('https://raw.githubusercontent.com/Wittline/Machine_Learning/master/Linear%20Regression/iris.data',
sep=',',
names=["sepallength", "sepalwidth", "petallength", "petalwidth", 'class']);
df1.head()
#Dropping the sepalwidth and sepallength columns
df1.drop(df1.columns[[0, 1]], axis = 1, inplace = True)
setosa= df1[df1['class']=='Iris-setosa']
versicolor =df1[df1['class']=='Iris-versicolor']
virginica =df1[df1['class']=='Iris-virginica']
df1.head()
df1.describe()
x = df1.petallength
y = df1.petalwidth
plt.figure(figsize = (10, 7))
plt.hist(x, bins=20, alpha=0.6, label='Petal Length' )
plt.hist(y, bins=20, alpha=0.6, label='Petal Width')
plt.grid(True, alpha=0.6)
plt.ylabel("Frequency")
plt.title("Histogram", fontsize=20)
plt.legend(loc='upper right')
plt.show()
plt.figure(1, figsize=(10, 7))
bp = plt.boxplot([df1.petallength, df1.petalwidth], vert=True, patch_artist=True,
flierprops={'alpha':0.6, 'markersize': 6,
'markeredgecolor': '#555555','marker': 'd',
'markerfacecolor': "#555555"},
capprops={'color': '#555555', 'linewidth': 2},
boxprops={'color': '#555555', 'linewidth': 2},
whiskerprops={'color': '#555555', 'linewidth': 2},
medianprops={'color': '#555555', 'linewidth': 2},
meanprops={'color': '#555555', 'linewidth': 2})
plt.grid(True, alpha=0.6)
plt.title("Box Plot", fontsize=20)
plt.ylabel("Frequency", fontsize=20)
plt.xticks(ticks=[1, 2], labels=['petallength', 'petalwidth'], fontsize=20)
bp['boxes'][0].set(facecolor='blue', alpha= 0.6)
bp['boxes'][1].set(facecolor="orange",alpha= 0.6 )
plt.show()
plt.figure(figsize=(10, 7))
plt.scatter(x = setosa.petallength, y = setosa.petalwidth, label="setosa", color='orange')
plt.scatter(x = versicolor.petallength, y = versicolor.petalwidth, label="versicolor", color='b')
plt.scatter(x = virginica.petallength, y = virginica.petalwidth, label="virginica", color='g')
plt.grid(True, alpha=0.6)
plt.title("Scatter", fontsize=20)
plt.xlabel("petallength", fontsize=20)
plt.ylabel("petalwidth", fontsize=20)
plt.legend()
plt.show()
#Least square method, this will return the Variance and covariance of X and Y (X,Y)= (PetalLength, PetalWidth)
def Least_square(x,y):
mx = x.mean()
my = y.mean()
u=0
d=0
e=0
i=0;
while(i< len(x)):
u += ((x[i] - mx)*(y[i] - my))
d += ((x[i] - mx)**2)
e += ((y[i] - my)**2)
i +=1;
# 𝑦 = 𝑎 + 𝑏𝑥
#returning a and b
b = u/d
a = my - b*mx
return a, b, u, d, e
x= df1.petallength
y= df1.petalwidth
a, b, covxy, vx, vy = Least_square(x, y)
yp= a + b*x
plt.figure(figsize=(10, 7))
plt.scatter(x,y,color="black", label= "(PetalLength, PetalWidth)")
plt.plot([min(x), max(x)], [min(yp), max(yp)], color="blue", label="y = -0.36651 + 0.4164x")
plt.legend()
plt.show()
pearson = (covxy/math.sqrt((vx*vy)))
pearson
# Coefficient of determination
R2 = (pearson**2)
R2