Felix Matthias Krumm · 27d18c1d · fbf5e411 · 27d18c1d
--- a/notebooks/source/algorithm_comparison_classification.ipynb 0 → 100644

+ 172

− 0
+++ b/notebooks/source/algorithm_comparison_classification.ipynb 0 → 100644

+ 172

− 0
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.cluster import KMeans
+from sklearn.linear_model import LogisticRegression
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score
+from sklearn.impute import  SimpleImputer
+```
+
+%% Cell type:code id: tags:
+
+``` python
+data = pd.read_csv("../data/Titanic/titanic.csv")
+
+data.head()
+```
+
+%% Output
+
+   PassengerId  Survived  Pclass  \
+0            1         0       3   
+1            2         1       1   
+2            3         1       3   
+3            4         1       1   
+4            5         0       3   
+
+                                                Name     Sex   Age  SibSp  \
+0                            Braund, Mr. Owen Harris    male  22.0      1   
+1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
+2                             Heikkinen, Miss. Laina  female  26.0      0   
+3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
+4                           Allen, Mr. William Henry    male  35.0      0   
+
+   Parch            Ticket     Fare Cabin Embarked  
+0      0         A/5 21171   7.2500   NaN        S  
+1      0          PC 17599  71.2833   C85        C  
+2      0  STON/O2. 3101282   7.9250   NaN        S  
+3      0            113803  53.1000  C123        S  
+4      0            373450   8.0500   NaN        S  
+
+%% Cell type:code id: tags:
+
+``` python
+data["Sex"] = data["Sex"].map({"male":0, "female":1})
+
+features = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
+target = ["Survived"]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+data.isnull().sum()
+```
+
+%% Output
+
+PassengerId      0
+Survived         0
+Pclass           0
+Name             0
+Sex              0
+Age            177
+SibSp            0
+Parch            0
+Ticket           0
+Fare             0
+Cabin          687
+Embarked         2
+dtype: int64
+
+%% Cell type:code id: tags:
+
+``` python
+imp = SimpleImputer(strategy="mean")
+
+data["Age"] = imp.fit_transform(data[["Age"]])
+```
+
+%% Cell type:code id: tags:
+
+``` python
+X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.5, random_state=42)
+
+y_train = np.array(y_train).ravel()
+y_test = np.array(y_test).ravel()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+nearest_neighbors = KNeighborsClassifier()
+decision_tree = DecisionTreeClassifier()
+random_forest = RandomForestClassifier()
+naive_bayes = GaussianNB()
+k_means = KMeans(n_clusters=3, init='k-means++')
+logistic_reg = LogisticRegression()
+support_vector = SVC()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+nearest_neighbors.fit(X_train, y_train)
+decision_tree.fit(X_train, y_train)
+random_forest.fit(X_train, y_train)
+naive_bayes.fit(X_train, y_train)
+k_means.fit(X_train, y_train)
+logistic_reg.fit(X_train, y_train)
+support_vector.fit(X_train, y_train)
+```
+
+%% Output
+
+SVC()
+
+%% Cell type:code id: tags:
+
+``` python
+nearest_neighbors_pred = nearest_neighbors.predict(X_test)
+decision_tree_pred = decision_tree.predict(X_test)
+random_forest_pred = random_forest.predict(X_test)
+naive_bayes_pred = naive_bayes.predict(X_test)
+k_means_pred = k_means.predict(X_test)
+logistic_reg_pred = logistic_reg.predict(X_test)
+support_vector_pred = support_vector.predict(X_test)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+nearest_neighbors_acc = accuracy_score(y_test, nearest_neighbors_pred)
+decision_tree_acc = accuracy_score(y_test, decision_tree_pred)
+random_forest_acc = accuracy_score(y_test, random_forest_pred)
+naive_bayes_acc = accuracy_score(y_test, naive_bayes_pred)
+k_means_acc = accuracy_score(y_test, k_means_pred)
+logistic_reg_acc = accuracy_score(y_test, logistic_reg_pred)
+support_vector_acc = accuracy_score(y_test, support_vector_pred)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+
+x_axis = ["K-Nearest Neighbors","Decision Tree", "Random Forest", "Naive Bayes", "K-Means","Logistic Regression","Support Vector Machine"]
+heights = [nearest_neighbors_acc,decision_tree_acc,random_forest_acc,naive_bayes_acc,k_means_acc,logistic_reg_acc,support_vector_acc]
+fig, ax = plt.subplots()
+plt.title("Accuracy metric comparison")
+plt.grid()
+plt.bar(x=x_axis,height=heights)
+
+fig.autofmt_xdate()
+
+plt.show()
+```
+
+%% Output
+
+
+
+%% Cell type:markdown id: tags:
+
+This comparison shows that for this dataset using Random Forest gives us the best result. A comparison like this is however not necessarily ideal since different algorithms excel at different problems.