Skip to content
Snippets Groups Projects
Commit 27d18c1d authored by Niklas's avatar Niklas
Browse files

Finished comparison of classification methods notebook

parent 5c3d5e41
No related branches found
No related tags found
1 merge request!77Resolve "Add comparison exercise"
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
```
%% Cell type:code id: tags:
``` python
data = pd.read_csv("../data/Titanic/titanic.csv")
data.head()
```
%% Output
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
%% Cell type:code id: tags:
``` python
data["Sex"] = data["Sex"].map({"male":0, "female":1})
features = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
target = ["Survived"]
```
%% Cell type:code id: tags:
``` python
data.isnull().sum()
```
%% Output
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
%% Cell type:code id: tags:
``` python
imp = SimpleImputer(strategy="mean")
data["Age"] = imp.fit_transform(data[["Age"]])
```
%% Cell type:code id: tags:
``` python
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.5, random_state=42)
y_train = np.array(y_train).ravel()
y_test = np.array(y_test).ravel()
```
%% Cell type:code id: tags:
``` python
nearest_neighbors = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
naive_bayes = GaussianNB()
k_means = KMeans(n_clusters=3)
logistic_reg = LogisticRegression()
support_vector = SVC()
```
%% Cell type:code id: tags:
``` python
nearest_neighbors.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train)
k_means.fit(X_train, y_train)
logistic_reg.fit(X_train, y_train)
support_vector.fit(X_train, y_train)
```
%% Output
SVC()
%% Cell type:code id: tags:
``` python
nearest_neighbors_pred = nearest_neighbors.predict(X_test)
decision_tree_pred = decision_tree.predict(X_test)
random_forest_pred = random_forest.predict(X_test)
naive_bayes_pred = naive_bayes.predict(X_test)
k_means_pred = k_means.predict(X_test)
logistic_reg_pred = logistic_reg.predict(X_test)
support_vector_pred = support_vector.predict(X_test)
```
%% Cell type:code id: tags:
``` python
nearest_neighbors_acc = accuracy_score(y_test, nearest_neighbors_pred)
decision_tree_acc = accuracy_score(y_test, decision_tree_pred)
random_forest_acc = accuracy_score(y_test, random_forest_pred)
naive_bayes_acc = accuracy_score(y_test, naive_bayes_pred)
k_means_acc = accuracy_score(y_test, k_means_pred)
logistic_reg_acc = accuracy_score(y_test, logistic_reg_pred)
support_vector_acc = accuracy_score(y_test, support_vector_pred)
```
%% Cell type:code id: tags:
``` python
x_axis = ["K-Nearest Neighbors","Decision Tree", "Random Forest", "Naive Bayes", "K-Means","Logistic Regression","Support Vector Machine"]
heights = [nearest_neighbors_acc,decision_tree_acc,random_forest_acc,naive_bayes_acc,k_means_acc,logistic_reg_acc,support_vector_acc]
fig, ax = plt.subplots()
plt.title("Accuracy metric comparison")
plt.grid()
plt.bar(x=x_axis,height=heights)
fig.autofmt_xdate()
plt.show()
```
%% Output
[0.7802690582959642, 0.7780269058295964, 0.8183856502242153, 0.7713004484304933, 0.484304932735426, 0.7959641255605381, 0.5986547085201793]
%% Cell type:markdown id: tags:
This comparison shows that for this dataset using Random Forest gives us the best result. A comparison like this is however not necessarily ideal since different algorithms excel at different problems.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment