Skip to content
Snippets Groups Projects

Resolve "Add comparison exercise"

Merged Felix Matthias Krumm requested to merge 72-add-comparison-exercise into main
1 file
+ 172
0
Compare changes
  • Side-by-side
  • Inline
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
```
%% Cell type:code id: tags:
``` python
data = pd.read_csv("../data/Titanic/titanic.csv")
data.head()
```
%% Output
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
%% Cell type:code id: tags:
``` python
data["Sex"] = data["Sex"].map({"male":0, "female":1})
features = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
target = ["Survived"]
```
%% Cell type:code id: tags:
``` python
data.isnull().sum()
```
%% Output
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
%% Cell type:code id: tags:
``` python
imp = SimpleImputer(strategy="mean")
data["Age"] = imp.fit_transform(data[["Age"]])
```
%% Cell type:code id: tags:
``` python
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.5, random_state=42)
y_train = np.array(y_train).ravel()
y_test = np.array(y_test).ravel()
```
%% Cell type:code id: tags:
``` python
nearest_neighbors = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
naive_bayes = GaussianNB()
k_means = KMeans(n_clusters=3, init='k-means++')
logistic_reg = LogisticRegression()
support_vector = SVC()
```
%% Cell type:code id: tags:
``` python
nearest_neighbors.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train)
k_means.fit(X_train, y_train)
logistic_reg.fit(X_train, y_train)
support_vector.fit(X_train, y_train)
```
%% Output
SVC()
%% Cell type:code id: tags:
``` python
nearest_neighbors_pred = nearest_neighbors.predict(X_test)
decision_tree_pred = decision_tree.predict(X_test)
random_forest_pred = random_forest.predict(X_test)
naive_bayes_pred = naive_bayes.predict(X_test)
k_means_pred = k_means.predict(X_test)
logistic_reg_pred = logistic_reg.predict(X_test)
support_vector_pred = support_vector.predict(X_test)
```
%% Cell type:code id: tags:
``` python
nearest_neighbors_acc = accuracy_score(y_test, nearest_neighbors_pred)
decision_tree_acc = accuracy_score(y_test, decision_tree_pred)
random_forest_acc = accuracy_score(y_test, random_forest_pred)
naive_bayes_acc = accuracy_score(y_test, naive_bayes_pred)
k_means_acc = accuracy_score(y_test, k_means_pred)
logistic_reg_acc = accuracy_score(y_test, logistic_reg_pred)
support_vector_acc = accuracy_score(y_test, support_vector_pred)
```
%% Cell type:code id: tags:
``` python
x_axis = ["K-Nearest Neighbors","Decision Tree", "Random Forest", "Naive Bayes", "K-Means","Logistic Regression","Support Vector Machine"]
heights = [nearest_neighbors_acc,decision_tree_acc,random_forest_acc,naive_bayes_acc,k_means_acc,logistic_reg_acc,support_vector_acc]
fig, ax = plt.subplots()
plt.title("Accuracy metric comparison")
plt.grid()
plt.bar(x=x_axis,height=heights)
fig.autofmt_xdate()
plt.show()
```
%% Output
%% Cell type:markdown id: tags:
This comparison shows that for this dataset using Random Forest gives us the best result. A comparison like this is however not necessarily ideal since different algorithms excel at different problems.
Loading