Skip to content
Snippets Groups Projects
Commit fbf5e411 authored by Niklas's avatar Niklas
Browse files

added a parameter to kmeans

parent 27d18c1d
Branches
No related tags found
1 merge request!77Resolve "Add comparison exercise"
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data = pd.read_csv("../data/Titanic/titanic.csv") data = pd.read_csv("../data/Titanic/titanic.csv")
data.head() data.head()
``` ```
%% Output %% Output
PassengerId Survived Pclass \ PassengerId Survived Pclass \
0 1 0 3 0 1 0 3
1 2 1 1 1 2 1 1
2 3 1 3 2 3 1 3
3 4 1 1 3 4 1 1
4 5 0 3 4 5 0 3
Name Sex Age SibSp \ Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1 0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0 2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0 4 Allen, Mr. William Henry male 35.0 0
Parch Ticket Fare Cabin Embarked Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S 0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C 1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S 2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S 3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S 4 0 373450 8.0500 NaN S
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data["Sex"] = data["Sex"].map({"male":0, "female":1}) data["Sex"] = data["Sex"].map({"male":0, "female":1})
features = ["Pclass", "Sex", "Age", "SibSp", "Parch"] features = ["Pclass", "Sex", "Age", "SibSp", "Parch"]
target = ["Survived"] target = ["Survived"]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data.isnull().sum() data.isnull().sum()
``` ```
%% Output %% Output
PassengerId 0 PassengerId 0
Survived 0 Survived 0
Pclass 0 Pclass 0
Name 0 Name 0
Sex 0 Sex 0
Age 177 Age 177
SibSp 0 SibSp 0
Parch 0 Parch 0
Ticket 0 Ticket 0
Fare 0 Fare 0
Cabin 687 Cabin 687
Embarked 2 Embarked 2
dtype: int64 dtype: int64
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
imp = SimpleImputer(strategy="mean") imp = SimpleImputer(strategy="mean")
data["Age"] = imp.fit_transform(data[["Age"]]) data["Age"] = imp.fit_transform(data[["Age"]])
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.5, random_state=42) X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.5, random_state=42)
y_train = np.array(y_train).ravel() y_train = np.array(y_train).ravel()
y_test = np.array(y_test).ravel() y_test = np.array(y_test).ravel()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
nearest_neighbors = KNeighborsClassifier() nearest_neighbors = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier() decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier() random_forest = RandomForestClassifier()
naive_bayes = GaussianNB() naive_bayes = GaussianNB()
k_means = KMeans(n_clusters=3) k_means = KMeans(n_clusters=3, init='k-means++')
logistic_reg = LogisticRegression() logistic_reg = LogisticRegression()
support_vector = SVC() support_vector = SVC()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
nearest_neighbors.fit(X_train, y_train) nearest_neighbors.fit(X_train, y_train)
decision_tree.fit(X_train, y_train) decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train) random_forest.fit(X_train, y_train)
naive_bayes.fit(X_train, y_train) naive_bayes.fit(X_train, y_train)
k_means.fit(X_train, y_train) k_means.fit(X_train, y_train)
logistic_reg.fit(X_train, y_train) logistic_reg.fit(X_train, y_train)
support_vector.fit(X_train, y_train) support_vector.fit(X_train, y_train)
``` ```
%% Output %% Output
SVC() SVC()
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
nearest_neighbors_pred = nearest_neighbors.predict(X_test) nearest_neighbors_pred = nearest_neighbors.predict(X_test)
decision_tree_pred = decision_tree.predict(X_test) decision_tree_pred = decision_tree.predict(X_test)
random_forest_pred = random_forest.predict(X_test) random_forest_pred = random_forest.predict(X_test)
naive_bayes_pred = naive_bayes.predict(X_test) naive_bayes_pred = naive_bayes.predict(X_test)
k_means_pred = k_means.predict(X_test) k_means_pred = k_means.predict(X_test)
logistic_reg_pred = logistic_reg.predict(X_test) logistic_reg_pred = logistic_reg.predict(X_test)
support_vector_pred = support_vector.predict(X_test) support_vector_pred = support_vector.predict(X_test)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
nearest_neighbors_acc = accuracy_score(y_test, nearest_neighbors_pred) nearest_neighbors_acc = accuracy_score(y_test, nearest_neighbors_pred)
decision_tree_acc = accuracy_score(y_test, decision_tree_pred) decision_tree_acc = accuracy_score(y_test, decision_tree_pred)
random_forest_acc = accuracy_score(y_test, random_forest_pred) random_forest_acc = accuracy_score(y_test, random_forest_pred)
naive_bayes_acc = accuracy_score(y_test, naive_bayes_pred) naive_bayes_acc = accuracy_score(y_test, naive_bayes_pred)
k_means_acc = accuracy_score(y_test, k_means_pred) k_means_acc = accuracy_score(y_test, k_means_pred)
logistic_reg_acc = accuracy_score(y_test, logistic_reg_pred) logistic_reg_acc = accuracy_score(y_test, logistic_reg_pred)
support_vector_acc = accuracy_score(y_test, support_vector_pred) support_vector_acc = accuracy_score(y_test, support_vector_pred)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
x_axis = ["K-Nearest Neighbors","Decision Tree", "Random Forest", "Naive Bayes", "K-Means","Logistic Regression","Support Vector Machine"] x_axis = ["K-Nearest Neighbors","Decision Tree", "Random Forest", "Naive Bayes", "K-Means","Logistic Regression","Support Vector Machine"]
heights = [nearest_neighbors_acc,decision_tree_acc,random_forest_acc,naive_bayes_acc,k_means_acc,logistic_reg_acc,support_vector_acc] heights = [nearest_neighbors_acc,decision_tree_acc,random_forest_acc,naive_bayes_acc,k_means_acc,logistic_reg_acc,support_vector_acc]
fig, ax = plt.subplots() fig, ax = plt.subplots()
plt.title("Accuracy metric comparison") plt.title("Accuracy metric comparison")
plt.grid() plt.grid()
plt.bar(x=x_axis,height=heights) plt.bar(x=x_axis,height=heights)
fig.autofmt_xdate() fig.autofmt_xdate()
plt.show() plt.show()
``` ```
%% Output %% Output
[0.7802690582959642, 0.7780269058295964, 0.8183856502242153, 0.7713004484304933, 0.484304932735426, 0.7959641255605381, 0.5986547085201793]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
This comparison shows that for this dataset using Random Forest gives us the best result. A comparison like this is however not necessarily ideal since different algorithms excel at different problems. This comparison shows that for this dataset using Random Forest gives us the best result. A comparison like this is however not necessarily ideal since different algorithms excel at different problems.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment