diff --git a/notebooks/source/algorithm_comparison_classification.ipynb b/notebooks/source/algorithm_comparison_classification.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..54fd1b63a41cf871a676fc604517118c73ddd929 --- /dev/null +++ b/notebooks/source/algorithm_comparison_classification.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 220, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.naive_bayes import GaussianNB\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.impute import SimpleImputer" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "data = pd.read_csv(\"../data/Titanic/titanic.csv\")\n", + "\n", + "data.head()\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": 221, + "outputs": [ + { + "data": { + "text/plain": " PassengerId Survived Pclass \\\n0 1 0 3 \n1 2 1 1 \n2 3 1 3 \n3 4 1 1 \n4 5 0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked \n0 0 A/5 21171 7.2500 NaN S \n1 0 PC 17599 71.2833 C85 C \n2 0 STON/O2. 3101282 7.9250 NaN S \n3 0 113803 53.1000 C123 S \n4 0 373450 8.0500 NaN S ", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Survived</th>\n <th>Pclass</th>\n <th>Name</th>\n <th>Sex</th>\n <th>Age</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Ticket</th>\n <th>Fare</th>\n <th>Cabin</th>\n <th>Embarked</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>Braund, Mr. Owen Harris</td>\n <td>male</td>\n <td>22.0</td>\n <td>1</td>\n <td>0</td>\n <td>A/5 21171</td>\n <td>7.2500</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n <td>female</td>\n <td>38.0</td>\n <td>1</td>\n <td>0</td>\n <td>PC 17599</td>\n <td>71.2833</td>\n <td>C85</td>\n <td>C</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>1</td>\n <td>3</td>\n <td>Heikkinen, Miss. Laina</td>\n <td>female</td>\n <td>26.0</td>\n <td>0</td>\n <td>0</td>\n <td>STON/O2. 3101282</td>\n <td>7.9250</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n <td>female</td>\n <td>35.0</td>\n <td>1</td>\n <td>0</td>\n <td>113803</td>\n <td>53.1000</td>\n <td>C123</td>\n <td>S</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>0</td>\n <td>3</td>\n <td>Allen, Mr. William Henry</td>\n <td>male</td>\n <td>35.0</td>\n <td>0</td>\n <td>0</td>\n <td>373450</td>\n <td>8.0500</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n </tbody>\n</table>\n</div>" + }, + "execution_count": 221, + "metadata": {}, + "output_type": "execute_result" + } + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "outputs": [], + "source": [ + "data[\"Sex\"] = data[\"Sex\"].map({\"male\":0, \"female\":1})\n", + "\n", + "features = [\"Pclass\", \"Sex\", \"Age\", \"SibSp\", \"Parch\"]\n", + "target = [\"Survived\"]" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "data.isnull().sum()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": 223, + "outputs": [ + { + "data": { + "text/plain": "PassengerId 0\nSurvived 0\nPclass 0\nName 0\nSex 0\nAge 177\nSibSp 0\nParch 0\nTicket 0\nFare 0\nCabin 687\nEmbarked 2\ndtype: int64" + }, + "execution_count": 223, + "metadata": {}, + "output_type": "execute_result" + } + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "outputs": [], + "source": [ + "imp = SimpleImputer(strategy=\"mean\")\n", + "\n", + "data[\"Age\"] = imp.fit_transform(data[[\"Age\"]])" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.5, random_state=42)\n", + "\n", + "y_train = np.array(y_train).ravel()\n", + "y_test = np.array(y_test).ravel()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "execution_count": 225, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": 226, + "outputs": [], + "source": [ + "nearest_neighbors = KNeighborsClassifier()\n", + "decision_tree = DecisionTreeClassifier()\n", + "random_forest = RandomForestClassifier()\n", + "naive_bayes = GaussianNB()\n", + "k_means = KMeans(n_clusters=3, init='k-means++')\n", + "logistic_reg = LogisticRegression()\n", + "support_vector = SVC()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 227, + "outputs": [ + { + "data": { + "text/plain": "SVC()" + }, + "execution_count": 227, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nearest_neighbors.fit(X_train, y_train)\n", + "decision_tree.fit(X_train, y_train)\n", + "random_forest.fit(X_train, y_train)\n", + "naive_bayes.fit(X_train, y_train)\n", + "k_means.fit(X_train, y_train)\n", + "logistic_reg.fit(X_train, y_train)\n", + "support_vector.fit(X_train, y_train)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 228, + "outputs": [], + "source": [ + "nearest_neighbors_pred = nearest_neighbors.predict(X_test)\n", + "decision_tree_pred = decision_tree.predict(X_test)\n", + "random_forest_pred = random_forest.predict(X_test)\n", + "naive_bayes_pred = naive_bayes.predict(X_test)\n", + "k_means_pred = k_means.predict(X_test)\n", + "logistic_reg_pred = logistic_reg.predict(X_test)\n", + "support_vector_pred = support_vector.predict(X_test)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 229, + "outputs": [], + "source": [ + "nearest_neighbors_acc = accuracy_score(y_test, nearest_neighbors_pred)\n", + "decision_tree_acc = accuracy_score(y_test, decision_tree_pred)\n", + "random_forest_acc = accuracy_score(y_test, random_forest_pred)\n", + "naive_bayes_acc = accuracy_score(y_test, naive_bayes_pred)\n", + "k_means_acc = accuracy_score(y_test, k_means_pred)\n", + "logistic_reg_acc = accuracy_score(y_test, logistic_reg_pred)\n", + "support_vector_acc = accuracy_score(y_test, support_vector_pred)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 230, + "outputs": [ + { + "data": { + "text/plain": "<Figure size 432x288 with 1 Axes>", + "image/png": "\n" + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "x_axis = [\"K-Nearest Neighbors\",\"Decision Tree\", \"Random Forest\", \"Naive Bayes\", \"K-Means\",\"Logistic Regression\",\"Support Vector Machine\"]\n", + "heights = [nearest_neighbors_acc,decision_tree_acc,random_forest_acc,naive_bayes_acc,k_means_acc,logistic_reg_acc,support_vector_acc]\n", + "fig, ax = plt.subplots()\n", + "plt.title(\"Accuracy metric comparison\")\n", + "plt.grid()\n", + "plt.bar(x=x_axis,height=heights)\n", + "\n", + "fig.autofmt_xdate()\n", + "\n", + "plt.show()" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "This comparison shows that for this dataset using Random Forest gives us the best result. A comparison like this is however not necessarily ideal since different algorithms excel at different problems." + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file