diff --git a/K-Means/AClusteringExample.ipynb b/K-Means/AClusteringExample.ipynb index dfae97078854b9c24f6493d0801e1225eac05bb3..57964430fb153b1d2460959c61e21ad98447109a 100644 --- a/K-Means/AClusteringExample.ipynb +++ b/K-Means/AClusteringExample.ipynb @@ -2,14 +2,15 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 18, "id": "chicken-minneapolis", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", - "from sklearn.cluster import KMeans" + "from sklearn.cluster import KMeans\n", + "from sklearn.cluster import Birch" ] }, { @@ -22,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 19, "id": "chicken-marshall", "metadata": {}, "outputs": [ @@ -62,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 20, "id": "generic-dating", "metadata": {}, "outputs": [ @@ -78,10 +79,10 @@ { "data": { "text/plain": [ - "<matplotlib.collections.PathCollection at 0x12cd0ee80>" + "<matplotlib.collections.PathCollection at 0x12b0973d0>" ] }, - "execution_count": 7, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" }, @@ -99,24 +100,41 @@ } ], "source": [ + "# cog = center of gravity\n", + "# cs = center of gravities\n", + "# k = k cluster\n", + "\n", "# Evaluate k\n", "ks = np.arange(0, 15)\n", + "# Gib die Anzhal der zu erwartenden Cluster an\n", "ks = [2]\n", "\n", + "# für jedes Cluster\n", "for k in ks:\n", + " # definiere die ersten zwei Datenpunkte als Startpunkte\n", " cs = data[0:k].copy()\n", " \n", " terminate = False\n", + " # Deklariere last_cog\n", " last_cog = [np.array([0, 0]) for i in range(k)]\n", " while not terminate:\n", + " # berechne alle Distanzen zwischen Datenpunkten und cs\n", " dist = np.stack([[np.linalg.norm(c-d) for d in data] for c in cs], axis=1)\n", + " # finde die kleinste Distanz eines Datenpunkts zu den cs und gebe das dazugehörige Cluster in einen Array\n", " z = np.array([np.argmin(d) for d in dist])\n", + " # i ist von 0 bis k und wenn z gleich 0 ist gehört der Datenpunkt dem nullten Cluster an,\n", + " # bei eins, dem ersten und so weiter. So besteht df aus einem Array mit Unterarrays mit Datenpukten der einzelnen Cluster\n", " df = [data[z == i] for i in range(k)]\n", + " # berechne die neuen Schwerpunkte\n", " current_cog = [np.sum(d, axis=0)/len(d) for d in df]\n", + " # berechne die Differenz zwischen dem aktuellen und dem letzten cog\n", " diff_cog = np.sum(np.abs([last - current for last, current in zip(last_cog, current_cog)]))\n", + " # setzte den letzten cog auf den aktuellen\n", " last_cog = current_cog\n", - " print(diff_cog)\n", + " #print(diff_cog)\n", + " # setze den neuen Startpunkt auf den aktuellen cog\n", " cs = current_cog\n", + " # wenn die Differenz unter einer gewissen Schwelle ist, beende den Algorithmus\n", " if diff_cog < 0.3:\n", " terminate = True\n", " \n", @@ -126,6 +144,22 @@ "# print(current_cog)" ] }, + { + "cell_type": "markdown", + "id": "printable-designer", + "metadata": {}, + "source": [ + "# Animated K-Means" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "upper-moment", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "czech-romance", @@ -136,49 +170,23 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 16, "id": "saved-scope", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[5.75036467 4.27404839]\n", - " [5.28040876 4.00709183]\n", - " [5.48519097 4.6457209 ]\n", - " [5.9807372 4.71990938]\n", - " [5.96165719 4.83556922]\n", - " [5.72478994 4.28187783]\n", - " [5.54122686 4.21521817]\n", - " [5.2768912 4.63933138]\n", - " [5.16065201 4.80505483]\n", - " [5.96992541 4.96367087]\n", - " [5.51606859 4.15052483]\n", - " [5.11586561 4.48221239]\n", - " [5.62348976 4.89471586]\n", - " [5.77668311 4.42271691]\n", - " [5.6130033 4.58950206]\n", - " [5.9172977 4.02449068]\n", - " [5.03959288 4.67345989]\n", - " [5.52858926 4.91908862]\n", - " [5.45933588 4.82682533]\n", - " [5.06234958 4.88552027]]\n" - ] - }, { "data": { "text/plain": [ - "<matplotlib.collections.PathCollection at 0x12cd73070>" + "<matplotlib.collections.PathCollection at 0x12b049bb0>" ] }, - "execution_count": 8, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] @@ -200,18 +208,60 @@ "plt.scatter(df2[:, 0], df2[:, 1], color='pink')" ] }, + { + "cell_type": "markdown", + "id": "broke-shark", + "metadata": {}, + "source": [ + "# BIRCH with Scikit-Learn" + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "partial-faculty", + "execution_count": 17, + "id": "included-county", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.collections.PathCollection at 0x12ac68a30>" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "brc = Birch(n_clusters=None)\n", + "brc.fit(data)\n", + "data_predicted = brc.predict(data)\n", + "\n", + "# \n", + "df1 = data[data_predicted == 0]\n", + "df2 = data[data_predicted == 1]\n", + "#print(df2)\n", + "plt.scatter(df1[:, 0], df1[:, 1], color='green')\n", + "plt.scatter(df2[:, 0], df2[:, 1], color='pink')" + ] }, { "cell_type": "code", "execution_count": null, - "id": "included-county", + "id": "third-burden", "metadata": {}, "outputs": [], "source": []