{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# AA-UTE 2024\n", "\n", "## Aprendizaje No Supervisado" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sklearn\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", "# Para fijar tamaños en figuras\n", "plt.rc('font', size=14)\n", "plt.rc('axes', labelsize=14, titlesize=14)\n", "plt.rc('legend', fontsize=14)\n", "plt.rc('xtick', labelsize=10)\n", "plt.rc('ytick', labelsize=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Correr la siguiente celda con funciones auxiiares" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### utils\n", "def plot_data(X):\n", " plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)\n", "\n", "def plot_centroids(centroids, weights=None, circle_color='w', cross_color='k'):\n", " if weights is not None:\n", " centroids = centroids[weights > weights.max() / 10]\n", " plt.scatter(centroids[:, 0], centroids[:, 1],\n", " marker='o', s=35, linewidths=8,\n", " color=circle_color, zorder=10, alpha=0.9)\n", " plt.scatter(centroids[:, 0], centroids[:, 1],\n", " marker='x', s=2, linewidths=12,\n", " color=cross_color, zorder=11, alpha=1)\n", "\n", "def plot_decision_boundaries(clusterer, X, resolution=1000, show_centroids=True,\n", " show_centers=False, show_xlabels=True, show_ylabels=True):\n", " mins = X.min(axis=0) - 0.1\n", " maxs = X.max(axis=0) + 0.1\n", " xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),\n", " np.linspace(mins[1], maxs[1], resolution))\n", " Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])\n", " Z = Z.reshape(xx.shape)\n", "\n", " plt.contourf(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),\n", " cmap=\"Pastel2\")\n", " plt.contour(Z, extent=(mins[0], maxs[0], mins[1], maxs[1]),\n", " linewidths=1, colors='k')\n", " plot_data(X)\n", " if show_centroids:\n", " plot_centroids(clusterer.cluster_centers_)\n", "\n", " if show_centers:\n", " plot_centroids(clusterer.means_)\n", "\n", " if show_xlabels:\n", " plt.xlabel(\"$x_1$\")\n", " else:\n", " plt.tick_params(labelbottom=False)\n", " if show_ylabels:\n", " plt.ylabel(\"$x_2$\", rotation=0)\n", " else:\n", " plt.tick_params(labelleft=False)\n", "\n", "\n", "def plot_clusterer_comparison(clusterer1, clusterer2, X, title1=None,\n", " title2=None):\n", " clusterer1.fit(X)\n", " clusterer2.fit(X)\n", "\n", " plt.figure(figsize=(10, 3.2))\n", "\n", " plt.subplot(121)\n", " plot_decision_boundaries(clusterer1, X)\n", " if title1:\n", " plt.title(title1)\n", "\n", " plt.subplot(122)\n", " plot_decision_boundaries(clusterer2, X, show_ylabels=False)\n", " if title2:\n", " plt.title(title2)\n", "\n", "from matplotlib.colors import LogNorm\n", "\n", "def plot_gaussian_mixture(clusterer, X, resolution=1000, show_ylabels=True):\n", " mins = X.min(axis=0) - 0.1\n", " maxs = X.max(axis=0) + 0.1\n", " xx, yy = np.meshgrid(np.linspace(mins[0], maxs[0], resolution),\n", " np.linspace(mins[1], maxs[1], resolution))\n", " Z = -clusterer.score_samples(np.c_[xx.ravel(), yy.ravel()])\n", " Z = Z.reshape(xx.shape)\n", "\n", " plt.contourf(xx, yy, Z,\n", " norm=LogNorm(vmin=1.0, vmax=30.0),\n", " levels=np.logspace(0, 2, 12))\n", " plt.contour(xx, yy, Z,\n", " norm=LogNorm(vmin=1.0, vmax=30.0),\n", " levels=np.logspace(0, 2, 12),\n", " linewidths=1, colors='k')\n", "\n", " Z = clusterer.predict(np.c_[xx.ravel(), yy.ravel()])\n", " Z = Z.reshape(xx.shape)\n", " plt.contour(xx, yy, Z,\n", " linewidths=2, colors='r', linestyles='dashed')\n", " \n", " plt.plot(X[:, 0], X[:, 1], 'k.', markersize=2)\n", " plot_centroids(clusterer.means_, clusterer.weights_)\n", "\n", " plt.xlabel(\"$x_1$\")\n", " if show_ylabels:\n", " plt.ylabel(\"$x_2$\", rotation=0)\n", " else:\n", " plt.tick_params(labelleft=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## K-Means" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Funcionamiento" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Creado de datos sintéticos con cinco agrupamientos diferentes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "from sklearn.datasets import make_blobs\n", "\n", "# Definir centros de clusters y cuánto ocupan\n", "blob_centers = np.array([[ 0.2, 2.3], [-1.5 , 2.3], [-2.8, 1.8],\n", " [-2.8, 2.8], [-2.8, 1.3]])\n", "blob_std = np.array([0.4, 0.3, 0.1, 0.1, 0.1])\n", "X, y = make_blobs(n_samples=2000, centers=blob_centers, cluster_std=blob_std,\n", " random_state=42)\n", "\n", "# Graficar\n", "plt.figure(figsize=(8, 4))\n", "plt.scatter(X[:, 0], X[:, 1], s=1)\n", "plt.xlabel(\"$x_1$\")\n", "plt.ylabel(\"$x_2$\", rotation=0)\n", "plt.gca().set_axisbelow(True)\n", "plt.grid()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Entrenar un algoritmo KMeans y ver las regiones de decisión" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "k = 5\n", "kmeans = KMeans(n_clusters=k, n_init=10, random_state=40)\n", "y_pred = kmeans.fit_predict(X)\n", "\n", "plt.figure(figsize=(8, 4))\n", "plot_decision_boundaries(kmeans, X)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Evolución de las iteraciones:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Se fija el estado para reproducir las iteraciones anteriores\n", "\n", "kmeans_iter1 = KMeans(n_clusters=5, init=\"random\", n_init=1, max_iter=1,\n", " random_state=9)\n", "kmeans_iter2 = KMeans(n_clusters=5, init=\"random\", n_init=1, max_iter=2,\n", " random_state=9)\n", "kmeans_iter3 = KMeans(n_clusters=5, init=\"random\", n_init=1, max_iter=3,\n", " random_state=9)\n", "kmeans_iter1.fit(X)\n", "kmeans_iter2.fit(X)\n", "kmeans_iter3.fit(X)\n", "\n", "plt.figure(figsize=(10, 8))\n", "\n", "plt.subplot(321)\n", "plot_data(X)\n", "plot_centroids(kmeans_iter1.cluster_centers_, circle_color='r', cross_color='w')\n", "plt.ylabel(\"$x_2$\", rotation=0)\n", "plt.tick_params(labelbottom=False)\n", "plt.title(\"Inicialización/actualización de centroides\")\n", "\n", "plt.subplot(322)\n", "plot_decision_boundaries(kmeans_iter1, X, show_xlabels=False,\n", " show_ylabels=False)\n", "plt.title(\"Etiquetado de instancias\\n(según centroides de la izq.)\")\n", "\n", "plt.subplot(323)\n", "plot_decision_boundaries(kmeans_iter1, X, show_centroids=False,\n", " show_xlabels=False)\n", "plot_centroids(kmeans_iter2.cluster_centers_)\n", "\n", "plt.subplot(324)\n", "plot_decision_boundaries(kmeans_iter2, X, show_xlabels=False,\n", " show_ylabels=False)\n", "\n", "plt.subplot(325)\n", "plot_decision_boundaries(kmeans_iter2, X, show_centroids=False)\n", "plot_centroids(kmeans_iter3.cluster_centers_)\n", "\n", "plt.subplot(326)\n", "plot_decision_boundaries(kmeans_iter3, X, show_ylabels=False)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Encontrar el número de clusters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "En general la cantidad de clusters determinados por los datos no está a la vista, y para altas dimensiones es aún más dificil de visualizar.\n", "\n", "Dentro de los métodos para elegir la cantidad de clusters se pueden encontrar los siguientes dos enfoques:\n", "\n", "- Estimación por _inercia_\n", "- Estimación por _coeficiente de Silhouette_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kmeans_per_k = [KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)\n", " for k in range(1, 10)]\n", "inertias = [model.inertia_ for model in kmeans_per_k]\n", "\n", "plt.figure(figsize=(8, 3.5))\n", "plt.plot(range(1, 10), inertias, \"bo-\")\n", "plt.xlabel(\"$k$\")\n", "plt.ylabel(\"Inercia\")\n", "plt.annotate(\"\", xy=(4, inertias[3]), xytext=(4.45, 650),\n", " arrowprops=dict(facecolor='black', shrink=0.1))\n", "plt.text(4.5, 650, \"Codo\", horizontalalignment=\"center\")\n", "plt.axis([1, 8.5, 0, 1300])\n", "plt.grid()\n", "plt.show()\n", "\n", "plot_decision_boundaries(kmeans_per_k[4 - 1], X)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import silhouette_score\n", "\n", "silhouette_scores = [silhouette_score(X, model.labels_)\n", " for model in kmeans_per_k[1:]]\n", "\n", "plt.figure(figsize=(8, 3))\n", "plt.plot(range(2, 10), silhouette_scores, \"bo-\")\n", "plt.xlabel(\"$k$\")\n", "plt.ylabel(\"Silhouette score\")\n", "plt.axis([1.8, 8.5, 0.55, 0.7])\n", "plt.grid()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Otra opción es utilizar el _diagrama de Silhouette_. Compara el valor del score medio del modelo con los scores individuales de cada instancia. Esto lo hace representando cada cluster con cierta altura vertical (cuanto más alto, más instancias asignadas) y tomando el tamaño horizontal como el score de cada instancia (ordenados de mayor a menor).\n", "\n", "Esta representación muestra cómo se distribuyen los coeficientes en cada cluster, resaltando cuáles pueden ser más significativos. Un buen número de cluster k es aquel que cumple tener los coeficientes distribuidos por encima del score medio." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import silhouette_samples\n", "from matplotlib.ticker import FixedLocator, FixedFormatter\n", "\n", "plt.figure(figsize=(11, 9))\n", "\n", "for k in (3, 4, 5, 6):\n", " plt.subplot(2, 2, k - 2)\n", " \n", " y_pred = kmeans_per_k[k - 1].labels_\n", " silhouette_coefficients = silhouette_samples(X, y_pred)\n", "\n", " padding = len(X) // 30\n", " pos = padding\n", " ticks = []\n", " for i in range(k):\n", " coeffs = silhouette_coefficients[y_pred == i]\n", " coeffs.sort()\n", "\n", " color = plt.cm.Spectral(i / k)\n", " plt.fill_betweenx(np.arange(pos, pos + len(coeffs)), 0, coeffs,\n", " facecolor=color, edgecolor=color, alpha=0.7)\n", " ticks.append(pos + len(coeffs) // 2)\n", " pos += len(coeffs) + padding\n", "\n", " plt.gca().yaxis.set_major_locator(FixedLocator(ticks))\n", " plt.gca().yaxis.set_major_formatter(FixedFormatter(range(k)))\n", " if k in (3, 5):\n", " plt.ylabel(\"Cluster\")\n", " \n", " if k in (5, 6):\n", " plt.gca().set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])\n", " plt.xlabel(\"Coeficiente Silhouette\")\n", " else:\n", " plt.tick_params(labelbottom=False)\n", "\n", " plt.axvline(x=silhouette_scores[k - 2], color=\"red\", linestyle=\"--\")\n", " plt.title(f\"$k={k}$\")\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Variabilidad y limitaciones" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inicialización" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kmeans_rnd_init1 = KMeans(n_clusters=5, init=\"random\", n_init=1, random_state=9)\n", "kmeans_rnd_init2 = KMeans(n_clusters=5, init=\"random\", n_init=1, random_state=100)\n", "\n", "plot_clusterer_comparison(kmeans_rnd_init1, kmeans_rnd_init2, X,\n", " \"Solución 1\",\n", " \"Solución 2\\n(otra inicialización de centroides)\")\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Pregunta:** aquí los centroides se inicializan de manera aleatoria. ¿Qué métodos se pueden usar para evitar soluciones sub-óptimas?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "_Respuesta:_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normalización de datos" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Datos no estandarizados\n", "X_rescale = X.copy()\n", "X_rescale[:,1] = X_rescale[:,1] / 4\n", "\n", "k = 5\n", "\n", "kmeans_rescale = KMeans(n_clusters=k, n_init=10, random_state=40)\n", "y_pred = kmeans_rescale.fit_predict(X_rescale)\n", "\n", "plot_decision_boundaries(kmeans_rescale, X_rescale)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Pregunta:** ¿por qué es importante estandarizar los datos?\n", "\n", "_Respuesta_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Datos **no isotrópicos**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Clusters no isotrópicos\n", "\n", "X1, y1 = make_blobs(n_samples=1000, centers=((4, -4), (0, 0)), random_state=42)\n", "X1 = X1.dot(np.array([[0.374, 0.95], [0.732, 0.598]]))\n", "X2, y2 = make_blobs(n_samples=250, centers=1, random_state=42)\n", "X2 = X2 + [6, -8]\n", "X_aniso = np.r_[X1, X2]\n", "y = np.r_[y1, y2]\n", "\n", "kmeans_1 = KMeans(n_clusters=3,\n", " init=np.array([[-1.5, 2.5], [0.5, 0], [4, 0]]),\n", " n_init=1, random_state=42)\n", "kmeans_2 = KMeans(n_clusters=3, n_init=10, random_state=42)\n", "kmeans_1.fit(X_aniso)\n", "kmeans_2.fit(X_aniso)\n", "\n", "plt.figure(figsize=(10, 3.2))\n", "\n", "plt.subplot(121)\n", "plot_decision_boundaries(kmeans_1, X_aniso)\n", "plt.title(f\"Inertia = {kmeans_1.inertia_:.1f}\")\n", "\n", "plt.subplot(122)\n", "plot_decision_boundaries(kmeans_2, X_aniso, show_ylabels=False)\n", "plt.title(f\"Inertia = {kmeans_2.inertia_:.1f}\")\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Discutir cuál de ambos resultados es mejor en cuanto a inercia y en cuanto al patrón de los datos. ¿Cuál es el que \"prefiere\" KMeans y por qué?\n", "\n", "_Respuesta_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modelo de Mezcla de Gaussianas" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.mixture import GaussianMixture\n", "\n", "gm = GaussianMixture(n_components=3, n_init=10, random_state=42)\n", "gm.fit(X_aniso)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ver regiones del clustering para GMMs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8, 4))\n", "plot_decision_boundaries(gm,X_aniso, show_centroids=False, show_centers=True)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Verificar que GMM también se adapta a los datos isotrópicos $X$ usados en KMeans" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Detección de Anomalías con GMM" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "GMM estima densidades de probabilidad (de distribución Gaussiana), donde las instancias tienen asignadas probabilidades de ser generada por cada uno de los clusters (o gaussiana).\n", "\n", "En particular, se puede visualizar la funciones de densidad de probabilidad:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8, 4))\n", "\n", "plot_gaussian_mixture(gm, X_aniso)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ver los valores medios y matrices de covarianza para cada cluster" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gm.means_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gm.covariances_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Al disponer de la función de densidad de probabilidad, se puede inferir la (log-)verosimilitud para cada muestra:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "densities = gm.score_samples(X_aniso)\n", "densities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A partir de esto se puede considerar que, si de todos los datos en general se tiene que el 4% son datos anómalos, se puede encontrar un umbral que indique que los valores menos probables de dicho porcentaje los detecte como anomalías:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "densities = gm.score_samples(X_aniso)\n", "density_threshold = np.percentile(densities, 4)\n", "anomalies = X_aniso[densities < density_threshold]\n", "\n", "plt.title('Histograma de log-verosimilitud')\n", "plt.hist(densities, bins=100)\n", "plt.vlines(density_threshold,0,100,'k',linestyles='dashed', label='Umbral')\n", "plt.legend()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8, 4))\n", "\n", "plot_gaussian_mixture(gm, X_aniso)\n", "plt.scatter(anomalies[:, 0], anomalies[:, 1], color='r', marker='*')\n", "plt.ylim(top=5.1)\n", "\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Ejercicio:**\n", "\n", "Indicar si los puntos [3,-0.5], [1,-1], [-2,2] son anómalos o no" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Opcional - Clustering en Electrodomésticos" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Usando una representación en bajas dimensiones de los datos PLAID (representados en sus diagramas V-I), utilizar un algoritmo de clustering y encontrar el número óptimo de clusters.\n", "\n", "Visualizar resultados en el espacio de baja dimensión." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "telefonicaAD", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 2 }