{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Proporciones altas de fondos en test - usando HOG" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# Encadenar iterables\n", "from itertools import chain\n", "\n", "# Proporciona una barra de progreso rápida\n", "from tqdm import tqdm\n", "\n", "# Interfaz para hacer gráficos y visualizaciones\n", "import matplotlib.pyplot as plt\n", "\n", "# Computación científica\n", "import numpy as np\n", "\n", "# Manipulación de datos\n", "import pandas as pd\n", "\n", "# Extraer parches (pequeños subconjuntos de imágenes) de imágenes\n", "from sklearn.feature_extraction.image import PatchExtractor\n", "\n", "# data: conjunto de datos de muestra y funciones de carga\n", "# color: convertir imágenes entre espacios de color\n", "# feature: funciones para identificar y extraer características de imágenes\n", "from skimage import data, color, feature\n", "\n", "# Cambiar el tamaño de una imagen\n", "from skimage.transform import resize, rescale\n", "\n", "# Descarga y carga en memoria un conjunto de datos de imágenes de caras de personas famosas\n", "from sklearn.datasets import fetch_lfw_people\n", "\n", "# Modelos\n", "from sklearn.linear_model import LogisticRegression\n", "\n", "# Train test split\n", "from sklearn.model_selection import train_test_split\n", "\n", "# Matriz de confusión\n", "from sklearn.metrics import confusion_matrix\n", "\n", "# La curva ROC\n", "from sklearn.metrics import roc_curve\n", "\n", "# Classification report\n", "from sklearn.metrics import classification_report\n", "\n", "# Métricas varias\n", "from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Funciones auxiliares" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Función para extraer porciones de una imagen\n", "def extract_patches(img, N, scale=1.0, patch_size=(62,47), random_state=0):\n", " # Calcula el tamaño del parche extraído basado en el factor de escala dado\n", " H = img.shape[0]\n", " W = img.shape[1]\n", " H_patch = min(H , int(scale * patch_size[0]))\n", " W_patch = min(W , int(scale * patch_size[1]))\n", " extracted_patch_size = (H_patch, W_patch)\n", "\n", " # Inicializa un objeto PatchExtractor con el tamaño de parche calculado,\n", " # el número máximo de parches, y una semilla de estado aleatorio\n", " extractor = PatchExtractor(patch_size=extracted_patch_size, max_patches=N, random_state=random_state)\n", "\n", " # Extrae parches de la imagen dada\n", " # img[np.newaxis] se utiliza la entrada de PatchExtractor es un conjunto de imágenes\n", " patches = extractor.transform(img[np.newaxis])\n", "\n", " # Si el factor de escala no es 1, redimensiona cada parche extraído\n", " # al tamaño del parche original\n", " if scale != 1:\n", " patches = np.array([resize(patch, patch_size) for patch in patches])\n", "\n", " # Devuelve la lista de parches extraídos (y posiblemente redimensionados)\n", " return patches" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def non_max_suppression(indices, Ni, Nj, overlapThresh):\n", " # Si no hay rectángulos, regresar una lista vacía\n", " if len(indices) == 0:\n", " return []\n", "\n", " # Si las cajas son enteros, convertir a flotantes\n", " if indices.dtype.kind == \"i\":\n", " indices = indices.astype(\"float\")\n", "\n", " # Inicializar la lista de índices seleccionados\n", " pick = []\n", "\n", " # Tomar las coordenadas de los cuadros\n", " x1 = np.array([indices[i,0] for i in range(indices.shape[0])])\n", " y1 = np.array([indices[i,1] for i in range(indices.shape[0])])\n", " x2 = np.array([indices[i,0]+Ni for i in range(indices.shape[0])])\n", " y2 = np.array([indices[i,1]+Nj for i in range(indices.shape[0])])\n", "\n", " # Calcula el área de los cuadros y ordena los cuadros\n", " area = (x2 - x1 + 1) * (y2 - y1 + 1)\n", " idxs = np.argsort(y2)\n", "\n", " # Mientras todavía hay índices en la lista de índices\n", " while len(idxs) > 0:\n", " # Toma el último índice de la lista y agrega el índice a la lista de seleccionados\n", " last = len(idxs) - 1\n", " i = idxs[last]\n", " pick.append(i)\n", "\n", " # Encontrar las coordenadas (x, y) más grandes para el inicio de la caja y las coordenadas (x, y) más pequeñas para el final de la caja\n", " xx1 = np.maximum(x1[i], x1[idxs[:last]])\n", " yy1 = np.maximum(y1[i], y1[idxs[:last]])\n", " xx2 = np.minimum(x2[i], x2[idxs[:last]])\n", " yy2 = np.minimum(y2[i], y2[idxs[:last]])\n", "\n", " # Calcula el ancho y alto de la caja\n", " w = np.maximum(0, xx2 - xx1 + 1)\n", " h = np.maximum(0, yy2 - yy1 + 1)\n", "\n", " # Calcula la proporción de superposición\n", " overlap = (w * h) / area[idxs[:last]]\n", "\n", " # Elimina todos los índices del índice de lista que tienen una proporción de superposición mayor que el umbral proporcionado\n", " idxs = np.delete(idxs, np.concatenate(([last], np.where(overlap > overlapThresh)[0])))\n", "\n", " # Devuelve solo las cajas seleccionadas\n", " return indices[pick].astype(\"int\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Define una función para realizar una ventana deslizante (sliding window) sobre una imagen.\n", "def sliding_window(img,\n", " patch_size=(62,47), # Define el tamaño del parche (patch) basado en el primer parche positivo por defecto\n", " istep=2, # Paso de desplazamiento en la dirección i (verticalmente)\n", " jstep=2, # Paso de desplazamiento en la dirección j (horizontalmente)\n", " scale=1.0): # Factor de escala para ajustar el tamaño del parche\n", "\n", " # Calcula las dimensiones Ni y Nj del parche ajustadas por el factor de escala.\n", " Ni, Nj = (int(scale * s) for s in patch_size)\n", "\n", " # Itera a lo largo de la imagen en la dirección i\n", " for i in range(0, img.shape[0] - Ni, istep):\n", " # Itera a lo largo de la imagen en la dirección j\n", " for j in range(0, img.shape[1] - Ni, jstep):\n", "\n", " # Extrae el parche de la imagen usando las coordenadas actuales i, j.\n", " patch = img[i:i + Ni, j:j + Nj]\n", "\n", " # Si el factor de escala es diferente de 1, redimensiona el parche al tamaño original del parche.\n", " if scale != 1:\n", " patch = resize(patch, patch_size)\n", "\n", " # Usa yield para devolver las coordenadas actuales y el parche.\n", " # Esto convierte la función en un generador.\n", " yield (i, j), patch" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "# Función que devuelve el número de detecciones brutas y procesadas para diversas escalas\n", "# Esta función asume conocidos model, size y los parámetros de las HOG\n", "def detections_by_scale(test_image, test_scales, step, thresholds=[0.5]):\n", " raw_detections = []\n", " detections = []\n", "\n", " for scale in tqdm(test_scales):\n", " raw_detections_scale = []\n", " detections_scale = []\n", "\n", " # Ventana deslizante\n", " indices, patches = zip(*sliding_window(test_image, scale=scale, istep=step, jstep=step))\n", "\n", " # Calcula las características HOG para cada parche y las almacena en un array.\n", " patches_hog = np.array([feature.hog(patch,\n", " orientations=orientations,\n", " pixels_per_cell=pixels_per_cell,\n", " cells_per_block=cells_per_block) for patch in patches])\n", " # Predicción\n", " for thr in thresholds:\n", " labels = (model.predict_proba(patches_hog)[:,1]>=thr).astype(int)\n", " raw_detections_scale.append(labels.sum())\n", " Ni, Nj = (int(scale*s) for s in size)\n", " indices = np.array(indices)\n", " detecciones = indices[labels == 1]\n", " detecciones = non_max_suppression(np.array(detecciones),Ni,Nj, 0.3)\n", " detections_scale.append(len(detecciones))\n", " \n", " # Actualizamos las listas\n", " raw_detections.append(raw_detections_scale)\n", " detections.append(detections_scale)\n", " \n", " return np.array(raw_detections), np.array(detections)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "# True Positive Rate\n", "def tpr_scorer(clf, X, y):\n", " y_pred = clf.predict(X)\n", " cm = confusion_matrix(y, y_pred)\n", " tpr = cm[1,1]/(cm[1,1]+cm[1,0])\n", " return tpr\n", "\n", "# False Positive Rate\n", "def fpr_scorer(clf, X, y):\n", " y_pred = clf.predict(X)\n", " cm = confusion_matrix(y, y_pred)\n", " fpr = cm[0,1]/(cm[0,0]+cm[0,1])\n", " return fpr\n", "\n", "# True Negative Rate\n", "def tnr_scorer(clf, X, y):\n", " y_pred = clf.predict(X)\n", " cm = confusion_matrix(y, y_pred)\n", " tnr = cm[0,0]/(cm[0,0]+cm[0,1])\n", " return tnr\n", "\n", "# True Negative Rate\n", "def fnr_scorer(clf, X, y):\n", " y_pred = clf.predict(X)\n", " cm = confusion_matrix(y, y_pred,)\n", " fnr = cm[1,0]/(cm[1,0]+cm[1,1])\n", " return fnr\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset de rostros (LFW)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(13233, 62, 47)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cargamos el dataset\n", "faces = fetch_lfw_people()\n", "positive_patches = faces.images\n", "positive_patches.shape" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Dividimos en train y test\n", "positive_patches_train, positive_patches_test = train_test_split(\n", " positive_patches,\n", " test_size=0.1,\n", " random_state=42\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape train: (11909, 62, 47)\n", "Shape test: (1324, 62, 47)\n" ] } ], "source": [ "print('Shape train: ',positive_patches_train.shape)\n", "print('Shape test: ',positive_patches_test.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset de fondos" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "41\n" ] } ], "source": [ "# Tomamos algunas imágenes de sklearn\n", "imgs = ['camera',\n", " 'text',\n", " 'coins',\n", " 'moon',\n", " 'page',\n", " 'clock',\n", " 'immunohistochemistry',\n", " 'chelsea',\n", " 'coffee',\n", " 'hubble_deep_field'\n", " ]\n", "\n", "backgrounds = []\n", "for name in imgs:\n", " img = getattr(data, name)()\n", " if len(img.shape) == 3 and img.shape[2] == 3: # Chequeamos si la imagen es RGB\n", " img = color.rgb2gray(img)\n", " backgrounds.append(img)\n", "\n", "# Imagenes caseras adicionales\n", "for i in range(31):\n", " filename = str(i)+'.jpg'\n", " img = plt.imread(filename)\n", " img = color.rgb2gray(img)\n", " backgrounds.append(img)\n", "\n", "print(len(backgrounds))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Definimos el modelo, los datos y las HOG" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LogisticRegression(C=1, max_iter=1000)_R_1_S_[0.5, 1, 2, 4, 8]_PTrain_10_PTest_100_O_3_C_(12, 12)_B_(3, 3)\n" ] } ], "source": [ "# Modelo\n", "model = LogisticRegression(penalty='l2',C=1, max_iter=1000)\n", "\n", "# Resolución de los rostros\n", "resolution = 1\n", "\n", "# Fondos\n", "scales = [0.5,1,2,4,8]\n", "proportion_train = 10\n", "proportion_test = 100\n", "num_patches_train = int((proportion_train * len(positive_patches_train))/(len(scales) * len(backgrounds)))\n", "num_patches_test = int((proportion_test * len(positive_patches_test))/(len(scales) * len(backgrounds)))\n", "\n", "# HOG\n", "orientations = 3\n", "pixels_per_cell = (12, 12)\n", "cells_per_block = (3, 3)\n", "\n", "# Nombre del experimento\n", "model_name = str(model)\n", "experiment_name = model_name\n", "experiment_name += '_R_' + str(resolution)\n", "experiment_name += '_S_' + str(scales)\n", "experiment_name += '_PTrain_' + str(proportion_train)\n", "experiment_name += '_PTest_' + str(proportion_test)\n", "experiment_name += '_O_' + str(orientations)\n", "experiment_name += '_C_' + str(pixels_per_cell)\n", "experiment_name += '_B_' + str(cells_per_block)\n", "\n", "print(experiment_name)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 11909/11909 [00:03<00:00, 3439.57it/s]\n", "100%|██████████| 1324/1324 [00:00<00:00, 3437.03it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Tamaño de los rostros: (62, 47)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Tamaño de las imágenes de rostros\n", "\n", "# Train\n", "positive_patches_train = np.array(\n", " [rescale(positive_patches_train[i], resolution)\n", " for i in tqdm(range(len(positive_patches_train)))]\n", " )\n", "\n", "# Test\n", "positive_patches_test = np.array(\n", " [rescale(positive_patches_test[i], resolution)\n", " for i in tqdm(range(len(positive_patches_test)))]\n", " )\n", "\n", "size = positive_patches_train[0].shape\n", "print('Tamaño de los rostros: ',size)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Procesando imágenes train: 100%|██████████| 41/41 [03:30<00:00, 5.14s/it]\n", "Procesando imágenes test: 100%|██████████| 41/41 [03:36<00:00, 5.28s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Shape train: (112961, 62, 47)\n", "Shape test: (125356, 62, 47)\n" ] } ], "source": [ "# Extraemos las imágenes de fondo\n", "\n", "# Train\n", "negative_patches_train = np.vstack(\n", " [extract_patches(im, num_patches_train, scale, random_state=42)\n", " for im in tqdm(backgrounds, desc='Procesando imágenes train')\n", " for scale in scales]\n", " )\n", "\n", "# Test\n", "negative_patches_test = np.vstack(\n", " [extract_patches(im, num_patches_test, scale, random_state=0)\n", " for im in tqdm(backgrounds, desc='Procesando imágenes test')\n", " for scale in scales]\n", " )\n", "\n", "print('Shape train: ',negative_patches_train.shape)\n", "print('Shape test: ',negative_patches_test.shape)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [] }, { "name": "stderr", "output_type": "stream", "text": [ "124870it [01:22, 1507.93it/s]\n", "126680it [01:50, 1150.90it/s]\n" ] } ], "source": [ "# Armamos la matriz de features y el vector de etiquetas\n", "\n", "# Train\n", "X_train = np.array(\n", " [feature.hog(image=im,\n", " orientations=orientations,\n", " pixels_per_cell=pixels_per_cell,\n", " cells_per_block=cells_per_block)\n", " for im in tqdm(chain(positive_patches_train, negative_patches_train))]\n", " )\n", "y_train = np.zeros(X_train.shape[0])\n", "y_train[:positive_patches_train.shape[0]] = 1\n", "\n", "# Test\n", "X_test = np.array(\n", " [feature.hog(image=im,\n", " orientations=orientations,\n", " pixels_per_cell=pixels_per_cell,\n", " cells_per_block=cells_per_block)\n", " for im in tqdm(chain(positive_patches_test, negative_patches_test))]\n", " )\n", "y_test = np.zeros(X_test.shape[0])\n", "y_test[:positive_patches_test.shape[0]] = 1" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape X_train: (124870, 81)\n", "Shape y_train: (124870,)\n", "Shape X_test: (126680, 81)\n", "Shape y_test: (126680,)\n" ] } ], "source": [ "print('Shape X_train: ', X_train.shape)\n", "print('Shape y_train: ', y_train.shape)\n", "print('Shape X_test: ', X_test.shape)\n", "print('Shape y_test: ', y_test.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Entrenamiento y evaluación del modelo" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
LogisticRegression(C=1, max_iter=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(C=1, max_iter=1000)