{ "cells": [ { "cell_type": "markdown", "id": "1b2382ea", "metadata": {}, "source": [ "# fastText, gensim (python)" ] }, { "cell_type": "code", "execution_count": 1, "id": "e53bf6fb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: gensim in /home/math/.virtualenvs/py3pytorch/lib/python3.8/site-packages (4.2.0)\r\n", "Requirement already satisfied: numpy>=1.17.0 in /home/math/.virtualenvs/py3pytorch/lib/python3.8/site-packages (from gensim) (1.21.1)\r\n", "Requirement already satisfied: scipy>=0.18.1 in /home/math/.virtualenvs/py3pytorch/lib/python3.8/site-packages (from gensim) (1.7.1)\r\n", "Requirement already satisfied: smart-open>=1.8.1 in /home/math/.virtualenvs/py3pytorch/lib/python3.8/site-packages (from gensim) (5.2.1)\r\n" ] } ], "source": [ "!pip install gensim" ] }, { "cell_type": "code", "execution_count": 3, "id": "4d6045e1", "metadata": {}, "outputs": [], "source": [ "import gensim.downloader as gensim_down" ] }, { "cell_type": "code", "execution_count": 2, "id": "b76b91b5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gensim_down.info()['models'].keys()" ] }, { "cell_type": "code", "execution_count": 2, "id": "d1a5dc7d", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'gensim_down' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_12411/2466918785.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgensim_down\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'models'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fasttext-wiki-news-subwords-300'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'gensim_down' is not defined" ] } ], "source": [ "gensim_down.info()['models']['fasttext-wiki-news-subwords-300']" ] }, { "cell_type": "code", "execution_count": null, "id": "577f0626", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "154d9ee2", "metadata": {}, "source": [ "# fastText (gensim)" ] }, { "cell_type": "code", "execution_count": null, "id": "cc02b9ac", "metadata": {}, "outputs": [], "source": [ "import gensim.models.fasttext as fasttext\n", "\n", "fasttext.load_facebook_model('./cc.es.300.bin') ## Crashes\n" ] }, { "cell_type": "markdown", "id": "9ec33c50", "metadata": {}, "source": [ "# fastText (python module)" ] }, { "cell_type": "code", "execution_count": 2, "id": "69bf9583", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting fasttext\n", " Downloading fasttext-0.9.2.tar.gz (68 kB)\n", "\u001b[K |████████████████████████████████| 68 kB 415 kB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: numpy in /home/math/.virtualenvs/py3pytorch/lib/python3.8/site-packages (from fasttext) (1.21.1)\n", "Requirement already satisfied: pybind11>=2.2 in /home/math/.virtualenvs/py3pytorch/lib/python3.8/site-packages (from fasttext) (2.10.0)\n", "Requirement already satisfied: setuptools>=0.7.0 in /home/math/.virtualenvs/py3pytorch/lib/python3.8/site-packages (from fasttext) (44.0.0)\n", "Building wheels for collected packages: fasttext\n", " Building wheel for fasttext (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for fasttext: filename=fasttext-0.9.2-cp38-cp38-linux_x86_64.whl size=4414079 sha256=e3955e4ba327e9a1cf16d671385aeb946f1f3f08f2cd19f2d959a71f3e74572d\n", " Stored in directory: /home/math/.cache/pip/wheels/93/61/2a/c54711a91c418ba06ba195b1d78ff24fcaad8592f2a694ac94\n", "Successfully built fasttext\n", "Installing collected packages: fasttext\n", "Successfully installed fasttext-0.9.2\n" ] } ], "source": [ "!pip install fasttext" ] }, { "cell_type": "markdown", "id": "afbff4ac", "metadata": {}, "source": [ "Link: https://fasttext.cc/docs/en/python-module.html" ] }, { "cell_type": "markdown", "id": "1fb6f188", "metadata": {}, "source": [ "##### Cargar modelo entrenado:" ] }, { "cell_type": "code", "execution_count": 4, "id": "9f9641d1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" ] } ], "source": [ "import fasttext\n", "\n", "model = fasttext.load_model(\"./cc.es.300.bin\")" ] }, { "cell_type": "markdown", "id": "f3c04dde", "metadata": {}, "source": [ "##### Obtener vector de palabra" ] }, { "cell_type": "code", "execution_count": null, "id": "9bb7d0bc", "metadata": {}, "outputs": [], "source": [ "model['maravilloso']" ] }, { "cell_type": "markdown", "id": "90fc9893", "metadata": {}, "source": [ "##### Obtener subwords (char n-grams)" ] }, { "cell_type": "code", "execution_count": null, "id": "6a870f5c", "metadata": {}, "outputs": [], "source": [ "model.get_subwords('maravilloso')" ] }, { "cell_type": "code", "execution_count": null, "id": "7f82fc96", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "1bb797d2", "metadata": {}, "source": [ "##### Palabras más cercanas: " ] }, { "cell_type": "code", "execution_count": 15, "id": "4fd1c2db", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0.8583537340164185, 'fabuloso'),\n", " (0.8245900869369507, 'hermoso'),\n", " (0.8058003187179565, 'fantástico'),\n", " (0.8048393130302429, 'magnífico'),\n", " (0.7916648387908936, 'grandioso'),\n", " (0.7713317275047302, 'magnifico'),\n", " (0.7648442983627319, 'maravillo'),\n", " (0.7598780989646912, 'bello'),\n", " (0.753715455532074, 'maraviloso'),\n", " (0.7490848898887634, 'bellísimo')]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.get_nearest_neighbors('maravilloso')" ] }, { "cell_type": "code", "execution_count": null, "id": "be263c4e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "675e6685", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "8ee56c44", "metadata": {}, "source": [ "##### Word Analogies:" ] }, { "cell_type": "code", "execution_count": 19, "id": "01c03ba2", "metadata": {}, "outputs": [], "source": [ "model.get_analogies()" ] }, { "cell_type": "code", "execution_count": 17, "id": "4d043f0e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "517d49a6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d9e57863", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "5cdab6d7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "db8f1a81", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }