diff --git a/NLP/src/smallBERTa_Pretraining.ipynb b/NLP/src/smallBERTa_Pretraining.ipynb new file mode 100644 index 00000000..313149e6 --- /dev/null +++ b/NLP/src/smallBERTa_Pretraining.ipynb @@ -0,0 +1,5908 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "smallBERTa_Pretraining.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "V4OynugZvMG2" + }, + "source": [ + "# Pre-training SmallBERTa - A tiny model to train on a tiny dataset\n", + "(Using HuggingFace Transformers)
\n", + "Admittedly, while language modeling is associated with terabytes of data, not all of use have either the processing power nor the resources to train huge models on such huge amounts of data.\n", + "In this example, we are going to train a relatively small neural net on a small dataset (which still happens to have over 2M rows).\n", + "
\n", + "\n", + "The ***main purpose*** of this blog is not to achieve state-of-the-art performance on LM tasks but to show a simple idea of how the recent language_modeling.py script can be used to train a Transformer model from scratch.\n", + "\n", + "This very notebook can be extended to various esoteric use cases where general purpose pre-trained models fail to perform well. Examples include medical dataset, scientific literature, legal documentation, etc.\n", + "\n", + "Input:\n", + " 1. To the Tokenizer:
\n", + " LM data in a directory containing all samples in separate *.txt files.\n", + " \n", + " 2. To the Model:
\n", + " LM data split into:
\n", + " 1. train.txt
\n", + " 2. eval.txt\n", + " \n", + "Output:
\n", + " Trained Model weights(that can be used elsewhere) and Tensorboard logs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5sHQ_tWig474" + }, + "source": [ + "## Install Dependencies" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "hPxoElNugaMu", + "outputId": "705e0776-70b3-4d51-a50c-b67e5f639997", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + } + }, + "source": [ + "#tokenizer working version --- 0.5.0\n", + "#transformer working version --- 2.5.0\n", + "!pip install transformers\n", + "!pip install tokenizers\n", + "!pip install tensorboard==2.1.0" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Collecting transformers\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/04/58/3d789b98923da6485f376be1e04d59ad7003a63bdb2b04b5eea7e02857e5/transformers-2.5.0-py3-none-any.whl (481kB)\n", + "\r\u001b[K |▊ | 10kB 21.9MB/s eta 0:00:01\r\u001b[K |█▍ | 20kB 29.1MB/s eta 0:00:01\r\u001b[K |██ | 30kB 24.6MB/s eta 0:00:01\r\u001b[K |██▊ | 40kB 19.1MB/s eta 0:00:01\r\u001b[K |███▍ | 51kB 15.6MB/s eta 0:00:01\r\u001b[K |████ | 61kB 15.4MB/s eta 0:00:01\r\u001b[K |████▊ | 71kB 13.4MB/s eta 0:00:01\r\u001b[K |█████▍ | 81kB 12.9MB/s eta 0:00:01\r\u001b[K |██████▏ | 92kB 12.7MB/s eta 0:00:01\r\u001b[K |██████▉ | 102kB 12.9MB/s eta 0:00:01\r\u001b[K |███████▌ | 112kB 12.9MB/s eta 0:00:01\r\u001b[K |████████▏ | 122kB 12.9MB/s eta 0:00:01\r\u001b[K |████████▉ | 133kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████▌ | 143kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████▏ | 153kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████▉ | 163kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████▋ | 174kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████▎ | 184kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████ | 194kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████▋ | 204kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████████▎ | 215kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████ | 225kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████▋ | 235kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████████▎ | 245kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████████ | 256kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████████▊ | 266kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████████████▍ | 276kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████████ | 286kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████████▊ | 296kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████████████▍ | 307kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████████████ | 317kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████████████▊ | 327kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████████████████▌ | 337kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▏ | 348kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▉ | 358kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████████████████▌ | 368kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▏ | 378kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▉ | 389kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████▌ | 399kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████▏ | 409kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████ | 419kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████▋ | 430kB 12.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▎ | 440kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 450kB 12.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▋ | 460kB 12.9MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▎| 471kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 481kB 12.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 491kB 12.9MB/s \n", + "\u001b[?25hCollecting tokenizers==0.5.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7e/1d/ea7e2c628942e686595736f73678348272120d026b7acd54fe43e5211bb1/tokenizers-0.5.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)\n", + "\u001b[K |████████████████████████████████| 3.8MB 51.0MB/s \n", + "\u001b[?25hCollecting sacremoses\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)\n", + "\u001b[K |████████████████████████████████| 870kB 51.0MB/s \n", + "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.21.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.17.5)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.28.1)\n", + "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from transformers) (1.11.15)\n", + "Collecting sentencepiece\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)\n", + "\u001b[K |████████████████████████████████| 1.0MB 50.1MB/s \n", + "\u001b[?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.12.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.0)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.14.1)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2019.11.28)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.8)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n", + "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.9.4)\n", + "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (0.3.3)\n", + "Requirement already satisfied: botocore<1.15.0,>=1.14.15 in /usr/local/lib/python3.6/dist-packages (from boto3->transformers) (1.14.15)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (2.6.1)\n", + "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.15->boto3->transformers) (0.15.2)\n", + "Building wheels for collected packages: sacremoses\n", + " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884628 sha256=bfd64cc598a7e475f655abf031d4190a57d3ca64431f51d59dfb570f216a77f8\n", + " Stored in directory: /root/.cache/pip/wheels/6d/ec/1a/21b8912e35e02741306f35f66c785f3afe94de754a0eaf1422\n", + "Successfully built sacremoses\n", + "Installing collected packages: tokenizers, sacremoses, sentencepiece, transformers\n", + "Successfully installed sacremoses-0.0.38 sentencepiece-0.1.85 tokenizers-0.5.0 transformers-2.5.0\n", + "Requirement already satisfied: tokenizers in /usr/local/lib/python3.6/dist-packages (0.5.0)\n", + "Collecting tensorboard==2.1.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/40/23/53ffe290341cd0855d595b0a2e7485932f473798af173bbe3a584b99bb06/tensorboard-2.1.0-py3-none-any.whl (3.8MB)\n", + "\u001b[K |████████████████████████████████| 3.8MB 27.7MB/s \n", + "\u001b[?25hRequirement already satisfied: grpcio>=1.24.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (1.27.1)\n", + "Requirement already satisfied: numpy>=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (1.17.5)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (0.4.1)\n", + "Requirement already satisfied: protobuf>=3.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (3.10.0)\n", + "Requirement already satisfied: wheel>=0.26; python_version >= \"3\" in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (0.34.2)\n", + "Requirement already satisfied: absl-py>=0.4 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (0.9.0)\n", + "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (1.12.0)\n", + "Requirement already satisfied: setuptools>=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (45.1.0)\n", + "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (1.7.2)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (2.21.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (1.0.0)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard==2.1.0) (3.2.1)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard==2.1.0) (1.3.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard==2.1.0) (0.2.8)\n", + "Requirement already satisfied: cachetools<3.2,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard==2.1.0) (3.1.1)\n", + "Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth<2,>=1.6.3->tensorboard==2.1.0) (4.0)\n", + "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard==2.1.0) (2.8)\n", + "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard==2.1.0) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard==2.1.0) (2019.11.28)\n", + "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3,>=2.21.0->tensorboard==2.1.0) (3.0.4)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard==2.1.0) (3.1.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard==2.1.0) (0.4.8)\n", + "\u001b[31mERROR: tensorflow 1.15.0 has requirement tensorboard<1.16.0,>=1.15.0, but you'll have tensorboard 2.1.0 which is incompatible.\u001b[0m\n", + "Installing collected packages: tensorboard\n", + " Found existing installation: tensorboard 1.15.0\n", + " Uninstalling tensorboard-1.15.0:\n", + " Successfully uninstalled tensorboard-1.15.0\n", + "Successfully installed tensorboard-2.1.0\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cBcbCQoEg9cT" + }, + "source": [ + "## Fetch Data\n", + "We will be using a tiny dataset(The Examiner - SpamClickBait News) of around 3M rows from kaggle to train our model. The dataset also contains output labels which will be dropped and only the text shall be used. For convenience we are using the Kaggle API to direcltly download the data from Kaggle to save our time and efforts." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "AtFnApKwiGUb", + "outputId": "99c4c4e6-147a-46ae-91da-d89a148a6c0c", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 169 + } + }, + "source": [ + "import os\n", + "import getpass\n", + "\n", + "#For a kaggle username & key, just go to your kaggle account and generate key\n", + "#The JSON file so downloaded contains both of them\n", + "if(\"examine-the-examiner.zip\" not in os.listdir()):\n", + " print(\"Copy these two values from the JSON file so generated\")\n", + " os.environ['KAGGLE_USERNAME'] = getpass.getpass(prompt='Kaggle username: ')\n", + " os.environ['KAGGLE_KEY'] = getpass.getpass(prompt='Kaggle key: ')\n", + " !kaggle datasets download -d therohk/examine-the-examiner\n", + " !unzip /content/examine-the-examiner.zip" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Copy these two values from the JSON file so generated\n", + "Kaggle username: ··········\n", + "Kaggle key: ··········\n", + "Downloading examine-the-examiner.zip to /content\n", + " 86% 123M/142M [00:00<00:00, 132MB/s]\n", + "100% 142M/142M [00:00<00:00, 163MB/s]\n", + "Archive: /content/examine-the-examiner.zip\n", + " inflating: examiner-date-text.csv \n", + " inflating: examiner-date-tokens.csv \n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IQ7hj9kuhBIj" + }, + "source": [ + "## Load and Preprocess data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HOG-fl1cGhJ4" + }, + "source": [ + "import regex as re\n", + "def basicPreprocess(text):\n", + " try:\n", + " processed_text = text.lower()\n", + " processed_text = re.sub(r'\\W +', ' ', processed_text)\n", + " except Exception as e:\n", + " print(\"Exception:\",e,\",on text:\", text)\n", + " return None\n", + " return processed_text" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Fn68O17MsqYp" + }, + "source": [ + "import pandas as pd\n", + "from tqdm import tqdm" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iUtf-gZ_hWEE" + }, + "source": [ + "## Read and Prune the data\n", + "For our purpose we are going to read a subset (~200,000 samples) to train, just to see results quickly. Feel free to increase (or remove) this limitation. " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bj7Bo6hMiySr", + "outputId": "0886be29-864c-4e12-b4b0-28c4e23f88f1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + } + }, + "source": [ + "data = pd.read_csv(\"/content/examiner-date-text.csv\")\n", + "print(data)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " publish_date headline_text\n", + "0 20100101 100 Most Anticipated books releasing in 2010\n", + "1 20100101 10 best films of 2009 - What's on your list?\n", + "2 20100101 10 days of free admission at Lan Su Chinese Ga...\n", + "3 20100101 10 PlayStation games to watch out for in 2010\n", + "4 20100101 10 resolutions for a Happy New Year for you an...\n", + "... ... ...\n", + "3089776 20151231 Which is better investment, Lego bricks or gol...\n", + "3089777 20151231 Wild score three unanswered goals to defeat th...\n", + "3089778 20151231 With NASA and Russia on the sidelines, Europe ...\n", + "3089779 20151231 Wolf Pack battling opponents, officials on the...\n", + "3089780 20151231 Writespace hosts all genre open mic night\n", + "\n", + "[3089781 rows x 2 columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "1JaLDYtAnZIP" + }, + "source": [ + "data = data.sample(frac=1).sample(frac=1)\n", + "data = data[:200000]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qYYUOhiXhHP8" + }, + "source": [ + "### Before Preprocessing" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8STrareTIxox", + "outputId": "3553e801-b2b4-463a-c7c2-03e1d1b6c500", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + } + }, + "source": [ + "print(data)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " publish_date headline_text\n", + "618246 20100816 Triangle UFO low and silent over rural Deansbo...\n", + "1794117 20120420 Kevin Hart and 'Think Like a Man' co-stars lea...\n", + "3053438 20150920 Uma Thurman custody battle finally settled wit...\n", + "180273 20100313 Legislator confident of Health Care bill\n", + "938083 20101228 McDonald's ad in Spanish, provoking sparks\n", + "... ... ...\n", + "1737672 20120319 Washington Post: Obama has been lying to Ameri...\n", + "1780904 20120413 California retiree collects $227k Mega Million...\n", + "1614310 20120105 This Weekend at Miami Science Museum Laser Show\n", + "1565925 20111205 December 12th is National Poinsettia Day\n", + "1358212 20110731 Spartans' Cousins gives stirring, thought-prov...\n", + "\n", + "[200000 rows x 2 columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Z8md5U5tGx1J" + }, + "source": [ + "data[\"headline_text\"] = data[\"headline_text\"].apply(basicPreprocess).dropna() #ignore exception if for empty/nan values" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DCzsk_sVhLsi" + }, + "source": [ + "### After Preprocessing" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "wV8ysU3cI1a-", + "outputId": "b3962d3a-6d6a-468b-9534-d9d1dd6f457b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 253 + } + }, + "source": [ + "print(data)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " publish_date headline_text\n", + "618246 20100816 triangle ufo low and silent over rural deansbo...\n", + "1794117 20120420 kevin hart and 'think like a man co-stars lear...\n", + "3053438 20150920 uma thurman custody battle finally settled wit...\n", + "180273 20100313 legislator confident of health care bill\n", + "938083 20101228 mcdonald's ad in spanish provoking sparks\n", + "... ... ...\n", + "1737672 20120319 washington post obama has been lying to americ...\n", + "1780904 20120413 california retiree collects $227k mega million...\n", + "1614310 20120105 this weekend at miami science museum laser show\n", + "1565925 20111205 december 12th is national poinsettia day\n", + "1358212 20110731 spartans cousins gives stirring thought-provok...\n", + "\n", + "[200000 rows x 2 columns]\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dbp40Xkrhs8l" + }, + "source": [ + "Removing newline characters just in case the input text has them. This is because the LineByLine class that we are going to use later assumes that samples are separated by newline" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9dBFTDQnjXnE" + }, + "source": [ + "data = data[\"headline_text\"]\n", + "data = data.replace(\"\\n\",\" \")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gI1Tp54IiVBj" + }, + "source": [ + "## Train a custom tokenizer\n", + "I have used a ByteLevelBPETokenizer just to prevent \\ tokens entirely.\n", + "Furthermore, the function used to train the tokenizer assumes that each sample is stored in a different text file." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "rs-wK-N1EACp" + }, + "source": [ + "txt_files_dir = \"/tmp/text_split\"\n", + "!mkdir {txt_files_dir}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QIvCE_svi7sQ" + }, + "source": [ + "Split LM data into individual files. These files are stored in /tmp/text_split and are used to train the tokenizer **only**." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_2oI92Z0tyAp", + "outputId": "022fc930-6312-4e83-eb4f-24b68e0b0394", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "i=0\n", + "for row in tqdm(data.to_list()):\n", + " file_name = os.path.join(txt_files_dir, str(i)+'.txt')\n", + " try:\n", + " f = open(file_name, 'w')\n", + " f.write(row)\n", + " f.close()\n", + " except Exception as e: #catch exceptions(for eg. empty rows)\n", + " print(row, e)\n", + " i+=1" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "100%|██████████| 200000/200000 [00:09<00:00, 20693.63it/s]\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3r6RuiCBXIJy" + }, + "source": [ + "from pathlib import Path\n", + "from tokenizers import ByteLevelBPETokenizer\n", + "from tokenizers.processors import BertProcessing\n", + "\n", + "\n", + "paths = [str(x) for x in Path(txt_files_dir).glob(\"**/*.txt\")]\n", + "\n", + "# Initialize a tokenizer\n", + "tokenizer = ByteLevelBPETokenizer()\n", + "\n", + "vocab_size=5000\n", + "# Customize training\n", + "tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=5, special_tokens=[\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + "])" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "0bv78Z2UjIci" + }, + "source": [ + "lm_data_dir = \"/tmp/lm_data\"\n", + "!mkdir {lm_data_dir}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sI5kEwUojOQo" + }, + "source": [ + "## Split into Valdation and Train set\n", + "We split the train data into validation and train. These two files are used to train and evaluate our model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2nWv7Yuki66k" + }, + "source": [ + "train_split = 0.9\n", + "train_data_size = int(len(data)*train_split)\n", + "\n", + "with open(os.path.join(lm_data_dir,'train.txt') , 'w') as f:\n", + " for item in data[:train_data_size].tolist():\n", + " f.write(\"%s\\n\" % item)\n", + "\n", + "with open(os.path.join(lm_data_dir,'eval.txt') , 'w') as f:\n", + " for item in data[train_data_size:].tolist():\n", + " f.write(\"%s\\n\" % item)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "UKaVWBiVTtEO" + }, + "source": [ + "!mkdir /content/models\n", + "!mkdir /content/models/smallBERTa" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "noQfBUkhJmFC", + "outputId": "9deb334f-d4ec-45e3-df14-1c048a4890ba", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 50 + } + }, + "source": [ + "tokenizer.save(\"/content/models/smallBERTa\", \"smallBERTa\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['/content/models/smallBERTa/smallBERTa-vocab.json',\n", + " '/content/models/smallBERTa/smallBERTa-merges.txt']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "odSTiCM--4_p" + }, + "source": [ + "!mv /content/models/smallBERTa/smallBERTa-vocab.json /content/models/smallBERTa/vocab.json\n", + "!mv /content/models/smallBERTa/smallBERTa-merges.txt /content/models/smallBERTa/merges.txt" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "naEJbZDjFnNo" + }, + "source": [ + "train_path = os.path.join(lm_data_dir,\"train.txt\")\n", + "eval_path = os.path.join(lm_data_dir,\"eval.txt\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P91yVQkXj9rc" + }, + "source": [ + "## Set Model Configuration\n", + "For our purpose, we are training a very small model for demo purposes" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XS4q1YtxZ2GW" + }, + "source": [ + "import json\n", + "config = {\n", + " \"attention_probs_dropout_prob\": 0.1,\n", + " \"hidden_act\": \"gelu\",\n", + " \"hidden_dropout_prob\": 0.3,\n", + " \"hidden_size\": 128,\n", + " \"initializer_range\": 0.02,\n", + " \"num_attention_heads\": 1,\n", + " \"num_hidden_layers\": 1,\n", + " \"vocab_size\": vocab_size,\n", + " \"intermediate_size\": 256,\n", + " \"max_position_embeddings\": 256\n", + "}\n", + "with open(\"/content/models/smallBERTa/config.json\", 'w') as fp:\n", + " json.dump(config, fp)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "CbVBgrDbmVJ2", + "outputId": "160bd4f1-ae4b-474e-bb4f-19a8907d05e3", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 135 + } + }, + "source": [ + "#%cd /content\n", + "!git clone https://github.com/huggingface/transformers.git" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Cloning into 'transformers'...\n", + "remote: Enumerating objects: 24, done.\u001b[K\n", + "remote: Counting objects: 100% (24/24), done.\u001b[K\n", + "remote: Compressing objects: 100% (23/23), done.\u001b[K\n", + "remote: Total 19858 (delta 5), reused 6 (delta 0), pack-reused 19834\u001b[K\n", + "Receiving objects: 100% (19858/19858), 11.95 MiB | 4.05 MiB/s, done.\n", + "Resolving deltas: 100% (14423/14423), done.\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EZMJ0zMxDIyc" + }, + "source": [ + "## Run training using the run_language_modeling.py examples script" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4kvkxHIk2Vgn", + "outputId": "7dbd97f4-e05b-4158-86a5-083818c57082", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 304 + } + }, + "source": [ + "!nvidia-smi #just to confirm that you are on a GPU, if not go to Runtime->Change Runtime" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Fri Feb 21 12:17:21 2020 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 440.48.02 Driver Version: 418.67 CUDA Version: 10.1 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla P4 Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 41C P8 7W / 75W | 0MiB / 7611MiB | 0% Default |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: GPU Memory |\n", + "| GPU PID Type Process name Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Hk2MUnKFV58z" + }, + "source": [ + "#Setting environment variables\n", + "os.environ[\"train_path\"] = train_path\n", + "os.environ[\"eval_path\"] = eval_path\n", + "os.environ[\"CUDA_LAUNCH_BLOCKING\"]='1' #Makes for easier debugging (just in case)\n", + "weights_dir = \"/content/models/smallBERTa/weights\"\n", + "!mkdir {weights_dir}" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "6UJ_BSAlmccq" + }, + "source": [ + "cmd = '''python /content/transformers/examples/run_language_modeling.py --output_dir {0} \\\n", + " --model_type roberta \\\n", + " --mlm \\\n", + " --train_data_file {1} \\\n", + " --eval_data_file {2} \\\n", + " --config_name /content/models/smallBERTa \\\n", + " --tokenizer_name /content/models/smallBERTa \\\n", + " --do_train \\\n", + " --line_by_line \\\n", + " --overwrite_output_dir \\\n", + " --do_eval \\\n", + " --block_size 256 \\\n", + " --learning_rate 1e-4 \\\n", + " --num_train_epochs 5 \\\n", + " --save_total_limit 2 \\\n", + " --save_steps 2000 \\\n", + " --logging_steps 500 \\\n", + " --per_gpu_eval_batch_size 32 \\\n", + " --per_gpu_train_batch_size 32 \\\n", + " --evaluate_during_training \\\n", + " --seed 42 \\\n", + " '''.format(weights_dir, train_path, eval_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jqhJzq03Fc15", + "outputId": "3a02319a-1040-457b-baf8-f5e4ed3c1e0e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + } + }, + "source": [ + "!{cmd}" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n", + "Evaluating: 96% 598/625 [00:04<00:00, 124.17it/s]\u001b[A\u001b[A\n", + "\n", + "Evaluating: 98% 611/625 [00:04<00:00, 124.94it/s]\u001b[A\u001b[A\n", + "\n", + "Evaluating: 100% 625/625 [00:05<00:00, 126.93it/s]\u001b[A\u001b[A\n", + "\n", + "\u001b[A\u001b[A02/21/2020 12:30:10 - INFO - __main__ - ***** Eval results *****\n", + "02/21/2020 12:30:10 - INFO - __main__ - perplexity = tensor(873.4072)\n", + "\n", + "Iteration: 11% 628/5625 [00:31<44:27, 1.87it/s]\u001b[A\n", + "Iteration: 11% 632/5625 [00:31<31:46, 2.62it/s]\u001b[A\n", + "Iteration: 11% 636/5625 [00:31<22:55, 3.63it/s]\u001b[A\n", + "Iteration: 11% 640/5625 [00:31<16:43, 4.97it/s]\u001b[A\n", + "Iteration: 11% 644/5625 [00:31<12:21, 6.72it/s]\u001b[A\n", + "Iteration: 12% 648/5625 [00:31<09:18, 8.91it/s]\u001b[A\n", + "Iteration: 12% 652/5625 [00:31<07:10, 11.54it/s]\u001b[A\n", + "Iteration: 12% 656/5625 [00:31<05:41, 14.56it/s]\u001b[A\n", + "Iteration: 12% 660/5625 [00:31<04:41, 17.63it/s]\u001b[A\n", + "Iteration: 12% 664/5625 [00:32<03:56, 20.97it/s]\u001b[A\n", + "Iteration: 12% 668/5625 [00:32<03:24, 24.25it/s]\u001b[A\n", + "Iteration: 12% 672/5625 [00:32<03:05, 26.70it/s]\u001b[A\n", + "Iteration: 12% 676/5625 [00:32<02:52, 28.77it/s]\u001b[A\n", + "Iteration: 12% 680/5625 [00:32<02:40, 30.77it/s]\u001b[A\n", + "Iteration: 12% 684/5625 [00:32<02:32, 32.44it/s]\u001b[A\n", + "Iteration: 12% 688/5625 [00:32<02:26, 33.77it/s]\u001b[A\n", + "Iteration: 12% 692/5625 [00:32<02:21, 34.88it/s]\u001b[A\n", + "Iteration: 12% 696/5625 [00:32<02:20, 35.01it/s]\u001b[A\n", + "Iteration: 12% 700/5625 [00:33<02:18, 35.67it/s]\u001b[A\n", + "Iteration: 13% 704/5625 [00:33<02:16, 36.10it/s]\u001b[A\n", + "Iteration: 13% 708/5625 [00:33<02:16, 36.05it/s]\u001b[A\n", + "Iteration: 13% 712/5625 [00:33<02:15, 36.21it/s]\u001b[A\n", + "Iteration: 13% 716/5625 [00:33<02:14, 36.62it/s]\u001b[A\n", + "Iteration: 13% 720/5625 [00:33<02:14, 36.59it/s]\u001b[A\n", + "Iteration: 13% 724/5625 [00:33<02:13, 36.62it/s]\u001b[A\n", + "Iteration: 13% 728/5625 [00:33<02:14, 36.52it/s]\u001b[A\n", + "Iteration: 13% 732/5625 [00:33<02:16, 35.84it/s]\u001b[A\n", + "Iteration: 13% 736/5625 [00:34<02:18, 35.26it/s]\u001b[A\n", + "Iteration: 13% 740/5625 [00:34<02:17, 35.49it/s]\u001b[A\n", + "Iteration: 13% 744/5625 [00:34<02:16, 35.69it/s]\u001b[A\n", + "Iteration: 13% 748/5625 [00:34<02:15, 35.91it/s]\u001b[A\n", + "Iteration: 13% 752/5625 [00:34<02:14, 36.30it/s]\u001b[A\n", + "Iteration: 13% 756/5625 [00:34<02:14, 36.32it/s]\u001b[A\n", + "Iteration: 14% 760/5625 [00:34<02:11, 36.88it/s]\u001b[A\n", + "Iteration: 14% 764/5625 [00:34<02:12, 36.70it/s]\u001b[A\n", + "Iteration: 14% 768/5625 [00:34<02:13, 36.30it/s]\u001b[A\n", + "Iteration: 14% 772/5625 [00:35<02:12, 36.57it/s]\u001b[A\n", + "Iteration: 14% 776/5625 [00:35<02:13, 36.41it/s]\u001b[A\n", + "Iteration: 14% 780/5625 [00:35<02:11, 36.85it/s]\u001b[A\n", + "Iteration: 14% 784/5625 [00:35<02:11, 36.80it/s]\u001b[A\n", + "Iteration: 14% 788/5625 [00:35<02:11, 36.88it/s]\u001b[A\n", + "Iteration: 14% 792/5625 [00:35<02:10, 37.05it/s]\u001b[A\n", + "Iteration: 14% 796/5625 [00:35<02:10, 36.89it/s]\u001b[A\n", + "Iteration: 14% 800/5625 [00:35<02:08, 37.52it/s]\u001b[A\n", + "Iteration: 14% 804/5625 [00:35<02:09, 37.30it/s]\u001b[A\n", + "Iteration: 14% 808/5625 [00:35<02:11, 36.55it/s]\u001b[A\n", + "Iteration: 14% 812/5625 [00:36<02:11, 36.55it/s]\u001b[A\n", + "Iteration: 15% 816/5625 [00:36<02:15, 35.37it/s]\u001b[A\n", + "Iteration: 15% 820/5625 [00:36<02:15, 35.55it/s]\u001b[A\n", + "Iteration: 15% 824/5625 [00:36<02:15, 35.38it/s]\u001b[A\n", + "Iteration: 15% 828/5625 [00:36<02:16, 35.25it/s]\u001b[A\n", + "Iteration: 15% 832/5625 [00:36<02:13, 35.92it/s]\u001b[A\n", + "Iteration: 15% 836/5625 [00:36<02:13, 35.95it/s]\u001b[A\n", + "Iteration: 15% 840/5625 [00:36<02:11, 36.50it/s]\u001b[A\n", + "Iteration: 15% 844/5625 [00:36<02:13, 35.94it/s]\u001b[A\n", + "Iteration: 15% 848/5625 [00:37<02:11, 36.22it/s]\u001b[A\n", + "Iteration: 15% 852/5625 [00:37<02:11, 36.36it/s]\u001b[A\n", + "Iteration: 15% 856/5625 [00:37<02:11, 36.16it/s]\u001b[A\n", + "Iteration: 15% 860/5625 [00:37<02:10, 36.45it/s]\u001b[A\n", + "Iteration: 15% 864/5625 [00:37<02:11, 36.28it/s]\u001b[A\n", + "Iteration: 15% 868/5625 [00:37<02:10, 36.35it/s]\u001b[A\n", + "Iteration: 16% 872/5625 [00:37<02:08, 36.87it/s]\u001b[A\n", + "Iteration: 16% 876/5625 [00:37<02:08, 36.85it/s]\u001b[A\n", + "Iteration: 16% 880/5625 [00:37<02:11, 36.18it/s]\u001b[A\n", + "Iteration: 16% 884/5625 [00:38<02:09, 36.48it/s]\u001b[A\n", + "Iteration: 16% 888/5625 [00:38<02:11, 36.00it/s]\u001b[A\n", + "Iteration: 16% 892/5625 [00:38<02:10, 36.25it/s]\u001b[A\n", + "Iteration: 16% 896/5625 [00:38<02:08, 36.74it/s]\u001b[A\n", + "Iteration: 16% 900/5625 [00:38<02:07, 37.16it/s]\u001b[A\n", + "Iteration: 16% 904/5625 [00:38<02:08, 36.83it/s]\u001b[A\n", + "Iteration: 16% 908/5625 [00:38<02:06, 37.18it/s]\u001b[A\n", + "Iteration: 16% 912/5625 [00:38<02:10, 36.04it/s]\u001b[A\n", + "Iteration: 16% 916/5625 [00:38<02:14, 34.93it/s]\u001b[A\n", + "Iteration: 16% 920/5625 [00:39<02:13, 35.19it/s]\u001b[A\n", + "Iteration: 16% 924/5625 [00:39<02:12, 35.48it/s]\u001b[A\n", + "Iteration: 16% 928/5625 [00:39<02:12, 35.33it/s]\u001b[A\n", + "Iteration: 17% 932/5625 [00:39<02:11, 35.67it/s]\u001b[A\n", + "Iteration: 17% 936/5625 [00:39<02:10, 36.05it/s]\u001b[A\n", + "Iteration: 17% 940/5625 [00:39<02:10, 35.85it/s]\u001b[A\n", + "Iteration: 17% 944/5625 [00:39<02:09, 36.24it/s]\u001b[A\n", + "Iteration: 17% 948/5625 [00:39<02:07, 36.64it/s]\u001b[A\n", + "Iteration: 17% 952/5625 [00:39<02:06, 36.95it/s]\u001b[A\n", + "Iteration: 17% 956/5625 [00:40<02:08, 36.32it/s]\u001b[A\n", + "Iteration: 17% 960/5625 [00:40<02:07, 36.52it/s]\u001b[A\n", + "Iteration: 17% 964/5625 [00:40<02:07, 36.66it/s]\u001b[A\n", + "Iteration: 17% 968/5625 [00:40<02:07, 36.46it/s]\u001b[A\n", + "Iteration: 17% 972/5625 [00:40<02:07, 36.61it/s]\u001b[A\n", + "Iteration: 17% 976/5625 [00:40<02:07, 36.45it/s]\u001b[A\n", + "Iteration: 17% 980/5625 [00:40<02:06, 36.71it/s]\u001b[A\n", + "Iteration: 17% 984/5625 [00:40<02:06, 36.71it/s]\u001b[A\n", + "Iteration: 18% 988/5625 [00:40<02:05, 36.98it/s]\u001b[A\n", + "Iteration: 18% 992/5625 [00:41<02:06, 36.68it/s]\u001b[A\n", + "Iteration: 18% 996/5625 [00:41<02:08, 36.11it/s]\u001b[A\n", + "Iteration: 18% 1000/5625 [00:41<02:07, 36.21it/s]\u001b[A\n", + "Iteration: 18% 1004/5625 [00:41<02:10, 35.32it/s]\u001b[A\n", + "Iteration: 18% 1008/5625 [00:41<02:09, 35.54it/s]\u001b[A\n", + "Iteration: 18% 1012/5625 [00:41<02:08, 35.81it/s]\u001b[A\n", + "Iteration: 18% 1016/5625 [00:41<02:09, 35.66it/s]\u001b[A\n", + "Iteration: 18% 1020/5625 [00:41<02:07, 36.16it/s]\u001b[A\n", + "Iteration: 18% 1024/5625 [00:41<02:04, 36.81it/s]\u001b[A\n", + "Iteration: 18% 1028/5625 [00:42<02:06, 36.45it/s]\u001b[A\n", + "Iteration: 18% 1032/5625 [00:42<02:05, 36.71it/s]\u001b[A\n", + "Iteration: 18% 1036/5625 [00:42<02:06, 36.38it/s]\u001b[A\n", + "Iteration: 18% 1040/5625 [00:42<02:04, 36.89it/s]\u001b[A\n", + "Iteration: 19% 1044/5625 [00:42<02:02, 37.27it/s]\u001b[A\n", + "Iteration: 19% 1048/5625 [00:42<02:03, 37.02it/s]\u001b[A\n", + "Iteration: 19% 1052/5625 [00:42<02:03, 37.00it/s]\u001b[A\n", + "Iteration: 19% 1056/5625 [00:42<02:03, 36.96it/s]\u001b[A\n", + "Iteration: 19% 1060/5625 [00:42<02:03, 36.92it/s]\u001b[A\n", + "Iteration: 19% 1064/5625 [00:43<02:02, 37.09it/s]\u001b[A\n", + "Iteration: 19% 1068/5625 [00:43<02:04, 36.64it/s]\u001b[A\n", + "Iteration: 19% 1072/5625 [00:43<02:03, 36.96it/s]\u001b[A\n", + "Iteration: 19% 1076/5625 [00:43<02:05, 36.30it/s]\u001b[A\n", + "Iteration: 19% 1080/5625 [00:43<02:10, 34.87it/s]\u001b[A\n", + "Iteration: 19% 1084/5625 [00:43<02:06, 35.80it/s]\u001b[A\n", + "Iteration: 19% 1088/5625 [00:43<02:04, 36.37it/s]\u001b[A\n", + "Iteration: 19% 1092/5625 [00:43<02:03, 36.57it/s]\u001b[A\n", + "Iteration: 19% 1096/5625 [00:43<02:04, 36.45it/s]\u001b[A\n", + "Iteration: 20% 1100/5625 [00:44<02:04, 36.40it/s]\u001b[A\n", + "Iteration: 20% 1104/5625 [00:44<02:05, 36.03it/s]\u001b[A\n", + "Iteration: 20% 1108/5625 [00:44<02:05, 35.93it/s]\u001b[A\n", + "Iteration: 20% 1112/5625 [00:44<02:05, 35.87it/s]\u001b[A\n", + "Iteration: 20% 1116/5625 [00:44<02:06, 35.75it/s]\u001b[A\n", + "Iteration: 20% 1120/5625 [00:44<02:05, 35.82it/s]\u001b[A\n", + "Iteration: 20% 1124/5625 [00:44<02:06, 35.64it/s]\u001b[A02/21/2020 12:30:24 - INFO - __main__ - Creating features from dataset file at /tmp/lm_data/eval.txt\n", + "02/21/2020 12:30:25 - INFO - __main__ - ***** Running evaluation *****\n", + "02/21/2020 12:30:25 - INFO - __main__ - Num examples = 20000\n", + "02/21/2020 12:30:25 - INFO - __main__ - Batch size = 32\n", + "\n", + "\n", + "Evaluating: 0% 0/625 [00:00 and Privacy Policy\n", + ", and TensorBoard.dev's Terms of Service\n", + ".\n", + "\n", + "This notice will not be shown again while you are logged into the uploader.\n", + "To log out, run `tensorboard dev auth revoke`.\n", + "\n", + "Continue? (yes/NO) yes\n", + "\n", + "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=373649185512-8v619h5kft38l4456nm2dj4ubeqsrvh6.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email&state=kgAdxJj3xxL6gDgTUoUWbPVrkXeIzl&prompt=consent&access_type=offline\n", + "Enter the authorization code: 4/wwHWmLi7O1avExJ9mp5Ka_Bbo3lSCOsRUHS1r2a5lqOiyIAllUK6KpY\n", + "\n", + "Upload started and will continue reading any new data as it's added\n", + "to the logdir. To stop uploading, press Ctrl-C.\n", + "View your TensorBoard live at: https://tensorboard.dev/experiment/wKOIBs5zRgCb0MY8KGi7Sg/\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader_main.py\", line 426, in execute\n", + " uploader.start_uploading()\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader.py\", line 111, in start_uploading\n", + " self._upload_once()\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader.py\", line 116, in _upload_once\n", + " self._rate_limiter.tick()\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/util.py\", line 41, in tick\n", + " self._time.sleep(wait_secs)\n", + "KeyboardInterrupt\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/bin/tensorboard\", line 8, in \n", + " sys.exit(run_main())\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/main.py\", line 66, in run_main\n", + " app.run(tensorboard.main, flags_parser=tensorboard.configure)\n", + " File \"/usr/local/lib/python3.6/dist-packages/absl/app.py\", line 299, in run\n", + " _run_main(main, args)\n", + " File \"/usr/local/lib/python3.6/dist-packages/absl/app.py\", line 250, in _run_main\n", + " sys.exit(main(argv))\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/program.py\", line 268, in main\n", + " return runner(self.flags) or 0\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader_main.py\", line 579, in run\n", + " return _run(flags)\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader_main.py\", line 259, in _run\n", + " intent.execute(server_info, channel)\n", + " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader_main.py\", line 431, in execute\n", + " print()\n", + "KeyboardInterrupt\n", + "^C\n" + ], + "name": "stdout" + } + ] + } + ] +} \ No newline at end of file diff --git a/NLP/src/train_esperbert.ipynb b/NLP/src/train_esperbert.ipynb new file mode 100644 index 00000000..d12e954c --- /dev/null +++ b/NLP/src/train_esperbert.ipynb @@ -0,0 +1,1472 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "01_how-to-train.ipynb", + "provenance": [], + "toc_visible": true, + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU", + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "a58a66392b644b1384661e850c077a6c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_a491e8caa0a048beb3b5259f14eb233f", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_837c9ddc3d594e088891874560c646b8", + "IPY_MODEL_dbf50873d62c4ba39321faefbed0cca5" + ] + }, + "model_module_version": "1.5.0" + }, + "a491e8caa0a048beb3b5259f14eb233f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + }, + "model_module_version": "1.2.0" + }, + "837c9ddc3d594e088891874560c646b8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_40bf955ba0284e84b198da6be8654219", + "_dom_classes": [], + "description": "Epoch: 100%", + "_model_name": "FloatProgressModel", + "bar_style": "success", + "max": 1, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 1, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_fe20a8dae6e84628b5076d02183090f5" + }, + "model_module_version": "1.5.0" + }, + "dbf50873d62c4ba39321faefbed0cca5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_93b3f9eae3cb4e3e859cf456e3547c6d", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 1/1 [2:46:46<00:00, 10006.17s/it]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_6feb10aeb43147e6aba028d065947ae8" + }, + "model_module_version": "1.5.0" + }, + "40bf955ba0284e84b198da6be8654219": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "initial", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + }, + "model_module_version": "1.5.0" + }, + "fe20a8dae6e84628b5076d02183090f5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + }, + "model_module_version": "1.2.0" + }, + "93b3f9eae3cb4e3e859cf456e3547c6d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + }, + "model_module_version": "1.5.0" + }, + "6feb10aeb43147e6aba028d065947ae8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + }, + "model_module_version": "1.2.0" + }, + "0989d41a4da24e9ebff377e02127642c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "state": { + "_view_name": "HBoxView", + "_dom_classes": [], + "_model_name": "HBoxModel", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.5.0", + "box_style": "", + "layout": "IPY_MODEL_42c6061ef7e44f179db5a6e3551c0f17", + "_model_module": "@jupyter-widgets/controls", + "children": [ + "IPY_MODEL_d295dd80550447d88da0f04ce36a22ff", + "IPY_MODEL_04e7e6d291da49d5816dc98a2904e95c" + ] + }, + "model_module_version": "1.5.0" + }, + "42c6061ef7e44f179db5a6e3551c0f17": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + }, + "model_module_version": "1.2.0" + }, + "d295dd80550447d88da0f04ce36a22ff": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "state": { + "_view_name": "ProgressView", + "style": "IPY_MODEL_e7d8c3a4fecd40778e32966b29ea65a1", + "_dom_classes": [], + "description": "Iteration: 100%", + "_model_name": "FloatProgressModel", + "bar_style": "success", + "max": 15228, + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": 15228, + "_view_count": null, + "_view_module_version": "1.5.0", + "orientation": "horizontal", + "min": 0, + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_016d7c8318f742c1943464b08232a510" + }, + "model_module_version": "1.5.0" + }, + "04e7e6d291da49d5816dc98a2904e95c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "state": { + "_view_name": "HTMLView", + "style": "IPY_MODEL_8388e9da9da4492c98c19235ca5fc1b5", + "_dom_classes": [], + "description": "", + "_model_name": "HTMLModel", + "placeholder": "​", + "_view_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "value": " 15228/15228 [2:46:46<00:00, 1.52it/s]", + "_view_count": null, + "_view_module_version": "1.5.0", + "description_tooltip": null, + "_model_module": "@jupyter-widgets/controls", + "layout": "IPY_MODEL_39c23c6a972b419eb2eeeebafeaedc22" + }, + "model_module_version": "1.5.0" + }, + "e7d8c3a4fecd40778e32966b29ea65a1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "state": { + "_view_name": "StyleView", + "_model_name": "ProgressStyleModel", + "description_width": "initial", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "bar_color": null, + "_model_module": "@jupyter-widgets/controls" + }, + "model_module_version": "1.5.0" + }, + "016d7c8318f742c1943464b08232a510": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + }, + "model_module_version": "1.2.0" + }, + "8388e9da9da4492c98c19235ca5fc1b5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "state": { + "_view_name": "StyleView", + "_model_name": "DescriptionStyleModel", + "description_width": "", + "_view_module": "@jupyter-widgets/base", + "_model_module_version": "1.5.0", + "_view_count": null, + "_view_module_version": "1.2.0", + "_model_module": "@jupyter-widgets/controls" + }, + "model_module_version": "1.5.0" + }, + "39c23c6a972b419eb2eeeebafeaedc22": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "state": { + "_view_name": "LayoutView", + "grid_template_rows": null, + "right": null, + "justify_content": null, + "_view_module": "@jupyter-widgets/base", + "overflow": null, + "_model_module_version": "1.2.0", + "_view_count": null, + "flex_flow": null, + "width": null, + "min_width": null, + "border": null, + "align_items": null, + "bottom": null, + "_model_module": "@jupyter-widgets/base", + "top": null, + "grid_column": null, + "overflow_y": null, + "overflow_x": null, + "grid_auto_flow": null, + "grid_area": null, + "grid_template_columns": null, + "flex": null, + "_model_name": "LayoutModel", + "justify_items": null, + "grid_row": null, + "max_height": null, + "align_content": null, + "visibility": null, + "align_self": null, + "height": null, + "min_height": null, + "padding": null, + "grid_auto_rows": null, + "grid_gap": null, + "max_width": null, + "order": null, + "_view_module_version": "1.2.0", + "grid_template_areas": null, + "object_position": null, + "object_fit": null, + "grid_auto_columns": null, + "margin": null, + "display": null, + "left": null + }, + "model_module_version": "1.2.0" + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "e67Ut53QYEdU", + "cellView": "form", + "outputId": "437871b8-b8ac-4eaf-c2e1-61d801c5e6b2", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 100 + } + }, + "source": [ + "#@title\n", + "%%html\n", + "
\n", + " Notebook written in collaboration with Aditya Malte.\n", + "
\n", + " The Notebook is on GitHub, so contributions are more than welcome.\n", + "
\n", + "
\n", + "
\n", + " Aditya wrote another notebook with a slightly different use case and methodology, please check it out.\n", + "
\n", + " \n", + " https://gist.github.com/aditya-malte/2d4f896f471be9c38eb4d723a710768b\n", + " \n", + "
\n" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + " Notebook written in collaboration with Aditya Malte.\n", + "
\n", + " The Notebook is on GitHub, so contributions are more than welcome.\n", + "
\n", + "
\n", + "
\n", + " Aditya wrote another notebook with a slightly different use case and methodology, please check it out.\n", + "
\n", + " \n", + " https://gist.github.com/aditya-malte/2d4f896f471be9c38eb4d723a710768b\n", + " \n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M1oqh0F6W3ad" + }, + "source": [ + "# How to train a new language model from scratch using Transformers and Tokenizers\n", + "\n", + "### Notebook edition (link to blogpost [link](https://huggingface.co/blog/how-to-train)). Last update May 15, 2020\n", + "\n", + "\n", + "Over the past few months, we made several improvements to our [`transformers`](https://github.com/huggingface/transformers) and [`tokenizers`](https://github.com/huggingface/tokenizers) libraries, with the goal of making it easier than ever to **train a new language model from scratch**.\n", + "\n", + "In this post we’ll demo how to train a “small” model (84 M parameters = 6 layers, 768 hidden size, 12 attention heads) – that’s the same number of layers & heads as DistilBERT – on **Esperanto**. We’ll then fine-tune the model on a downstream task of part-of-speech tagging.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oK7PPVm2XBgr" + }, + "source": [ + "## 1. Find a dataset\n", + "\n", + "First, let us find a corpus of text in Esperanto. Here we’ll use the Esperanto portion of the [OSCAR corpus](https://traces1.inria.fr/oscar/) from INRIA.\n", + "OSCAR is a huge multilingual corpus obtained by language classification and filtering of [Common Crawl](https://commoncrawl.org/) dumps of the Web.\n", + "\n", + "\n", + "\n", + "The Esperanto portion of the dataset is only 299M, so we’ll concatenate with the Esperanto sub-corpus of the [Leipzig Corpora Collection](https://wortschatz.uni-leipzig.de/en/download), which is comprised of text from diverse sources like news, literature, and wikipedia.\n", + "\n", + "The final training corpus has a size of 3 GB, which is still small – for your model, you will get better results the more data you can get to pretrain on.\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HOk4iZ9YZvec" + }, + "source": [ + "# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance\n", + "!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G-kkz81OY6xH" + }, + "source": [ + "## 2. Train a tokenizer\n", + "\n", + "We choose to train a byte-level Byte-pair encoding tokenizer (the same as GPT-2), with the same special tokens as RoBERTa. Let’s arbitrarily pick its size to be 52,000.\n", + "\n", + "We recommend training a byte-level BPE (rather than let’s say, a WordPiece tokenizer like BERT) because it will start building its vocabulary from an alphabet of single bytes, so all words will be decomposable into tokens (no more `` tokens!).\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5duRggBRZKvP" + }, + "source": [ + "# We won't need TensorFlow here\n", + "!pip uninstall -y tensorflow\n", + "# Install `transformers` from master\n", + "!pip install git+https://github.com/huggingface/transformers\n", + "!pip list | grep -E 'transformers|tokenizers'\n", + "# transformers version at notebook update --- 2.11.0\n", + "# tokenizers version at notebook update --- 0.8.0rc1" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "IMnymRDLe0hi", + "outputId": "4d26476f-e6b5-475a-a0c1-41b6fcdc041a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + } + }, + "source": [ + "%%time\n", + "from pathlib import Path\n", + "\n", + "from tokenizers import ByteLevelBPETokenizer\n", + "\n", + "paths = [str(x) for x in Path(\".\").glob(\"**/*.txt\")]\n", + "\n", + "# Initialize a tokenizer\n", + "tokenizer = ByteLevelBPETokenizer()\n", + "\n", + "# Customize training\n", + "tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + "])" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "CPU times: user 4min, sys: 3min 7s, total: 7min 7s\n", + "Wall time: 2min 25s\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6Ei7bqpRf1LH" + }, + "source": [ + "Now let's save files to disk" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EIS-irI0f32P", + "outputId": "e86c4a24-eb65-4f0a-aa58-ed1931a05ac9", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "!mkdir EsperBERTo\n", + "tokenizer.save_model(\"EsperBERTo\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['EsperBERTo/vocab.json', 'EsperBERTo/merges.txt']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lOOfYSuQhSqT" + }, + "source": [ + "🔥🔥 Wow, that was fast! ⚡️🔥\n", + "\n", + "We now have both a `vocab.json`, which is a list of the most frequent tokens ranked by frequency, and a `merges.txt` list of merges.\n", + "\n", + "```json\n", + "{\n", + "\t\"\": 0,\n", + "\t\"\": 1,\n", + "\t\"\": 2,\n", + "\t\"\": 3,\n", + "\t\"\": 4,\n", + "\t\"!\": 5,\n", + "\t\"\\\"\": 6,\n", + "\t\"#\": 7,\n", + "\t\"$\": 8,\n", + "\t\"%\": 9,\n", + "\t\"&\": 10,\n", + "\t\"'\": 11,\n", + "\t\"(\": 12,\n", + "\t\")\": 13,\n", + "\t# ...\n", + "}\n", + "\n", + "# merges.txt\n", + "l a\n", + "Ġ k\n", + "o n\n", + "Ġ la\n", + "t a\n", + "Ġ e\n", + "Ġ d\n", + "Ġ p\n", + "# ...\n", + "```\n", + "\n", + "What is great is that our tokenizer is optimized for Esperanto. Compared to a generic tokenizer trained for English, more native words are represented by a single, unsplit token. Diacritics, i.e. accented characters used in Esperanto – `ĉ`, `ĝ`, `ĥ`, `ĵ`, `ŝ`, and `ŭ` – are encoded natively. We also represent sequences in a more efficient manner. Here on this corpus, the average length of encoded sequences is ~30% smaller as when using the pretrained GPT-2 tokenizer.\n", + "\n", + "Here’s how you can use it in `tokenizers`, including handling the RoBERTa special tokens – of course, you’ll also be able to use it directly from `transformers`.\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "tKVWB8WShT-z" + }, + "source": [ + "from tokenizers.implementations import ByteLevelBPETokenizer\n", + "from tokenizers.processors import BertProcessing\n", + "\n", + "\n", + "tokenizer = ByteLevelBPETokenizer(\n", + " \"./EsperBERTo/vocab.json\",\n", + " \"./EsperBERTo/merges.txt\",\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "hO5M3vrAhcuj" + }, + "source": [ + "tokenizer._tokenizer.post_processor = BertProcessing(\n", + " (\"\", tokenizer.token_to_id(\"\")),\n", + " (\"\", tokenizer.token_to_id(\"\")),\n", + ")\n", + "tokenizer.enable_truncation(max_length=512)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "E3Ye27nchfzq", + "outputId": "b9812ed2-1ecd-4e1b-d9bd-7de581955e70", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "tokenizer.encode(\"Mi estas Julien.\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X8ya5_7rhjKS", + "outputId": "e9e08ded-1081-4823-dd81-9d6be1255385", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "tokenizer.encode(\"Mi estas Julien.\").tokens" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 11 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WQpUC_CDhnWW" + }, + "source": [ + "## 3. Train a language model from scratch\n", + "\n", + "**Update:** This section follows along the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py) script, using our new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) directly. Feel free to pick the approach you like best.\n", + "\n", + "> We’ll train a RoBERTa-like model, which is a BERT-like with a couple of changes (check the [documentation](https://huggingface.co/transformers/model_doc/roberta.html) for more details).\n", + "\n", + "As the model is BERT-like, we’ll train it on a task of *Masked language modeling*, i.e. the predict how to fill arbitrary tokens that we randomly mask in the dataset. This is taken care of by the example script.\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "kD140sFjh0LQ", + "outputId": "0bab1f9e-bf7a-4f13-82d3-07fe5866ce78", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 318 + } + }, + "source": [ + "# Check that we have a GPU\n", + "!nvidia-smi" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Fri May 15 21:17:12 2020 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 440.82 Driver Version: 418.67 CUDA Version: 10.1 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "|===============================+======================+======================|\n", + "| 0 Tesla P100-PCIE... Off | 00000000:00:04.0 Off | 0 |\n", + "| N/A 38C P0 26W / 250W | 0MiB / 16280MiB | 0% Default |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: GPU Memory |\n", + "| GPU PID Type Process name Usage |\n", + "|=============================================================================|\n", + "| No running processes found |\n", + "+-----------------------------------------------------------------------------+\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VNZZs-r6iKAV", + "outputId": "c8404d6c-7662-4240-c8da-ee89edfaf51b", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "# Check that PyTorch sees it\n", + "import torch\n", + "torch.cuda.is_available()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u0qQzgrBi1OX" + }, + "source": [ + "### We'll define the following config for the model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "LTXXutqeDzPi" + }, + "source": [ + "from transformers import RobertaConfig\n", + "\n", + "config = RobertaConfig(\n", + " vocab_size=52_000,\n", + " max_position_embeddings=514,\n", + " num_attention_heads=12,\n", + " num_hidden_layers=6,\n", + " type_vocab_size=1,\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yAwQ82JiE5pi" + }, + "source": [ + "Now let's re-create our tokenizer in transformers" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4keFBUjQFOD1" + }, + "source": [ + "from transformers import RobertaTokenizerFast\n", + "\n", + "tokenizer = RobertaTokenizerFast.from_pretrained(\"./EsperBERTo\", max_len=512)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6yNCw-3hFv9h" + }, + "source": [ + "Finally let's initialize our model.\n", + "\n", + "**Important:**\n", + "\n", + "As we are training from scratch, we only initialize from a config, not from an existing pretrained model or checkpoint." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "BzMqR-dzF4Ro" + }, + "source": [ + "from transformers import RobertaForMaskedLM\n", + "\n", + "model = RobertaForMaskedLM(config=config)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "jU6JhBSTKiaM", + "outputId": "35879a60-2915-4894-f702-2d649cfa398a", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "source": [ + "model.num_parameters()\n", + "# => 84 million parameters" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "84095008" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 10 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jBtUHRMliOLM" + }, + "source": [ + "### Now let's build our training Dataset\n", + "\n", + "We'll build our dataset by applying our tokenizer to our text file.\n", + "\n", + "Here, as we only have one text file, we don't even need to customize our `Dataset`. We'll just use the `LineByLineDataset` out-of-the-box." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GlvP_A-THEEl", + "outputId": "e0510a33-7937-4a04-fa1c-d4e20b758bb2", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 52 + } + }, + "source": [ + "%%time\n", + "from transformers import LineByLineTextDataset\n", + "\n", + "dataset = LineByLineTextDataset(\n", + " tokenizer=tokenizer,\n", + " file_path=\"./oscar.eo.txt\",\n", + " block_size=128,\n", + ")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "CPU times: user 4min 54s, sys: 2.98 s, total: 4min 57s\n", + "Wall time: 1min 37s\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hDLs73HcIHk5" + }, + "source": [ + "Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.\n", + "\n", + "This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zTgWPa9Dipk2" + }, + "source": [ + "from transformers import DataCollatorForLanguageModeling\n", + "\n", + "data_collator = DataCollatorForLanguageModeling(\n", + " tokenizer=tokenizer, mlm=True, mlm_probability=0.15\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ri2BIQKqjfHm" + }, + "source": [ + "### Finally, we are all set to initialize our Trainer" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YpvnFFmZJD-N" + }, + "source": [ + "from transformers import Trainer, TrainingArguments\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./EsperBERTo\",\n", + " overwrite_output_dir=True,\n", + " num_train_epochs=1,\n", + " per_gpu_train_batch_size=64,\n", + " save_steps=10_000,\n", + " save_total_limit=2,\n", + " prediction_loss_only=True,\n", + ")\n", + "\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=data_collator,\n", + " train_dataset=dataset,\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o6sASa36Nf-N" + }, + "source": [ + "### Start training" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VmaHZXzmkNtJ", + "outputId": "a19880cb-bcc6-4885-bf24-c2c6d0f56d1e", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 738, + "referenced_widgets": [ + "a58a66392b644b1384661e850c077a6c", + "a491e8caa0a048beb3b5259f14eb233f", + "837c9ddc3d594e088891874560c646b8", + "dbf50873d62c4ba39321faefbed0cca5", + "40bf955ba0284e84b198da6be8654219", + "fe20a8dae6e84628b5076d02183090f5", + "93b3f9eae3cb4e3e859cf456e3547c6d", + "6feb10aeb43147e6aba028d065947ae8", + "0989d41a4da24e9ebff377e02127642c", + "42c6061ef7e44f179db5a6e3551c0f17", + "d295dd80550447d88da0f04ce36a22ff", + "04e7e6d291da49d5816dc98a2904e95c", + "e7d8c3a4fecd40778e32966b29ea65a1", + "016d7c8318f742c1943464b08232a510", + "8388e9da9da4492c98c19235ca5fc1b5", + "39c23c6a972b419eb2eeeebafeaedc22" + ] + } + }, + "source": [ + "%%time\n", + "trainer.train()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a58a66392b644b1384661e850c077a6c", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0989d41a4da24e9ebff377e02127642c", + "version_minor": 0, + "version_major": 2 + }, + "text/plain": [ + "HBox(children=(FloatProgress(value=0.0, description='Iteration', max=15228.0, style=ProgressStyle(description_…" + ] + }, + "metadata": { + "tags": [] + } + }, + { + "output_type": "stream", + "text": [ + "{\"loss\": 7.152712148666382, \"learning_rate\": 4.8358287365379566e-05, \"epoch\": 0.03283425269240872, \"step\": 500}\n", + "{\"loss\": 6.928811420440674, \"learning_rate\": 4.671657473075913e-05, \"epoch\": 0.06566850538481744, \"step\": 1000}\n", + "{\"loss\": 6.789419063568115, \"learning_rate\": 4.5074862096138694e-05, \"epoch\": 0.09850275807722617, \"step\": 1500}\n", + "{\"loss\": 6.688932447433472, \"learning_rate\": 4.343314946151826e-05, \"epoch\": 0.1313370107696349, \"step\": 2000}\n", + "{\"loss\": 6.595982004165649, \"learning_rate\": 4.179143682689782e-05, \"epoch\": 0.1641712634620436, \"step\": 2500}\n", + "{\"loss\": 6.545944199562073, \"learning_rate\": 4.0149724192277385e-05, \"epoch\": 0.19700551615445233, \"step\": 3000}\n", + "{\"loss\": 6.4864857263565066, \"learning_rate\": 3.850801155765695e-05, \"epoch\": 0.22983976884686105, \"step\": 3500}\n", + "{\"loss\": 6.412427802085876, \"learning_rate\": 3.686629892303651e-05, \"epoch\": 0.2626740215392698, \"step\": 4000}\n", + "{\"loss\": 6.363630670547486, \"learning_rate\": 3.522458628841608e-05, \"epoch\": 0.29550827423167847, \"step\": 4500}\n", + "{\"loss\": 6.273832890510559, \"learning_rate\": 3.358287365379564e-05, \"epoch\": 0.3283425269240872, \"step\": 5000}\n", + "{\"loss\": 6.197585330963134, \"learning_rate\": 3.1941161019175205e-05, \"epoch\": 0.3611767796164959, \"step\": 5500}\n", + "{\"loss\": 6.097779376983643, \"learning_rate\": 3.029944838455477e-05, \"epoch\": 0.39401103230890466, \"step\": 6000}\n", + "{\"loss\": 5.985456382751464, \"learning_rate\": 2.8657735749934332e-05, \"epoch\": 0.42684528500131336, \"step\": 6500}\n", + "{\"loss\": 5.8448616371154785, \"learning_rate\": 2.70160231153139e-05, \"epoch\": 0.4596795376937221, \"step\": 7000}\n", + "{\"loss\": 5.692522863388062, \"learning_rate\": 2.5374310480693457e-05, \"epoch\": 0.4925137903861308, \"step\": 7500}\n", + "{\"loss\": 5.562082152366639, \"learning_rate\": 2.3732597846073024e-05, \"epoch\": 0.5253480430785396, \"step\": 8000}\n", + "{\"loss\": 5.457240365982056, \"learning_rate\": 2.2090885211452588e-05, \"epoch\": 0.5581822957709482, \"step\": 8500}\n", + "{\"loss\": 5.376953645706177, \"learning_rate\": 2.0449172576832152e-05, \"epoch\": 0.5910165484633569, \"step\": 9000}\n", + "{\"loss\": 5.298609251022339, \"learning_rate\": 1.8807459942211716e-05, \"epoch\": 0.6238508011557657, \"step\": 9500}\n", + "{\"loss\": 5.225468152046203, \"learning_rate\": 1.716574730759128e-05, \"epoch\": 0.6566850538481744, \"step\": 10000}\n", + "{\"loss\": 5.174519973754883, \"learning_rate\": 1.5524034672970843e-05, \"epoch\": 0.6895193065405831, \"step\": 10500}\n", + "{\"loss\": 5.113943946838379, \"learning_rate\": 1.3882322038350407e-05, \"epoch\": 0.7223535592329918, \"step\": 11000}\n", + "{\"loss\": 5.08140989112854, \"learning_rate\": 1.2240609403729971e-05, \"epoch\": 0.7551878119254006, \"step\": 11500}\n", + "{\"loss\": 5.072491912841797, \"learning_rate\": 1.0598896769109535e-05, \"epoch\": 0.7880220646178093, \"step\": 12000}\n", + "{\"loss\": 5.012459496498108, \"learning_rate\": 8.957184134489099e-06, \"epoch\": 0.820856317310218, \"step\": 12500}\n", + "{\"loss\": 4.999591351509094, \"learning_rate\": 7.315471499868663e-06, \"epoch\": 0.8536905700026267, \"step\": 13000}\n", + "{\"loss\": 4.994838352203369, \"learning_rate\": 5.673758865248227e-06, \"epoch\": 0.8865248226950354, \"step\": 13500}\n", + "{\"loss\": 4.955870885848999, \"learning_rate\": 4.032046230627791e-06, \"epoch\": 0.9193590753874442, \"step\": 14000}\n", + "{\"loss\": 4.941655583381653, \"learning_rate\": 2.390333596007355e-06, \"epoch\": 0.9521933280798529, \"step\": 14500}\n", + "{\"loss\": 4.931783639907837, \"learning_rate\": 7.486209613869189e-07, \"epoch\": 0.9850275807722616, \"step\": 15000}\n", + "\n", + "\n", + "CPU times: user 1h 43min 36s, sys: 1h 3min 28s, total: 2h 47min 4s\n", + "Wall time: 2h 46min 46s\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=15228, training_loss=5.762423221226405)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 18 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_ZkooHz1-_2h" + }, + "source": [ + "#### 🎉 Save final model (+ tokenizer + config) to disk" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "QDNgPls7_l13" + }, + "source": [ + "trainer.save_model(\"./EsperBERTo\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d0caceCy_p1-" + }, + "source": [ + "## 4. Check that the LM actually trained" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iIQJ8ND_AEhl" + }, + "source": [ + "Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.\n", + "\n", + "Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, ``) and return a list of the most probable filled sequences, with their probabilities.\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ltXgXyCbAJLY" + }, + "source": [ + "from transformers import pipeline\n", + "\n", + "fill_mask = pipeline(\n", + " \"fill-mask\",\n", + " model=\"./EsperBERTo\",\n", + " tokenizer=\"./EsperBERTo\"\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "UIvgZ3S6AO0z", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 283 + }, + "outputId": "5f3d2f00-abdc-44a9-9c1b-75e3ec328576" + }, + "source": [ + "# The sun .\n", + "# =>\n", + "\n", + "fill_mask(\"La suno .\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[{'score': 0.02119220793247223,\n", + " 'sequence': ' La suno estas.',\n", + " 'token': 316},\n", + " {'score': 0.012403824366629124,\n", + " 'sequence': ' La suno situas.',\n", + " 'token': 2340},\n", + " {'score': 0.011061107739806175,\n", + " 'sequence': ' La suno estis.',\n", + " 'token': 394},\n", + " {'score': 0.008284995332360268,\n", + " 'sequence': ' La suno de.',\n", + " 'token': 274},\n", + " {'score': 0.006471084896475077,\n", + " 'sequence': ' La suno akvo.',\n", + " 'token': 1833}]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 36 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i0qCyyhNAWZi" + }, + "source": [ + "Ok, simple syntax/grammar works. Let’s try a slightly more interesting prompt:\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YZ9HSQxAAbme", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 283 + }, + "outputId": "aabfeedc-b1d0-4837-b01d-cd42726a5a3d" + }, + "source": [ + "fill_mask(\"Jen la komenco de bela .\")\n", + "\n", + "# This is the beginning of a beautiful .\n", + "# =>" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[{'score': 0.01814725436270237,\n", + " 'sequence': ' Jen la komenco de bela urbo.',\n", + " 'token': 871},\n", + " {'score': 0.015888698399066925,\n", + " 'sequence': ' Jen la komenco de bela vivo.',\n", + " 'token': 1160},\n", + " {'score': 0.015662025660276413,\n", + " 'sequence': ' Jen la komenco de bela tempo.',\n", + " 'token': 1021},\n", + " {'score': 0.015555007383227348,\n", + " 'sequence': ' Jen la komenco de bela mondo.',\n", + " 'token': 945},\n", + " {'score': 0.01412549614906311,\n", + " 'sequence': ' Jen la komenco de bela tago.',\n", + " 'token': 1633}]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 37 + } + ] + } + ] +} \ No newline at end of file