\n",
+ " sys.exit(run_main())\n",
+ " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/main.py\", line 66, in run_main\n",
+ " app.run(tensorboard.main, flags_parser=tensorboard.configure)\n",
+ " File \"/usr/local/lib/python3.6/dist-packages/absl/app.py\", line 299, in run\n",
+ " _run_main(main, args)\n",
+ " File \"/usr/local/lib/python3.6/dist-packages/absl/app.py\", line 250, in _run_main\n",
+ " sys.exit(main(argv))\n",
+ " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/program.py\", line 268, in main\n",
+ " return runner(self.flags) or 0\n",
+ " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader_main.py\", line 579, in run\n",
+ " return _run(flags)\n",
+ " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader_main.py\", line 259, in _run\n",
+ " intent.execute(server_info, channel)\n",
+ " File \"/usr/local/lib/python3.6/dist-packages/tensorboard/uploader/uploader_main.py\", line 431, in execute\n",
+ " print()\n",
+ "KeyboardInterrupt\n",
+ "^C\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/NLP/src/train_esperbert.ipynb b/NLP/src/train_esperbert.ipynb
new file mode 100644
index 00000000..d12e954c
--- /dev/null
+++ b/NLP/src/train_esperbert.ipynb
@@ -0,0 +1,1472 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "01_how-to-train.ipynb",
+ "provenance": [],
+ "toc_visible": true,
+ "machine_shape": "hm"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "accelerator": "GPU",
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "a58a66392b644b1384661e850c077a6c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HBoxModel",
+ "state": {
+ "_view_name": "HBoxView",
+ "_dom_classes": [],
+ "_model_name": "HBoxModel",
+ "_view_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_view_count": null,
+ "_view_module_version": "1.5.0",
+ "box_style": "",
+ "layout": "IPY_MODEL_a491e8caa0a048beb3b5259f14eb233f",
+ "_model_module": "@jupyter-widgets/controls",
+ "children": [
+ "IPY_MODEL_837c9ddc3d594e088891874560c646b8",
+ "IPY_MODEL_dbf50873d62c4ba39321faefbed0cca5"
+ ]
+ },
+ "model_module_version": "1.5.0"
+ },
+ "a491e8caa0a048beb3b5259f14eb233f": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "state": {
+ "_view_name": "LayoutView",
+ "grid_template_rows": null,
+ "right": null,
+ "justify_content": null,
+ "_view_module": "@jupyter-widgets/base",
+ "overflow": null,
+ "_model_module_version": "1.2.0",
+ "_view_count": null,
+ "flex_flow": null,
+ "width": null,
+ "min_width": null,
+ "border": null,
+ "align_items": null,
+ "bottom": null,
+ "_model_module": "@jupyter-widgets/base",
+ "top": null,
+ "grid_column": null,
+ "overflow_y": null,
+ "overflow_x": null,
+ "grid_auto_flow": null,
+ "grid_area": null,
+ "grid_template_columns": null,
+ "flex": null,
+ "_model_name": "LayoutModel",
+ "justify_items": null,
+ "grid_row": null,
+ "max_height": null,
+ "align_content": null,
+ "visibility": null,
+ "align_self": null,
+ "height": null,
+ "min_height": null,
+ "padding": null,
+ "grid_auto_rows": null,
+ "grid_gap": null,
+ "max_width": null,
+ "order": null,
+ "_view_module_version": "1.2.0",
+ "grid_template_areas": null,
+ "object_position": null,
+ "object_fit": null,
+ "grid_auto_columns": null,
+ "margin": null,
+ "display": null,
+ "left": null
+ },
+ "model_module_version": "1.2.0"
+ },
+ "837c9ddc3d594e088891874560c646b8": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_view_name": "ProgressView",
+ "style": "IPY_MODEL_40bf955ba0284e84b198da6be8654219",
+ "_dom_classes": [],
+ "description": "Epoch: 100%",
+ "_model_name": "FloatProgressModel",
+ "bar_style": "success",
+ "max": 1,
+ "_view_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "value": 1,
+ "_view_count": null,
+ "_view_module_version": "1.5.0",
+ "orientation": "horizontal",
+ "min": 0,
+ "description_tooltip": null,
+ "_model_module": "@jupyter-widgets/controls",
+ "layout": "IPY_MODEL_fe20a8dae6e84628b5076d02183090f5"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "dbf50873d62c4ba39321faefbed0cca5": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "state": {
+ "_view_name": "HTMLView",
+ "style": "IPY_MODEL_93b3f9eae3cb4e3e859cf456e3547c6d",
+ "_dom_classes": [],
+ "description": "",
+ "_model_name": "HTMLModel",
+ "placeholder": "",
+ "_view_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "value": " 1/1 [2:46:46<00:00, 10006.17s/it]",
+ "_view_count": null,
+ "_view_module_version": "1.5.0",
+ "description_tooltip": null,
+ "_model_module": "@jupyter-widgets/controls",
+ "layout": "IPY_MODEL_6feb10aeb43147e6aba028d065947ae8"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "40bf955ba0284e84b198da6be8654219": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_view_name": "StyleView",
+ "_model_name": "ProgressStyleModel",
+ "description_width": "initial",
+ "_view_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.5.0",
+ "_view_count": null,
+ "_view_module_version": "1.2.0",
+ "bar_color": null,
+ "_model_module": "@jupyter-widgets/controls"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "fe20a8dae6e84628b5076d02183090f5": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "state": {
+ "_view_name": "LayoutView",
+ "grid_template_rows": null,
+ "right": null,
+ "justify_content": null,
+ "_view_module": "@jupyter-widgets/base",
+ "overflow": null,
+ "_model_module_version": "1.2.0",
+ "_view_count": null,
+ "flex_flow": null,
+ "width": null,
+ "min_width": null,
+ "border": null,
+ "align_items": null,
+ "bottom": null,
+ "_model_module": "@jupyter-widgets/base",
+ "top": null,
+ "grid_column": null,
+ "overflow_y": null,
+ "overflow_x": null,
+ "grid_auto_flow": null,
+ "grid_area": null,
+ "grid_template_columns": null,
+ "flex": null,
+ "_model_name": "LayoutModel",
+ "justify_items": null,
+ "grid_row": null,
+ "max_height": null,
+ "align_content": null,
+ "visibility": null,
+ "align_self": null,
+ "height": null,
+ "min_height": null,
+ "padding": null,
+ "grid_auto_rows": null,
+ "grid_gap": null,
+ "max_width": null,
+ "order": null,
+ "_view_module_version": "1.2.0",
+ "grid_template_areas": null,
+ "object_position": null,
+ "object_fit": null,
+ "grid_auto_columns": null,
+ "margin": null,
+ "display": null,
+ "left": null
+ },
+ "model_module_version": "1.2.0"
+ },
+ "93b3f9eae3cb4e3e859cf456e3547c6d": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_view_name": "StyleView",
+ "_model_name": "DescriptionStyleModel",
+ "description_width": "",
+ "_view_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.5.0",
+ "_view_count": null,
+ "_view_module_version": "1.2.0",
+ "_model_module": "@jupyter-widgets/controls"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "6feb10aeb43147e6aba028d065947ae8": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "state": {
+ "_view_name": "LayoutView",
+ "grid_template_rows": null,
+ "right": null,
+ "justify_content": null,
+ "_view_module": "@jupyter-widgets/base",
+ "overflow": null,
+ "_model_module_version": "1.2.0",
+ "_view_count": null,
+ "flex_flow": null,
+ "width": null,
+ "min_width": null,
+ "border": null,
+ "align_items": null,
+ "bottom": null,
+ "_model_module": "@jupyter-widgets/base",
+ "top": null,
+ "grid_column": null,
+ "overflow_y": null,
+ "overflow_x": null,
+ "grid_auto_flow": null,
+ "grid_area": null,
+ "grid_template_columns": null,
+ "flex": null,
+ "_model_name": "LayoutModel",
+ "justify_items": null,
+ "grid_row": null,
+ "max_height": null,
+ "align_content": null,
+ "visibility": null,
+ "align_self": null,
+ "height": null,
+ "min_height": null,
+ "padding": null,
+ "grid_auto_rows": null,
+ "grid_gap": null,
+ "max_width": null,
+ "order": null,
+ "_view_module_version": "1.2.0",
+ "grid_template_areas": null,
+ "object_position": null,
+ "object_fit": null,
+ "grid_auto_columns": null,
+ "margin": null,
+ "display": null,
+ "left": null
+ },
+ "model_module_version": "1.2.0"
+ },
+ "0989d41a4da24e9ebff377e02127642c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HBoxModel",
+ "state": {
+ "_view_name": "HBoxView",
+ "_dom_classes": [],
+ "_model_name": "HBoxModel",
+ "_view_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_view_count": null,
+ "_view_module_version": "1.5.0",
+ "box_style": "",
+ "layout": "IPY_MODEL_42c6061ef7e44f179db5a6e3551c0f17",
+ "_model_module": "@jupyter-widgets/controls",
+ "children": [
+ "IPY_MODEL_d295dd80550447d88da0f04ce36a22ff",
+ "IPY_MODEL_04e7e6d291da49d5816dc98a2904e95c"
+ ]
+ },
+ "model_module_version": "1.5.0"
+ },
+ "42c6061ef7e44f179db5a6e3551c0f17": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "state": {
+ "_view_name": "LayoutView",
+ "grid_template_rows": null,
+ "right": null,
+ "justify_content": null,
+ "_view_module": "@jupyter-widgets/base",
+ "overflow": null,
+ "_model_module_version": "1.2.0",
+ "_view_count": null,
+ "flex_flow": null,
+ "width": null,
+ "min_width": null,
+ "border": null,
+ "align_items": null,
+ "bottom": null,
+ "_model_module": "@jupyter-widgets/base",
+ "top": null,
+ "grid_column": null,
+ "overflow_y": null,
+ "overflow_x": null,
+ "grid_auto_flow": null,
+ "grid_area": null,
+ "grid_template_columns": null,
+ "flex": null,
+ "_model_name": "LayoutModel",
+ "justify_items": null,
+ "grid_row": null,
+ "max_height": null,
+ "align_content": null,
+ "visibility": null,
+ "align_self": null,
+ "height": null,
+ "min_height": null,
+ "padding": null,
+ "grid_auto_rows": null,
+ "grid_gap": null,
+ "max_width": null,
+ "order": null,
+ "_view_module_version": "1.2.0",
+ "grid_template_areas": null,
+ "object_position": null,
+ "object_fit": null,
+ "grid_auto_columns": null,
+ "margin": null,
+ "display": null,
+ "left": null
+ },
+ "model_module_version": "1.2.0"
+ },
+ "d295dd80550447d88da0f04ce36a22ff": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "FloatProgressModel",
+ "state": {
+ "_view_name": "ProgressView",
+ "style": "IPY_MODEL_e7d8c3a4fecd40778e32966b29ea65a1",
+ "_dom_classes": [],
+ "description": "Iteration: 100%",
+ "_model_name": "FloatProgressModel",
+ "bar_style": "success",
+ "max": 15228,
+ "_view_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "value": 15228,
+ "_view_count": null,
+ "_view_module_version": "1.5.0",
+ "orientation": "horizontal",
+ "min": 0,
+ "description_tooltip": null,
+ "_model_module": "@jupyter-widgets/controls",
+ "layout": "IPY_MODEL_016d7c8318f742c1943464b08232a510"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "04e7e6d291da49d5816dc98a2904e95c": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "state": {
+ "_view_name": "HTMLView",
+ "style": "IPY_MODEL_8388e9da9da4492c98c19235ca5fc1b5",
+ "_dom_classes": [],
+ "description": "",
+ "_model_name": "HTMLModel",
+ "placeholder": "",
+ "_view_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "value": " 15228/15228 [2:46:46<00:00, 1.52it/s]",
+ "_view_count": null,
+ "_view_module_version": "1.5.0",
+ "description_tooltip": null,
+ "_model_module": "@jupyter-widgets/controls",
+ "layout": "IPY_MODEL_39c23c6a972b419eb2eeeebafeaedc22"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "e7d8c3a4fecd40778e32966b29ea65a1": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "ProgressStyleModel",
+ "state": {
+ "_view_name": "StyleView",
+ "_model_name": "ProgressStyleModel",
+ "description_width": "initial",
+ "_view_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.5.0",
+ "_view_count": null,
+ "_view_module_version": "1.2.0",
+ "bar_color": null,
+ "_model_module": "@jupyter-widgets/controls"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "016d7c8318f742c1943464b08232a510": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "state": {
+ "_view_name": "LayoutView",
+ "grid_template_rows": null,
+ "right": null,
+ "justify_content": null,
+ "_view_module": "@jupyter-widgets/base",
+ "overflow": null,
+ "_model_module_version": "1.2.0",
+ "_view_count": null,
+ "flex_flow": null,
+ "width": null,
+ "min_width": null,
+ "border": null,
+ "align_items": null,
+ "bottom": null,
+ "_model_module": "@jupyter-widgets/base",
+ "top": null,
+ "grid_column": null,
+ "overflow_y": null,
+ "overflow_x": null,
+ "grid_auto_flow": null,
+ "grid_area": null,
+ "grid_template_columns": null,
+ "flex": null,
+ "_model_name": "LayoutModel",
+ "justify_items": null,
+ "grid_row": null,
+ "max_height": null,
+ "align_content": null,
+ "visibility": null,
+ "align_self": null,
+ "height": null,
+ "min_height": null,
+ "padding": null,
+ "grid_auto_rows": null,
+ "grid_gap": null,
+ "max_width": null,
+ "order": null,
+ "_view_module_version": "1.2.0",
+ "grid_template_areas": null,
+ "object_position": null,
+ "object_fit": null,
+ "grid_auto_columns": null,
+ "margin": null,
+ "display": null,
+ "left": null
+ },
+ "model_module_version": "1.2.0"
+ },
+ "8388e9da9da4492c98c19235ca5fc1b5": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "state": {
+ "_view_name": "StyleView",
+ "_model_name": "DescriptionStyleModel",
+ "description_width": "",
+ "_view_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.5.0",
+ "_view_count": null,
+ "_view_module_version": "1.2.0",
+ "_model_module": "@jupyter-widgets/controls"
+ },
+ "model_module_version": "1.5.0"
+ },
+ "39c23c6a972b419eb2eeeebafeaedc22": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "state": {
+ "_view_name": "LayoutView",
+ "grid_template_rows": null,
+ "right": null,
+ "justify_content": null,
+ "_view_module": "@jupyter-widgets/base",
+ "overflow": null,
+ "_model_module_version": "1.2.0",
+ "_view_count": null,
+ "flex_flow": null,
+ "width": null,
+ "min_width": null,
+ "border": null,
+ "align_items": null,
+ "bottom": null,
+ "_model_module": "@jupyter-widgets/base",
+ "top": null,
+ "grid_column": null,
+ "overflow_y": null,
+ "overflow_x": null,
+ "grid_auto_flow": null,
+ "grid_area": null,
+ "grid_template_columns": null,
+ "flex": null,
+ "_model_name": "LayoutModel",
+ "justify_items": null,
+ "grid_row": null,
+ "max_height": null,
+ "align_content": null,
+ "visibility": null,
+ "align_self": null,
+ "height": null,
+ "min_height": null,
+ "padding": null,
+ "grid_auto_rows": null,
+ "grid_gap": null,
+ "max_width": null,
+ "order": null,
+ "_view_module_version": "1.2.0",
+ "grid_template_areas": null,
+ "object_position": null,
+ "object_fit": null,
+ "grid_auto_columns": null,
+ "margin": null,
+ "display": null,
+ "left": null
+ },
+ "model_module_version": "1.2.0"
+ }
+ }
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "e67Ut53QYEdU",
+ "cellView": "form",
+ "outputId": "437871b8-b8ac-4eaf-c2e1-61d801c5e6b2",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 100
+ }
+ },
+ "source": [
+ "#@title\n",
+ "%%html\n",
+ "\n",
+ " Notebook written in collaboration with
Aditya Malte.\n",
+ "
\n",
+ " The Notebook is on GitHub, so contributions are more than welcome.\n",
+ "
\n",
+ "
\n",
+ "\n"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/html": [
+ "\n",
+ " Notebook written in collaboration with
Aditya Malte.\n",
+ "
\n",
+ " The Notebook is on GitHub, so contributions are more than welcome.\n",
+ "
\n",
+ "
\n",
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "tags": []
+ }
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "M1oqh0F6W3ad"
+ },
+ "source": [
+ "# How to train a new language model from scratch using Transformers and Tokenizers\n",
+ "\n",
+ "### Notebook edition (link to blogpost [link](https://huggingface.co/blog/how-to-train)). Last update May 15, 2020\n",
+ "\n",
+ "\n",
+ "Over the past few months, we made several improvements to our [`transformers`](https://github.com/huggingface/transformers) and [`tokenizers`](https://github.com/huggingface/tokenizers) libraries, with the goal of making it easier than ever to **train a new language model from scratch**.\n",
+ "\n",
+ "In this post we’ll demo how to train a “small” model (84 M parameters = 6 layers, 768 hidden size, 12 attention heads) – that’s the same number of layers & heads as DistilBERT – on **Esperanto**. We’ll then fine-tune the model on a downstream task of part-of-speech tagging.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "oK7PPVm2XBgr"
+ },
+ "source": [
+ "## 1. Find a dataset\n",
+ "\n",
+ "First, let us find a corpus of text in Esperanto. Here we’ll use the Esperanto portion of the [OSCAR corpus](https://traces1.inria.fr/oscar/) from INRIA.\n",
+ "OSCAR is a huge multilingual corpus obtained by language classification and filtering of [Common Crawl](https://commoncrawl.org/) dumps of the Web.\n",
+ "\n",
+ "
\n",
+ "\n",
+ "The Esperanto portion of the dataset is only 299M, so we’ll concatenate with the Esperanto sub-corpus of the [Leipzig Corpora Collection](https://wortschatz.uni-leipzig.de/en/download), which is comprised of text from diverse sources like news, literature, and wikipedia.\n",
+ "\n",
+ "The final training corpus has a size of 3 GB, which is still small – for your model, you will get better results the more data you can get to pretrain on.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "HOk4iZ9YZvec"
+ },
+ "source": [
+ "# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance\n",
+ "!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "G-kkz81OY6xH"
+ },
+ "source": [
+ "## 2. Train a tokenizer\n",
+ "\n",
+ "We choose to train a byte-level Byte-pair encoding tokenizer (the same as GPT-2), with the same special tokens as RoBERTa. Let’s arbitrarily pick its size to be 52,000.\n",
+ "\n",
+ "We recommend training a byte-level BPE (rather than let’s say, a WordPiece tokenizer like BERT) because it will start building its vocabulary from an alphabet of single bytes, so all words will be decomposable into tokens (no more `` tokens!).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "5duRggBRZKvP"
+ },
+ "source": [
+ "# We won't need TensorFlow here\n",
+ "!pip uninstall -y tensorflow\n",
+ "# Install `transformers` from master\n",
+ "!pip install git+https://github.com/huggingface/transformers\n",
+ "!pip list | grep -E 'transformers|tokenizers'\n",
+ "# transformers version at notebook update --- 2.11.0\n",
+ "# tokenizers version at notebook update --- 0.8.0rc1"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "IMnymRDLe0hi",
+ "outputId": "4d26476f-e6b5-475a-a0c1-41b6fcdc041a",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 52
+ }
+ },
+ "source": [
+ "%%time\n",
+ "from pathlib import Path\n",
+ "\n",
+ "from tokenizers import ByteLevelBPETokenizer\n",
+ "\n",
+ "paths = [str(x) for x in Path(\".\").glob(\"**/*.txt\")]\n",
+ "\n",
+ "# Initialize a tokenizer\n",
+ "tokenizer = ByteLevelBPETokenizer()\n",
+ "\n",
+ "# Customize training\n",
+ "tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[\n",
+ " \"\",\n",
+ " \"\",\n",
+ " \"\",\n",
+ " \"\",\n",
+ " \"\",\n",
+ "])"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 4min, sys: 3min 7s, total: 7min 7s\n",
+ "Wall time: 2min 25s\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6Ei7bqpRf1LH"
+ },
+ "source": [
+ "Now let's save files to disk"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "EIS-irI0f32P",
+ "outputId": "e86c4a24-eb65-4f0a-aa58-ed1931a05ac9",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ }
+ },
+ "source": [
+ "!mkdir EsperBERTo\n",
+ "tokenizer.save_model(\"EsperBERTo\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['EsperBERTo/vocab.json', 'EsperBERTo/merges.txt']"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lOOfYSuQhSqT"
+ },
+ "source": [
+ "🔥🔥 Wow, that was fast! ⚡️🔥\n",
+ "\n",
+ "We now have both a `vocab.json`, which is a list of the most frequent tokens ranked by frequency, and a `merges.txt` list of merges.\n",
+ "\n",
+ "```json\n",
+ "{\n",
+ "\t\"\": 0,\n",
+ "\t\"\": 1,\n",
+ "\t\"\": 2,\n",
+ "\t\"\": 3,\n",
+ "\t\"\": 4,\n",
+ "\t\"!\": 5,\n",
+ "\t\"\\\"\": 6,\n",
+ "\t\"#\": 7,\n",
+ "\t\"$\": 8,\n",
+ "\t\"%\": 9,\n",
+ "\t\"&\": 10,\n",
+ "\t\"'\": 11,\n",
+ "\t\"(\": 12,\n",
+ "\t\")\": 13,\n",
+ "\t# ...\n",
+ "}\n",
+ "\n",
+ "# merges.txt\n",
+ "l a\n",
+ "Ġ k\n",
+ "o n\n",
+ "Ġ la\n",
+ "t a\n",
+ "Ġ e\n",
+ "Ġ d\n",
+ "Ġ p\n",
+ "# ...\n",
+ "```\n",
+ "\n",
+ "What is great is that our tokenizer is optimized for Esperanto. Compared to a generic tokenizer trained for English, more native words are represented by a single, unsplit token. Diacritics, i.e. accented characters used in Esperanto – `ĉ`, `ĝ`, `ĥ`, `ĵ`, `ŝ`, and `ŭ` – are encoded natively. We also represent sequences in a more efficient manner. Here on this corpus, the average length of encoded sequences is ~30% smaller as when using the pretrained GPT-2 tokenizer.\n",
+ "\n",
+ "Here’s how you can use it in `tokenizers`, including handling the RoBERTa special tokens – of course, you’ll also be able to use it directly from `transformers`.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "tKVWB8WShT-z"
+ },
+ "source": [
+ "from tokenizers.implementations import ByteLevelBPETokenizer\n",
+ "from tokenizers.processors import BertProcessing\n",
+ "\n",
+ "\n",
+ "tokenizer = ByteLevelBPETokenizer(\n",
+ " \"./EsperBERTo/vocab.json\",\n",
+ " \"./EsperBERTo/merges.txt\",\n",
+ ")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "hO5M3vrAhcuj"
+ },
+ "source": [
+ "tokenizer._tokenizer.post_processor = BertProcessing(\n",
+ " (\"\", tokenizer.token_to_id(\"\")),\n",
+ " (\"\", tokenizer.token_to_id(\"\")),\n",
+ ")\n",
+ "tokenizer.enable_truncation(max_length=512)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "E3Ye27nchfzq",
+ "outputId": "b9812ed2-1ecd-4e1b-d9bd-7de581955e70",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ }
+ },
+ "source": [
+ "tokenizer.encode(\"Mi estas Julien.\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "X8ya5_7rhjKS",
+ "outputId": "e9e08ded-1081-4823-dd81-9d6be1255385",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ }
+ },
+ "source": [
+ "tokenizer.encode(\"Mi estas Julien.\").tokens"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "['', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '']"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 11
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "WQpUC_CDhnWW"
+ },
+ "source": [
+ "## 3. Train a language model from scratch\n",
+ "\n",
+ "**Update:** This section follows along the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/legacy/run_language_modeling.py) script, using our new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) directly. Feel free to pick the approach you like best.\n",
+ "\n",
+ "> We’ll train a RoBERTa-like model, which is a BERT-like with a couple of changes (check the [documentation](https://huggingface.co/transformers/model_doc/roberta.html) for more details).\n",
+ "\n",
+ "As the model is BERT-like, we’ll train it on a task of *Masked language modeling*, i.e. the predict how to fill arbitrary tokens that we randomly mask in the dataset. This is taken care of by the example script.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "kD140sFjh0LQ",
+ "outputId": "0bab1f9e-bf7a-4f13-82d3-07fe5866ce78",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 318
+ }
+ },
+ "source": [
+ "# Check that we have a GPU\n",
+ "!nvidia-smi"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "Fri May 15 21:17:12 2020 \n",
+ "+-----------------------------------------------------------------------------+\n",
+ "| NVIDIA-SMI 440.82 Driver Version: 418.67 CUDA Version: 10.1 |\n",
+ "|-------------------------------+----------------------+----------------------+\n",
+ "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
+ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
+ "|===============================+======================+======================|\n",
+ "| 0 Tesla P100-PCIE... Off | 00000000:00:04.0 Off | 0 |\n",
+ "| N/A 38C P0 26W / 250W | 0MiB / 16280MiB | 0% Default |\n",
+ "+-------------------------------+----------------------+----------------------+\n",
+ " \n",
+ "+-----------------------------------------------------------------------------+\n",
+ "| Processes: GPU Memory |\n",
+ "| GPU PID Type Process name Usage |\n",
+ "|=============================================================================|\n",
+ "| No running processes found |\n",
+ "+-----------------------------------------------------------------------------+\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "VNZZs-r6iKAV",
+ "outputId": "c8404d6c-7662-4240-c8da-ee89edfaf51b",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ }
+ },
+ "source": [
+ "# Check that PyTorch sees it\n",
+ "import torch\n",
+ "torch.cuda.is_available()"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 6
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "u0qQzgrBi1OX"
+ },
+ "source": [
+ "### We'll define the following config for the model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "LTXXutqeDzPi"
+ },
+ "source": [
+ "from transformers import RobertaConfig\n",
+ "\n",
+ "config = RobertaConfig(\n",
+ " vocab_size=52_000,\n",
+ " max_position_embeddings=514,\n",
+ " num_attention_heads=12,\n",
+ " num_hidden_layers=6,\n",
+ " type_vocab_size=1,\n",
+ ")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yAwQ82JiE5pi"
+ },
+ "source": [
+ "Now let's re-create our tokenizer in transformers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "4keFBUjQFOD1"
+ },
+ "source": [
+ "from transformers import RobertaTokenizerFast\n",
+ "\n",
+ "tokenizer = RobertaTokenizerFast.from_pretrained(\"./EsperBERTo\", max_len=512)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "6yNCw-3hFv9h"
+ },
+ "source": [
+ "Finally let's initialize our model.\n",
+ "\n",
+ "**Important:**\n",
+ "\n",
+ "As we are training from scratch, we only initialize from a config, not from an existing pretrained model or checkpoint."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "BzMqR-dzF4Ro"
+ },
+ "source": [
+ "from transformers import RobertaForMaskedLM\n",
+ "\n",
+ "model = RobertaForMaskedLM(config=config)"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "jU6JhBSTKiaM",
+ "outputId": "35879a60-2915-4894-f702-2d649cfa398a",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 34
+ }
+ },
+ "source": [
+ "model.num_parameters()\n",
+ "# => 84 million parameters"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "84095008"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jBtUHRMliOLM"
+ },
+ "source": [
+ "### Now let's build our training Dataset\n",
+ "\n",
+ "We'll build our dataset by applying our tokenizer to our text file.\n",
+ "\n",
+ "Here, as we only have one text file, we don't even need to customize our `Dataset`. We'll just use the `LineByLineDataset` out-of-the-box."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "GlvP_A-THEEl",
+ "outputId": "e0510a33-7937-4a04-fa1c-d4e20b758bb2",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 52
+ }
+ },
+ "source": [
+ "%%time\n",
+ "from transformers import LineByLineTextDataset\n",
+ "\n",
+ "dataset = LineByLineTextDataset(\n",
+ " tokenizer=tokenizer,\n",
+ " file_path=\"./oscar.eo.txt\",\n",
+ " block_size=128,\n",
+ ")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 4min 54s, sys: 2.98 s, total: 4min 57s\n",
+ "Wall time: 1min 37s\n"
+ ],
+ "name": "stdout"
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "hDLs73HcIHk5"
+ },
+ "source": [
+ "Like in the [`run_language_modeling.py`](https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py) script, we need to define a data_collator.\n",
+ "\n",
+ "This is just a small helper that will help us batch different samples of the dataset together into an object that PyTorch knows how to perform backprop on."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "zTgWPa9Dipk2"
+ },
+ "source": [
+ "from transformers import DataCollatorForLanguageModeling\n",
+ "\n",
+ "data_collator = DataCollatorForLanguageModeling(\n",
+ " tokenizer=tokenizer, mlm=True, mlm_probability=0.15\n",
+ ")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ri2BIQKqjfHm"
+ },
+ "source": [
+ "### Finally, we are all set to initialize our Trainer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "YpvnFFmZJD-N"
+ },
+ "source": [
+ "from transformers import Trainer, TrainingArguments\n",
+ "\n",
+ "training_args = TrainingArguments(\n",
+ " output_dir=\"./EsperBERTo\",\n",
+ " overwrite_output_dir=True,\n",
+ " num_train_epochs=1,\n",
+ " per_gpu_train_batch_size=64,\n",
+ " save_steps=10_000,\n",
+ " save_total_limit=2,\n",
+ " prediction_loss_only=True,\n",
+ ")\n",
+ "\n",
+ "trainer = Trainer(\n",
+ " model=model,\n",
+ " args=training_args,\n",
+ " data_collator=data_collator,\n",
+ " train_dataset=dataset,\n",
+ ")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "o6sASa36Nf-N"
+ },
+ "source": [
+ "### Start training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "VmaHZXzmkNtJ",
+ "outputId": "a19880cb-bcc6-4885-bf24-c2c6d0f56d1e",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 738,
+ "referenced_widgets": [
+ "a58a66392b644b1384661e850c077a6c",
+ "a491e8caa0a048beb3b5259f14eb233f",
+ "837c9ddc3d594e088891874560c646b8",
+ "dbf50873d62c4ba39321faefbed0cca5",
+ "40bf955ba0284e84b198da6be8654219",
+ "fe20a8dae6e84628b5076d02183090f5",
+ "93b3f9eae3cb4e3e859cf456e3547c6d",
+ "6feb10aeb43147e6aba028d065947ae8",
+ "0989d41a4da24e9ebff377e02127642c",
+ "42c6061ef7e44f179db5a6e3551c0f17",
+ "d295dd80550447d88da0f04ce36a22ff",
+ "04e7e6d291da49d5816dc98a2904e95c",
+ "e7d8c3a4fecd40778e32966b29ea65a1",
+ "016d7c8318f742c1943464b08232a510",
+ "8388e9da9da4492c98c19235ca5fc1b5",
+ "39c23c6a972b419eb2eeeebafeaedc22"
+ ]
+ }
+ },
+ "source": [
+ "%%time\n",
+ "trainer.train()"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a58a66392b644b1384661e850c077a6c",
+ "version_minor": 0,
+ "version_major": 2
+ },
+ "text/plain": [
+ "HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ }
+ },
+ {
+ "output_type": "display_data",
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "0989d41a4da24e9ebff377e02127642c",
+ "version_minor": 0,
+ "version_major": 2
+ },
+ "text/plain": [
+ "HBox(children=(FloatProgress(value=0.0, description='Iteration', max=15228.0, style=ProgressStyle(description_…"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ }
+ },
+ {
+ "output_type": "stream",
+ "text": [
+ "{\"loss\": 7.152712148666382, \"learning_rate\": 4.8358287365379566e-05, \"epoch\": 0.03283425269240872, \"step\": 500}\n",
+ "{\"loss\": 6.928811420440674, \"learning_rate\": 4.671657473075913e-05, \"epoch\": 0.06566850538481744, \"step\": 1000}\n",
+ "{\"loss\": 6.789419063568115, \"learning_rate\": 4.5074862096138694e-05, \"epoch\": 0.09850275807722617, \"step\": 1500}\n",
+ "{\"loss\": 6.688932447433472, \"learning_rate\": 4.343314946151826e-05, \"epoch\": 0.1313370107696349, \"step\": 2000}\n",
+ "{\"loss\": 6.595982004165649, \"learning_rate\": 4.179143682689782e-05, \"epoch\": 0.1641712634620436, \"step\": 2500}\n",
+ "{\"loss\": 6.545944199562073, \"learning_rate\": 4.0149724192277385e-05, \"epoch\": 0.19700551615445233, \"step\": 3000}\n",
+ "{\"loss\": 6.4864857263565066, \"learning_rate\": 3.850801155765695e-05, \"epoch\": 0.22983976884686105, \"step\": 3500}\n",
+ "{\"loss\": 6.412427802085876, \"learning_rate\": 3.686629892303651e-05, \"epoch\": 0.2626740215392698, \"step\": 4000}\n",
+ "{\"loss\": 6.363630670547486, \"learning_rate\": 3.522458628841608e-05, \"epoch\": 0.29550827423167847, \"step\": 4500}\n",
+ "{\"loss\": 6.273832890510559, \"learning_rate\": 3.358287365379564e-05, \"epoch\": 0.3283425269240872, \"step\": 5000}\n",
+ "{\"loss\": 6.197585330963134, \"learning_rate\": 3.1941161019175205e-05, \"epoch\": 0.3611767796164959, \"step\": 5500}\n",
+ "{\"loss\": 6.097779376983643, \"learning_rate\": 3.029944838455477e-05, \"epoch\": 0.39401103230890466, \"step\": 6000}\n",
+ "{\"loss\": 5.985456382751464, \"learning_rate\": 2.8657735749934332e-05, \"epoch\": 0.42684528500131336, \"step\": 6500}\n",
+ "{\"loss\": 5.8448616371154785, \"learning_rate\": 2.70160231153139e-05, \"epoch\": 0.4596795376937221, \"step\": 7000}\n",
+ "{\"loss\": 5.692522863388062, \"learning_rate\": 2.5374310480693457e-05, \"epoch\": 0.4925137903861308, \"step\": 7500}\n",
+ "{\"loss\": 5.562082152366639, \"learning_rate\": 2.3732597846073024e-05, \"epoch\": 0.5253480430785396, \"step\": 8000}\n",
+ "{\"loss\": 5.457240365982056, \"learning_rate\": 2.2090885211452588e-05, \"epoch\": 0.5581822957709482, \"step\": 8500}\n",
+ "{\"loss\": 5.376953645706177, \"learning_rate\": 2.0449172576832152e-05, \"epoch\": 0.5910165484633569, \"step\": 9000}\n",
+ "{\"loss\": 5.298609251022339, \"learning_rate\": 1.8807459942211716e-05, \"epoch\": 0.6238508011557657, \"step\": 9500}\n",
+ "{\"loss\": 5.225468152046203, \"learning_rate\": 1.716574730759128e-05, \"epoch\": 0.6566850538481744, \"step\": 10000}\n",
+ "{\"loss\": 5.174519973754883, \"learning_rate\": 1.5524034672970843e-05, \"epoch\": 0.6895193065405831, \"step\": 10500}\n",
+ "{\"loss\": 5.113943946838379, \"learning_rate\": 1.3882322038350407e-05, \"epoch\": 0.7223535592329918, \"step\": 11000}\n",
+ "{\"loss\": 5.08140989112854, \"learning_rate\": 1.2240609403729971e-05, \"epoch\": 0.7551878119254006, \"step\": 11500}\n",
+ "{\"loss\": 5.072491912841797, \"learning_rate\": 1.0598896769109535e-05, \"epoch\": 0.7880220646178093, \"step\": 12000}\n",
+ "{\"loss\": 5.012459496498108, \"learning_rate\": 8.957184134489099e-06, \"epoch\": 0.820856317310218, \"step\": 12500}\n",
+ "{\"loss\": 4.999591351509094, \"learning_rate\": 7.315471499868663e-06, \"epoch\": 0.8536905700026267, \"step\": 13000}\n",
+ "{\"loss\": 4.994838352203369, \"learning_rate\": 5.673758865248227e-06, \"epoch\": 0.8865248226950354, \"step\": 13500}\n",
+ "{\"loss\": 4.955870885848999, \"learning_rate\": 4.032046230627791e-06, \"epoch\": 0.9193590753874442, \"step\": 14000}\n",
+ "{\"loss\": 4.941655583381653, \"learning_rate\": 2.390333596007355e-06, \"epoch\": 0.9521933280798529, \"step\": 14500}\n",
+ "{\"loss\": 4.931783639907837, \"learning_rate\": 7.486209613869189e-07, \"epoch\": 0.9850275807722616, \"step\": 15000}\n",
+ "\n",
+ "\n",
+ "CPU times: user 1h 43min 36s, sys: 1h 3min 28s, total: 2h 47min 4s\n",
+ "Wall time: 2h 46min 46s\n"
+ ],
+ "name": "stdout"
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "TrainOutput(global_step=15228, training_loss=5.762423221226405)"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 18
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "_ZkooHz1-_2h"
+ },
+ "source": [
+ "#### 🎉 Save final model (+ tokenizer + config) to disk"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "QDNgPls7_l13"
+ },
+ "source": [
+ "trainer.save_model(\"./EsperBERTo\")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "d0caceCy_p1-"
+ },
+ "source": [
+ "## 4. Check that the LM actually trained"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "iIQJ8ND_AEhl"
+ },
+ "source": [
+ "Aside from looking at the training and eval losses going down, the easiest way to check whether our language model is learning anything interesting is via the `FillMaskPipeline`.\n",
+ "\n",
+ "Pipelines are simple wrappers around tokenizers and models, and the 'fill-mask' one will let you input a sequence containing a masked token (here, ``) and return a list of the most probable filled sequences, with their probabilities.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ltXgXyCbAJLY"
+ },
+ "source": [
+ "from transformers import pipeline\n",
+ "\n",
+ "fill_mask = pipeline(\n",
+ " \"fill-mask\",\n",
+ " model=\"./EsperBERTo\",\n",
+ " tokenizer=\"./EsperBERTo\"\n",
+ ")"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "UIvgZ3S6AO0z",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 283
+ },
+ "outputId": "5f3d2f00-abdc-44a9-9c1b-75e3ec328576"
+ },
+ "source": [
+ "# The sun .\n",
+ "# =>\n",
+ "\n",
+ "fill_mask(\"La suno .\")"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[{'score': 0.02119220793247223,\n",
+ " 'sequence': ' La suno estas.',\n",
+ " 'token': 316},\n",
+ " {'score': 0.012403824366629124,\n",
+ " 'sequence': ' La suno situas.',\n",
+ " 'token': 2340},\n",
+ " {'score': 0.011061107739806175,\n",
+ " 'sequence': ' La suno estis.',\n",
+ " 'token': 394},\n",
+ " {'score': 0.008284995332360268,\n",
+ " 'sequence': ' La suno de.',\n",
+ " 'token': 274},\n",
+ " {'score': 0.006471084896475077,\n",
+ " 'sequence': ' La suno akvo.',\n",
+ " 'token': 1833}]"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 36
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i0qCyyhNAWZi"
+ },
+ "source": [
+ "Ok, simple syntax/grammar works. Let’s try a slightly more interesting prompt:\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "YZ9HSQxAAbme",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 283
+ },
+ "outputId": "aabfeedc-b1d0-4837-b01d-cd42726a5a3d"
+ },
+ "source": [
+ "fill_mask(\"Jen la komenco de bela .\")\n",
+ "\n",
+ "# This is the beginning of a beautiful .\n",
+ "# =>"
+ ],
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "[{'score': 0.01814725436270237,\n",
+ " 'sequence': ' Jen la komenco de bela urbo.',\n",
+ " 'token': 871},\n",
+ " {'score': 0.015888698399066925,\n",
+ " 'sequence': ' Jen la komenco de bela vivo.',\n",
+ " 'token': 1160},\n",
+ " {'score': 0.015662025660276413,\n",
+ " 'sequence': ' Jen la komenco de bela tempo.',\n",
+ " 'token': 1021},\n",
+ " {'score': 0.015555007383227348,\n",
+ " 'sequence': ' Jen la komenco de bela mondo.',\n",
+ " 'token': 945},\n",
+ " {'score': 0.01412549614906311,\n",
+ " 'sequence': ' Jen la komenco de bela tago.',\n",
+ " 'token': 1633}]"
+ ]
+ },
+ "metadata": {
+ "tags": []
+ },
+ "execution_count": 37
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file