From 569a21577fa83596c3a027c057c909488cde2006 Mon Sep 17 00:00:00 2001 From: Max Woolf Date: Sat, 17 Apr 2021 19:03:48 -0700 Subject: [PATCH 1/6] bump versions --- requirements.txt | 4 ++-- setup.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements.txt b/requirements.txt index da7d5f9..93f2c86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -transformers>=4.5.0 +transformers>=4.5.1 fire>=0.3.0 -pytorch-lightning>=1.2.3 +pytorch-lightning>=1.2.7 torch>=1.6.0 \ No newline at end of file diff --git a/setup.py b/setup.py index b250dd2..70e6f6e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="aitextgen", packages=["aitextgen"], # this must be the same as the name above - version="0.4.1", + version="0.5.0", description="A robust Python tool for text-based AI training and generation using GPT-2.", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", @@ -17,9 +17,9 @@ python_requires=">=3.6", include_package_data=True, install_requires=[ - "transformers>=4.5.0", + "transformers>=4.5.1", "fire>=0.3.0", - "pytorch-lightning>=1.2.3", + "pytorch-lightning>=1.2.7", "torch>=1.6.0", ], ) From e2f5d5d3a093fab71645afb7c4ffa3e7d6624715 Mon Sep 17 00:00:00 2001 From: Max Woolf Date: Sun, 18 Apr 2021 10:15:11 -0700 Subject: [PATCH 2/6] Add GPT Neo to README --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b80a62e..e870d08 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # aitextgen -A robust Python tool for text-based AI training and generation using [OpenAI's](https://openai.com) [GPT-2](https://openai.com/blog/better-language-models/) architecture. +A robust Python tool for text-based AI training and generation using [OpenAI's](https://openai.com) [GPT-2](https://openai.com/blog/better-language-models/) and [EleutherAI's](https://www.eleuther.ai) [GPT Neo/GPT-3](https://github.com/EleutherAI/gpt-neo) architecture. aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Hugging Face Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages: -- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI...or create your own GPT-2 model + tokenizer and train from scratch! +- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 1325M/355M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch! - Generates text faster than gpt-2-simple and with better memory efficiency! - With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the HuggingFace model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text. - With pytorch-lightning, aitextgen trains models not just on CPUs and GPUs, but also _multiple_ GPUs and (eventually) TPUs! It also includes a pretty training progress bar, with the ability to add optional loggers. @@ -16,7 +16,7 @@ You can read more about aitextgen [in the documentation](https://docs.aitextgen. You can play with aitextgen _for free_ with powerful GPUs using these Colaboratory Notebooks! -- [Finetune OpenAI's 124M GPT-2 model on your own dataset (GPU)](https://colab.research.google.com/drive/15qBZx5y9rdaQSyWpsreMDnTiZ5IlN0zD?usp=sharing) +- [Finetune OpenAI's 124M GPT-2 model (or GPT Neo) on your own dataset (GPU)](https://colab.research.google.com/drive/15qBZx5y9rdaQSyWpsreMDnTiZ5IlN0zD?usp=sharing) - [Train a GPT-2 model + tokenizer from scratch (GPU)](https://colab.research.google.com/drive/144MdX5aLqrQ3-YW-po81CQMrD6kpgpYh?usp=sharing) You can also play with custom [Reddit](notebooks/reddit_demo.ipynb) and [Hacker News](notebooks/hacker_news_demo.ipynb) demo models on your own PC. @@ -90,10 +90,9 @@ ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=50 ai.generate(10, prompt="ROMEO:") # With your trained model, you can reload the model at any time by -# providing the pytorch_model.bin model weights, the config, and the tokenizer. -ai2 = aitextgen(model="trained_model/pytorch_model.bin", - tokenizer_file="aitextgen.tokenizer.json", - config="trained_model/config.json") +# providing the folder containing the pytorch_model.bin model weights + the config, and providing the tokenizer. +ai2 = aitextgen(model_folder="trained_model", + tokenizer_file="aitextgen.tokenizer.json") ai2.generate(10, prompt="ROMEO:") ``` @@ -106,7 +105,7 @@ Want to run aitextgen and finetune GPT-2? Use the Colab notebooks in the Demos s ## Upcoming Features -The current release (v0.4.X) of aitextgen **is considered to be a beta**, targeting the most common use cases. The Notebooks and examples written so far are tested to work, but more fleshing out of the docs/use cases will be done over the next few months in addition to fixing the known issues noted above. +The current release (v0.5.X) of aitextgen **is considered to be a beta**, targeting the most common use cases. The Notebooks and examples written so far are tested to work, but more fleshing out of the docs/use cases will be done over the next few months in addition to fixing the known issues noted above. The next versions of aitextgen (and one of the reasons I made this package in the first place) will have native support for _schema-based generation_. (See [this repo](https://github.com/minimaxir/gpt-2-keyword-generation) for a rough proof-of-concept.) From 7c7888a7df0bf629875091177e77fa58a610c57a Mon Sep 17 00:00:00 2001 From: Max Woolf Date: Sun, 18 Apr 2021 10:46:56 -0700 Subject: [PATCH 3/6] add `trained_folder` note --- README.md | 2 +- notebooks/training_hello_world.ipynb | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e870d08..a26d681 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ ai = aitextgen(tokenizer_file=tokenizer_file, config=config) # which automatically processes the dataset with the appropriate size. data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64) -# Train the model! It will save pytorch_model.bin periodically and after completion. +# Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder. # On a 2020 8-core iMac, this took ~25 minutes to run. ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000) diff --git a/notebooks/training_hello_world.ipynb b/notebooks/training_hello_world.ipynb index 05f2f18..cb64fea 100644 --- a/notebooks/training_hello_world.ipynb +++ b/notebooks/training_hello_world.ipynb @@ -131,7 +131,7 @@ }, { "source": [ - "Train the model! It will save pytorch_model.bin periodically and after completion. On a 2020 8-core iMac, this took ~25 minutes to run.\n", + "Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder. On a 2020 8-core iMac, this took ~25 minutes to run.\n", "\n", "The configuration below processes 400,000 subsets of tokens (8 * 50000), which is about just one pass through all the data (1 epoch). Ideally you'll want multiple passes through the data and a training loss less than `2.0` for coherent output; when training a model from scratch, that's more difficult, but with long enough training you can get there!" ], @@ -317,9 +317,8 @@ "metadata": {}, "outputs": [], "source": [ - "ai2 = aitextgen(model=\"trained_model/pytorch_model.bin\",\n", - " tokenizer_file=\"aitextgen.tokenizer.json\",\n", - " config=\"trained_model/config.json\")" + "ai2 = aitextgen(model_folder=\"trained_model\",\n", + " tokenizer_file=\"aitextgen.tokenizer.json\")" ] }, { From ad6e3b20317dbc7dab57c519b2c92cc8dd279f71 Mon Sep 17 00:00:00 2001 From: Max Woolf Date: Sun, 18 Apr 2021 12:14:15 -0700 Subject: [PATCH 4/6] update docs for v0.5.0 --- README.md | 6 ++-- docs/dataset.md | 33 +++++++++++++------ docs/generate-performance.md | 6 ++-- docs/generate.md | 18 ++++++---- docs/index.md | 12 ++++--- docs/load-model.md | 49 +++++++++++++++++++--------- docs/loggers.md | 4 +-- docs/save-model.md | 10 +++--- docs/tutorials/colab.md | 6 ++-- docs/tutorials/generate_1_5b.md | 10 +++--- docs/tutorials/hello-world.md | 28 +++++++++------- docs/tutorials/model-from-scratch.md | 24 +++++++------- mkdocs.yml | 31 +++++++++++------- 13 files changed, 143 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index a26d681..7e644df 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A robust Python tool for text-based AI training and generation using [OpenAI's]( aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Hugging Face Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages: -- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 1325M/355M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch! +- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 125M/350M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch! - Generates text faster than gpt-2-simple and with better memory efficiency! - With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the HuggingFace model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text. - With pytorch-lightning, aitextgen trains models not just on CPUs and GPUs, but also _multiple_ GPUs and (eventually) TPUs! It also includes a pretty training progress bar, with the ability to add optional loggers. @@ -35,7 +35,7 @@ Here's how you can quickly test out aitextgen on your own computer, even if you For generating text from a pretrained GPT-2 model: -```python +```py3 from aitextgen import aitextgen # Without any parameters, aitextgen() will download, cache, and load the 124M GPT-2 "small" model @@ -56,7 +56,7 @@ aitextgen generate --prompt "I believe in unicorns because" --to_file False Want to train your own mini GPT-2 model on your own computer? You can follow along [in this Jupyter Notebook](/notebooks/training_hello_world.ipynb) or, download this [text file of Shakespeare's plays](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt), cd to that directory in a Terminal, open up a `python3` console and go: -```python +```py3 from aitextgen.TokenDataset import TokenDataset from aitextgen.tokenizers import train_tokenizer from aitextgen.utils import GPT2ConfigCPU diff --git a/docs/dataset.md b/docs/dataset.md index b5d8162..7fcb4fa 100644 --- a/docs/dataset.md +++ b/docs/dataset.md @@ -1,19 +1,20 @@ # TokenDataset -aitextgen has a special class, `TokenDataset`, used for managing tokenized datasets to be fed into model training. (this is in contrast with other GPT-2 finetuning approaches, which tokenizes at training time although you can still do that if you want) +aitextgen has a special class, `TokenDataset`, used for managing tokenized datasets to be fed into model training. (this is in contrast with other GPT-2 finetuning approaches, which tokenizes at training time although you can still do that by passing a `file_path` and other relevant parameters to `ai.train()`.) This has a few nice bonuses, including: - Tokenize a dataset on a local machine ahead of time and compress it, saving time/bandwidth transporting data to a remote machine - Supports both reading a dataset line-by-line (including single-column CSVs), or bulk texts. +- Debug and log the loaded texts. - Merge datasets together without using external libraries - Cross-train on multiple datasets to "blend" them together. ## Creating a TokenDataset For GPT-2 Finetuning -The easiest way to create a TokenDataset is to provide a target file. If no `vocab_file` and `merges_file` are provided, it will use the default GPT-2 tokenizer. +The easiest way to create a TokenDataset is to provide a target file. If no `tokenizer_file` is provided, it will use the default GPT-2 tokenizer. -```python +```py3 from aitextgen.TokenDataset import TokenDataset data = TokenDataset("shakespeare.txt") @@ -21,34 +22,46 @@ data = TokenDataset("shakespeare.txt") If you pass a single-column CSV and specify `line_by_line=True`, the TokenDataset will parse it row-by-row, and is the recommended way to handle multiline texts. -```python +```py3 data = TokenDataset("politics.csv", line_by_line=True) ``` You can also manually pass a list of texts to `texts` instead if you've processed them elsewhere. -```python +```py3 data = TokenDataset(texts = ["Lorem", "Ipsum", "Dolor"]) ``` +## Block Size + +`block_size` is another parameter that can be passed when creating a TokenDataset, more useful for custom models. This should match the context window (e.g. the `n_positions` or `max_position_embeddings` config parameters). By default, it will choose `1024`: the GPT-2 context window. + +When implicitly loading a dataset via `ai.train()`, the `block_size` will be set to what is supported by the corresponding model `config`. + +## Debugging a TokenDataset + +When loading a dataset, a progress bar will appear showing how many texts are loaded and + +If you want to see what exactly is input to the model during training, you can access a slice via `data[0]`. + ## Saving/Loading a TokenDataset When creating a TokenDataset, you can automatically save it as a compressed gzipped numpy array when completed. -```python +```py3 data = TokenDataset("shakespeare.txt", save_cache=True) ``` Or save it after you've loaded it with the `save()` function. -```python +```py3 data = TokenDataset("shakespeare.txt") data.save() ``` By default, it will save to `dataset_cache.tar.gz`. You can then reload that into another Python session by specifying the cache. -```python +```py3 data = TokenDataset("dataset_cache.tar.gz", from_cache=True) ``` @@ -58,7 +71,7 @@ data = TokenDataset("dataset_cache.tar.gz", from_cache=True) ## Using TokenDatasets with a Custom GPT-2 Model -The default TokenDataset has a `block_size` of `1024`, which corresponds to the _context window of the default GPT-2 model_. If you're using a custom model w/ a different maximum. Additionally, you must explicitly provide the vocab and merges files to rebuild the tokenizer, as the tokenizer will be different than the normal GPT-2 one. +The default TokenDataset has a `block_size` of `1024`, which corresponds to the _context window of the default GPT-2 model_. If you're using a custom model w/ a different maximum. Additionally, you must explicitly provide the tokenizer file to rebuild the tokenizer, as the tokenizer will be different than the normal GPT-2 one. See the [Model From Scratch](tutorials/model-from-scratch.md) docs for more info. @@ -72,7 +85,7 @@ Merging processed TokenDatasets can be done with the `merge_datasets()` function !!! note "About Merging" The current implementation merges by subset count, so equalization may not be perfect, but it will not significantly impact training. -```python +```py3 from aitextgen.TokenDataset import TokenDataset, merge_datasets data1 = TokenDataset("politics1000.csv", line_by_line=True) # 1000 samples diff --git a/docs/generate-performance.md b/docs/generate-performance.md index a9a69a5..e312502 100644 --- a/docs/generate-performance.md +++ b/docs/generate-performance.md @@ -10,7 +10,7 @@ PyTorch has the ability to quantize models on the CPU. Currently, it will only q To quantize a model after it's loaded, just run: -```python +```py3 ai.quantize() ``` @@ -22,13 +22,13 @@ Certain GPUs, notably the cheap T4 and the expensive V100, support the ability t Assuming you are using a compatable GPU and already have [apex](https://github.com/NVIDIA/apex) installed, you can convert a model to the "half" FP16 mode with this: -```python +```py3 ai.to_fp16() ``` If you want to convert the model _before_ loading it into GPU memory (which may help avoid memory leaks), you can instantiate the model like this: -```python +```py3 ai.to_fp16(to_gpu=True, to_fp16=True) ``` diff --git a/docs/generate.md b/docs/generate.md index e2578e6..6b024e0 100644 --- a/docs/generate.md +++ b/docs/generate.md @@ -7,14 +7,18 @@ Thanks to the base Transformers package, aitextgen has more options for generati See [this article](https://huggingface.co/blog/how-to-generate) by Huggingface engineer Patrick von Platen for how sampling and these parameters are used in practice. - `n`: Number of texts generated. -- `max_length`: Maximum length of the generated text (default: 200; for GPT-2, the maximum is 1024.) -- `prompt`: Prompt that starts the generated text and is included in the generate text. (used to be `prefix` in previous tools) +- `max_length`: Maximum length of the generated text (default: 200; for GPT-2, the maximum is 1024; for GPT Neo, the maximum is 2048) +- `prompt`: Prompt that starts the generated text and is included in the generated text. - `temperature`: Controls the "craziness" of the text (default: 0.7) - `top_k`: If nonzero, limits the sampled tokens to the top _k_ values. (default: 0) - `top_p`: If nonzero, limits the sampled tokens to the cumulative probability Some lesser-known-but-still-useful-parameters that are unique to Transformers: + +!!! warning "Performance" + Enabling these parameters may slow down generation. + - `num_beams`: If greater than 1, executes beam search for cleaner text. - `repetition_penalty`: If greater than 1.0, penalizes repetition in a text to avoid infinite loops. - `length_penalty`: If greater than 1.0, penalizes text proportional to the length @@ -30,17 +34,17 @@ Given a `aitextgen` object with a loaded model + tokenizer named `ai`: want to generate on the GPU, make sure you call `ai.to_gpu()` beforehand, or load the model into the GPU using `ai = aitextgen(to_gpu=True)` -- `ai.generate()`: Generates and prints text to console. If `prompt` is used, the `prompt` is bolded. (a la [Talk to Transformer](https://talktotransformer.com)) +- `ai.generate()`: Generates and prints text to console. If `prompt` is used, the `prompt` is **bolded**. - `ai.generate_one()`: A helper function which generates a single text and returns as a string (good for APIs) - `ai.generate_samples()`: Generates multiple samples at specified temperatures: great for debugging. -- `ai.generate_to_file()`: Generates a bulk amount of texts to file. (this accepts a `batch_size` parameter which is useful if using on a GPU) +- `ai.generate_to_file()`: Generates a bulk amount of texts to file. (this accepts a `batch_size` parameter which is useful if using on a GPU, as it can generate texts in parallel with no performance loss) -!!! note "Cleanup" - By default, the `cleanup` parameter is set to True, which automatically removes texts that are blatantly malformed (e.g. only 2 characters long). Therefore, there may be less than `n` results returned. You can disabled this behavior by setting `cleanup=False`. +!!! note "lstrip and nonempty_output" + By default, the `lstrip` and `nonempty_output` parameters to `generate` are set to `True`, which alters the behavior of the generated text in a way that is most likely preferable. `lstrip`: Removes all whitespace at the beginning of the generated space. `nonempty_output`: If the output is empty (possible on shortform content), skip it if generating multiple texts, or try again if it's a single text. If `min_length` is specified, the same behavior occurs for texts below the minimum length after processing. ## Seed -aitextgen has a new `seed` parameter for generation. Using any generate function with a `seed` parameter (must be an integer) and all other models/parameters the same, and the generated text will be identical. This allows for reproducible generations. +aitextgen has a new `seed` parameter for generation. Using any generate function with a `seed` parameter (must be an integer) and all other models/parameters the same, and the generated text will be identical. This allows for reproducible generations in case someone accuses you of faking the AI output. For `generate_to_file()`, the 8-digit number at the end of the file name will be the seed used to generate the file, making reprodicibility easy. diff --git a/docs/index.md b/docs/index.md index d16f6b9..cb22116 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,12 +1,14 @@ # aitextgen -A robust tool for advanced AI text generation via [GPT-2](https://openai.com/blog/better-language-models/). +_Last Updated: April 18th, 2021 (aitextgen v0.5.0)_ -aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Huggingface Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages: +A robust Python tool for text-based AI training and generation using [OpenAI's](https://openai.com) [GPT-2](https://openai.com/blog/better-language-models/) and [EleutherAI's](https://www.eleuther.ai) [GPT Neo/GPT-3](https://github.com/EleutherAI/gpt-neo) architecture. -- Finetunes on a pretrained 124M GPT-2 model from OpenAI...or create your own GPT-2 model + tokenizer and train from scratch! -- Generates text faster than gpt-2-simple and with better memory efficiency! (even [from the 1.5B GPT-2 model](tutorials/generate_1_5b/)!) -- With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the Huggingface model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text. +aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Hugging Face Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages: + +- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 125M/355M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch! +- Generates text faster than gpt-2-simple and with better memory efficiency! +- With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the HuggingFace model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text. - With pytorch-lightning, aitextgen trains models not just on CPUs and GPUs, but also _multiple_ GPUs and (eventually) TPUs! It also includes a pretty training progress bar, with the ability to add optional loggers. - The input dataset is its own object, allowing you to not only easily encode megabytes of data in seconds, cache, and compress it on a local computer before transporting to a remote server, but you are able to _merge_ datasets without biasing the resulting dataset, or _cross-train_ on multiple datasets to create blended output. diff --git a/docs/load-model.md b/docs/load-model.md index cfdd845..66a1bab 100644 --- a/docs/load-model.md +++ b/docs/load-model.md @@ -8,45 +8,62 @@ There are several ways to load models. ## Loading an aitextgen model -The closer to the default 124M GPT-2 model, the fewer files you need! +For the base case, loading the default 124M GPT-2 model via Huggingface: -For the base case, loading the default model via Huggingface: - -```python +```py3 ai = aitextgen() ``` The downloaded model will be downloaded to `cache_dir`: `/aitextgen` by default. -If you've finetuned a 124M GPT-2 model using aitextgen, you can pass the generated `pytorch_model.bin` to aitextgen: +If you're loading a custom model for a different GPT-2/GPT-Neo architecture _from scratch_ but with the normal GPT-2 tokenizer, you can pass only a config. -```python -ai = aitextgen(model="pytorch_model.bin") +```py3 +from aitextgen.utils import GPT2ConfigCPU +config = GPT2ConfigCPU() +ai = aitextgen(config=config) ``` -If you're loading a finetuned model of a different GPT-2 architecture, you'll must also pass the generated `config.json` to aitextgen: +While training/finetuning a model, two files will be created: the `pytorch_model.bin` which contains the weights for the model, and a `config.json` illustrating the architecture for the model. Both of these files are needed to reload the model. + +If you've finetuned a model using aitextgen (the default model), you can pass the **folder name** containing the generated `pytorch_model.bin` and `config.json` to aitextgen (e.g. `trained_model`, which is where trained models will be saved by default). + + +!!! note "Same Directory" + If both files are in the current directory, you can pass `model_folder="."`. -```python -ai = aitextgen(model="pytorch_model.bin", config=config) +```py3 +ai = aitextgen(model_folder="trained_model") ``` -If you want to download an alternative GPT-2 model from Huggingface's repository of models, pass that model name to `model`. +These examples assume you are using the default GPT-2 tokenizer. If you have a _custom tokenizer_, you'll need to pass that along with loading the model. + +```py3 +ai3 = aitextgen(model_folder="trained_model", + tokenizer_file="aitextgen.tokenizer.json") +``` -```python +If you want to download an alternative GPT-2 model from Hugging Face's repository of models, pass that model name to `model`. + +```py3 ai = aitextgen(model="minimaxir/hacker-news") ``` The model and associated config + tokenizer will be downloaded into `cache_dir`. -## Loading TensorFlow-based GPT-2 models +This can also be used to download the [pretrained GPT Neo models](https://huggingface.co/EleutherAI) from EleutherAI. -aitextgen lets you download the models from Google's servers that OpenAI had uploaded back when GPT-2 was first released in 2019. These models are then converted to a PyTorch format. +```py3 +ai = aitextgen(model="EleutherAI/gpt-neo-125M") +``` + +## Loading TensorFlow-based GPT-2 models -It's counterintuitive, but it's _substantially_ faster than downloading from Huggingface's servers, especially if you are running your code on Google Cloud Platform (e.g. Colab notebooks) +aitextgen lets you download the models from Microsoft's servers that OpenAI had uploaded back when GPT-2 was first released in 2019. These models are then converted to a PyTorch format. To use this workflow, pass the corresponding model number to `tf_gpt2`: -```python +```py3 ai = aitextgen(tf_gpt2="124M") ``` diff --git a/docs/loggers.md b/docs/loggers.md index 442655e..0df3e6a 100644 --- a/docs/loggers.md +++ b/docs/loggers.md @@ -6,7 +6,7 @@ You can create loggers with popular tools such as [TensorBoard](https://www.tens For example, if you want to create a TensorBoard logger, you can create it: -```python +```py3 from pytorch_lightning import loggers tb_logger = loggers.TensorBoardLogger('logs/') @@ -14,6 +14,6 @@ tb_logger = loggers.TensorBoardLogger('logs/') Then pass it to the `loggers` parameter for `ai.train()`. -```python +```py3 ai.train(train_data=data, loggers=tb_logger) ``` diff --git a/docs/save-model.md b/docs/save-model.md index 70c5bd1..9bd719d 100644 --- a/docs/save-model.md +++ b/docs/save-model.md @@ -2,7 +2,7 @@ There are are multiple ways to save models. -Whenever a model is saved, two files are generated: `pytorch_model.bin` which contains the model weights, and `config.json` which is needed to load the model if it is not the base 124M GPT-2. +Whenever a model is saved, two files are generated: `pytorch_model.bin` which contains the model weights, and `config.json` which is needed to load the model. Assuming we have an aitextgen model `ai`: @@ -10,7 +10,7 @@ Assuming we have an aitextgen model `ai`: The aitextgen model can be saved at any time using `save`. -```python +```py3 ai.save() ``` @@ -24,7 +24,7 @@ If you are using Google Colaboratory, you can mount your personal Google Drive t First mount your Google Drive using `mount_gdrive()`: -```python +```py3 from aitextgen.colab import mount_gdrive, copy_file_to_gdrive mount_gdrive() ``` @@ -33,7 +33,7 @@ You'll be asked for an auth code; input it and press enter, and a `My Drive` fol You can drag and drop the model files into the Google Drive, or use `copy_file_to_gdrive` to copy them programmatically. -```python +```py3 copy_file_to_gdrive("pytorch_model.bin") copy_file_to_gdrive("config.json") ``` @@ -48,7 +48,7 @@ Concerned about timeouts in Google Colab? aitextgen has a feature that will copy As long as your drive is mounted as above, pass `save_gdrive = True` to the `train()` function: -```python +```py3 ai.train(save_gdrive=True) ``` diff --git a/docs/tutorials/colab.md b/docs/tutorials/colab.md index cda92a3..c6cce54 100644 --- a/docs/tutorials/colab.md +++ b/docs/tutorials/colab.md @@ -11,11 +11,11 @@ The Colab Notebooks also contain utilities to make it easier to export the model A Notebook for finetuning OpenAI's model on a GPU. This is the most common use case. -!!! note "124M Only" - Currently you can only finetune the 124M OpenAI GPT-2 model. +!!! note + Currently you can only finetune the 124M/355M/774M OpenAI GPT-2 models, with the latter two forcing `gradient_checkpointing=True` to ensure it does not cause the Colab GPU to go OOM. ## Training Your Own GPT-2 Model [Colab Notebook](https://colab.research.google.com/drive/144MdX5aLqrQ3-YW-po81CQMrD6kpgpYh?usp=sharing) -A Notebook for creating your own GPT-2 model with your own tokenizer. See the Model From Scratch on the advantages and disadvantages of this approach. +A Notebook for creating your own GPT-2 model with your own tokenizer. See the Model From Scratch section on the advantages and disadvantages of this approach. diff --git a/docs/tutorials/generate_1_5b.md b/docs/tutorials/generate_1_5b.md index f7b3d80..e5be959 100644 --- a/docs/tutorials/generate_1_5b.md +++ b/docs/tutorials/generate_1_5b.md @@ -45,16 +45,16 @@ Now go back to the Launcher and create a Python 3 Notebook (or upload the one he !!! warning "CUDA" You may want to ensure the Notebook sees the CUDA installation, which appears to be somewhat random. This can be verified by running `import torch` in a cell, then `torch.cuda.is_available()`. - + In a cell, load aitextgen: -```python +```py3 from aitextgen import aitextgen ``` In another cell, input and run: -```python +```py3 ai = aitextgen(tf_gpt2="1558M", to_gpu=True, to_fp16=True) ``` @@ -71,7 +71,7 @@ Now we can generate texts! The T4, for GPT-2 1.5B in FP16 mode, can generate abo Create a cell and add: -```python +```py3 ai.generate_to_file(n=300, batch_size=30) ``` @@ -83,7 +83,7 @@ And it will generate the texts to a file! When completed, you can double-click t More importantly, all parameters to `generate` are valid, allowing massive flexibility! -```python +```py3 ai.generate_to_file(n=150, batch_size=15, max_length=1024, top_p=0.9, temperature=1.2, prompt="President Donald Trump has magically transformed into a unicorn.") ``` diff --git a/docs/tutorials/hello-world.md b/docs/tutorials/hello-world.md index 2e2510d..2310121 100644 --- a/docs/tutorials/hello-world.md +++ b/docs/tutorials/hello-world.md @@ -4,7 +4,7 @@ Here's how you can quickly test out aitextgen on your own computer, even if you For generating text from a pretrained GPT-2 model: -```python +```py3 from aitextgen import aitextgen # Without any parameters, aitextgen() will download, cache, and load the 124M GPT-2 "small" model @@ -25,7 +25,7 @@ aitextgen generate --prompt "I believe in unicorns because" --to_file False Want to train your own mini GPT-2 model on your own computer? Download this [text file of Shakespeare plays](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt), cd to that directory in a Teriminal, open up a `python3` console and go: -```python +```py3 from aitextgen.TokenDataset import TokenDataset from aitextgen.tokenizers import train_tokenizer from aitextgen.utils import GPT2ConfigCPU @@ -35,27 +35,33 @@ from aitextgen import aitextgen file_name = "input.txt" # Train a custom BPE Tokenizer on the downloaded text -# This will save two files: aitextgen-vocab.json and aitextgen-merges.txt, -# which are needed to rebuild the tokenizer. +# This will save one file: `aitextgen.tokenizer.json`, which contains the +# information needed to rebuild the tokenizer. train_tokenizer(file_name) -vocab_file = "aitextgen-vocab.json" -merges_file = "aitextgen-merges.txt" +tokenizer_file = "aitextgen.tokenizer.json" # GPT2ConfigCPU is a mini variant of GPT-2 optimized for CPU-training # e.g. the # of input tokens here is 64 vs. 1024 for base GPT-2. config = GPT2ConfigCPU() # Instantiate aitextgen using the created tokenizer and config -ai = aitextgen(vocab_file=vocab_file, merges_file=merges_file, config=config) +ai = aitextgen(tokenizer_file=tokenizer_file, config=config) # You can build datasets for training by creating TokenDatasets, # which automatically processes the dataset with the appropriate size. -data = TokenDataset(file_name, vocab_file=vocab_file, merges_file=merges_file, block_size=64) +data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64) -# Train the model! It will save pytorch_model.bin periodically and after completion. -# On a 2016 MacBook Pro, this took ~25 minutes to run. -ai.train(data, batch_size=16, num_steps=5000) +# Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder. +# On a 2020 8-core iMac, this took ~25 minutes to run. +ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000) # Generate text from it! ai.generate(10, prompt="ROMEO:") + +# With your trained model, you can reload the model at any time by +# providing the folder containing the pytorch_model.bin model weights + the config, and providing the tokenizer. +ai2 = aitextgen(model_folder="trained_model", + tokenizer_file="aitextgen.tokenizer.json") + +ai2.generate(10, prompt="ROMEO:") ``` diff --git a/docs/tutorials/model-from-scratch.md b/docs/tutorials/model-from-scratch.md index 4a60739..b8ef248 100644 --- a/docs/tutorials/model-from-scratch.md +++ b/docs/tutorials/model-from-scratch.md @@ -8,7 +8,7 @@ If that is _not_ your use case, you may get a better generation quality _and_ sp - Non-English Text - Heavily Encoded Text -It still will require a _massive_ amount of training time (several hours, even on a TPU), but will be more flexible. +It still will require a _massive_ amount of training time (several hours) but will be more flexible. ## Building a Custom Tokenizer. @@ -16,20 +16,20 @@ The `train_tokenizer()` function from `aitextgen.tokenizers` trains the model on !!! note "Vocabulary Size" - The default vocabulary size for `train_tokenizer()` is 5,000 tokens. Although this is much lower than GPT-2's 50k vocab size, the smaller the vocab size, the easier it is to train the model (since it's more likely for the model to make a correct "guess"), and the model file size will be _much_ smaller. + The default vocabulary size for `train_tokenizer()` is 1,000 tokens. Although this is much lower than GPT-2's 50k vocab size, the smaller the vocab size, the easier it is to train the model (since it's more likely for the model to make a correct "guess"), and the model file size will be _much_ smaller. -```python +```py3 from aitextgen.tokenizers import train_tokenizer train_tokenizer(file_name) ``` -This creates two files: `aitextgen-vocab.json` and `aitextgen-merges.txt`, which are needed to rebuild the tokenizer. +This creates one file, `aitextgen.tokenizer.json`, which is needed to rebuild the tokenizer. # Building a Custom Dataset You can build a TokenDataset based off your custom Tokenizer, to be fed into the model. -```python +```py3 data = TokenDataset(file_name, vocab_file=vocab_file, merges_file=merges_file, block_size=32) ``` @@ -39,7 +39,7 @@ Whenever you load a default 124M GPT-2 model, it uses a `GPT2Config()` under the The `build_gpt2_config()` function from `aitextgen.utils` gives you more control. -```python +```py3 config = build_gpt2_config(vocab_size=5000, max_length=32, dropout=0.0, n_embd=256, n_layer=8, n_head=8) ``` @@ -60,20 +60,20 @@ A few notes on the inputs: You can instantiate an empty GPT-2 according to your custom config, and construct a custom tokenizer according to your vocab and merges file: -```python -ai = aitextgen(vocab_file=vocab_file, merges_file=merges_file, config=config) +```py3 +ai = aitextgen(tokenizer_file=tokenizer_file, config=config) ``` Training is done as normal. -```python +```py3 ai.train(data, batch_size=16, num_steps=5000) ``` ## Reloading the Custom Model -You'll always need to provide the vocab_file, merges_file, and config (a config file is saved when the model is saved; you can either build it at runtime as above, or use the `config.json`) +You'll always need to provide the tokenizer_file and the folder containing the `pytorch_model.bin` and `config.json`. -```python -ai = aitextgen(model="pytorch_model.bin", vocab_file=vocab_file, merges_file=merges_file, config=config) +```py3 +ai = aitextgen(model_folder="trained_model", tokenizer_file="aitextgen.tokenizer.json") ``` diff --git a/mkdocs.yml b/mkdocs.yml index c1d697c..3e5aac3 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: aitextgen -site_description: A robust Python tool for text-based AI training and generation using GPT-2. +site_description: A robust Python tool for text-based AI training and generation using GPT-2 and GPT Neo. site_author: Max Woolf (@minimaxir) nav: @@ -8,15 +8,15 @@ nav: - Loading a Model: load-model.md - Saving a Model: save-model.md - TokenDataset: dataset.md - - Training an aitextgen Model: + - Training a Model: - Colaboratory Notebooks: tutorials/colab.md # - Improving Training Performance: train-performance.md - Training a GPT-2 Model From Scratch: tutorials/model-from-scratch.md - Loggers: loggers.md - - Generating from an aitextgen Model: + - Generating from a Model: - Generating Text: generate.md - - Improving Generation Performance: generate-performance.md - - Generating From GPT-2 1.5B: tutorials/generate_1_5b.md + # - Improving Generation Performance: generate-performance.md + # - Generating From GPT-2 1.5B: tutorials/generate_1_5b.md - Importing from gpt-2-simple: gpt-2-simple.md - Helpful Notes: helpful-notes.md - Ethics: ethics.md @@ -26,9 +26,17 @@ nav: theme: name: "material" palette: - scheme: "slate" - primary: "light blue" - accent: "light blue" + - media: "(prefers-color-scheme: light)" + scheme: default + toggle: + icon: material/toggle-switch-off-outline + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: black + toggle: + icon: material/toggle-switch + name: Switch to light mode font: text: "Source Sans Pro" code: "Fira Code" @@ -43,7 +51,7 @@ repo_name: minimaxir/aitextgen repo_url: https://github.com/minimaxir/aitextgen edit_uri: "" -copyright: "Copyright © 2019 - 2020 Max Woolf" +copyright: "Copyright © 2019 - 2021 Max Woolf" extra: social: @@ -53,9 +61,8 @@ extra: link: "https://twitter.com/minimaxir" markdown_extensions: - - codehilite: - guess_lang: false - - mkautodoc + - pymdownx.highlight + - pymdownx.superfences - admonition - toc: permalink: true From e1740efb265ba766f1695e9fbec874cc60f8158b Mon Sep 17 00:00:00 2001 From: Max Woolf Date: Sun, 18 Apr 2021 14:27:15 -0700 Subject: [PATCH 5/6] remove redundant cleanup param --- aitextgen/aitextgen.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/aitextgen/aitextgen.py b/aitextgen/aitextgen.py index 719e8f6..b94dfca 100644 --- a/aitextgen/aitextgen.py +++ b/aitextgen/aitextgen.py @@ -472,7 +472,6 @@ def generate_to_file( destination_path: str = None, sample_delim: str = "=" * 20 + "\n", seed: int = None, - cleanup: bool = True, **kwargs, ) -> None: """ @@ -516,15 +515,6 @@ def generate_to_file( for _ in range(n // batch_size): gen_texts = self.generate(n=batch_size, return_as_list=True, **kwargs) - # Remove empty texts and strip out extra newlines/extra spaces - if cleanup: - texts_to_clean = gen_texts - gen_texts = [] - for text in texts_to_clean: - clean_text = text.strip().strip("\n") - if clean_text and len(clean_text) >= 2: - gen_texts.append(clean_text) - for gen_text in gen_texts: f.write("{}\n{}".format(gen_text, sample_delim)) pbar.update(batch_size) From 4a1c6dcd660ef857576b71bf0d2ba4672e4f61c6 Mon Sep 17 00:00:00 2001 From: Max Woolf Date: Sun, 18 Apr 2021 18:02:07 -0700 Subject: [PATCH 6/6] Add assert for old behavior --- aitextgen/aitextgen.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/aitextgen/aitextgen.py b/aitextgen/aitextgen.py index b94dfca..ffd7376 100644 --- a/aitextgen/aitextgen.py +++ b/aitextgen/aitextgen.py @@ -16,6 +16,7 @@ from transformers import ( AutoConfig, AutoModelForCausalLM, + AutoTokenizer, GPT2Config, GPT2LMHeadModel, GPT2TokenizerFast, @@ -97,6 +98,12 @@ def __init__( **kwargs, ) -> None: + if model: + assert not os.path.isfile(model), ( + "As of aitextgen 0.5.0, you must " + + "use `model_folder` to load an existing model." + ) + if not verbose: for module in [ "transformers.file_utils", @@ -189,7 +196,7 @@ def __init__( ) if model and "gpt2" not in model: logger.info(f"Using the tokenizer for {model}.") - self.tokenizer = GPT2TokenizerFast.from_pretrained( + self.tokenizer = AutoTokenizer.from_pretrained( model, cache_dir=cache_dir, )