From 569a21577fa83596c3a027c057c909488cde2006 Mon Sep 17 00:00:00 2001
From: Max Woolf <max@minimaxir.com>
Date: Sat, 17 Apr 2021 19:03:48 -0700
Subject: [PATCH 1/6] bump versions

---
 requirements.txt | 4 ++--
 setup.py         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index da7d5f9..93f2c86 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-transformers>=4.5.0
+transformers>=4.5.1
 fire>=0.3.0
-pytorch-lightning>=1.2.3
+pytorch-lightning>=1.2.7
 torch>=1.6.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b250dd2..70e6f6e 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="aitextgen",
     packages=["aitextgen"],  # this must be the same as the name above
-    version="0.4.1",
+    version="0.5.0",
     description="A robust Python tool for text-based AI training and generation using GPT-2.",
     long_description=open("README.md", "r", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
@@ -17,9 +17,9 @@
     python_requires=">=3.6",
     include_package_data=True,
     install_requires=[
-        "transformers>=4.5.0",
+        "transformers>=4.5.1",
         "fire>=0.3.0",
-        "pytorch-lightning>=1.2.3",
+        "pytorch-lightning>=1.2.7",
         "torch>=1.6.0",
     ],
 )

From e2f5d5d3a093fab71645afb7c4ffa3e7d6624715 Mon Sep 17 00:00:00 2001
From: Max Woolf <max@minimaxir.com>
Date: Sun, 18 Apr 2021 10:15:11 -0700
Subject: [PATCH 2/6] Add GPT Neo to README

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index b80a62e..e870d08 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
 # aitextgen
 
-A robust Python tool for text-based AI training and generation using [OpenAI's](https://openai.com) [GPT-2](https://openai.com/blog/better-language-models/) architecture.
+A robust Python tool for text-based AI training and generation using [OpenAI's](https://openai.com) [GPT-2](https://openai.com/blog/better-language-models/) and [EleutherAI's](https://www.eleuther.ai) [GPT Neo/GPT-3](https://github.com/EleutherAI/gpt-neo) architecture.
 
 aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Hugging Face Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages:
 
-- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI...or create your own GPT-2 model + tokenizer and train from scratch!
+- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 1325M/355M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch!
 - Generates text faster than gpt-2-simple and with better memory efficiency!
 - With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the HuggingFace model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text.
 - With pytorch-lightning, aitextgen trains models not just on CPUs and GPUs, but also _multiple_ GPUs and (eventually) TPUs! It also includes a pretty training progress bar, with the ability to add optional loggers.
@@ -16,7 +16,7 @@ You can read more about aitextgen [in the documentation](https://docs.aitextgen.
 
 You can play with aitextgen _for free_ with powerful GPUs using these Colaboratory Notebooks!
 
-- [Finetune OpenAI's 124M GPT-2 model on your own dataset (GPU)](https://colab.research.google.com/drive/15qBZx5y9rdaQSyWpsreMDnTiZ5IlN0zD?usp=sharing)
+- [Finetune OpenAI's 124M GPT-2 model (or GPT Neo) on your own dataset (GPU)](https://colab.research.google.com/drive/15qBZx5y9rdaQSyWpsreMDnTiZ5IlN0zD?usp=sharing)
 - [Train a GPT-2 model + tokenizer from scratch (GPU)](https://colab.research.google.com/drive/144MdX5aLqrQ3-YW-po81CQMrD6kpgpYh?usp=sharing)
 
 You can also play with custom [Reddit](notebooks/reddit_demo.ipynb) and [Hacker News](notebooks/hacker_news_demo.ipynb) demo models on your own PC.
@@ -90,10 +90,9 @@ ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=50
 ai.generate(10, prompt="ROMEO:")
 
 # With your trained model, you can reload the model at any time by
-# providing the pytorch_model.bin model weights, the config, and the tokenizer.
-ai2 = aitextgen(model="trained_model/pytorch_model.bin",
-                tokenizer_file="aitextgen.tokenizer.json",
-                config="trained_model/config.json")
+# providing the folder containing the pytorch_model.bin model weights + the config, and providing the tokenizer.
+ai2 = aitextgen(model_folder="trained_model",
+                tokenizer_file="aitextgen.tokenizer.json")
 
 ai2.generate(10, prompt="ROMEO:")
 ```
@@ -106,7 +105,7 @@ Want to run aitextgen and finetune GPT-2? Use the Colab notebooks in the Demos s
 
 ## Upcoming Features
 
-The current release (v0.4.X) of aitextgen **is considered to be a beta**, targeting the most common use cases. The Notebooks and examples written so far are tested to work, but more fleshing out of the docs/use cases will be done over the next few months in addition to fixing the known issues noted above.
+The current release (v0.5.X) of aitextgen **is considered to be a beta**, targeting the most common use cases. The Notebooks and examples written so far are tested to work, but more fleshing out of the docs/use cases will be done over the next few months in addition to fixing the known issues noted above.
 
 The next versions of aitextgen (and one of the reasons I made this package in the first place) will have native support for _schema-based generation_. (See [this repo](https://github.com/minimaxir/gpt-2-keyword-generation) for a rough proof-of-concept.)
 

From 7c7888a7df0bf629875091177e77fa58a610c57a Mon Sep 17 00:00:00 2001
From: Max Woolf <max@minimaxir.com>
Date: Sun, 18 Apr 2021 10:46:56 -0700
Subject: [PATCH 3/6] add `trained_folder` note

---
 README.md                            | 2 +-
 notebooks/training_hello_world.ipynb | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index e870d08..a26d681 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,7 @@ ai = aitextgen(tokenizer_file=tokenizer_file, config=config)
 # which automatically processes the dataset with the appropriate size.
 data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64)
 
-# Train the model! It will save pytorch_model.bin periodically and after completion.
+# Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder.
 # On a 2020 8-core iMac, this took ~25 minutes to run.
 ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000)
 
diff --git a/notebooks/training_hello_world.ipynb b/notebooks/training_hello_world.ipynb
index 05f2f18..cb64fea 100644
--- a/notebooks/training_hello_world.ipynb
+++ b/notebooks/training_hello_world.ipynb
@@ -131,7 +131,7 @@
   },
   {
    "source": [
-    "Train the model! It will save pytorch_model.bin periodically and after completion. On a 2020 8-core iMac, this took ~25 minutes to run.\n",
+    "Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder. On a 2020 8-core iMac, this took ~25 minutes to run.\n",
     "\n",
     "The configuration below processes 400,000 subsets of tokens (8 * 50000), which is about just one pass through all the data (1 epoch). Ideally you'll want multiple passes through the data and a training loss less than `2.0` for coherent output; when training a model from scratch, that's more difficult, but with long enough training you can get there!"
    ],
@@ -317,9 +317,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ai2 = aitextgen(model=\"trained_model/pytorch_model.bin\",\n",
-    "                tokenizer_file=\"aitextgen.tokenizer.json\",\n",
-    "                config=\"trained_model/config.json\")"
+    "ai2 = aitextgen(model_folder=\"trained_model\",\n",
+    "                tokenizer_file=\"aitextgen.tokenizer.json\")"
    ]
   },
   {

From ad6e3b20317dbc7dab57c519b2c92cc8dd279f71 Mon Sep 17 00:00:00 2001
From: Max Woolf <max@minimaxir.com>
Date: Sun, 18 Apr 2021 12:14:15 -0700
Subject: [PATCH 4/6] update docs for v0.5.0

---
 README.md                            |  6 ++--
 docs/dataset.md                      | 33 +++++++++++++------
 docs/generate-performance.md         |  6 ++--
 docs/generate.md                     | 18 ++++++----
 docs/index.md                        | 12 ++++---
 docs/load-model.md                   | 49 +++++++++++++++++++---------
 docs/loggers.md                      |  4 +--
 docs/save-model.md                   | 10 +++---
 docs/tutorials/colab.md              |  6 ++--
 docs/tutorials/generate_1_5b.md      | 10 +++---
 docs/tutorials/hello-world.md        | 28 +++++++++-------
 docs/tutorials/model-from-scratch.md | 24 +++++++-------
 mkdocs.yml                           | 31 +++++++++++-------
 13 files changed, 143 insertions(+), 94 deletions(-)

diff --git a/README.md b/README.md
index a26d681..7e644df 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A robust Python tool for text-based AI training and generation using [OpenAI's](
 
 aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Hugging Face Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages:
 
-- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 1325M/355M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch!
+- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 125M/350M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch!
 - Generates text faster than gpt-2-simple and with better memory efficiency!
 - With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the HuggingFace model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text.
 - With pytorch-lightning, aitextgen trains models not just on CPUs and GPUs, but also _multiple_ GPUs and (eventually) TPUs! It also includes a pretty training progress bar, with the ability to add optional loggers.
@@ -35,7 +35,7 @@ Here's how you can quickly test out aitextgen on your own computer, even if you
 
 For generating text from a pretrained GPT-2 model:
 
-```python
+```py3
 from aitextgen import aitextgen
 
 # Without any parameters, aitextgen() will download, cache, and load the 124M GPT-2 "small" model
@@ -56,7 +56,7 @@ aitextgen generate --prompt "I believe in unicorns because" --to_file False
 
 Want to train your own mini GPT-2 model on your own computer? You can follow along [in this Jupyter Notebook](/notebooks/training_hello_world.ipynb) or, download this [text file of Shakespeare's plays](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt), cd to that directory in a Terminal, open up a `python3` console and go:
 
-```python
+```py3
 from aitextgen.TokenDataset import TokenDataset
 from aitextgen.tokenizers import train_tokenizer
 from aitextgen.utils import GPT2ConfigCPU
diff --git a/docs/dataset.md b/docs/dataset.md
index b5d8162..7fcb4fa 100644
--- a/docs/dataset.md
+++ b/docs/dataset.md
@@ -1,19 +1,20 @@
 # TokenDataset
 
-aitextgen has a special class, `TokenDataset`, used for managing tokenized datasets to be fed into model training. (this is in contrast with other GPT-2 finetuning approaches, which tokenizes at training time although you can still do that if you want)
+aitextgen has a special class, `TokenDataset`, used for managing tokenized datasets to be fed into model training. (this is in contrast with other GPT-2 finetuning approaches, which tokenizes at training time although you can still do that by passing a `file_path` and other relevant parameters to `ai.train()`.)
 
 This has a few nice bonuses, including:
 
 - Tokenize a dataset on a local machine ahead of time and compress it, saving time/bandwidth transporting data to a remote machine
 - Supports both reading a dataset line-by-line (including single-column CSVs), or bulk texts.
+- Debug and log the loaded texts.
 - Merge datasets together without using external libraries
 - Cross-train on multiple datasets to "blend" them together.
 
 ## Creating a TokenDataset For GPT-2 Finetuning
 
-The easiest way to create a TokenDataset is to provide a target file. If no `vocab_file` and `merges_file` are provided, it will use the default GPT-2 tokenizer.
+The easiest way to create a TokenDataset is to provide a target file. If no `tokenizer_file` is provided, it will use the default GPT-2 tokenizer.
 
-```python
+```py3
 from aitextgen.TokenDataset import TokenDataset
 
 data = TokenDataset("shakespeare.txt")
@@ -21,34 +22,46 @@ data = TokenDataset("shakespeare.txt")
 
 If you pass a single-column CSV and specify `line_by_line=True`, the TokenDataset will parse it row-by-row, and is the recommended way to handle multiline texts.
 
-```python
+```py3
 data = TokenDataset("politics.csv", line_by_line=True)
 ```
 
 You can also manually pass a list of texts to `texts` instead if you've processed them elsewhere.
 
-```python
+```py3
 data = TokenDataset(texts = ["Lorem", "Ipsum", "Dolor"])
 ```
 
+## Block Size
+
+`block_size` is another parameter that can be passed when creating a TokenDataset, more useful for custom models. This should match the context window (e.g. the `n_positions` or `max_position_embeddings` config parameters). By default, it will choose `1024`: the GPT-2 context window.
+
+When implicitly loading a dataset via `ai.train()`, the `block_size` will be set to what is supported by the corresponding model `config`.
+
+## Debugging a TokenDataset
+
+When loading a dataset, a progress bar will appear showing how many texts are loaded and
+
+If you want to see what exactly is input to the model during training, you can access a slice via `data[0]`.
+
 ## Saving/Loading a TokenDataset
 
 When creating a TokenDataset, you can automatically save it as a compressed gzipped numpy array when completed.
 
-```python
+```py3
 data = TokenDataset("shakespeare.txt", save_cache=True)
 ```
 
 Or save it after you've loaded it with the `save()` function.
 
-```python
+```py3
 data = TokenDataset("shakespeare.txt")
 data.save()
 ```
 
 By default, it will save to `dataset_cache.tar.gz`. You can then reload that into another Python session by specifying the cache.
 
-```python
+```py3
 data = TokenDataset("dataset_cache.tar.gz", from_cache=True)
 ```
 
@@ -58,7 +71,7 @@ data = TokenDataset("dataset_cache.tar.gz", from_cache=True)
 
 ## Using TokenDatasets with a Custom GPT-2 Model
 
-The default TokenDataset has a `block_size` of `1024`, which corresponds to the _context window of the default GPT-2 model_. If you're using a custom model w/ a different maximum. Additionally, you must explicitly provide the vocab and merges files to rebuild the tokenizer, as the tokenizer will be different than the normal GPT-2 one.
+The default TokenDataset has a `block_size` of `1024`, which corresponds to the _context window of the default GPT-2 model_. If you're using a custom model w/ a different maximum. Additionally, you must explicitly provide the tokenizer file to rebuild the tokenizer, as the tokenizer will be different than the normal GPT-2 one.
 
 See the [Model From Scratch](tutorials/model-from-scratch.md) docs for more info.
 
@@ -72,7 +85,7 @@ Merging processed TokenDatasets can be done with the `merge_datasets()` function
 !!! note "About Merging"
     The current implementation merges by subset count, so equalization may not be perfect, but it will not significantly impact training.
 
-```python
+```py3
 from aitextgen.TokenDataset import TokenDataset, merge_datasets
 
 data1 = TokenDataset("politics1000.csv", line_by_line=True)   # 1000 samples
diff --git a/docs/generate-performance.md b/docs/generate-performance.md
index a9a69a5..e312502 100644
--- a/docs/generate-performance.md
+++ b/docs/generate-performance.md
@@ -10,7 +10,7 @@ PyTorch has the ability to quantize models on the CPU. Currently, it will only q
 
 To quantize a model after it's loaded, just run:
 
-```python
+```py3
 ai.quantize()
 ```
 
@@ -22,13 +22,13 @@ Certain GPUs, notably the cheap T4 and the expensive V100, support the ability t
 
 Assuming you are using a compatable GPU and already have [apex](https://github.com/NVIDIA/apex) installed, you can convert a model to the "half" FP16 mode with this:
 
-```python
+```py3
 ai.to_fp16()
 ```
 
 If you want to convert the model _before_ loading it into GPU memory (which may help avoid memory leaks), you can instantiate the model like this:
 
-```python
+```py3
 ai.to_fp16(to_gpu=True, to_fp16=True)
 ```
 
diff --git a/docs/generate.md b/docs/generate.md
index e2578e6..6b024e0 100644
--- a/docs/generate.md
+++ b/docs/generate.md
@@ -7,14 +7,18 @@ Thanks to the base Transformers package, aitextgen has more options for generati
 See [this article](https://huggingface.co/blog/how-to-generate) by Huggingface engineer Patrick von Platen for how sampling and these parameters are used in practice.
 
 - `n`: Number of texts generated.
-- `max_length`: Maximum length of the generated text (default: 200; for GPT-2, the maximum is 1024.)
-- `prompt`: Prompt that starts the generated text and is included in the generate text. (used to be `prefix` in previous tools)
+- `max_length`: Maximum length of the generated text (default: 200; for GPT-2, the maximum is 1024; for GPT Neo, the maximum is 2048)
+- `prompt`: Prompt that starts the generated text and is included in the generated text.
 - `temperature`: Controls the "craziness" of the text (default: 0.7)
 - `top_k`: If nonzero, limits the sampled tokens to the top _k_ values. (default: 0)
 - `top_p`: If nonzero, limits the sampled tokens to the cumulative probability
 
 Some lesser-known-but-still-useful-parameters that are unique to Transformers:
 
+<!--prettier-ignore-->
+!!! warning "Performance"
+    Enabling these parameters may slow down generation.
+
 - `num_beams`: If greater than 1, executes beam search for cleaner text.
 - `repetition_penalty`: If greater than 1.0, penalizes repetition in a text to avoid infinite loops.
 - `length_penalty`: If greater than 1.0, penalizes text proportional to the length
@@ -30,17 +34,17 @@ Given a `aitextgen` object with a loaded model + tokenizer named `ai`:
     want to generate on the GPU, make sure you call `ai.to_gpu()` beforehand, or
     load the model into the GPU using `ai = aitextgen(to_gpu=True)`
 
-- `ai.generate()`: Generates and prints text to console. If `prompt` is used, the `prompt` is bolded. (a la [Talk to Transformer](https://talktotransformer.com))
+- `ai.generate()`: Generates and prints text to console. If `prompt` is used, the `prompt` is **bolded**.
 - `ai.generate_one()`: A helper function which generates a single text and returns as a string (good for APIs)
 - `ai.generate_samples()`: Generates multiple samples at specified temperatures: great for debugging.
-- `ai.generate_to_file()`: Generates a bulk amount of texts to file. (this accepts a `batch_size` parameter which is useful if using on a GPU)
+- `ai.generate_to_file()`: Generates a bulk amount of texts to file. (this accepts a `batch_size` parameter which is useful if using on a GPU, as it can generate texts in parallel with no performance loss)
 
 <!-- prettier-ignore -->
-!!! note "Cleanup"
-    By default, the `cleanup` parameter is set to True, which automatically removes texts that are blatantly malformed (e.g. only 2 characters long). Therefore, there may be less than `n` results returned. You can disabled this behavior by setting `cleanup=False`.
+!!! note "lstrip and nonempty_output"
+    By default, the `lstrip` and `nonempty_output` parameters to `generate` are set to `True`, which alters the behavior of the generated text in a way that is most likely preferable.  `lstrip`: Removes all whitespace at the beginning of the generated space. `nonempty_output`: If the output is empty (possible on shortform content), skip it if generating multiple texts, or try again if it's a single text. If `min_length` is specified, the same behavior occurs for texts below the minimum length after processing.
 
 ## Seed
 
-aitextgen has a new `seed` parameter for generation. Using any generate function with a `seed` parameter (must be an integer) and all other models/parameters the same, and the generated text will be identical. This allows for reproducible generations.
+aitextgen has a new `seed` parameter for generation. Using any generate function with a `seed` parameter (must be an integer) and all other models/parameters the same, and the generated text will be identical. This allows for reproducible generations in case someone accuses you of faking the AI output.
 
 For `generate_to_file()`, the 8-digit number at the end of the file name will be the seed used to generate the file, making reprodicibility easy.
diff --git a/docs/index.md b/docs/index.md
index d16f6b9..cb22116 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,12 +1,14 @@
 # aitextgen
 
-A robust tool for advanced AI text generation via [GPT-2](https://openai.com/blog/better-language-models/).
+_Last Updated: April 18th, 2021 (aitextgen v0.5.0)_
 
-aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Huggingface Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages:
+A robust Python tool for text-based AI training and generation using [OpenAI's](https://openai.com) [GPT-2](https://openai.com/blog/better-language-models/) and [EleutherAI's](https://www.eleuther.ai) [GPT Neo/GPT-3](https://github.com/EleutherAI/gpt-neo) architecture.
 
-- Finetunes on a pretrained 124M GPT-2 model from OpenAI...or create your own GPT-2 model + tokenizer and train from scratch!
-- Generates text faster than gpt-2-simple and with better memory efficiency! (even [from the 1.5B GPT-2 model](tutorials/generate_1_5b/)!)
-- With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the Huggingface model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text.
+aitextgen is a Python package that leverages [PyTorch](https://pytorch.org), [Hugging Face Transformers](https://github.com/huggingface/transformers) and [pytorch-lightning](https://github.com/PyTorchLightning/pytorch-lightning) with specific optimizations for text generation using GPT-2, plus _many_ added features. It is the successor to [textgenrnn](https://github.com/minimaxir/textgenrnn) and [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple), taking the best of both packages:
+
+- Finetunes on a pretrained 124M/355M/774M GPT-2 model from OpenAI or a 125M/355M GPT Neo model from EleutherAI...or create your own GPT-2/GPT Neo model + tokenizer and train from scratch!
+- Generates text faster than gpt-2-simple and with better memory efficiency!
+- With Transformers, aitextgen preserves compatibility with the base package, allowing you to use the model for other NLP tasks, download custom GPT-2 models from the HuggingFace model repository, and upload your own models! Also, it uses the included `generate()` function to allow a massive amount of control over the generated text.
 - With pytorch-lightning, aitextgen trains models not just on CPUs and GPUs, but also _multiple_ GPUs and (eventually) TPUs! It also includes a pretty training progress bar, with the ability to add optional loggers.
 - The input dataset is its own object, allowing you to not only easily encode megabytes of data in seconds, cache, and compress it on a local computer before transporting to a remote server, but you are able to _merge_ datasets without biasing the resulting dataset, or _cross-train_ on multiple datasets to create blended output.
 
diff --git a/docs/load-model.md b/docs/load-model.md
index cfdd845..66a1bab 100644
--- a/docs/load-model.md
+++ b/docs/load-model.md
@@ -8,45 +8,62 @@ There are several ways to load models.
 
 ## Loading an aitextgen model
 
-The closer to the default 124M GPT-2 model, the fewer files you need!
+For the base case, loading the default 124M GPT-2 model via Huggingface:
 
-For the base case, loading the default model via Huggingface:
-
-```python
+```py3
 ai = aitextgen()
 ```
 
 The downloaded model will be downloaded to `cache_dir`: `/aitextgen` by default.
 
-If you've finetuned a 124M GPT-2 model using aitextgen, you can pass the generated `pytorch_model.bin` to aitextgen:
+If you're loading a custom model for a different GPT-2/GPT-Neo architecture _from scratch_ but with the normal GPT-2 tokenizer, you can pass only a config.
 
-```python
-ai = aitextgen(model="pytorch_model.bin")
+```py3
+from aitextgen.utils import GPT2ConfigCPU
+config = GPT2ConfigCPU()
+ai = aitextgen(config=config)
 ```
 
-If you're loading a finetuned model of a different GPT-2 architecture, you'll must also pass the generated `config.json` to aitextgen:
+While training/finetuning a model, two files will be created: the `pytorch_model.bin` which contains the weights for the model, and a `config.json` illustrating the architecture for the model. Both of these files are needed to reload the model.
+
+If you've finetuned a model using aitextgen (the default model), you can pass the **folder name** containing the generated `pytorch_model.bin` and `config.json` to aitextgen (e.g. `trained_model`, which is where trained models will be saved by default).
+
+<!--prettier-ignore-->
+!!! note "Same Directory"
+    If both files are in the current directory, you can pass `model_folder="."`.
 
-```python
-ai = aitextgen(model="pytorch_model.bin", config=config)
+```py3
+ai = aitextgen(model_folder="trained_model")
 ```
 
-If you want to download an alternative GPT-2 model from Huggingface's repository of models, pass that model name to `model`.
+These examples assume you are using the default GPT-2 tokenizer. If you have a _custom tokenizer_, you'll need to pass that along with loading the model.
+
+```py3
+ai3 = aitextgen(model_folder="trained_model",
+                tokenizer_file="aitextgen.tokenizer.json")
+```
 
-```python
+If you want to download an alternative GPT-2 model from Hugging Face's repository of models, pass that model name to `model`.
+
+```py3
 ai = aitextgen(model="minimaxir/hacker-news")
 ```
 
 The model and associated config + tokenizer will be downloaded into `cache_dir`.
 
-## Loading TensorFlow-based GPT-2 models
+This can also be used to download the [pretrained GPT Neo models](https://huggingface.co/EleutherAI) from EleutherAI.
 
-aitextgen lets you download the models from Google's servers that OpenAI had uploaded back when GPT-2 was first released in 2019. These models are then converted to a PyTorch format.
+```py3
+ai = aitextgen(model="EleutherAI/gpt-neo-125M")
+```
+
+## Loading TensorFlow-based GPT-2 models
 
-It's counterintuitive, but it's _substantially_ faster than downloading from Huggingface's servers, especially if you are running your code on Google Cloud Platform (e.g. Colab notebooks)
+aitextgen lets you download the models from Microsoft's servers that OpenAI had uploaded back when GPT-2 was first released in 2019. These models are then converted to a PyTorch format.
 
 To use this workflow, pass the corresponding model number to `tf_gpt2`:
 
-```python
+```py3
 ai = aitextgen(tf_gpt2="124M")
 ```
 
diff --git a/docs/loggers.md b/docs/loggers.md
index 442655e..0df3e6a 100644
--- a/docs/loggers.md
+++ b/docs/loggers.md
@@ -6,7 +6,7 @@ You can create loggers with popular tools such as [TensorBoard](https://www.tens
 
 For example, if you want to create a TensorBoard logger, you can create it:
 
-```python
+```py3
 from pytorch_lightning import loggers
 
 tb_logger = loggers.TensorBoardLogger('logs/')
@@ -14,6 +14,6 @@ tb_logger = loggers.TensorBoardLogger('logs/')
 
 Then pass it to the `loggers` parameter for `ai.train()`.
 
-```python
+```py3
 ai.train(train_data=data, loggers=tb_logger)
 ```
diff --git a/docs/save-model.md b/docs/save-model.md
index 70c5bd1..9bd719d 100644
--- a/docs/save-model.md
+++ b/docs/save-model.md
@@ -2,7 +2,7 @@
 
 There are are multiple ways to save models.
 
-Whenever a model is saved, two files are generated: `pytorch_model.bin` which contains the model weights, and `config.json` which is needed to load the model if it is not the base 124M GPT-2.
+Whenever a model is saved, two files are generated: `pytorch_model.bin` which contains the model weights, and `config.json` which is needed to load the model.
 
 Assuming we have an aitextgen model `ai`:
 
@@ -10,7 +10,7 @@ Assuming we have an aitextgen model `ai`:
 
 The aitextgen model can be saved at any time using `save`.
 
-```python
+```py3
 ai.save()
 ```
 
@@ -24,7 +24,7 @@ If you are using Google Colaboratory, you can mount your personal Google Drive t
 
 First mount your Google Drive using `mount_gdrive()`:
 
-```python
+```py3
 from aitextgen.colab import mount_gdrive, copy_file_to_gdrive
 mount_gdrive()
 ```
@@ -33,7 +33,7 @@ You'll be asked for an auth code; input it and press enter, and a `My Drive` fol
 
 You can drag and drop the model files into the Google Drive, or use `copy_file_to_gdrive` to copy them programmatically.
 
-```python
+```py3
 copy_file_to_gdrive("pytorch_model.bin")
 copy_file_to_gdrive("config.json")
 ```
@@ -48,7 +48,7 @@ Concerned about timeouts in Google Colab? aitextgen has a feature that will copy
 
 As long as your drive is mounted as above, pass `save_gdrive = True` to the `train()` function:
 
-```python
+```py3
 ai.train(save_gdrive=True)
 ```
 
diff --git a/docs/tutorials/colab.md b/docs/tutorials/colab.md
index cda92a3..c6cce54 100644
--- a/docs/tutorials/colab.md
+++ b/docs/tutorials/colab.md
@@ -11,11 +11,11 @@ The Colab Notebooks also contain utilities to make it easier to export the model
 A Notebook for finetuning OpenAI's model on a GPU. This is the most common use case.
 
 <!-- prettier-ignore -->
-!!! note "124M Only"
-    Currently you can only finetune the 124M OpenAI GPT-2 model.
+!!! note
+    Currently you can only finetune the 124M/355M/774M OpenAI GPT-2 models, with the latter two forcing `gradient_checkpointing=True` to ensure it does not cause the Colab GPU to go OOM.
 
 ## Training Your Own GPT-2 Model
 
 [Colab Notebook](https://colab.research.google.com/drive/144MdX5aLqrQ3-YW-po81CQMrD6kpgpYh?usp=sharing)
 
-A Notebook for creating your own GPT-2 model with your own tokenizer. See the Model From Scratch on the advantages and disadvantages of this approach.
+A Notebook for creating your own GPT-2 model with your own tokenizer. See the Model From Scratch section on the advantages and disadvantages of this approach.
diff --git a/docs/tutorials/generate_1_5b.md b/docs/tutorials/generate_1_5b.md
index f7b3d80..e5be959 100644
--- a/docs/tutorials/generate_1_5b.md
+++ b/docs/tutorials/generate_1_5b.md
@@ -45,16 +45,16 @@ Now go back to the Launcher and create a Python 3 Notebook (or upload the one he
 <!-- prettier-ignore -->
 !!! warning "CUDA"
     You may want to ensure the Notebook sees the CUDA installation, which appears to be somewhat random. This can be verified by running `import torch` in a cell, then `torch.cuda.is_available()`.
-    
+
 In a cell, load aitextgen:
 
-```python
+```py3
 from aitextgen import aitextgen
 ```
 
 In another cell, input and run:
 
-```python
+```py3
 ai = aitextgen(tf_gpt2="1558M", to_gpu=True, to_fp16=True)
 ```
 
@@ -71,7 +71,7 @@ Now we can generate texts! The T4, for GPT-2 1.5B in FP16 mode, can generate abo
 
 Create a cell and add:
 
-```python
+```py3
 ai.generate_to_file(n=300, batch_size=30)
 ```
 
@@ -83,7 +83,7 @@ And it will generate the texts to a file! When completed, you can double-click t
 
 More importantly, all parameters to `generate` are valid, allowing massive flexibility!
 
-```python
+```py3
 ai.generate_to_file(n=150, batch_size=15, max_length=1024, top_p=0.9, temperature=1.2, prompt="President Donald Trump has magically transformed into a unicorn.")
 ```
 
diff --git a/docs/tutorials/hello-world.md b/docs/tutorials/hello-world.md
index 2e2510d..2310121 100644
--- a/docs/tutorials/hello-world.md
+++ b/docs/tutorials/hello-world.md
@@ -4,7 +4,7 @@ Here's how you can quickly test out aitextgen on your own computer, even if you
 
 For generating text from a pretrained GPT-2 model:
 
-```python
+```py3
 from aitextgen import aitextgen
 
 # Without any parameters, aitextgen() will download, cache, and load the 124M GPT-2 "small" model
@@ -25,7 +25,7 @@ aitextgen generate --prompt "I believe in unicorns because" --to_file False
 
 Want to train your own mini GPT-2 model on your own computer? Download this [text file of Shakespeare plays](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt), cd to that directory in a Teriminal, open up a `python3` console and go:
 
-```python
+```py3
 from aitextgen.TokenDataset import TokenDataset
 from aitextgen.tokenizers import train_tokenizer
 from aitextgen.utils import GPT2ConfigCPU
@@ -35,27 +35,33 @@ from aitextgen import aitextgen
 file_name = "input.txt"
 
 # Train a custom BPE Tokenizer on the downloaded text
-# This will save two files: aitextgen-vocab.json and aitextgen-merges.txt,
-# which are needed to rebuild the tokenizer.
+# This will save one file: `aitextgen.tokenizer.json`, which contains the
+# information needed to rebuild the tokenizer.
 train_tokenizer(file_name)
-vocab_file = "aitextgen-vocab.json"
-merges_file = "aitextgen-merges.txt"
+tokenizer_file = "aitextgen.tokenizer.json"
 
 # GPT2ConfigCPU is a mini variant of GPT-2 optimized for CPU-training
 # e.g. the # of input tokens here is 64 vs. 1024 for base GPT-2.
 config = GPT2ConfigCPU()
 
 # Instantiate aitextgen using the created tokenizer and config
-ai = aitextgen(vocab_file=vocab_file, merges_file=merges_file, config=config)
+ai = aitextgen(tokenizer_file=tokenizer_file, config=config)
 
 # You can build datasets for training by creating TokenDatasets,
 # which automatically processes the dataset with the appropriate size.
-data = TokenDataset(file_name, vocab_file=vocab_file, merges_file=merges_file, block_size=64)
+data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64)
 
-# Train the model! It will save pytorch_model.bin periodically and after completion.
-# On a 2016 MacBook Pro, this took ~25 minutes to run.
-ai.train(data, batch_size=16, num_steps=5000)
+# Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder.
+# On a 2020 8-core iMac, this took ~25 minutes to run.
+ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000)
 
 # Generate text from it!
 ai.generate(10, prompt="ROMEO:")
+
+# With your trained model, you can reload the model at any time by
+# providing the folder containing the pytorch_model.bin model weights + the config, and providing the tokenizer.
+ai2 = aitextgen(model_folder="trained_model",
+                tokenizer_file="aitextgen.tokenizer.json")
+
+ai2.generate(10, prompt="ROMEO:")
 ```
diff --git a/docs/tutorials/model-from-scratch.md b/docs/tutorials/model-from-scratch.md
index 4a60739..b8ef248 100644
--- a/docs/tutorials/model-from-scratch.md
+++ b/docs/tutorials/model-from-scratch.md
@@ -8,7 +8,7 @@ If that is _not_ your use case, you may get a better generation quality _and_ sp
 - Non-English Text
 - Heavily Encoded Text
 
-It still will require a _massive_ amount of training time (several hours, even on a TPU), but will be more flexible.
+It still will require a _massive_ amount of training time (several hours) but will be more flexible.
 
 ## Building a Custom Tokenizer.
 
@@ -16,20 +16,20 @@ The `train_tokenizer()` function from `aitextgen.tokenizers` trains the model on
 
 <!--prettier-ignore-->
 !!! note "Vocabulary Size"
-    The default vocabulary size for `train_tokenizer()` is 5,000 tokens. Although this is much lower than GPT-2's 50k vocab size, the smaller the vocab size, the easier it is to train the model (since it's more likely for the model to make a correct "guess"), and the model file size will be _much_ smaller.
+    The default vocabulary size for `train_tokenizer()` is 1,000 tokens. Although this is much lower than GPT-2's 50k vocab size, the smaller the vocab size, the easier it is to train the model (since it's more likely for the model to make a correct "guess"), and the model file size will be _much_ smaller.
 
-```python
+```py3
 from aitextgen.tokenizers import train_tokenizer
 train_tokenizer(file_name)
 ```
 
-This creates two files: `aitextgen-vocab.json` and `aitextgen-merges.txt`, which are needed to rebuild the tokenizer.
+This creates one file, `aitextgen.tokenizer.json`, which is needed to rebuild the tokenizer.
 
 # Building a Custom Dataset
 
 You can build a TokenDataset based off your custom Tokenizer, to be fed into the model.
 
-```python
+```py3
 data = TokenDataset(file_name, vocab_file=vocab_file, merges_file=merges_file, block_size=32)
 ```
 
@@ -39,7 +39,7 @@ Whenever you load a default 124M GPT-2 model, it uses a `GPT2Config()` under the
 
 The `build_gpt2_config()` function from `aitextgen.utils` gives you more control.
 
-```python
+```py3
 config = build_gpt2_config(vocab_size=5000, max_length=32, dropout=0.0, n_embd=256, n_layer=8, n_head=8)
 ```
 
@@ -60,20 +60,20 @@ A few notes on the inputs:
 
 You can instantiate an empty GPT-2 according to your custom config, and construct a custom tokenizer according to your vocab and merges file:
 
-```python
-ai = aitextgen(vocab_file=vocab_file, merges_file=merges_file, config=config)
+```py3
+ai = aitextgen(tokenizer_file=tokenizer_file, config=config)
 ```
 
 Training is done as normal.
 
-```python
+```py3
 ai.train(data, batch_size=16, num_steps=5000)
 ```
 
 ## Reloading the Custom Model
 
-You'll always need to provide the vocab_file, merges_file, and config (a config file is saved when the model is saved; you can either build it at runtime as above, or use the `config.json`)
+You'll always need to provide the tokenizer_file and the folder containing the `pytorch_model.bin` and `config.json`.
 
-```python
-ai = aitextgen(model="pytorch_model.bin", vocab_file=vocab_file, merges_file=merges_file, config=config)
+```py3
+ai = aitextgen(model_folder="trained_model", tokenizer_file="aitextgen.tokenizer.json")
 ```
diff --git a/mkdocs.yml b/mkdocs.yml
index c1d697c..3e5aac3 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,5 +1,5 @@
 site_name: aitextgen
-site_description: A robust Python tool for text-based AI training and generation using GPT-2.
+site_description: A robust Python tool for text-based AI training and generation using GPT-2 and GPT Neo.
 site_author: Max Woolf (@minimaxir)
 
 nav:
@@ -8,15 +8,15 @@ nav:
   - Loading a Model: load-model.md
   - Saving a Model: save-model.md
   - TokenDataset: dataset.md
-  - Training an aitextgen Model:
+  - Training a Model:
       - Colaboratory Notebooks: tutorials/colab.md
       # - Improving Training Performance: train-performance.md
       - Training a GPT-2 Model From Scratch: tutorials/model-from-scratch.md
       - Loggers: loggers.md
-  - Generating from an aitextgen Model:
+  - Generating from a Model:
       - Generating Text: generate.md
-      - Improving Generation Performance: generate-performance.md
-      - Generating From GPT-2 1.5B: tutorials/generate_1_5b.md
+      # - Improving Generation Performance: generate-performance.md
+      # - Generating From GPT-2 1.5B: tutorials/generate_1_5b.md
   - Importing from gpt-2-simple: gpt-2-simple.md
   - Helpful Notes: helpful-notes.md
   - Ethics: ethics.md
@@ -26,9 +26,17 @@ nav:
 theme:
   name: "material"
   palette:
-    scheme: "slate"
-    primary: "light blue"
-    accent: "light blue"
+    - media: "(prefers-color-scheme: light)"
+      scheme: default
+      toggle:
+        icon: material/toggle-switch-off-outline
+        name: Switch to dark mode
+    - media: "(prefers-color-scheme: dark)"
+      scheme: slate
+      primary: black
+      toggle:
+        icon: material/toggle-switch
+        name: Switch to light mode
   font:
     text: "Source Sans Pro"
     code: "Fira Code"
@@ -43,7 +51,7 @@ repo_name: minimaxir/aitextgen
 repo_url: https://github.com/minimaxir/aitextgen
 edit_uri: ""
 
-copyright: "Copyright &copy; 2019 - 2020 Max Woolf"
+copyright: "Copyright &copy; 2019 - 2021 Max Woolf"
 
 extra:
   social:
@@ -53,9 +61,8 @@ extra:
       link: "https://twitter.com/minimaxir"
 
 markdown_extensions:
-  - codehilite:
-      guess_lang: false
-  - mkautodoc
+  - pymdownx.highlight
+  - pymdownx.superfences
   - admonition
   - toc:
       permalink: true

From e1740efb265ba766f1695e9fbec874cc60f8158b Mon Sep 17 00:00:00 2001
From: Max Woolf <max@minimaxir.com>
Date: Sun, 18 Apr 2021 14:27:15 -0700
Subject: [PATCH 5/6] remove redundant cleanup param

---
 aitextgen/aitextgen.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/aitextgen/aitextgen.py b/aitextgen/aitextgen.py
index 719e8f6..b94dfca 100644
--- a/aitextgen/aitextgen.py
+++ b/aitextgen/aitextgen.py
@@ -472,7 +472,6 @@ def generate_to_file(
         destination_path: str = None,
         sample_delim: str = "=" * 20 + "\n",
         seed: int = None,
-        cleanup: bool = True,
         **kwargs,
     ) -> None:
         """
@@ -516,15 +515,6 @@ def generate_to_file(
         for _ in range(n // batch_size):
             gen_texts = self.generate(n=batch_size, return_as_list=True, **kwargs)
 
-            # Remove empty texts and strip out extra newlines/extra spaces
-            if cleanup:
-                texts_to_clean = gen_texts
-                gen_texts = []
-                for text in texts_to_clean:
-                    clean_text = text.strip().strip("\n")
-                    if clean_text and len(clean_text) >= 2:
-                        gen_texts.append(clean_text)
-
             for gen_text in gen_texts:
                 f.write("{}\n{}".format(gen_text, sample_delim))
             pbar.update(batch_size)

From 4a1c6dcd660ef857576b71bf0d2ba4672e4f61c6 Mon Sep 17 00:00:00 2001
From: Max Woolf <max@minimaxir.com>
Date: Sun, 18 Apr 2021 18:02:07 -0700
Subject: [PATCH 6/6] Add assert for old behavior

---
 aitextgen/aitextgen.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/aitextgen/aitextgen.py b/aitextgen/aitextgen.py
index b94dfca..ffd7376 100644
--- a/aitextgen/aitextgen.py
+++ b/aitextgen/aitextgen.py
@@ -16,6 +16,7 @@
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
+    AutoTokenizer,
     GPT2Config,
     GPT2LMHeadModel,
     GPT2TokenizerFast,
@@ -97,6 +98,12 @@ def __init__(
         **kwargs,
     ) -> None:
 
+        if model:
+            assert not os.path.isfile(model), (
+                "As of aitextgen 0.5.0, you must "
+                + "use `model_folder` to load an existing model."
+            )
+
         if not verbose:
             for module in [
                 "transformers.file_utils",
@@ -189,7 +196,7 @@ def __init__(
             )
             if model and "gpt2" not in model:
                 logger.info(f"Using the tokenizer for {model}.")
-                self.tokenizer = GPT2TokenizerFast.from_pretrained(
+                self.tokenizer = AutoTokenizer.from_pretrained(
                     model,
                     cache_dir=cache_dir,
                 )