Skip to content

Commit

Permalink
code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
AkiRusProd committed Jul 25, 2024
1 parent ac6a795 commit c5fe87b
Show file tree
Hide file tree
Showing 22 changed files with 206 additions and 203 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -675,6 +675,9 @@ Training process Example | Interpolation between images Example
>Example №5
*a beautiful portrait painting of a cyberpunk city by simon stalenhag and pascal blanche and alphonse mucha, in style of colorful comic. symmetry, hyper detailed. octanev render. trending on artstation*

Code:
*[Model example](examples/gpt.py)*

</details>

<details>
Expand Down
1 change: 0 additions & 1 deletion data_loader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import tarfile
import zipfile
from pathlib import Path

Expand Down
37 changes: 19 additions & 18 deletions examples/gpt.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import math
from pathlib import Path
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import TemplateProcessing
from tqdm import tqdm

import neunet
import neunet.nn as nn
from datasets import load_dataset
from datasets import load_dataset # type: ignore
from neunet import Tensor
from neunet.optim import Adam

import matplotlib.pyplot as plt

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads, dropout=0.1):
Expand All @@ -22,7 +23,8 @@ def __init__(self, d_model, n_heads, dropout=0.1):
self.scale = math.sqrt(d_model)
self.dropout = nn.Dropout(dropout)

assert d_model % n_heads == 0
if d_model % n_heads != 0:
raise ValueError("d_model must be divisible by n_heads")

self.depth = d_model // n_heads

Expand All @@ -33,7 +35,7 @@ def __init__(self, d_model, n_heads, dropout=0.1):
self.fc = nn.Linear(d_model, d_model)


def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor=None):
def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor]=None):
batch_size = q.shape[0]
q = self.wq(q).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
k = self.wk(k).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
Expand Down Expand Up @@ -191,21 +193,20 @@ def forward(self, x) -> tuple[Tensor, Tensor]:

for split, split_dataset in data.items():

with open(f"./datasets/sd-prompts/sd-prompts-{split}.txt", 'w', encoding='utf-8') as f:
with Path(f"./datasets/sd-prompts/sd-prompts-{split}.txt").open('w', encoding='utf-8') as f:
for item in split_dataset:
f.write(item['Prompt'] + '\n')


FILE_PATHS = [DATASET_PATH / "sd-prompts-train.txt", DATASET_PATH / "sd-prompts-test.txt"]
FILE_PATHS = [str(path) for path in FILE_PATHS]


# [Train and load Tokenizer]

if not (SAVE_PATH / "vocab").exists():
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=FILE_PATHS, vocab_size=15000, min_frequency=1, special_tokens=[
tokenizer.train(files=[str(path) for path in FILE_PATHS], vocab_size=15000, min_frequency=1, special_tokens=[
PAD_TOKEN,
SOS_TOKEN,
EOS_TOKEN,
Expand All @@ -231,7 +232,7 @@ class DataPreprocessor():
def __init__(self, tokenizer: ByteLevelBPETokenizer):
self.tokenizer = tokenizer

self.tokenizer._tokenizer.post_processor = TemplateProcessing(
self.tokenizer._tokenizer.post_processor = TemplateProcessing( # noqa SLF001
single=f"{SOS_TOKEN} $A {EOS_TOKEN}",
special_tokens=[
(f"{SOS_TOKEN}", tokenizer.token_to_id(f"{SOS_TOKEN}")),
Expand All @@ -242,13 +243,13 @@ def __init__(self, tokenizer: ByteLevelBPETokenizer):
# self.tokenizer.enable_truncation(max_length=151)
self.tokenizer.enable_padding(pad_token = PAD_TOKEN)

def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
def tokenize(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
examples = []

for src_file in paths:
print(f"Processing {src_file}")
src_file = Path(src_file)
lines = src_file.read_text(encoding="utf-8").splitlines()
path_src_file = Path(src_file)
lines = path_src_file.read_text(encoding="utf-8").splitlines()

if lines_limit:
lines = lines[:lines_limit]
Expand All @@ -259,13 +260,13 @@ def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -

return examples

def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
def __call__(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
return self.tokenize(paths, batch_size, lines_limit)

data_post_processor = DataPreprocessor(tokenizer)

train_data = data_post_processor([FILE_PATHS[0]], batch_size = BATCH_SIZE, lines_limit=20000)
val_data = data_post_processor([FILE_PATHS[1]], batch_size = BATCH_SIZE, lines_limit=2000)
train_data = data_post_processor([str(FILE_PATHS[0])], batch_size = BATCH_SIZE, lines_limit=20000)
val_data = data_post_processor([str(FILE_PATHS[1])], batch_size = BATCH_SIZE, lines_limit=2000)



Expand Down Expand Up @@ -300,7 +301,7 @@ def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -

# [train, eval, predict methods definition]

def train_step(dataset: np.ndarray, epoch: int, epochs: int) -> float:
def train_step(dataset: list[np.ndarray], epoch: int, epochs: int) -> float:
loss_history = []
model.train()

Expand Down Expand Up @@ -333,7 +334,7 @@ def train_step(dataset: np.ndarray, epoch: int, epochs: int) -> float:

return epoch_loss

def eval(dataset: np.ndarray) -> float:
def eval(dataset: list[np.ndarray]) -> float:
loss_history = []
model.eval()

Expand Down Expand Up @@ -361,7 +362,7 @@ def eval(dataset: np.ndarray) -> float:
return epoch_loss


def train(train_data: np.ndarray, val_data: np.ndarray, epochs: int, save_every_epochs: int, save_path: str = None, validation_check: bool = False):
def train(train_data: list[np.ndarray], val_data: list[np.ndarray], epochs: int, save_every_epochs: int, save_path: Optional[str] = None, validation_check: bool = False):
best_val_loss = float('inf')

train_loss_history = []
Expand Down Expand Up @@ -395,7 +396,7 @@ def train(train_data: np.ndarray, val_data: np.ndarray, epochs: int, save_every_
def predict(sentence: str = "", max_length: int = 50, temperature: float = 0.7) -> tuple[str, Tensor]:
model.eval()

tokens: list = [SOS_INDEX] + tokenizer.encode(sentence, add_special_tokens=False).ids
tokens: list = [SOS_INDEX, *tokenizer.encode(sentence, add_special_tokens=False).ids]

for _ in range(max_length):
inputs = np.asarray(tokens).reshape(1, -1)
Expand Down
65 changes: 33 additions & 32 deletions examples/seq2seq.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import math
from pathlib import Path
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
Expand All @@ -9,7 +10,7 @@

import neunet
import neunet.nn as nn
from datasets import load_dataset
from datasets import load_dataset # type: ignore
from neunet import Tensor
from neunet.optim import Adam

Expand All @@ -27,7 +28,8 @@ def __init__(self, d_model, n_heads, dropout=0.1):
self.scale = math.sqrt(d_model)
self.dropout = nn.Dropout(dropout)

assert d_model % n_heads == 0
if d_model % n_heads != 0:
raise ValueError("d_model must be divisible by n_heads")

self.depth = d_model // n_heads

Expand All @@ -37,7 +39,7 @@ def __init__(self, d_model, n_heads, dropout=0.1):

self.fc = nn.Linear(d_model, d_model)

def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Tensor=None):
def forward(self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor]=None):
batch_size = q.shape[0]
q = self.wq(q).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
k = self.wk(k).contiguous().reshape(batch_size, -1, self.n_heads, self.depth).transpose(0, 2, 1, 3)
Expand Down Expand Up @@ -246,7 +248,7 @@ def forward(self, src: np.ndarray, tgt: np.ndarray) -> tuple[Tensor, Tensor]:
PAD_TOKEN = '<pad>' # noqa: S105
SOS_TOKEN = '<sos>' # noqa: S105
EOS_TOKEN = '<eos>' # noqa: S105
# UNK_TOKEN = '<unk>' # noqa: S105
# UNK_TOKEN = '<unk>'

DATASET_PATH = Path("./datasets/multi30k/")
SAVE_PATH = Path("./saved models/seq2seq/")
Expand All @@ -255,23 +257,22 @@ def forward(self, src: np.ndarray, tgt: np.ndarray) -> tuple[Tensor, Tensor]:
data = load_dataset("bentrevett/multi30k", cache_dir="datasets/multi30k")

for split, split_dataset in data.items():
with open(f"./datasets/multi30k/{split}.en", 'w', encoding='utf-8') as f:
with Path(f"./datasets/multi30k/{split}.en").open('w', encoding='utf-8') as f:
for item in split_dataset:
f.write(item['en'] + '\n')

with open(f"./datasets/multi30k/{split}.de", 'w', encoding='utf-8') as f:
with Path(f"./datasets/multi30k/{split}.de").open('w', encoding='utf-8') as f:
for item in split_dataset:
f.write(item['de'] + '\n')

FILE_PATHS = [DATASET_PATH / "train.en", DATASET_PATH / "train.de", DATASET_PATH / "val.en", DATASET_PATH / "val.de", DATASET_PATH / "test.en", DATASET_PATH / "test.de"]
FILE_PATHS = [str(path) for path in FILE_PATHS]


# [Train and load Tokenizer]
if not (SAVE_PATH / "vocab").exists():
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=FILE_PATHS, vocab_size=15000, min_frequency=1, special_tokens=[
tokenizer.train(files=[str(path) for path in FILE_PATHS], vocab_size=15000, min_frequency=1, special_tokens=[
PAD_TOKEN,
SOS_TOKEN,
EOS_TOKEN,
Expand All @@ -298,7 +299,7 @@ class DataPreprocessor():
def __init__(self, tokenizer: ByteLevelBPETokenizer):
self.tokenizer = tokenizer

self.tokenizer._tokenizer.post_processor = TemplateProcessing(
self.tokenizer._tokenizer.post_processor = TemplateProcessing( # noqa SLF001
single=f"{SOS_TOKEN} $A {EOS_TOKEN}",
special_tokens=[
(f"{SOS_TOKEN}", tokenizer.token_to_id(f"{SOS_TOKEN}")),
Expand All @@ -309,13 +310,13 @@ def __init__(self, tokenizer: ByteLevelBPETokenizer):
# self.tokenizer.enable_truncation(max_length=128)
self.tokenizer.enable_padding(pad_token = PAD_TOKEN)

def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
def tokenize(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
examples = []

for src_file in paths:
print(f"Processing {src_file}")
src_file = Path(src_file)
lines = src_file.read_text(encoding="utf-8").splitlines()
path_src_file = Path(src_file)
lines = path_src_file.read_text(encoding="utf-8").splitlines()

if lines_limit:
lines = lines[:lines_limit]
Expand All @@ -326,20 +327,20 @@ def tokenize(self, paths: list[str], batch_size: int, lines_limit: int = None) -

return examples

def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -> np.ndarray:
def __call__(self, paths: list[str], batch_size: int, lines_limit: Optional[int] = None) -> list[np.ndarray]:
return self.tokenize(paths, batch_size, lines_limit)


data_post_processor = DataPreprocessor(tokenizer)

train_src = data_post_processor([DATASET_PATH / "train.en"], batch_size = BATCH_SIZE)
train_tgt = data_post_processor([DATASET_PATH / "train.de"], batch_size = BATCH_SIZE)
train_src = data_post_processor([str(DATASET_PATH / "train.en")], batch_size = BATCH_SIZE)
train_tgt = data_post_processor([str(DATASET_PATH / "train.de")], batch_size = BATCH_SIZE)

val_src = data_post_processor([DATASET_PATH / "val.en"], batch_size = BATCH_SIZE)
val_tgt = data_post_processor([DATASET_PATH / "val.de"], batch_size = BATCH_SIZE)
val_src = data_post_processor([str(DATASET_PATH / "val.en")], batch_size = BATCH_SIZE)
val_tgt = data_post_processor([str(DATASET_PATH / "val.de")], batch_size = BATCH_SIZE)

test_src = data_post_processor([DATASET_PATH / "test.en"], batch_size = BATCH_SIZE)
test_tgt = data_post_processor([DATASET_PATH / "test.de"], batch_size = BATCH_SIZE)
test_src = data_post_processor([str(DATASET_PATH / "test.en")], batch_size = BATCH_SIZE)
test_tgt = data_post_processor([str(DATASET_PATH / "test.de")], batch_size = BATCH_SIZE)


train_data = train_src, train_tgt
Expand Down Expand Up @@ -386,11 +387,11 @@ def __call__(self, paths: list[str], batch_size: int, lines_limit: int = None) -

# [train, eval, predict methods definition]

def train_step(source: np.ndarray, target: np.ndarray, epoch: int, epochs: int) -> float:
def train_step(source: list[np.ndarray], target: list[np.ndarray], epoch: int, epochs: int) -> float:
loss_history = []
model.train()

tqdm_range = tqdm(enumerate(zip(source, target)), total = len(source))
tqdm_range = tqdm(enumerate(zip(source, target, strict=False)), total = len(source))
for batch_num, (source_batch, target_batch) in tqdm_range:

output, _ = model.forward(source_batch, target_batch[:,:-1])
Expand Down Expand Up @@ -419,11 +420,11 @@ def train_step(source: np.ndarray, target: np.ndarray, epoch: int, epochs: int)

return epoch_loss

def eval(source: np.ndarray, target: np.ndarray) -> float:
def eval(source: list[np.ndarray], target: list[np.ndarray]) -> float:
loss_history = []
model.eval()

tqdm_range = tqdm(enumerate(zip(source, target)), total = len(source))
tqdm_range = tqdm(enumerate(zip(source, target, strict=False)), total = len(source))
for batch_num, (source_batch, target_batch) in tqdm_range:

output, _ = model.forward(source_batch, target_batch[:,:-1])
Expand All @@ -447,7 +448,7 @@ def eval(source: np.ndarray, target: np.ndarray) -> float:
return epoch_loss


def train(train_data: np.ndarray, val_data: np.ndarray, epochs: int, save_every_epochs: int, save_path: str = None, validation_check: bool = False):
def train(train_data: tuple[list[np.ndarray], list[np.ndarray]], val_data: tuple[list[np.ndarray], list[np.ndarray]], epochs: int, save_every_epochs: int, save_path: Optional[str] = None, validation_check: bool = False):
best_val_loss = float('inf')

train_loss_history = []
Expand Down Expand Up @@ -547,23 +548,22 @@ def plot_loss_history(train_loss_history, val_loss_history):



test_data = []
raw_test_data: list[dict[str, str]] = []

with open(DATASET_PATH / "test.en", 'r') as f:
en_file = [l.strip() for l in open(DATASET_PATH / "test.en", 'r', encoding='utf-8')]
de_file = [l.strip() for l in open(DATASET_PATH / "test.de", 'r', encoding='utf-8')]
en_file = [l.strip() for l in Path(DATASET_PATH / "test.en").open('r', encoding='utf-8')]
de_file = [l.strip() for l in Path(DATASET_PATH / "test.de").open('r', encoding='utf-8')]

for i in range(len(en_file)):
if en_file[i] == '' or de_file[i] == '':
continue
en_seq, de_seq = en_file[i], de_file[i]

test_data.append({'en': en_seq, 'de': de_seq})
raw_test_data.append({'en': en_seq, 'de': de_seq})

sentences_num = 10

random_indices = np.random.randint(0, len(test_data), sentences_num)
sentences_selection = [test_data[i] for i in random_indices]
random_indices = np.random.randint(0, len(raw_test_data), sentences_num)
sentences_selection = [raw_test_data[i] for i in random_indices]

# [Translate sentences from validation set]
for i, example in enumerate(sentences_selection):
Expand All @@ -575,7 +575,8 @@ def plot_loss_history(train_loss_history, val_loss_history):


def plot_attention(sentence: str, translation: str, attention: Tensor, heads_num: int = 8, rows_num: int = 2, cols_num: int = 4):
assert rows_num * cols_num == heads_num
if rows_num * cols_num != heads_num:
raise ValueError("heads_num must be equal to rows_num * cols_num")
attention = attention.detach().cpu().numpy().squeeze()

sentence = tokenizer.encode(sentence, add_special_tokens=False).tokens
Expand Down
Loading

0 comments on commit c5fe87b

Please sign in to comment.