This repository was archived by the owner on Jul 13, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_tokenizer.py
More file actions
62 lines (55 loc) · 1.76 KB
/
generate_tokenizer.py
File metadata and controls
62 lines (55 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from pathlib import Path
from argparse import ArgumentParser, Namespace
from miditok import TokenizerConfig, REMI, MIDILike
def parse_arguments() -> Namespace:
parser = ArgumentParser(description="Tokenizer Generation")
parser.add_argument(
"--data_folder",
type=str,
default="Pop1K7/midi_analyzed",
help="folder of dataset"
)
parser.add_argument(
"--tokenizer_name",
type=str,
default="remi",
choices=["remi", "remiplus", "midilike"],
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_arguments()
match args.tokenizer_name:
case "remi":
config = TokenizerConfig(
num_velocities=16,
use_chords=True,
use_programs=False,
use_tempos=True,
)
tokenizer = REMI(config)
case "remiplus":
config = TokenizerConfig(
num_velocities=16,
use_chords=True,
use_programs=False,
use_tempos=True,
one_token_stream_for_programs = True,
use_time_signatures = True,
)
tokenizer = REMI(config)
case "midilike":
config = TokenizerConfig(
num_velocities=16,
use_chords=True,
use_programs=False,
use_tempos=True,
max_duration=(4, 480, 120)
)
tokenizer = MIDILike(config)
case _:
raise ValueError("Invalid tokenizer name")
tokenizer.train(
vocab_size=30000,
files_paths=list(Path(args.data_folder).glob("**/*.mid")),
)
tokenizer.save(Path("tokenizers", f"{args.tokenizer_name}.json"))