forked from extreme-bert/extreme-bert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_data.py
78 lines (68 loc) · 2.85 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# coding=utf-8
# Copyright 2022 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
# code taken from commit: ea000838156e3be251699ad6a3c8b1339c76e987
# https://github.com/IntelLabs/academic-budget-bert
# Copyright 2021 Intel Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import pathlib
import subprocess
from data.BookscorpusTextFormatting import BookscorpusTextFormatting
from data.WikicorpusTextFormatting import WikicorpusTextFormatting
logging.basicConfig(
format="%(asctime)s - %(levelname)s: %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger = logging.getLogger(__name__)
try:
from wikiextractor import WikiExtractor
except ModuleNotFoundError as e:
logger.error("wikiextractor is not installed, please install to use script")
quit()
WIKI_EXT_CMD = "python -m wikiextractor.WikiExtractor"
FORMATTERS = {"wiki": WikicorpusTextFormatting, "bookcorpus": BookscorpusTextFormatting}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-f", type=str, required=True, help="Path to wikipedia xml or bookcorpus directory"
)
parser.add_argument("-o", type=str, required=True, help="Output directory")
parser.add_argument(
"--type",
type=str,
required=True,
choices=FORMATTERS.keys(),
help="Dataset type [wiki, bookcorpus]",
)
parser.add_argument(
"--n_processes", type=int, default=16, help="Number of concurrent processes"
)
args = parser.parse_args()
merged_file = pathlib.Path(args.o, f"{args.type}_one_article_per_line.txt")
fmt = FORMATTERS.get(args.type)
if args.type == "wiki":
data_path = pathlib.Path(args.o, args.type)
data_path.mkdir(parents=True, exist_ok=True)
logger.info("Extracting articles using wikiextractor ...")
EXTRACT_CMD = f"{WIKI_EXT_CMD} {args.f} -b 100M --processes {args.n_processes} -o {data_path.absolute()}"
subprocess.run(EXTRACT_CMD, shell=True, check=True)
logger.info("Done. \n")
elif args.type == "bookcorpus":
data_path = pathlib.Path(args.f)
logger.info(f"Loading {args.type} files and combining into 1 file ...")
data_formatter = fmt(str(data_path), str(merged_file), recursive=True)
data_formatter.merge()
logger.info("Done.")