Skip to content

Commit 593feb3

Browse files
committed
prepare all artifacts from prepare files
Signed-off-by: dafnapension <[email protected]>
1 parent 84bcd45 commit 593feb3

File tree

5 files changed

+269
-183
lines changed

5 files changed

+269
-183
lines changed

.github/workflows/catalog_consistency.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,4 @@ jobs:
3636
- run: uv pip install --system -e ".[tests]"
3737

3838
- name: Run Tests
39-
run: python utils/prepare_all_artifacts.py
39+
run: python utils/check_catalog_consistency.py

prepare/tasks/qa/tasks.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,6 @@
1313
Text,
1414
)
1515

16-
add_link_to_catalog(
17-
artifact_linked_to="tasks.qa.extractive",
18-
name="tasks.qa.with_context.extractive",
19-
overwrite=True,
20-
)
2116
add_to_catalog(
2217
Task(
2318
__description__="""This is the Question Answering Task with provided context , where the answer must be extracted verbatim from the context.
@@ -40,6 +35,12 @@
4035
overwrite=True,
4136
)
4237

38+
add_link_to_catalog(
39+
artifact_linked_to="tasks.qa.extractive",
40+
name="tasks.qa.with_context.extractive",
41+
overwrite=True,
42+
)
43+
4344
add_to_catalog(
4445
Task(
4546
__description__="""""",

src/unitxt/artifact.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import inspect
33
import json
44
import os
5-
import pkgutil
65
import re
76
import sys
87
import sysconfig
@@ -845,24 +844,6 @@ def maybe_recover_artifact(obj):
845844
return obj
846845

847846

848-
def register_all_artifacts(path):
849-
for loader, module_name, _is_pkg in pkgutil.walk_packages(path):
850-
logger.info(__name__)
851-
if module_name == __name__:
852-
continue
853-
logger.info(f"Loading {module_name}")
854-
# Import the module
855-
module = loader.find_module(module_name).load_module(module_name)
856-
857-
# Iterate over every object in the module
858-
for _name, obj in inspect.getmembers(module):
859-
# Make sure the object is a class
860-
if inspect.isclass(obj):
861-
# Make sure the class is a subclass of Artifact (but not Artifact itself)
862-
if issubclass(obj, Artifact) and obj is not Artifact:
863-
logger.info(obj)
864-
865-
866847
def get_artifacts_data_classification(artifact: str) -> Optional[List[str]]:
867848
"""Loads given artifact's data classification policy from an environment variable.
868849

utils/check_catalog_consistency.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
import filecmp
2+
import glob
3+
import importlib.util
4+
import os
5+
import shutil
6+
from collections import defaultdict
7+
from pathlib import Path
8+
9+
from unitxt import get_logger
10+
from unitxt.settings_utils import get_constants, get_settings
11+
12+
logger = get_logger()
13+
constants = get_constants()
14+
settings = get_settings()
15+
16+
17+
def import_module_from_file(file_path):
18+
# Get the module name (file name without extension)
19+
module_name = os.path.splitext(os.path.basename(file_path))[0]
20+
# Create a module specification
21+
spec = importlib.util.spec_from_file_location(module_name, file_path)
22+
# Create a new module based on the specification
23+
module = importlib.util.module_from_spec(spec)
24+
# Load the module
25+
logger.info(
26+
f"allow unverified code in {file_path} : {settings.allow_unverified_code}"
27+
)
28+
spec.loader.exec_module(module)
29+
return module
30+
31+
32+
# flake8: noqa: C901
33+
def main():
34+
catalog_dir = constants.catalog_dir
35+
catalog_back_dir = catalog_dir + "_back"
36+
37+
os.environ["UNITXT_USE_ONLY_LOCAL_CATALOGS"] = "True"
38+
os.environ["UNITXT_TEST_CARD_DISABLE"] = "True"
39+
os.environ["UNITXT_TEST_METRIC_DISABLE"] = "True"
40+
os.environ["UNITXT_ALLOW_UNVERIFIED_CODE"] = "True"
41+
os.environ["UNITXT_SKIP_ARTIFACTS_PREPARE_AND_VERIFY"] = "True"
42+
logger.info("*" * 100)
43+
logger.info("*" * 100)
44+
logger.info(
45+
"Copying all files from 'src/unitxt/catalog' to a backup 'src/unitxt/catalog_back'"
46+
)
47+
shutil.rmtree(catalog_back_dir, ignore_errors=True)
48+
shutil.copytree(catalog_dir, catalog_back_dir)
49+
50+
logger.critical("Starting to reprepare the catalog...")
51+
prepare_dir = os.path.join(Path(catalog_dir).parent.parent.parent, "prepare")
52+
prepare_files = sorted(glob.glob(f"{prepare_dir}/**/*.py", recursive=True))
53+
failing_prepare_files = []
54+
prepare_files_generating_entries_not_in_the_catalog = []
55+
prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog = []
56+
catalog_files_generated_thus_far = defaultdict(
57+
list
58+
) # from catalog_file to list of its generators
59+
current_catalog_files = glob.glob(f"{catalog_dir}/**/*.json", recursive=True)
60+
initial_time = os.path.getmtime(catalog_dir)
61+
for current_catalog_file in current_catalog_files:
62+
if os.path.getmtime(current_catalog_file) > initial_time:
63+
initial_time = os.path.getmtime(current_catalog_file)
64+
# initial_time is the most recent modification time of any catalog file
65+
next_border_time = initial_time
66+
for i, prepare_file in enumerate(prepare_files):
67+
logger.info("*" * 100)
68+
logger.info(f"* {i}/{len(prepare_files)}: {prepare_file}")
69+
logger.info("*")
70+
border_time = next_border_time
71+
try:
72+
import_module_from_file(prepare_file)
73+
current_catalog_files = glob.glob(
74+
f"{catalog_dir}/**/*.json", recursive=True
75+
)
76+
new_times = [] # modification times of catalog files changed by prepare_file
77+
for current_catalog_file in current_catalog_files:
78+
if (
79+
os.path.getmtime(current_catalog_file) > border_time
80+
): # current_catalog_file was just generated by prepare_file
81+
new_times.append(os.path.getmtime(current_catalog_file))
82+
catalog_files_generated_thus_far[current_catalog_file].append(
83+
prepare_file
84+
)
85+
if not os.path.exists(
86+
current_catalog_file.replace(catalog_dir, catalog_back_dir)
87+
):
88+
# prepare_file generates a catalog file that is not a member of branch's original catalog
89+
prepare_files_generating_entries_not_in_the_catalog.append(
90+
prepare_file
91+
)
92+
# return branch's catalog to its original state:
93+
os.remove(current_catalog_file)
94+
elif not filecmp.cmp(
95+
current_catalog_file,
96+
current_catalog_file.replace(catalog_dir, catalog_back_dir),
97+
shallow=False,
98+
):
99+
# prepare_file generates a catalog file that is different from the existing branch's catalog file of same name
100+
prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog.append(
101+
prepare_file
102+
)
103+
# restore current_catalog_file from backup catalog.
104+
shutil.copy(
105+
current_catalog_file.replace(catalog_dir, catalog_back_dir),
106+
current_catalog_file,
107+
)
108+
# modification time of current_catalog_file is now - the time of copying
109+
new_times.append(os.path.getmtime(current_catalog_file))
110+
111+
if new_times:
112+
# several prepare files are all commented out, waiting for a fix
113+
next_border_time = max(new_times)
114+
115+
except Exception as e:
116+
logger.info(f"Failed to run prepare file: {prepare_file}")
117+
failing_prepare_files.append((prepare_file, e))
118+
119+
# report errors discovered thus far
120+
if failing_prepare_files:
121+
logger.critical(
122+
f"Execution of the following {len(failing_prepare_files)} prepare files failed for the following respective causes:"
123+
)
124+
for prepare_file, e in failing_prepare_files:
125+
logger.critical(
126+
f"prepare file: '{prepare_file}' failed, throwing exception: '{e}'"
127+
)
128+
129+
if prepare_files_generating_entries_not_in_the_catalog:
130+
prepare_files_generating_entries_not_in_the_catalog = sorted(
131+
set(prepare_files_generating_entries_not_in_the_catalog)
132+
)
133+
logger.critical(
134+
f"The following {len(prepare_files_generating_entries_not_in_the_catalog)} prepare files generated catalog files that are not included in the catalog. To fix: add the products of these prepare files to the catalog."
135+
)
136+
for prepare_file in prepare_files_generating_entries_not_in_the_catalog:
137+
logger.critical(f"{prepare_file}")
138+
139+
if prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog:
140+
prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog = sorted(
141+
set(
142+
prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog
143+
)
144+
)
145+
logger.critical(
146+
f"The following {len(prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog)} prepare files generated catalog files of different contents from what is included in the (original branch's) catalog. To fix: update the branch's catalog files by the products of these prepare files."
147+
)
148+
for prepare_file in prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog:
149+
logger.critical(f"{prepare_file}")
150+
151+
# see if the branch's catalog contains any file that none of the branch's prepare file generates:
152+
catalog_files_not_generated_by_any_prepare_file = []
153+
current_catalog_files = glob.glob(f"{catalog_dir}/**/*.json", recursive=True)
154+
for current_catalog_file in current_catalog_files:
155+
if (
156+
os.path.getmtime(current_catalog_file) > initial_time
157+
): # current_catalog_file was touched by a prepare file
158+
continue
159+
catalog_files_not_generated_by_any_prepare_file.append(current_catalog_file)
160+
161+
if catalog_files_not_generated_by_any_prepare_file:
162+
logger.critical(
163+
f"The following {len(catalog_files_not_generated_by_any_prepare_file)} branch's catalog files are not generated by any of the branch's prepare files. To fix: remove them from the branch's catalog."
164+
)
165+
for catalog_file in catalog_files_not_generated_by_any_prepare_file:
166+
logger.critical(f"{catalog_file}")
167+
168+
catalog_files_generated_by_two_or_more_prepare_files = [
169+
catalog_file
170+
for catalog_file in catalog_files_generated_thus_far
171+
if len(catalog_files_generated_thus_far[catalog_file]) > 1
172+
]
173+
if catalog_files_generated_by_two_or_more_prepare_files:
174+
logger.critical(
175+
f"Each of the following {len(catalog_files_generated_by_two_or_more_prepare_files)} catalog files were generated by two or more prepare files. To fix: remove repeating 'add_to_catalog'-s from branch's prepare files."
176+
)
177+
for catalog_file in catalog_files_generated_by_two_or_more_prepare_files:
178+
logger.critical(
179+
f"{catalog_file} is generated by: {catalog_files_generated_thus_far[catalog_file]}"
180+
)
181+
182+
# finally, restore branch's catalog, including modification times
183+
shutil.rmtree(catalog_dir, ignore_errors=True)
184+
shutil.copytree(catalog_back_dir, catalog_dir)
185+
shutil.rmtree(catalog_back_dir, ignore_errors=True)
186+
187+
if failing_prepare_files:
188+
raise RuntimeError(
189+
"Checking consistency of branch's catalog against the total production of the branch's prepare files, we run each prepare file in turn, given the branch's catalog (which is needed as input by many of the prepare files). Some of the prepare files failed running. See details in the logs."
190+
)
191+
192+
if (
193+
catalog_files_not_generated_by_any_prepare_file
194+
or prepare_files_generating_entries_not_in_the_catalog
195+
or prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog
196+
or catalog_files_generated_by_two_or_more_prepare_files
197+
):
198+
raise RuntimeError(
199+
"Branch's catalog is different from the total production of branch's prepare files. See details in the logs."
200+
)
201+
202+
logger.critical(
203+
"Done. Catalog is consistent with the total production of the prepare files."
204+
)
205+
206+
207+
if __name__ == "__main__":
208+
main()

0 commit comments

Comments
 (0)