|
| 1 | +import filecmp |
| 2 | +import glob |
| 3 | +import importlib.util |
| 4 | +import os |
| 5 | +import shutil |
| 6 | +from collections import defaultdict |
| 7 | +from pathlib import Path |
| 8 | + |
| 9 | +from unitxt import get_logger |
| 10 | +from unitxt.settings_utils import get_constants, get_settings |
| 11 | + |
| 12 | +logger = get_logger() |
| 13 | +constants = get_constants() |
| 14 | +settings = get_settings() |
| 15 | + |
| 16 | + |
| 17 | +def import_module_from_file(file_path): |
| 18 | + # Get the module name (file name without extension) |
| 19 | + module_name = os.path.splitext(os.path.basename(file_path))[0] |
| 20 | + # Create a module specification |
| 21 | + spec = importlib.util.spec_from_file_location(module_name, file_path) |
| 22 | + # Create a new module based on the specification |
| 23 | + module = importlib.util.module_from_spec(spec) |
| 24 | + # Load the module |
| 25 | + logger.info( |
| 26 | + f"allow unverified code in {file_path} : {settings.allow_unverified_code}" |
| 27 | + ) |
| 28 | + spec.loader.exec_module(module) |
| 29 | + return module |
| 30 | + |
| 31 | + |
| 32 | +# flake8: noqa: C901 |
| 33 | +def main(): |
| 34 | + catalog_dir = constants.catalog_dir |
| 35 | + catalog_back_dir = catalog_dir + "_back" |
| 36 | + |
| 37 | + os.environ["UNITXT_USE_ONLY_LOCAL_CATALOGS"] = "True" |
| 38 | + os.environ["UNITXT_TEST_CARD_DISABLE"] = "True" |
| 39 | + os.environ["UNITXT_TEST_METRIC_DISABLE"] = "True" |
| 40 | + os.environ["UNITXT_ALLOW_UNVERIFIED_CODE"] = "True" |
| 41 | + os.environ["UNITXT_SKIP_ARTIFACTS_PREPARE_AND_VERIFY"] = "True" |
| 42 | + logger.info("*" * 100) |
| 43 | + logger.info("*" * 100) |
| 44 | + logger.info( |
| 45 | + "Copying all files from 'src/unitxt/catalog' to a backup 'src/unitxt/catalog_back'" |
| 46 | + ) |
| 47 | + shutil.rmtree(catalog_back_dir, ignore_errors=True) |
| 48 | + shutil.copytree(catalog_dir, catalog_back_dir) |
| 49 | + |
| 50 | + logger.critical("Starting to reprepare the catalog...") |
| 51 | + prepare_dir = os.path.join(Path(catalog_dir).parent.parent.parent, "prepare") |
| 52 | + prepare_files = sorted(glob.glob(f"{prepare_dir}/**/*.py", recursive=True)) |
| 53 | + failing_prepare_files = [] |
| 54 | + prepare_files_generating_entries_not_in_the_catalog = [] |
| 55 | + prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog = [] |
| 56 | + catalog_files_generated_thus_far = defaultdict( |
| 57 | + list |
| 58 | + ) # from catalog_file to list of its generators |
| 59 | + current_catalog_files = glob.glob(f"{catalog_dir}/**/*.json", recursive=True) |
| 60 | + initial_time = os.path.getmtime(catalog_dir) |
| 61 | + for current_catalog_file in current_catalog_files: |
| 62 | + if os.path.getmtime(current_catalog_file) > initial_time: |
| 63 | + initial_time = os.path.getmtime(current_catalog_file) |
| 64 | + # initial_time is the most recent modification time of any catalog file |
| 65 | + next_border_time = initial_time |
| 66 | + for i, prepare_file in enumerate(prepare_files): |
| 67 | + logger.info("*" * 100) |
| 68 | + logger.info(f"* {i}/{len(prepare_files)}: {prepare_file}") |
| 69 | + logger.info("*") |
| 70 | + border_time = next_border_time |
| 71 | + try: |
| 72 | + import_module_from_file(prepare_file) |
| 73 | + current_catalog_files = glob.glob( |
| 74 | + f"{catalog_dir}/**/*.json", recursive=True |
| 75 | + ) |
| 76 | + new_times = [] # modification times of catalog files changed by prepare_file |
| 77 | + for current_catalog_file in current_catalog_files: |
| 78 | + if ( |
| 79 | + os.path.getmtime(current_catalog_file) > border_time |
| 80 | + ): # current_catalog_file was just generated by prepare_file |
| 81 | + new_times.append(os.path.getmtime(current_catalog_file)) |
| 82 | + catalog_files_generated_thus_far[current_catalog_file].append( |
| 83 | + prepare_file |
| 84 | + ) |
| 85 | + if not os.path.exists( |
| 86 | + current_catalog_file.replace(catalog_dir, catalog_back_dir) |
| 87 | + ): |
| 88 | + # prepare_file generates a catalog file that is not a member of branch's original catalog |
| 89 | + prepare_files_generating_entries_not_in_the_catalog.append( |
| 90 | + prepare_file |
| 91 | + ) |
| 92 | + # return branch's catalog to its original state: |
| 93 | + os.remove(current_catalog_file) |
| 94 | + elif not filecmp.cmp( |
| 95 | + current_catalog_file, |
| 96 | + current_catalog_file.replace(catalog_dir, catalog_back_dir), |
| 97 | + shallow=False, |
| 98 | + ): |
| 99 | + # prepare_file generates a catalog file that is different from the existing branch's catalog file of same name |
| 100 | + prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog.append( |
| 101 | + prepare_file |
| 102 | + ) |
| 103 | + # restore current_catalog_file from backup catalog. |
| 104 | + shutil.copy( |
| 105 | + current_catalog_file.replace(catalog_dir, catalog_back_dir), |
| 106 | + current_catalog_file, |
| 107 | + ) |
| 108 | + # modification time of current_catalog_file is now - the time of copying |
| 109 | + new_times.append(os.path.getmtime(current_catalog_file)) |
| 110 | + |
| 111 | + if new_times: |
| 112 | + # several prepare files are all commented out, waiting for a fix |
| 113 | + next_border_time = max(new_times) |
| 114 | + |
| 115 | + except Exception as e: |
| 116 | + logger.info(f"Failed to run prepare file: {prepare_file}") |
| 117 | + failing_prepare_files.append((prepare_file, e)) |
| 118 | + |
| 119 | + # report errors discovered thus far |
| 120 | + if failing_prepare_files: |
| 121 | + logger.critical( |
| 122 | + f"Execution of the following {len(failing_prepare_files)} prepare files failed for the following respective causes:" |
| 123 | + ) |
| 124 | + for prepare_file, e in failing_prepare_files: |
| 125 | + logger.critical( |
| 126 | + f"prepare file: '{prepare_file}' failed, throwing exception: '{e}'" |
| 127 | + ) |
| 128 | + |
| 129 | + if prepare_files_generating_entries_not_in_the_catalog: |
| 130 | + prepare_files_generating_entries_not_in_the_catalog = sorted( |
| 131 | + set(prepare_files_generating_entries_not_in_the_catalog) |
| 132 | + ) |
| 133 | + logger.critical( |
| 134 | + f"The following {len(prepare_files_generating_entries_not_in_the_catalog)} prepare files generated catalog files that are not included in the catalog. To fix: add the products of these prepare files to the catalog." |
| 135 | + ) |
| 136 | + for prepare_file in prepare_files_generating_entries_not_in_the_catalog: |
| 137 | + logger.critical(f"{prepare_file}") |
| 138 | + |
| 139 | + if prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog: |
| 140 | + prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog = sorted( |
| 141 | + set( |
| 142 | + prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog |
| 143 | + ) |
| 144 | + ) |
| 145 | + logger.critical( |
| 146 | + f"The following {len(prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog)} prepare files generated catalog files of different contents from what is included in the (original branch's) catalog. To fix: update the branch's catalog files by the products of these prepare files." |
| 147 | + ) |
| 148 | + for prepare_file in prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog: |
| 149 | + logger.critical(f"{prepare_file}") |
| 150 | + |
| 151 | + # see if the branch's catalog contains any file that none of the branch's prepare file generates: |
| 152 | + catalog_files_not_generated_by_any_prepare_file = [] |
| 153 | + current_catalog_files = glob.glob(f"{catalog_dir}/**/*.json", recursive=True) |
| 154 | + for current_catalog_file in current_catalog_files: |
| 155 | + if ( |
| 156 | + os.path.getmtime(current_catalog_file) > initial_time |
| 157 | + ): # current_catalog_file was touched by a prepare file |
| 158 | + continue |
| 159 | + catalog_files_not_generated_by_any_prepare_file.append(current_catalog_file) |
| 160 | + |
| 161 | + if catalog_files_not_generated_by_any_prepare_file: |
| 162 | + logger.critical( |
| 163 | + f"The following {len(catalog_files_not_generated_by_any_prepare_file)} branch's catalog files are not generated by any of the branch's prepare files. To fix: remove them from the branch's catalog." |
| 164 | + ) |
| 165 | + for catalog_file in catalog_files_not_generated_by_any_prepare_file: |
| 166 | + logger.critical(f"{catalog_file}") |
| 167 | + |
| 168 | + catalog_files_generated_by_two_or_more_prepare_files = [ |
| 169 | + catalog_file |
| 170 | + for catalog_file in catalog_files_generated_thus_far |
| 171 | + if len(catalog_files_generated_thus_far[catalog_file]) > 1 |
| 172 | + ] |
| 173 | + if catalog_files_generated_by_two_or_more_prepare_files: |
| 174 | + logger.critical( |
| 175 | + f"Each of the following {len(catalog_files_generated_by_two_or_more_prepare_files)} catalog files were generated by two or more prepare files. To fix: remove repeating 'add_to_catalog'-s from branch's prepare files." |
| 176 | + ) |
| 177 | + for catalog_file in catalog_files_generated_by_two_or_more_prepare_files: |
| 178 | + logger.critical( |
| 179 | + f"{catalog_file} is generated by: {catalog_files_generated_thus_far[catalog_file]}" |
| 180 | + ) |
| 181 | + |
| 182 | + # finally, restore branch's catalog, including modification times |
| 183 | + shutil.rmtree(catalog_dir, ignore_errors=True) |
| 184 | + shutil.copytree(catalog_back_dir, catalog_dir) |
| 185 | + shutil.rmtree(catalog_back_dir, ignore_errors=True) |
| 186 | + |
| 187 | + if failing_prepare_files: |
| 188 | + raise RuntimeError( |
| 189 | + "Checking consistency of branch's catalog against the total production of the branch's prepare files, we run each prepare file in turn, given the branch's catalog (which is needed as input by many of the prepare files). Some of the prepare files failed running. See details in the logs." |
| 190 | + ) |
| 191 | + |
| 192 | + if ( |
| 193 | + catalog_files_not_generated_by_any_prepare_file |
| 194 | + or prepare_files_generating_entries_not_in_the_catalog |
| 195 | + or prepare_files_generating_entries_of_different_content_from_what_is_in_the_catalog |
| 196 | + or catalog_files_generated_by_two_or_more_prepare_files |
| 197 | + ): |
| 198 | + raise RuntimeError( |
| 199 | + "Branch's catalog is different from the total production of branch's prepare files. See details in the logs." |
| 200 | + ) |
| 201 | + |
| 202 | + logger.critical( |
| 203 | + "Done. Catalog is consistent with the total production of the prepare files." |
| 204 | + ) |
| 205 | + |
| 206 | + |
| 207 | +if __name__ == "__main__": |
| 208 | + main() |
0 commit comments