diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ced5d5021ca9..c2e3679c7858 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: rev: 25.1.0 hooks: - id: black - files: ^(misc/codegen/.*|misc/scripts/models-as-data/bulk_generate_mad)\.py$ + files: ^(misc/codegen/.*|misc/scripts/models-as-data/.*)\.py$ - repo: local hooks: diff --git a/misc/scripts/models-as-data/bulk_generate_mad.py b/misc/scripts/models-as-data/bulk_generate_mad.py index 91286c52526a..c9218048f1c4 100755 --- a/misc/scripts/models-as-data/bulk_generate_mad.py +++ b/misc/scripts/models-as-data/bulk_generate_mad.py @@ -236,13 +236,12 @@ def generate_models(config, args, project: Project, database_dir: str) -> None: language = config["language"] generator = mad.Generator(language) - # Note: The argument parser converts with-sinks to with_sinks, etc. generator.generateSinks = should_generate_sinks(project) generator.generateSources = should_generate_sources(project) generator.generateSummaries = should_generate_summaries(project) - generator.setenvironment(database=database_dir, folder=name) generator.threads = args.codeql_threads generator.ram = args.codeql_ram + generator.setenvironment(database=database_dir, folder=name) generator.run() @@ -348,7 +347,7 @@ def download_dca_databases( """ print("\n=== Finding projects ===") project_map = {project["name"]: project for project in projects} - analyzed_databases = {} + analyzed_databases = {n: None for n in project_map} for experiment_name in experiment_names: response = get_json_from_github( f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json", @@ -361,17 +360,24 @@ def download_dca_databases( artifact_name = analyzed_database["artifact_name"] pretty_name = pretty_name_from_artifact_name(artifact_name) - if not pretty_name in project_map: + if not pretty_name in analyzed_databases: print(f"Skipping {pretty_name} as it is not in the list of projects") continue - if pretty_name in analyzed_databases: + if analyzed_databases[pretty_name] is not None: print( f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}" ) analyzed_databases[pretty_name] = analyzed_database + not_found = [name for name, db in analyzed_databases.items() if db is None] + if not_found: + print( + f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}" + ) + sys.exit(1) + def download_and_decompress(analyzed_database: dict) -> str: artifact_name = analyzed_database["artifact_name"] repository = analyzed_database["repository"] @@ -525,7 +531,7 @@ def main(config, args) -> None: parser.add_argument( "--codeql-ram", type=int, - help="What `--ram` value to pass to `codeql` while generating models (by default the flag is not passed)", + help="What `--ram` value to pass to `codeql` while generating models (by default 2048 MB per thread)", default=None, ) parser.add_argument( diff --git a/misc/scripts/models-as-data/convert_extensions.py b/misc/scripts/models-as-data/convert_extensions.py index 28a7b7349bc0..01a10dae5ffd 100644 --- a/misc/scripts/models-as-data/convert_extensions.py +++ b/misc/scripts/models-as-data/convert_extensions.py @@ -7,65 +7,86 @@ import sys import tempfile + def quote_if_needed(v): # string columns if type(v) is str: - return "\"" + v + "\"" + return '"' + v + '"' # bool column return str(v) + def parseData(data): - rows = [{ }, { }] + rows = [{}, {}] for row in data: d = map(quote_if_needed, row) provenance = row[-1] targetRows = rows[1] if provenance.endswith("generated") else rows[0] - helpers.insert_update(targetRows, row[0], " - [" + ', '.join(d) + ']\n') + helpers.insert_update(targetRows, row[0], " - [" + ", ".join(d) + "]\n") return rows + class Converter: def __init__(self, language, dbDir): self.language = language self.dbDir = dbDir - self.codeQlRoot = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip() + self.codeQlRoot = ( + subprocess.check_output(["git", "rev-parse", "--show-toplevel"]) + .decode("utf-8") + .strip() + ) self.extDir = os.path.join(self.codeQlRoot, f"{self.language}/ql/lib/ext/") self.dirname = "modelconverter" self.modelFileExtension = ".model.yml" self.workDir = tempfile.mkdtemp() - def runQuery(self, query): - print('########## Querying: ', query) - queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query) + print("########## Querying: ", query) + queryFile = os.path.join( + self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query + ) resultBqrs = os.path.join(self.workDir, "out.bqrs") - helpers.run_cmd(['codeql', 'query', 'run', queryFile, '--database', self.dbDir, '--output', resultBqrs], "Failed to generate " + query) + helpers.run_cmd( + [ + "codeql", + "query", + "run", + queryFile, + "--database", + self.dbDir, + "--output", + resultBqrs, + ], + "Failed to generate " + query, + ) return helpers.readData(self.workDir, resultBqrs) - def asAddsTo(self, rows, predicate): - extensions = [{ }, { }] + extensions = [{}, {}] for i in range(2): for key in rows[i]: - extensions[i][key] = helpers.addsToTemplate.format(f"codeql/{self.language}-all", predicate, rows[i][key]) - - return extensions + extensions[i][key] = helpers.addsToTemplate.format( + f"codeql/{self.language}-all", predicate, rows[i][key] + ) + return extensions def getAddsTo(self, query, predicate): data = self.runQuery(query) rows = parseData(data) return self.asAddsTo(rows, predicate) - def makeContent(self): summaries = self.getAddsTo("ExtractSummaries.ql", helpers.summaryModelPredicate) sources = self.getAddsTo("ExtractSources.ql", helpers.sourceModelPredicate) sinks = self.getAddsTo("ExtractSinks.ql", helpers.sinkModelPredicate) neutrals = self.getAddsTo("ExtractNeutrals.ql", helpers.neutralModelPredicate) - return [helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]), helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1])] - + return [ + helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]), + helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1]), + ] def save(self, extensions): # Create directory if it doesn't exist @@ -77,9 +98,11 @@ def save(self, extensions): for entry in extensions[0]: with open(self.extDir + "/" + entry + self.modelFileExtension, "w") as f: f.write(extensionTemplate.format(extensions[0][entry])) - + for entry in extensions[1]: - with open(self.extDir + "/generated/" + entry + self.modelFileExtension, "w") as f: + with open( + self.extDir + "/generated/" + entry + self.modelFileExtension, "w" + ) as f: f.write(extensionTemplate.format(extensions[1][entry])) def run(self): diff --git a/misc/scripts/models-as-data/generate_mad.py b/misc/scripts/models-as-data/generate_mad.py index 818721ed43b6..2f2b74cf3f7d 100755 --- a/misc/scripts/models-as-data/generate_mad.py +++ b/misc/scripts/models-as-data/generate_mad.py @@ -7,153 +7,187 @@ import sys import tempfile import re +import argparse + def quote_if_needed(row): if row != "true" and row != "false": - return "\"" + row + "\"" + return '"' + row + '"' # subtypes column return row[0].upper() + row[1:] + def parseData(data): - rows = { } + rows = {} for row in data: - d = row[0].split(';') + d = row[0].split(";") namespace = d[0] d = map(quote_if_needed, d) - helpers.insert_update(rows, namespace, " - [" + ', '.join(d) + ']\n') + helpers.insert_update(rows, namespace, " - [" + ", ".join(d) + "]\n") return rows -def printHelp(): - print(f"""Usage: -python3 generate_mad.py [DIR] --language LANGUAGE [--with-sinks] [--with-sources] [--with-summaries] [--with-neutrals] [--with-typebased-summaries] [--dry-run] - +description = """\ This generates summary, source, sink and neutral models for the code in the database. -The files will be placed in `LANGUAGE/ql/lib/ext/generated/DIR` - -Which models are generated is controlled by the flags: - --with-sinks - --with-sources - --with-summaries - --with-neutrals - --with-typebased-summaries (Experimental) -If none of these flags are specified, all models are generated except for the type based models. - - --dry-run: Only run the queries, but don't write to file. +The files will be placed in `LANGUAGE/ql/lib/ext/generated/DIR`""" +epilog = """\ Example invocations: $ python3 generate_mad.py /tmp/dbs/my_library_db $ python3 generate_mad.py /tmp/dbs/my_library_db --with-sinks $ python3 generate_mad.py /tmp/dbs/my_library_db --with-sinks my_directory +Requirements: `codeql` should appear on your path.""" -Requirements: `codeql` should appear on your path. - """) class Generator: - def __init__(self, language): + generateSinks = False + generateSources = False + generateSummaries = False + generateNeutrals = False + generateTypeBasedSummaries = False + dryRun = False + dirname = "modelgenerator" + ram = None + threads = 0 + folder = "" + + def __init__(self, language=None): self.language = language - self.generateSinks = False - self.generateSources = False - self.generateSummaries = False - self.generateNeutrals = False - self.generateTypeBasedSummaries = False - self.dryRun = False - self.dirname = "modelgenerator" - self.ram = 2**15 - self.threads = 8 - - - def setenvironment(self, database, folder): - self.codeQlRoot = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip() - self.database = database + + def setenvironment(self, database=None, folder=None): + self.codeQlRoot = ( + subprocess.check_output(["git", "rev-parse", "--show-toplevel"]) + .decode("utf-8") + .strip() + ) + self.database = database or self.database + self.folder = folder or self.folder self.generatedFrameworks = os.path.join( - self.codeQlRoot, f"{self.language}/ql/lib/ext/generated/{folder}") + self.codeQlRoot, f"{self.language}/ql/lib/ext/generated/{self.folder}" + ) self.workDir = tempfile.mkdtemp() + if self.ram is None: + threads = self.threads if self.threads > 0 else os.cpu_count() + self.ram = 2048 * threads os.makedirs(self.generatedFrameworks, exist_ok=True) - @staticmethod def make(): - # Create a generator instance based on command line arguments. - if any(s == "--help" for s in sys.argv): - printHelp() - sys.exit(0) - - if "--language" in sys.argv: - language = sys.argv[sys.argv.index("--language") + 1] - sys.argv.remove("--language") - sys.argv.remove(language) - else: - printHelp() - sys.exit(0) - - generator = Generator(language=language) - - if "--with-sinks" in sys.argv: - sys.argv.remove("--with-sinks") + p = argparse.ArgumentParser( + description=description, + formatter_class=argparse.RawTextHelpFormatter, + epilog=epilog, + ) + p.add_argument("database", help="Path to the CodeQL database") + p.add_argument( + "folder", + nargs="?", + default="", + help="Optional folder to place the generated files in", + ) + p.add_argument( + "--language", + required=True, + help="The language for which to generate models", + ) + p.add_argument( + "--with-sinks", + action="store_true", + help="Generate sink models", + dest="generateSinks", + ) + p.add_argument( + "--with-sources", + action="store_true", + help="Generate source models", + dest="generateSources", + ) + p.add_argument( + "--with-summaries", + action="store_true", + help="Generate summary models", + dest="generateSummaries", + ) + p.add_argument( + "--with-neutrals", + action="store_true", + help="Generate neutral models", + dest="generateNeutrals", + ) + p.add_argument( + "--with-typebased-summaries", + action="store_true", + help="Generate type-based summary models (experimental)", + dest="generateTypeBasedSummaries", + ) + p.add_argument( + "--dry-run", + action="store_true", + help="Do not write the generated files, just print them to stdout", + dest="dryRun", + ) + p.add_argument( + "--threads", + type=int, + default=Generator.threads, + help="Number of threads to use for CodeQL queries (default %(default)s). `0` means use all available threads.", + ) + p.add_argument( + "--ram", + type=int, + help="Amount of RAM to use for CodeQL queries in MB. Default is to use 2048 MB per thread.", + ) + generator = p.parse_args(namespace=Generator()) + + if ( + not generator.generateSinks + and not generator.generateSources + and not generator.generateSummaries + and not generator.generateNeutrals + and not generator.generateTypeBasedSummaries + ): generator.generateSinks = True - - if "--with-sources" in sys.argv: - sys.argv.remove("--with-sources") generator.generateSources = True - - if "--with-summaries" in sys.argv: - sys.argv.remove("--with-summaries") generator.generateSummaries = True - - if "--with-neutrals" in sys.argv: - sys.argv.remove("--with-neutrals") generator.generateNeutrals = True - if "--with-typebased-summaries" in sys.argv: - sys.argv.remove("--with-typebased-summaries") - generator.generateTypeBasedSummaries = True - - if "--dry-run" in sys.argv: - sys.argv.remove("--dry-run") - generator.dryRun = True - - if (not generator.generateSinks and - not generator.generateSources and - not generator.generateSummaries and - not generator.generateNeutrals and - not generator.generateTypeBasedSummaries): - generator.generateSinks = generator.generateSources = generator.generateSummaries = generator.generateNeutrals = True - - n = len(sys.argv) - if n < 2: - printHelp() - sys.exit(1) - elif n == 2: - generator.setenvironment(sys.argv[1], "") - else: - generator.setenvironment(sys.argv[1], sys.argv[2]) - + generator.setenvironment() return generator - def runQuery(self, query): print("########## Querying " + query + "...") - queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query) + queryFile = os.path.join( + self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query + ) resultBqrs = os.path.join(self.workDir, "out.bqrs") - cmd = ['codeql', 'query', 'run', queryFile, '--database', self.database, '--output', resultBqrs] - if self.threads is not None: - cmd += ["--threads", str(self.threads)] - if self.ram is not None: - cmd += ["--ram", str(self.ram)] + cmd = [ + "codeql", + "query", + "run", + queryFile, + "--database", + self.database, + "--output", + resultBqrs, + "--threads", + str(self.threads), + "--ram", + str(self.ram), + ] helpers.run_cmd(cmd, "Failed to generate " + query) return helpers.readData(self.workDir, resultBqrs) - def asAddsTo(self, rows, predicate): - extensions = { } + extensions = {} for key in rows: - extensions[key] = helpers.addsToTemplate.format(f"codeql/{self.language}-all", predicate, rows[key]) + extensions[key] = helpers.addsToTemplate.format( + f"codeql/{self.language}-all", predicate, rows[key] + ) return extensions def getAddsTo(self, query, predicate): @@ -164,27 +198,37 @@ def getAddsTo(self, query, predicate): def makeContent(self): summaryAddsTo = {} if self.generateSummaries: - summaryAddsTo = self.getAddsTo("CaptureSummaryModels.ql", helpers.summaryModelPredicate) + summaryAddsTo = self.getAddsTo( + "CaptureSummaryModels.ql", helpers.summaryModelPredicate + ) sinkAddsTo = {} if self.generateSinks: - sinkAddsTo = self.getAddsTo("CaptureSinkModels.ql", helpers.sinkModelPredicate) + sinkAddsTo = self.getAddsTo( + "CaptureSinkModels.ql", helpers.sinkModelPredicate + ) sourceAddsTo = {} if self.generateSources: - sourceAddsTo = self.getAddsTo("CaptureSourceModels.ql", helpers.sourceModelPredicate) + sourceAddsTo = self.getAddsTo( + "CaptureSourceModels.ql", helpers.sourceModelPredicate + ) neutralAddsTo = {} if self.generateNeutrals: - neutralAddsTo = self.getAddsTo("CaptureNeutralModels.ql", helpers.neutralModelPredicate) + neutralAddsTo = self.getAddsTo( + "CaptureNeutralModels.ql", helpers.neutralModelPredicate + ) return helpers.merge(summaryAddsTo, sinkAddsTo, sourceAddsTo, neutralAddsTo) def makeTypeBasedContent(self): if self.generateTypeBasedSummaries: - typeBasedSummaryAddsTo = self.getAddsTo("CaptureTypeBasedSummaryModels.ql", helpers.summaryModelPredicate) + typeBasedSummaryAddsTo = self.getAddsTo( + "CaptureTypeBasedSummaryModels.ql", helpers.summaryModelPredicate + ) else: - typeBasedSummaryAddsTo = { } + typeBasedSummaryAddsTo = {} return typeBasedSummaryAddsTo @@ -195,13 +239,14 @@ def save(self, extensions, extension): {0}""" for entry in extensions: # Replace problematic characters with dashes, and collapse multiple dashes. - sanitizedEntry = re.sub(r'-+', '-', entry.replace('/', '-').replace(':', '-')) + sanitizedEntry = re.sub( + r"-+", "-", entry.replace("/", "-").replace(":", "-") + ) target = os.path.join(self.generatedFrameworks, sanitizedEntry + extension) with open(target, "w") as f: f.write(extensionTemplate.format(extensions[entry])) print("Models as data extensions written to " + target) - def run(self): content = self.makeContent() typeBasedContent = self.makeTypeBasedContent() @@ -210,14 +255,17 @@ def run(self): print("Models as data extensions generated, but not written to file.") sys.exit(0) - if (self.generateSinks or - self.generateSources or - self.generateSummaries or - self.generateNeutrals): + if ( + self.generateSinks + or self.generateSources + or self.generateSummaries + or self.generateNeutrals + ): self.save(content, ".model.yml") if self.generateTypeBasedSummaries: self.save(typeBasedContent, ".typebased.model.yml") -if __name__ == '__main__': + +if __name__ == "__main__": Generator.make().run() diff --git a/misc/scripts/models-as-data/helpers.py b/misc/scripts/models-as-data/helpers.py index 49cccb35cb62..f165caf62dcc 100644 --- a/misc/scripts/models-as-data/helpers.py +++ b/misc/scripts/models-as-data/helpers.py @@ -14,37 +14,53 @@ data: {2}""" + def remove_dir(dirName): if os.path.isdir(dirName): shutil.rmtree(dirName) print("Removed directory:", dirName) + def run_cmd(cmd, msg="Failed to run command"): - print('Running ' + ' '.join(cmd)) + print("Running " + " ".join(cmd)) if subprocess.check_call(cmd): print(msg) exit(1) + def readData(workDir, bqrsFile): generatedJson = os.path.join(workDir, "out.json") - print('Decoding BQRS to JSON.') - run_cmd(['codeql', 'bqrs', 'decode', bqrsFile, '--output', generatedJson, '--format=json'], "Failed to decode BQRS.") + print("Decoding BQRS to JSON.") + run_cmd( + [ + "codeql", + "bqrs", + "decode", + bqrsFile, + "--output", + generatedJson, + "--format=json", + ], + "Failed to decode BQRS.", + ) with open(generatedJson) as f: results = json.load(f) try: - return results['#select']['tuples'] + return results["#select"]["tuples"] except KeyError: - print('Unexpected JSON output - no tuples found') + print("Unexpected JSON output - no tuples found") exit(1) + def insert_update(rows, key, value): if key in rows: rows[key] += value else: rows[key] = value + def merge(*dicts): merged = {} for d in dicts: