diff --git a/README.rst b/README.rst index fab7cb30..b8fba953 100644 --- a/README.rst +++ b/README.rst @@ -118,6 +118,14 @@ Expressions into separate steps in the manner of cwl-expression-refactor. cwl-normalizer directory/path/to/save/outputs path_to_my_workflow.cwl [more_workflows.cwl] +Generate for Workflow Parameters from a CWL document +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``cwl-inputs-schema-gen`` generates a JSON Schema for workflow input parameters from a CWL document. + +.. code:: bash + + cwl-inputs-schema-gen path_to_my_workflow.cwl Using the CWL Parsers ~~~~~~~~~~~~~~~~~~~~~ diff --git a/cwl_utils/inputs_schema_gen.py b/cwl_utils/inputs_schema_gen.py new file mode 100644 index 00000000..16b87694 --- /dev/null +++ b/cwl_utils/inputs_schema_gen.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2024 Hirotaka Suetake +# Copyright 2024 Alexis Lucattini + +"""Generate JSON Schema from CWL inputs object.""" +import argparse +import logging +import sys +import json +from copy import deepcopy +from pathlib import Path +from typing import Any, List, Union, Dict, Optional +from urllib.parse import urlparse +import requests + +# Get typeguard from extensions if we're running in python3.8 +if sys.version_info[:2] < (3, 10): + from typing_extensions import TypeGuard # Not in 3.8 typing module +else: + from typing import TypeGuard + +from cwl_utils.loghandler import _logger as _cwlutilslogger +from cwl_utils.parser import ( + load_document_by_uri, + InputArraySchemaTypes, + InputEnumSchemaTypes, + InputRecordSchemaTypes, + File, + Directory, + WorkflowInputParameter, + InputRecordSchema, + InputEnumSchema, + InputArraySchema, + Workflow, + CommandLineTool, +) +from cwl_utils.utils import ( + sanitise_schema_field, + is_uri, + to_pascal_case, + get_value_from_uri, + is_local_uri, +) + +_logger = logging.getLogger("cwl-inputs-schema-gen") # pylint: disable=invalid-name +defaultStreamHandler = logging.StreamHandler() # pylint: disable=invalid-name +_logger.addHandler(defaultStreamHandler) +_logger.setLevel(logging.INFO) +_cwlutilslogger.setLevel(100) + +# Globals + +# Maps CWL types to JSON Schema types +PRIMITIVE_TYPES_MAPPING = { + "boolean": "boolean", + "string": "string", + "int": "integer", + "float": "number", + "long": "number", + "double": "number", + "null": "null", +} + +JSON_TEMPLATE_PATH = ( + Path(__file__) + .parent.joinpath("./templates/workflow_input_json_schema_template.json") + .absolute() + .resolve() +) + +# Some type hinting +InputType = Union[ + InputArraySchema, InputEnumSchema, InputRecordSchema, str, File, Directory +] + + +# Don't need type checking at runtime + + +class JSONSchemaProperty: + """Generate a JSON schema property from a CWL input parameter.""" + + def __init__( + self, + name: str, + type_: Union[InputType, List[InputType], str, Any], + description: Optional[str] = "", + required: Optional[bool] = False, + ): + """Initialise the JSONSchemaProperty object.""" + # Initialise values + self.name: str = name + self.type_: Union[InputType, List[InputType], str, Any] = type_ + self.description = description + self.required = required + self.type_dict = self.generate_type_dict() + + def generate_type_dict(self) -> Dict[str, Any]: + """Generate the type dict for a property from a CWL input parameter type.""" + # If the type is a list and contains null, then the property is not required + if isinstance(self.type_, List) and "null" in self.type_: + self.required = False + self.type_ = list(filter(lambda type_item: type_item != "null", self.type_)) + + # Check if we're down to one item, we can then squeeze + if len(self.type_) == 1: + self.type_ = self.type_[0] + + # type_ is still a list therefore we offer multiple input types for this parameter + if isinstance(self.type_, List): + # We use the oneOf keyword to specify multiple types + type_dict = self.generate_type_dict_from_type_list(self.type_) + # type_ is a single type + else: + type_dict = self.generate_type_dict_from_type(self.type_) + + # Add in the description to the type dict + type_dict.update({"description": self.description}) + + return type_dict + + def generate_type_dict_from_type(self, type_item: Any) -> Dict[str, Any]: + """ + Generate the type dict for a property from a CWL input parameter type. + + We call this function for each type in the type_ list + In the case there are multiple types, each dict is added to the oneOf list + """ + # Primitive types should have a 1-1 mapping + # Between an CWL Input Parameter type and a JSON schema type + if isinstance(type_item, str): + if type_item in PRIMITIVE_TYPES_MAPPING.keys(): + return {"type": PRIMITIVE_TYPES_MAPPING[type_item]} + elif type_item in ["stdin"]: + return {"$ref": "#/definitions/File"} + elif type_item in ["File", "Directory", "Any"]: + return {"$ref": f"#/definitions/{type_item}"} + # When item is a record schema type + elif is_uri(type_item): + return { + "$ref": f"#/definitions/{to_pascal_case(get_value_from_uri(type_item))}" + } + else: + raise ValueError(f"Unknown type: {type_item}") + elif isinstance(type_item, InputArraySchemaTypes): + return { + "type": "array", + "items": self.generate_type_dict_from_type(type_item.items), + } + elif isinstance(type_item, InputEnumSchemaTypes): + return { + "type": "string", + "enum": list( + map( + lambda symbol_iter: get_value_from_uri(symbol_iter), + type_item.symbols, + ) + ), + } + elif isinstance(type_item, InputRecordSchemaTypes): + if type_item.fields is None: + return {"type": "object"} + if not isinstance(type_item.fields, List): + _cwlutilslogger.error( + "Expected fields of InputRecordSchemaType to be a list" + ) + raise TypeError + return { + "type": "object", + "properties": { + get_value_from_uri(prop.name): self.generate_type_dict_from_type( + prop.type_ + ) + for prop in type_item.fields + }, + } + elif isinstance(type_item, Dict): + # Nested import + # {'$import': '../relative/path/to/schema'} + if "$import" in type_item.keys(): + # This path is a relative path to import + return { + "$ref": f"#/definitions/{to_pascal_case(get_value_from_uri(type_item['$import']))}" + } + else: + raise ValueError(f"Unknown type: {type_item}") + elif isinstance(type_item, List): + # Nested schema + return { + "oneOf": list( + map( + lambda type_iter: self.generate_type_dict_from_type(type_iter), + type_item, + ) + ) + } + else: + raise ValueError(f"Unknown type: {type_item}") + + def generate_type_dict_from_type_list( + self, type_: List[InputType] + ) -> Dict[str, Any]: + """Given a list of types, generate a JSON schema property dict wrapped in oneOf list.""" + return { + "oneOf": list( + map( + lambda type_item: self.generate_type_dict_from_type(type_item), + type_, + ) + ) + } + + def to_dict(self) -> Dict[str, Any]: + """Return as a dictionary.""" + return {self.name: self.type_dict} + + +def get_is_required_from_input_parameter( + input_parameter: WorkflowInputParameter, +) -> bool: + """Given an input parameter, return if it is required.""" + if isinstance(input_parameter.type_, str) and input_parameter.type_.endswith("?"): + return False + if input_parameter.default is not None: + return False + if isinstance(input_parameter.type_, List) and "null" in input_parameter.type_: + return False + if isinstance(input_parameter.type_, InputRecordSchemaTypes): + if input_parameter.type_ is not None: + if (isinstance(input_parameter.type_.type_, str)) and ( + input_parameter.type_.type_.endswith("?") + ): + return False + return True + + +def generate_json_schema_property_from_input_parameter( + input_parameter: WorkflowInputParameter, +) -> JSONSchemaProperty: + """ + Given an input parameter, generate a JSON schema property. + + :param input_parameter: + :return: + """ + # Get the input name and documentation for description + input_name = get_value_from_uri(str(input_parameter.id)) + doc = input_parameter.doc + required = get_is_required_from_input_parameter(input_parameter) + + return JSONSchemaProperty( + name=input_name, + type_=input_parameter.type_, + description=doc if doc is not None else "", + required=required, + ) + + +def generate_definition_from_schema(schema: InputRecordSchema) -> Dict[str, Any]: + """ + Given a schema, generate a JSON schema definition. + + :param schema: + :return: + """ + # Sanitise each field of the schema + sanitised_fields = {} + + if schema.fields is None: + return {} + + for field in schema.fields: + sanitised_fields.update( + { + get_value_from_uri(field.name): sanitise_schema_field( + {"type": field.type_} + ) + } + ) + + # Generate JSON properties + property_list = [] + + for prop_name, prop_obj in sanitised_fields.items(): + # Simplify type first by removing nulls + required = True + + # If the property object is a string, then it's a reference to another schema + if isinstance(prop_obj, str): + raise TypeError("Property Object should be a dictionary") + + if isinstance(prop_obj.get("type", []), List): + if "null" in prop_obj.get("type", []): + required = False + prop_obj["type"] = list( + filter(lambda type_item: type_item != "null", prop_obj.get("type", [])) + ) + + # Check if we're down to one item + if len(prop_obj["type"]) == 1: + prop_obj["type"] = prop_obj["type"][0] + + # Generate JSONSchema Property + prop = JSONSchemaProperty( + name=prop_name, + type_=prop_obj.get("type"), + description=prop_obj.get("doc", ""), + required=required, + ) + property_list.append(prop) + + return { + to_pascal_case(get_value_from_uri(str(schema.name))): { + "type": "object", + "properties": {prop.name: prop.type_dict for prop in property_list}, + "required": [prop.name for prop in property_list if prop.required], + } + } + + +def cwl_to_jsonschema(cwl_obj: Union[Workflow, CommandLineTool]) -> Any: + """ + cwl_obj: A CWL Object. + + Returns: + A JSONSchema object. + + Example: + cwl_obj = load_document_by_uri() + jsonschema = cwl_to_jsonschema(cwl_inputs) + + """ + # Initialise the schema from the workflow input json schema template + with open(JSON_TEMPLATE_PATH, "r") as template_h: + input_json_schema = json.load(template_h) + + # Get the complex schema keys + def is_complex_record_schema_key(idx_iter: str) -> TypeGuard[bool]: + if cwl_obj.loadingOptions.idx is None: + return False + + if cwl_obj.loadingOptions.idx.get(idx_iter) is None: + return False + + if not isinstance(cwl_obj.loadingOptions.idx.get(idx_iter), tuple): + return False + + # Get index as a tuple + input_schema_type, _ = cwl_obj.loadingOptions.idx.get(idx_iter, (None, None)) + + if isinstance(input_schema_type, InputRecordSchemaTypes): + return True + return False + + complex_schema_keys: List[str] = list( + filter( + lambda idx_iter: is_complex_record_schema_key(idx_iter), + cwl_obj.loadingOptions.idx.keys(), + ) + ) + + # Complex schema values + def get_complex_schema_values(idx_iter: str) -> InputRecordSchema: + if not isinstance(cwl_obj.loadingOptions.idx.get(idx_iter), tuple): + raise TypeError(f"Expected tuple from idx loading options key {idx_iter}") + + # Collect input record schema + input_record_schema, _ = cwl_obj.loadingOptions.idx.get(idx_iter, (None, None)) + + if not isinstance(input_record_schema, InputRecordSchemaTypes): + raise TypeError( + f"Expected InputRecordSchemaTypes from idx loading options key {idx_iter}" + ) + + return input_record_schema + + complex_schema_values: List[InputRecordSchema] = list( + map( + lambda idx_iter: get_complex_schema_values(idx_iter), + complex_schema_keys, + ) + ) + + # Load in all $imports to be referred by complex input types + workflow_schema_definitions_list = list( + map( + lambda complex_schema_values_iter: generate_definition_from_schema( + complex_schema_values_iter + ), + complex_schema_values, + ) + ) + + if cwl_obj.requirements is not None: + try: + schema_def_requirement = next( + filter( + lambda requirement_iter: requirement_iter.class_ + == "SchemaDefRequirement", + cwl_obj.requirements, + ) + ) + + workflow_schema_definitions_list.extend( + list( + map( + lambda schema_def_iter: generate_definition_from_schema( + schema_def_iter + ), + schema_def_requirement.types, + ) + ) + ) + + except StopIteration: + pass + + # Convert schema definitions to dict + workflow_schema_definitions_dict = {} + for schema_definition in workflow_schema_definitions_list: + workflow_schema_definitions_dict.update(schema_definition) + + # Generate JSON Schema Properties + properties = list( + map( + lambda workflow_parameter_input_obj: generate_json_schema_property_from_input_parameter( + workflow_parameter_input_obj + ), + cwl_obj.inputs, + ) + ) + + # Generate JSON schema + input_json_schema.update( + { + "type": "object", + "properties": { + prop.name: ( + {"oneOf": [{"type": "null"}, prop.type_dict]} + if prop.required is False + else prop.type_dict + ) + for prop in properties + }, + "required": [prop.name for prop in properties if prop.required], + } + ) + + # Update definitions from schema + input_json_schema["definitions"].update(workflow_schema_definitions_dict) + + # Slim down the schema as required + input_json_schema = slim_definitions(input_json_schema) + + # Add "additionalProperties": false to top of schema + # input_json_schema["additionalProperties"] = False + + return input_json_schema + + +# Traverse the properties and return all definitions that are used +def _recursive_search( + json_data: Dict[str, Any], + target_key: str, +) -> List[Any]: + """Given a target key return all instances of a key in a json object.""" + result = [] + + if isinstance(json_data, dict): + for key, value in json_data.items(): + if key == target_key: + result.append(value) + else: + result.extend(_recursive_search(value, target_key)) + elif isinstance(json_data, list): + for item in json_data: + result.extend(_recursive_search(item, target_key)) + + return result + + +# Get all the property dependencies +def _get_all_ref_attributes(json_object: Dict[str, Any]) -> List[Any]: + """Given a json object, return all the reference attributes.""" + return _recursive_search(json_object, "$ref") + + +def get_property_dependencies( + property_dict: Dict[str, Any], + input_json_schema: Dict[str, Any], + existing_property_dependencies: Optional[List[Any]] = None, +) -> List[str]: + """Recursively collect all dependencies for a property.""" + # Initialise return list + if existing_property_dependencies is None: + existing_property_dependencies = [] + + # All reference attributes + for reference_attribute in _get_all_ref_attributes(property_dict): + # Get the value from the reference attribute + reference_value = get_value_from_uri(reference_attribute) + # If the reference value is not in the existing property dependencies, add it + if reference_value not in existing_property_dependencies: + existing_property_dependencies.append(reference_value) + # Get the property dependencies of the reference value + existing_property_dependencies.extend( + get_property_dependencies( + input_json_schema["definitions"][reference_value], + input_json_schema, + existing_property_dependencies, + ) + ) + + return existing_property_dependencies + + +def slim_definitions(input_json_schema: Dict[str, Any]) -> Dict[str, Any]: + """ + Slim down the schema to only the definitions that are used by the properties. + + Traverse the properties and return all definitions that are used. + Remove all other definitions. + """ + # Copy schema + input_json_schema = deepcopy(input_json_schema) + + # Get required definitions + required_definitions = get_property_dependencies( + input_json_schema.get("properties", {}), input_json_schema + ) + + for definition_key in list(input_json_schema["definitions"].keys()): + if definition_key not in required_definitions: + del input_json_schema["definitions"][definition_key] + + return input_json_schema + + +def arg_parser() -> argparse.ArgumentParser: + """Build the argument parser.""" + parser = argparse.ArgumentParser(description="Generate JSON Schema from a CWL URI.") + parser.add_argument("cwl_url", help="URL or Path to the CWL document") + parser.add_argument( + "-o", + "--output", + type=argparse.FileType("w"), + default=sys.stdout, + help="Output file. Default is stdout.", + ) + return parser + + +def parse_args(args: List[str]) -> argparse.Namespace: + """Parse the command line arguments.""" + return arg_parser().parse_args(args) + + +def main() -> None: + """Console entry point.""" + sys.exit(run(parse_args(sys.argv[1:]))) + + +def get_cwl_url(url: str) -> str: + """ + Conform to uri format. + + If no scheme, then assert is a local file path and exists + if scheme is file, then assert is a local file path and exists + If scheme is not file, then assert is a valid Web URL + Return either the url or the local path as a uri. + """ + if not is_uri(url): + if not Path(url).exists(): + logging.error("The CWL URL is invalid.") + raise FileNotFoundError + return Path(url).as_uri() + elif is_local_uri(url): + if not Path(urlparse(url).path).exists(): + logging.error("The CWL URL is invalid.") + raise FileNotFoundError + return url + else: + # urlparse(url).scheme not in ['file']: + response = requests.get(url, timeout=20) + if response.status_code != 200: + logging.error("The CWL URL is invalid.") + raise FileNotFoundError + return url + + +def run(args: argparse.Namespace) -> int: + """Run the main program.""" + # Check the cwl_url is valid + cwl_url = get_cwl_url(args.cwl_url) + + # Check the output file is writable + if args.output.name != "": + if not Path(args.output.name).parent.is_dir(): + logging.error( + "The output file is not writable, the output parent directory does not exist" + ) + return 1 + + _logger.info("Loading the CWL document") + cwl_obj = load_document_by_uri(cwl_url) + + try: + jsonschema = cwl_to_jsonschema(cwl_obj) + except Exception as e: + _logger.exception( + "Failed to generate JSON Schema from CWL inputs object. Error: %s", e + ) + return 1 + args.output.write(json.dumps(jsonschema, indent=2) + "\n") + + return 0 + + +if __name__ == "__main__": + main() diff --git a/cwl_utils/parser/__init__.py b/cwl_utils/parser/__init__.py index 36d977b1..e493091a 100644 --- a/cwl_utils/parser/__init__.py +++ b/cwl_utils/parser/__init__.py @@ -21,30 +21,12 @@ InputParameter = Union[ cwl_v1_0.InputParameter, cwl_v1_1.InputParameter, cwl_v1_2.InputParameter ] -"""Type union for a CWL v1.x InputParameter object.""" -InputArraySchema = Union[ - cwl_v1_0.InputArraySchema, - cwl_v1_1.InputArraySchema, - cwl_v1_2.InputArraySchema, -] -"""Type union for a CWL v1.x InputArraySchema object.""" -InputEnumSchema = Union[ - cwl_v1_0.InputEnumSchema, - cwl_v1_1.InputEnumSchema, - cwl_v1_2.InputEnumSchema, -] """Type union for a CWL v1.x InputEnumSchema object.""" InputRecordField = Union[ cwl_v1_0.InputRecordField, cwl_v1_1.InputRecordField, cwl_v1_2.InputRecordField, ] -"""Type union for a CWL v1.x InputRecordField object.""" -InputRecordSchema = Union[ - cwl_v1_0.InputRecordSchema, - cwl_v1_1.InputRecordSchema, - cwl_v1_2.InputRecordSchema, -] """Type union for a CWL v1.x InputRecordSchema object.""" OutputParameter = Union[ cwl_v1_0.OutputParameter, cwl_v1_1.OutputParameter, cwl_v1_2.OutputParameter @@ -110,6 +92,11 @@ CommandLineTool = Union[ cwl_v1_0.CommandLineTool, cwl_v1_1.CommandLineTool, cwl_v1_2.CommandLineTool ] +CommandLineToolTypes = ( + cwl_v1_0.CommandLineTool, + cwl_v1_1.CommandLineTool, + cwl_v1_2.CommandLineTool, +) """Type union for a CWL v1.x CommandLineTool object.""" CommandLineBinding = Union[ cwl_v1_0.CommandLineBinding, @@ -179,12 +166,36 @@ ) """Type union for a CWL v1.x SoftwareRequirement object.""" ArraySchema = Union[cwl_v1_0.ArraySchema, cwl_v1_1.ArraySchema, cwl_v1_2.ArraySchema] +InputArraySchema = Union[ + cwl_v1_0.InputArraySchema, cwl_v1_1.InputArraySchema, cwl_v1_2.InputArraySchema +] +InputArraySchemaTypes = ( + cwl_v1_0.InputArraySchema, + cwl_v1_1.InputArraySchema, + cwl_v1_2.InputArraySchema, +) """Type Union for a CWL v1.x ArraySchema object.""" EnumSchema = Union[cwl_v1_0.EnumSchema, cwl_v1_1.EnumSchema, cwl_v1_2.EnumSchema] +InputEnumSchema = Union[ + cwl_v1_0.InputEnumSchema, cwl_v1_1.InputEnumSchema, cwl_v1_2.InputEnumSchema +] +InputEnumSchemaTypes = ( + cwl_v1_0.InputEnumSchema, + cwl_v1_1.InputEnumSchema, + cwl_v1_2.InputEnumSchema, +) """Type Union for a CWL v1.x EnumSchema object.""" RecordSchema = Union[ cwl_v1_0.RecordSchema, cwl_v1_1.RecordSchema, cwl_v1_2.RecordSchema ] +InputRecordSchema = Union[ + cwl_v1_0.InputRecordSchema, cwl_v1_1.InputRecordSchema, cwl_v1_2.InputRecordSchema +] +InputRecordSchemaTypes = ( + cwl_v1_0.InputRecordSchema, + cwl_v1_1.InputRecordSchema, + cwl_v1_2.InputRecordSchema, +) """Type Union for a CWL v1.x RecordSchema object.""" File = Union[cwl_v1_0.File, cwl_v1_1.File, cwl_v1_2.File] """Type Union for a CWL v1.x File object.""" diff --git a/cwl_utils/templates/WorkflowInputJsonSchemaTemplate.md b/cwl_utils/templates/WorkflowInputJsonSchemaTemplate.md new file mode 100644 index 00000000..74e7db0b --- /dev/null +++ b/cwl_utils/templates/WorkflowInputJsonSchemaTemplate.md @@ -0,0 +1,1275 @@ +# Generating the Workflow Input JSON Schema Template + + +* [Generating the Workflow Input JSON Schema Template](#generating-the-workflow-input-json-schema-template) + * [Part 1 - Clone the CWL-TS-Auto directory](#part-1---clone-the-cwl-ts-auto-directory) + * [Part 2 - Install the typescript-json-schema package](#part-2---install-the-typescript-json-schema-package) + * [Part 3 - Generate the Workflow Input JSON Schema Template](#part-3---generate-the-workflow-input-json-schema-template) + * [Part 4 - Refine the template with the following python script](#part-4---refine-the-template-with-the-following-python-script) + * [Part 5 - Run schema generation against all tests in the cwl v1.2 directory](#part-5---run-schema-generation-against-all-tests-in-the-cwl-v12-directory) + + +## Part 1 - Clone the CWL-TS-Auto directory + +``` +git clone https://github.com/common-workflow-lab/cwl-ts-auto + +cd cwl-ts-auto +``` + +## Part 2 - Install the typescript-json-schema package + +``` +npm install typescript-json-schema@^0.62.0 +``` + +## Part 3 - Generate the Workflow Input JSON Schema Template + +```bash +npx typescript-json-schema \ + --required \ + --noExtraProps \ + tsconfig.json \ + WorkflowInputParameter > workflow_input_json_schema_template.primary.json +``` + +## Part 4 - Refine the template with the following python script + +
+ +Click to expand! + +```python +#!/usr/bin/env python3 + +import json +from copy import deepcopy +from itertools import chain +from pathlib import Path +from typing import Dict, List, Any +import sys + +CLEANED_FILE_DEFINITION = { + "File": { + "additionalProperties": False, + "description": "Represents a file (or group of files when `secondaryFiles` is provided) that\nwill be accessible by tools using standard POSIX file system call API such as\nopen(2) and read(2).\n\nFiles are represented as objects with `class` of `File`. File objects have\na number of properties that provide metadata about the file.\n\nThe `location` property of a File is a URI that uniquely identifies the\nfile. Implementations must support the `file://` URI scheme and may support\nother schemes such as `http://` and `https://`. The value of `location` may also be a\nrelative reference, in which case it must be resolved relative to the URI\nof the document it appears in. Alternately to `location`, implementations\nmust also accept the `path` property on File, which must be a filesystem\npath available on the same host as the CWL runner (for inputs) or the\nruntime environment of a command line tool execution (for command line tool\noutputs).\n\nIf no `location` or `path` is specified, a file object must specify\n`contents` with the UTF-8 text content of the file. This is a \"file\nliteral\". File literals do not correspond to external resources, but are\ncreated on disk with `contents` with when needed for executing a tool.\nWhere appropriate, expressions can return file literals to define new files\non a runtime. The maximum size of `contents` is 64 kilobytes.\n\nThe `basename` property defines the filename on disk where the file is\nstaged. This may differ from the resource name. If not provided,\n`basename` must be computed from the last path part of `location` and made\navailable to expressions.\n\nThe `secondaryFiles` property is a list of File or Directory objects that\nmust be staged in the same directory as the primary file. It is an error\nfor file names to be duplicated in `secondaryFiles`.\n\nThe `size` property is the size in bytes of the File. It must be computed\nfrom the resource and made available to expressions. The `checksum` field\ncontains a cryptographic hash of the file content for use it verifying file\ncontents. Implementations may, at user option, enable or disable\ncomputation of the `checksum` field for performance or other reasons.\nHowever, the ability to compute output checksums is required to pass the\nCWL conformance test suite.\n\nWhen executing a CommandLineTool, the files and secondary files may be\nstaged to an arbitrary directory, but must use the value of `basename` for\nthe filename. The `path` property must be file path in the context of the\ntool execution runtime (local to the compute node, or within the executing\ncontainer). All computed properties should be available to expressions.\nFile literals also must be staged and `path` must be set.\n\nWhen collecting CommandLineTool outputs, `glob` matching returns file paths\n(with the `path` property) and the derived properties. This can all be\nmodified by `outputEval`. Alternately, if the file `cwl.output.json` is\npresent in the output, `outputBinding` is ignored.\n\nFile objects in the output must provide either a `location` URI or a `path`\nproperty in the context of the tool execution runtime (local to the compute\nnode, or within the executing container).\n\nWhen evaluating an ExpressionTool, file objects must be referenced via\n`location` (the expression tool does not have access to files on disk so\n`path` is meaningless) or as file literals. It is legal to return a file\nobject with an existing `location` but a different `basename`. The\n`loadContents` field of ExpressionTool inputs behaves the same as on\nCommandLineTool inputs, however it is not meaningful on the outputs.\n\nAn ExpressionTool may forward file references from input to output by using\nthe same value for `location`.", + "properties": { + "basename": { + "description": "The base name of the file, that is, the name of the file without any\nleading directory path. The base name must not contain a slash `/`.\n\nIf not provided, the implementation must set this field based on the\n`location` field by taking the final path component after parsing\n`location` as an IRI. If `basename` is provided, it is not required to\nmatch the value from `location`.\n\nWhen this file is made available to a CommandLineTool, it must be named\nwith `basename`, i.e. the final component of the `path` field must match\n`basename`.", + "type": "string" + }, + "checksum": { + "description": "Optional hash code for validating file integrity. Currently, must be in the form\n\"sha1$ + hexadecimal string\" using the SHA-1 algorithm.", + "type": "string" + }, + "class": { + "const": "File", + "description": "Must be `File` to indicate this object describes a file.", + "type": "string" + }, + "contents": { + "description": "File contents literal.\n\nIf neither `location` nor `path` is provided, `contents` must be\nnon-null. The implementation must assign a unique identifier for the\n`location` field. When the file is staged as input to CommandLineTool,\nthe value of `contents` must be written to a file.\n\nIf `contents` is set as a result of a Javascript expression,\nan `entry` in `InitialWorkDirRequirement`, or read in from\n`cwl.output.json`, there is no specified upper limit on the\nsize of `contents`. Implementations may have practical limits\non the size of `contents` based on memory and storage\navailable to the workflow runner or other factors.\n\nIf the `loadContents` field of an `InputParameter` or\n`OutputParameter` is true, and the input or output File object\n`location` is valid, the file must be a UTF-8 text file 64 KiB\nor smaller, and the implementation must read the entire\ncontents of the file and place it in the `contents` field. If\nthe size of the file is greater than 64 KiB, the\nimplementation must raise a fatal error.", + "type": "string" + }, + "dirname": { + "description": "The name of the directory containing file, that is, the path leading up\nto the final slash in the path such that `dirname + '/' + basename ==\npath`.\n\nThe implementation must set this field based on the value of `path`\nprior to evaluating parameter references or expressions in a\nCommandLineTool document. This field must not be used in any other\ncontext.", + "type": "string" + }, + "format": { + "description": "The format of the file: this must be an IRI of a concept node that\nrepresents the file format, preferably defined within an ontology.\nIf no ontology is available, file formats may be tested by exact match.\n\nReasoning about format compatibility must be done by checking that an\ninput file format is the same, `owl:equivalentClass` or\n`rdfs:subClassOf` the format required by the input parameter.\n`owl:equivalentClass` is transitive with `rdfs:subClassOf`, e.g. if\n` owl:equivalentClass ` and ` owl:subclassOf ` then infer\n` owl:subclassOf `.\n\nFile format ontologies may be provided in the \"$schemas\" metadata at the\nroot of the document. If no ontologies are specified in `$schemas`, the\nruntime may perform exact file format matches.", + "type": "string" + }, + "location": { + "description": "An IRI that identifies the file resource. This may be a relative\nreference, in which case it must be resolved using the base IRI of the\ndocument. The location may refer to a local or remote resource; the\nimplementation must use the IRI to retrieve file content. If an\nimplementation is unable to retrieve the file content stored at a\nremote resource (due to unsupported protocol, access denied, or other\nissue) it must signal an error.\n\nIf the `location` field is not provided, the `contents` field must be\nprovided. The implementation must assign a unique identifier for\nthe `location` field.\n\nIf the `path` field is provided but the `location` field is not, an\nimplementation may assign the value of the `path` field to `location`,\nthen follow the rules above.", + "type": "string" + }, + "nameext": { + "description": "The basename extension such that `nameroot + nameext == basename`, and\n`nameext` is empty or begins with a period and contains at most one\nperiod. Leading periods on the basename are ignored; a basename of\n`.cshrc` will have an empty `nameext`.\n\nThe implementation must set this field automatically based on the value\nof `basename` prior to evaluating parameter references or expressions.", + "type": "string" + }, + "nameroot": { + "description": "The basename root such that `nameroot + nameext == basename`, and\n`nameext` is empty or begins with a period and contains at most one\nperiod. For the purposes of path splitting leading periods on the\nbasename are ignored; a basename of `.cshrc` will have a nameroot of\n`.cshrc`.\n\nThe implementation must set this field automatically based on the value\nof `basename` prior to evaluating parameter references or expressions.", + "type": "string" + }, + "path": { + "description": "The local host path where the File is available when a CommandLineTool is\nexecuted. This field must be set by the implementation. The final\npath component must match the value of `basename`. This field\nmust not be used in any other context. The command line tool being\nexecuted must be able to access the file at `path` using the POSIX\n`open(2)` syscall.\n\nAs a special case, if the `path` field is provided but the `location`\nfield is not, an implementation may assign the value of the `path`\nfield to `location`, and remove the `path` field.\n\nIf the `path` contains [POSIX shell metacharacters](http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_02)\n(`|`,`&`, `;`, `<`, `>`, `(`,`)`, `$`,`` ` ``, `\\`, `\"`, `'`,\n``, ``, and ``) or characters\n[not allowed](http://www.iana.org/assignments/idna-tables-6.3.0/idna-tables-6.3.0.xhtml)\nfor [Internationalized Domain Names for Applications](https://tools.ietf.org/html/rfc6452)\nthen implementations may terminate the process with a\n`permanentFailure`.", + "type": "string" + }, + "secondaryFiles": { + "description": "A list of additional files or directories that are associated with the\nprimary file and must be transferred alongside the primary file.\nExamples include indexes of the primary file, or external references\nwhich must be included when loading primary document. A file object\nlisted in `secondaryFiles` may itself include `secondaryFiles` for\nwhich the same rules apply.", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/Directory" + } + ] + }, + "type": "array" + }, + "size": { + "description": "Optional file size (in bytes)", + "type": "number" + } + }, + "required": [ + "class" + ], + "type": "object" + } +} + +CLEANED_DIRECTORY_DEFINITION = { + "Directory": { + "additionalProperties": False, + "description": "Represents a directory to present to a command line tool.\n\nDirectories are represented as objects with `class` of `Directory`. Directory objects have\na number of properties that provide metadata about the directory.\n\nThe `location` property of a Directory is a URI that uniquely identifies\nthe directory. Implementations must support the file:// URI scheme and may\nsupport other schemes such as http://. Alternately to `location`,\nimplementations must also accept the `path` property on Directory, which\nmust be a filesystem path available on the same host as the CWL runner (for\ninputs) or the runtime environment of a command line tool execution (for\ncommand line tool outputs).\n\nA Directory object may have a `listing` field. This is a list of File and\nDirectory objects that are contained in the Directory. For each entry in\n`listing`, the `basename` property defines the name of the File or\nSubdirectory when staged to disk. If `listing` is not provided, the\nimplementation must have some way of fetching the Directory listing at\nruntime based on the `location` field.\n\nIf a Directory does not have `location`, it is a Directory literal. A\nDirectory literal must provide `listing`. Directory literals must be\ncreated on disk at runtime as needed.\n\nThe resources in a Directory literal do not need to have any implied\nrelationship in their `location`. For example, a Directory listing may\ncontain two files located on different hosts. It is the responsibility of\nthe runtime to ensure that those files are staged to disk appropriately.\nSecondary files associated with files in `listing` must also be staged to\nthe same Directory.\n\nWhen executing a CommandLineTool, Directories must be recursively staged\nfirst and have local values of `path` assigned.\n\nDirectory objects in CommandLineTool output must provide either a\n`location` URI or a `path` property in the context of the tool execution\nruntime (local to the compute node, or within the executing container).\n\nAn ExpressionTool may forward file references from input to output by using\nthe same value for `location`.\n\nName conflicts (the same `basename` appearing multiple times in `listing`\nor in any entry in `secondaryFiles` in the listing) is a fatal error.", + "properties": { + "basename": { + "description": "The base name of the directory, that is, the name of the file without any\nleading directory path. The base name must not contain a slash `/`.\n\nIf not provided, the implementation must set this field based on the\n`location` field by taking the final path component after parsing\n`location` as an IRI. If `basename` is provided, it is not required to\nmatch the value from `location`.\n\nWhen this file is made available to a CommandLineTool, it must be named\nwith `basename`, i.e. the final component of the `path` field must match\n`basename`.", + "type": "string" + }, + "class": { + "const": "Directory", + "description": "Must be `Directory` to indicate this object describes a Directory.", + "type": "string" + }, + "listing": { + "description": "List of files or subdirectories contained in this directory. The name\nof each file or subdirectory is determined by the `basename` field of\neach `File` or `Directory` object. It is an error if a `File` shares a\n`basename` with any other entry in `listing`. If two or more\n`Directory` object share the same `basename`, this must be treated as\nequivalent to a single subdirectory with the listings recursively\nmerged.", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/Directory" + } + ] + }, + "type": "array" + }, + "location": { + "description": "An IRI that identifies the directory resource. This may be a relative\nreference, in which case it must be resolved using the base IRI of the\ndocument. The location may refer to a local or remote resource. If\nthe `listing` field is not set, the implementation must use the\nlocation IRI to retrieve directory listing. If an implementation is\nunable to retrieve the directory listing stored at a remote resource (due to\nunsupported protocol, access denied, or other issue) it must signal an\nerror.\n\nIf the `location` field is not provided, the `listing` field must be\nprovided. The implementation must assign a unique identifier for\nthe `location` field.\n\nIf the `path` field is provided but the `location` field is not, an\nimplementation may assign the value of the `path` field to `location`,\nthen follow the rules above.", + "type": "string" + }, + "path": { + "description": "The local path where the Directory is made available prior to executing a\nCommandLineTool. This must be set by the implementation. This field\nmust not be used in any other context. The command line tool being\nexecuted must be able to access the directory at `path` using the POSIX\n`opendir(2)` syscall.\n\nIf the `path` contains [POSIX shell metacharacters](http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_02)\n(`|`,`&`, `;`, `<`, `>`, `(`,`)`, `$`,`` ` ``, `\\`, `\"`, `'`,\n``, ``, and ``) or characters\n[not allowed](http://www.iana.org/assignments/idna-tables-6.3.0/idna-tables-6.3.0.xhtml)\nfor [Internationalized Domain Names for Applications](https://tools.ietf.org/html/rfc6452)\nthen implementations may terminate the process with a\n`permanentFailure`.", + "type": "string" + } + }, + "required": [ + "class" + ], + "type": "object" + } +} + +CLEANED_ANY_DEFINITION = { + "Any": { + "description": "A placeholder for any type of CWL object.", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + }, + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/Directory" + } + ], + "properties": dict( + map( + lambda iter_: (iter_, True), + list( + set( + list(CLEANED_DIRECTORY_DEFINITION["Directory"]["properties"].keys()) + + list(CLEANED_FILE_DEFINITION["File"]["properties"].keys()) + ) + ) + ) + ) + } +} + +DEFINTIIONS_TO_REMOVE = ["DefaultFetcher", "Dictionary", "Dictionary", "Fetcher", "T"] + + + +def remove_loading_options_and_extension_fields_from_schema(schema_dict: Any) -> Dict: + """ + Remove loadingOptions from schema recursively + :param schema_dict: + :return: + """ + + new_schema_dict = {} + + if isinstance(schema_dict, Dict): + for key, value in deepcopy(schema_dict).items(): + if isinstance(value, Dict): + if "loadingOptions" in value: + del value["loadingOptions"] + if "LoadingOptions" in value: + del value["LoadingOptions"] + if "extensionFields" in value: + del value["extensionFields"] + new_schema_dict[key] = remove_loading_options_and_extension_fields_from_schema(value) + elif isinstance(value, List): + if "loadingOptions" in value: + _ = value.pop(value.index("loadingOptions")) + if "LoadingOptions" in value: + _ = value.pop(value.index("LoadingOptions")) + if "extensionFields" in value: + _ = value.pop(value.index("extensionFields")) + new_schema_dict[key] = remove_loading_options_and_extension_fields_from_schema(value) + else: + new_schema_dict[key] = value + elif isinstance(schema_dict, List): + new_schema_dict = list( + map(lambda value_iter: remove_loading_options_and_extension_fields_from_schema(value_iter), schema_dict)) + else: + # Item is a list of number + new_schema_dict = schema_dict + + return new_schema_dict + + +def read_schema_in_from_file(file_path: Path) -> Dict: + """ + Read in the auto-generated schema from the file + :param file_path: + :return: + """ + if not file_path.exists(): + raise FileNotFoundError(f"File {file_path} does not exist") + + with open(file_path, "r") as file_h: + return json.load(file_h) + + +def assert_definitions_key(schema_dict: Dict): + """ + Ensure that the definitions key is part of the schema dictionary and is itself is a dictionary + :param schema_dict: + :return: + """ + if "definitions" not in schema_dict.keys() and not isinstance(schema_dict["definitions"], Dict): + raise ValueError("Schema does not contain a 'definitions' key or 'definitions' is not a dictionary") + + +def add_import_and_include_to_schema(schema_dict) -> Dict: + """ + Under the definitions section, add in the $import and $include definitions + Copied from https://github.com/common-workflow-language/cwl-v1.2/blob/76bdf9b55e2378432e0e6380ccedebb4a94ce483/json-schema/cwl.yaml#L57-L72 + + { + "CWLImportManual": { + "description": \"\"\" + Represents an '$import' directive that should point toward another compatible CWL file to import + where specified. + The contents of the imported file should be relevant contextually where it is being imported + \"\"\", + "$comment": \"\"\" + The schema validation of the CWL will not itself perform the '$import' to resolve and validate its contents. + Therefore, the complete schema will not be validated entirely, and could still be partially malformed. + To ensure proper and exhaustive validation of a CWL definition with this schema, all '$import' directives + should be resolved and extended beforehand. + \"\"\", + "type": "object", + "properties": { + "$import": { + "type": "string" + } + }, + "required": [ + "$import" + ], + "additionalProperties": false + } + } + + Ditto for $include directive + + { + "CWLIncludeManual": { + "description": " + Represents an '$include' directive that should point toward another compatible CWL file to import + where specified. + The contents of the imported file should be relevant contextually where it is being imported + ", + "$comment": " + The schema validation of the CWL will not itself perform the '$include' to resolve and validate its contents. + Therefore, the complete schema will not be validated entirely, and could still be partially malformed. + To ensure proper and exhaustive validation of a CWL definition with this schema, all '$include' directives + should be resolved and extended beforehand. + ", + "type": "object", + "properties": { + "$include": { + "type": "string" + } + }, + "required": [ + "$include" + ], + "additionalProperties": false + } + } + + + :param schema_dict: + :return: + """ + + # Always do a deepcopy on the input + schema_dict = deepcopy(schema_dict) + + # Confirm definitions key + assert_definitions_key(schema_dict) + + # Add in the $import and $include to the definitions + schema_dict["definitions"].update( + { + "CWLImportManual": { + "description": "" + "Represents an '$import' directive that should point toward another compatible " + "CWL file to import where specified. The contents of the imported file should be " + "relevant contextually where it is being imported", + "$comment": "" + "The schema validation of the CWL will not itself perform the '$import' to resolve and " + "validate its contents. Therefore, the complete schema will not be validated entirely, " + "and could still be partially malformed. " + "To ensure proper and exhaustive validation of a CWL definition with this schema, " + "all '$import' directives should be resolved and extended beforehand", + "type": "object", + "properties": { + "$import": { + "type": "string" + } + }, + "required": [ + "$import" + ], + "additionalProperties": False + }, + "CWLIncludeManual": { + "description": "" + "Represents an '$include' directive that should point toward another compatible " + "CWL file to import where specified. The contents of the imported file should be " + "relevant contextually where it is being imported", + "$comment": "" + "The schema validation of the CWL will not itself perform the '$include' to resolve and " + "validate its contents. Therefore, the complete schema will not be validated entirely, " + "and could still be partially malformed. " + "To ensure proper and exhaustive validation of a CWL definition with this schema, " + "all '$include' directives should be resolved and extended beforehand", + "type": "object", + "properties": { + "$include": { + "type": "string" + } + }, + "required": [ + "$include" + ], + "additionalProperties": False + } + } + ) + + return schema_dict + + +def fix_inline_javascript_requirement(schema_dict: Dict) -> Dict: + """ + Fix the InlineJavascriptRequirement.expressionLib array to allow for $include + + FROM + + { + "InlineJavascriptRequirement": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#InlineJavascriptRequirement\n\nIndicates that the workflow platform must support inline Javascript expressions.\nIf this requirement is not present, the workflow platform must not perform expression\ninterpolation.", + "properties": { + "class": { + "const": "InlineJavascriptRequirement", + "description": "Always 'InlineJavascriptRequirement'", + "type": "string" + }, + "expressionLib": { + "description": "Additional code fragments that will also be inserted\nbefore executing the expression code. Allows for function definitions that may\nbe called from CWL expressions.", + "items": { + "type": "string" + }, + "type": "array" + }, + "extensionFields": { + "$ref": "#/definitions/Dictionary" + }, + "loadingOptions": { + "$ref": "#/definitions/LoadingOptions" + } + }, + "required": [ + "class" + ], + "type": "object" + } + } + + TO + + { + "InlineJavascriptRequirement": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#InlineJavascriptRequirement\n\nIndicates that the workflow platform must support inline Javascript expressions.\nIf this requirement is not present, the workflow platform must not perform expression\ninterpolation.", + "properties": { + "class": { + "const": "InlineJavascriptRequirement", + "description": "Always 'InlineJavascriptRequirement'", + "type": "string" + }, + "expressionLib": { + "description": "Additional code fragments that will also be inserted\nbefore executing the expression code. Allows for function definitions that may\nbe called from CWL expressions.", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/CWLIncludeManual" + } + ] + }, + "type": "array" + }, + "extensionFields": { + "$ref": "#/definitions/Dictionary" + } + }, + "required": [ + "class" + ], + "type": "object" + } + } + + """ + + # Always do a deepcopy on the input + schema_dict = deepcopy(schema_dict) + + # Confirm definitions key + assert_definitions_key(schema_dict) + + # Assert InlineJavascriptRequirement exists in definitions + if "InlineJavascriptRequirement" not in schema_dict["definitions"]: + raise ValueError("Schema does not contain an 'InlineJavascriptRequirement' key in 'definitions'") + + # Confirm that the InlineJavascriptRequirement has a properties key and the properties key is a dictionary + if ( + "properties" not in schema_dict["definitions"]["InlineJavascriptRequirement"] or + not isinstance(schema_dict["definitions"]["InlineJavascriptRequirement"]["properties"], Dict) + ): + raise ValueError( + "Schema does not contain a 'properties' key in 'InlineJavascriptRequirement.definitions' " + "or 'properties' is not a dictionary" + ) + + # Confirm that properties has an expressionLib key + if "expressionLib" not in schema_dict["definitions"]["InlineJavascriptRequirement"]["properties"]: + raise ValueError("Schema does not contain an 'expressionLib' key in 'InlineJavascriptRequirement.properties'") + + # Confirm that expressionLib is of type array and has an items key + if ( + "type" not in schema_dict["definitions"]["InlineJavascriptRequirement"]["properties"]["expressionLib"] + or + not schema_dict["definitions"]["InlineJavascriptRequirement"]["properties"]["expressionLib"][ + "type"] == "array" + or + "items" not in schema_dict["definitions"]["InlineJavascriptRequirement"]["properties"]["expressionLib"] + ): + raise ValueError( + "Schema does not contain an 'expressionLib' key in 'InlineJavascriptRequirement.properties' " + "of type array with an 'items' key" + ) + + # Allow for $include in the expressionLib array by updating the the expressionLib items to be a anyOf array + schema_dict["definitions"]["InlineJavascriptRequirement"]["properties"]["expressionLib"]["items"] = { + "anyOf": [ + { + "type": "string" + }, + { + "$ref": "#/definitions/CWLIncludeManual" + } + ] + } + + return schema_dict + + +def fix_schema_def_requirement(schema_dict: Dict) -> Dict: + """ + Allow SchemaDefRequirement.types array to be $import type + + FROM + + { + "SchemaDefRequirement": { + "additionalProperties": false, + "description": "" + "Auto-generated class implementation for https://w3id.org/cwl/cwl#SchemaDefRequirement" + "This field consists of an array of type definitions which must be used when" + "interpreting the `inputs` and `outputs` fields. When a `type` field" + "contains a IRI, the implementation must check if the type is defined in" + "`schemaDefs` and use that definition. If the type is not found in" + "`schemaDefs`, it is an error. The entries in `schemaDefs` must be" + "processed in the order listed such that later schema definitions may refer" + "to earlier schema definitions." + "- **Type definitions are allowed for `enum` and `record` types only.**" + "- Type definitions may be shared by defining them in a file and then" + " `$include`-ing them in the `types` field.\n- A file can contain a list of type definitions", + "properties": { + "class": { + "const": "SchemaDefRequirement", + "description": "Always 'SchemaDefRequirement'", + "type": "string" + }, + "extensionFields": { + "$ref": "#/definitions/Dictionary" + }, + "loadingOptions": { + "$ref": "#/definitions/LoadingOptions" + }, + "types": { + "description": "The list of type definitions.", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/CommandInputArraySchema" + }, + { + "$ref": "#/definitions/CommandInputRecordSchema" + }, + { + "$ref": "#/definitions/CommandInputEnumSchema" + } + ] + }, + "type": "array" + } + }, + "required": [ + "class", + "types" + ], + "type": "object" + } + } + + TO + + { + "SchemaDefRequirement": { + "additionalProperties": false, + "description": "" + "Auto-generated class implementation for https://w3id.org/cwl/cwl#SchemaDefRequirement" + "This field consists of an array of type definitions which must be used when" + "interpreting the `inputs` and `outputs` fields. When a `type` field" + "contains a IRI, the implementation must check if the type is defined in" + "`schemaDefs` and use that definition. If the type is not found in" + "`schemaDefs`, it is an error. The entries in `schemaDefs` must be" + "processed in the order listed such that later schema definitions may refer" + "to earlier schema definitions." + "- **Type definitions are allowed for `enum` and `record` types only.**" + "- Type definitions may be shared by defining them in a file and then" + " `$include`-ing them in the `types` field.\n- A file can contain a list of type definitions", + "properties": { + "class": { + "const": "SchemaDefRequirement", + "description": "Always 'SchemaDefRequirement'", + "type": "string" + }, + "extensionFields": { + "$ref": "#/definitions/Dictionary" + }, + "loadingOptions": { + "$ref": "#/definitions/LoadingOptions" + }, + "types": { + "description": "The list of type definitions.", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/CommandInputArraySchema" + }, + { + "$ref": "#/definitions/CommandInputRecordSchema" + }, + { + "$ref": "#/definitions/CommandInputEnumSchema" + }, + { + "$ref": "#/definitions/CWLImportManual" + } + ] + }, + "type": "array" + } + }, + "required": [ + "class", + "types" + ], + "type": "object" + } + } + + :param schema_dict: + :return: + """ + + # Always do a deepcopy on the input + schema_dict = deepcopy(schema_dict) + + # Confirm definitions key + assert_definitions_key(schema_dict) + + # Assert SchemaDefRequirement exists in definitions + if "SchemaDefRequirement" not in schema_dict["definitions"]: + raise ValueError("Schema does not contain an 'SchemaDefRequirement' key in 'definitions'") + + # Confirm that the SchemaDefRequirement has a properties key and the properties key is a dictionary + if ( + "properties" not in schema_dict["definitions"]["SchemaDefRequirement"] or + not isinstance(schema_dict["definitions"]["SchemaDefRequirement"]["properties"], Dict) + ): + raise ValueError( + "Schema does not contain a 'properties' key in 'SchemaDefRequirement.definitions' " + "or 'properties' is not a dictionary" + ) + + # Confirm that properties has a types key + if "types" not in schema_dict["definitions"]["SchemaDefRequirement"]["properties"]: + raise ValueError("Schema does not contain an 'types' key in 'SchemaDefRequirement.properties'") + + # Confirm that types is of type array and has an items key + if ( + "type" not in schema_dict["definitions"]["SchemaDefRequirement"]["properties"]["types"] + or + not schema_dict["definitions"]["SchemaDefRequirement"]["properties"]["types"]["type"] == "array" + or + "items" not in schema_dict["definitions"]["SchemaDefRequirement"]["properties"]["types"] + ): + raise ValueError( + "Schema does not contain an 'types' key in 'SchemaDefRequirement.properties' " + "of type array with an 'items' key" + ) + + # Confirm that the types items has an anyOf key and the anyOf key is an array + if ( + "anyOf" not in schema_dict["definitions"]["SchemaDefRequirement"]["properties"]["types"]["items"] + or + not isinstance(schema_dict["definitions"]["SchemaDefRequirement"]["properties"]["types"]["items"]["anyOf"], + List) + ): + raise ValueError( + "Schema does not contain an 'anyOf' key in 'SchemaDefRequirement.properties.types.items' " + "or 'anyOf' is not a list" + ) + + # Allow for $import in the types array by updating the types items to be a anyOf array + schema_dict["definitions"]["SchemaDefRequirement"]["properties"]["types"]["items"]["anyOf"].append( + { + "$ref": "#/definitions/CWLImportManual" + } + ) + + return schema_dict + + +def add_cwl_metadata_to_schema(schema_dict: Dict) -> Dict: + """ + Add in the CWL metadata to the schema + Derived from https://github.com/common-workflow-language/cwl-v1.2/blob/76bdf9b55e2378432e0e6380ccedebb4a94ce483/json-schema/cwl.yaml#L2231-L2241 + :param schema_dict: + :return: + """ + + # Always do a deepcopy on the input + schema_dict = deepcopy(schema_dict) + + # Assert defintions + assert_definitions_key(schema_dict) + + # Add in the CWL metadata to the definitions + schema_dict["definitions"].update( + { + "CWLDocumentMetadata": { + "description": "Metadata for a CWL document", + "type": "object", + "properties": { + "$namespaces": { + "description": "The namespaces used in the document", + "type": "object", + "patternProperties": { + "^[_a-zA-Z][a-zA-Z0-9_-]*$": { + "type": "string" + } + } + }, + "$schemas": { + "description": "The schemas used in the document", + "type": "array", + "items": { + "type": "string" + } + } + }, + "patternProperties": { + "^s:.*$": { + "type": "object" + }, + # Or the full version + "^https://schema.org/.*$": { + "type": "object" + } + }, + "additionalProperties": False, + "required": [] + } + } + ) + return schema_dict + + +def write_schema_out_to_file(schema_dict: Dict, file_path: Path): + """ + Write out the schema to the file + :param schema_dict: + :param file_path: + :return: + """ + with open(file_path, "w") as file_h: + json.dump(schema_dict, file_h, indent=4) + + +def rename_all_keys_with_trailing_underscore(schema_dict: Any) -> Dict: + """ + Keys such as class_, type_ etc. are renames from TypeScript. We need to rename them in the JSON schema back + to their original names to generate a valid CWL JSON schema + :param schema_dict: + :return: + """ + + new_schema_dict = {} + + if isinstance(schema_dict, Dict): + for key, value in deepcopy(schema_dict).items(): + key = key.rstrip("_") + if isinstance(value, Dict): + new_schema_dict[key] = rename_all_keys_with_trailing_underscore(value) + elif isinstance(value, List): + new_schema_dict[key] = rename_all_keys_with_trailing_underscore(value) + else: + new_schema_dict[key] = value + elif isinstance(schema_dict, List): + new_schema_dict = list( + map(lambda value_iter: rename_all_keys_with_trailing_underscore(value_iter), schema_dict)) + else: + # Item is a value + new_schema_dict = schema_dict.rstrip("_") + + return new_schema_dict + + +def add_cwl_file(schema_dict: Dict) -> Dict: + """ + Large updates to the actual file body + + Can come in two forms, File and Graph. + + In File form, can be of type Workflow, ExpressionTool or CommandLineTool, + In Graph form, we have the $graph property which then has elements of type CWLFile + + Both can have the metadata objects such as $namespaces and $schemas + + We initialise both objects. + + Then state that the file can be a file or a graph + + :param schema_dict: + :return: + """ + # Always deep copy the input + schema_dict = deepcopy(schema_dict) + + # Assert $ref key + if "$ref" not in schema_dict: + raise ValueError("Schema does not contain a '$ref' key") + + # Assert $ref value is "#/definitions/Workflow" + if schema_dict["$ref"] != "#/definitions/Workflow": + raise ValueError("Schema does not contain a '$ref' value of '#/definitions/Workflow'") + + # Update the schema to use 'if-else' for CommandlineTool and Expression + schema_dict.update( + { + "$ref": "#/definitions/CWLFile", + } + ) + + schema_dict["definitions"].update( + { + # First create the yaml option + # Which is either a workflow, commandline tool or expression tool + "CWLFile": { + "type": "object", + "additionalProperties": False, + "allOf": [ + { + "oneOf": [ + { + "$ref": "#/definitions/Workflow" + }, + { + "$ref": "#/definitions/CommandLineTool" + }, + { + "$ref": "#/definitions/ExpressionTool" + } + ] + }, + { + "oneOf": [ + { + "$ref": "#/definitions/CWLDocumentMetadata" + } + ] + } + ] + } + } + ) + + return schema_dict + + +def add_cwl_graph(schema_dict: Dict) -> Dict: + """ + Large updates to the actual file body + + Can come in two forms, File and Graph. + + In File form, can be of type Workflow, ExpressionTool or CommandLineTool, + In Graph form, we have the $graph property which then has elements of type CWLFile + + Both can have the metadata objects such as $namespaces and $schemas + + We initialise both objects. + + Then state that the file can be a file or a graph + + :param schema_dict: + :return: + """ + # Always deep copy the input + schema_dict = deepcopy(schema_dict) + + # Assert $ref key + if "$ref" not in schema_dict: + raise ValueError("Schema does not contain a '$ref' key") + + # Update the schema + schema_dict.update( + { + "$ref": "#/definitions/CWLGraphWithMetadata", + } + ) + + # Update definitions + schema_dict["definitions"].update( + { + # Now create the graph option + "CWLGraph": { + "type": "object", + "properties": { + "$graph": { + "type": "array", + "items": { + "$ref": "#/definitions/CWLFile" + } + }, + # Copy from Workflow + "cwlVersion": schema_dict["definitions"]["Workflow"]["properties"]["cwlVersion"] + }, + "required": [ + "$graph" + ] + }, + "CWLGraphWithMetadata": { + "type": "object", + "additionalProperties": False, + "allOf": [ + { + "$ref": "#/definitions/CWLGraph" + }, + { + "$ref": "#/definitions/CWLDocumentMetadata" + } + ] + } + } + ) + + return schema_dict + + +def fix_descriptions(schema_dict: Dict) -> Dict: + """ + Fix the descriptions for all definitions by removing the 'Auto-generated' class implementation ... + Means that users will see helpful descriptions in the schema + :param schema_dict: + :return: + """ + # Always deep copy the input + schema_dict = deepcopy(schema_dict) + + # Assert definitions + assert_definitions_key(schema_dict) + + # Iterate over all definitions and remove the 'Auto-generated' class implementation + for schema_def_name, schema_def_dict in schema_dict.get("definitions", {}).items(): + if "description" not in schema_def_dict: + continue + schema_dict["definitions"][schema_def_name]["description"] = ( + schema_def_dict.get("description", "").split("\n\n", 1)[-1] + ) + + # Update top level description + schema_dict["description"] = schema_dict.get("description", "").split("\n\n", 1)[-1] + + return schema_dict + + +def fix_additional_properties(schema_dict: Dict, top_definition: str, sub_definition_keys: List) -> Dict: + """ + Fix the additionalProperties issues demonstrated in https://stoic-agnesi-d0ac4a.netlify.app/37 + :param schema_dict: + :return: + """ + # Always copy the input + schema_dict = deepcopy(schema_dict) + + # Part 1, drop additionalProperties: false from Workflow, CommandLineTool and ExpressionTool definitions + for definition_key in sub_definition_keys: + _ = schema_dict["definitions"][definition_key].pop("additionalProperties", None) + + # Part 2 + # For CWLFileorGraph definition, add in the collective set of properties keys defined under + # Workflow, CommandLineTool, ExpressionTool, $graph and CWLMetadata + # And for each property key set the value to true - + property_keys = [] + for definition_key in sub_definition_keys: + if "properties" not in schema_dict["definitions"][definition_key]: + continue + property_keys.append(list(schema_dict["definitions"][definition_key]["properties"].keys())) + property_keys = list(set(chain(*property_keys))) + + schema_dict["definitions"][top_definition]["properties"] = dict( + map( + lambda property_key_iter: (property_key_iter, True), + property_keys + ) + ) + + # Part 2a, copy over patternProperties + pattern_property_objects = {} + for definition_key in sub_definition_keys: + if "patternProperties" not in schema_dict["definitions"][definition_key]: + continue + pattern_property_objects.update( + schema_dict["definitions"][definition_key]["patternProperties"] + ) + + schema_dict["definitions"][top_definition]["patternProperties"] = pattern_property_objects + + # Make additionalProperties false to this top CWLDocumentMetadata + schema_dict["definitions"][top_definition]["additionalProperties"] = False + + return schema_dict + + +def fix_hints(schema_dict, definition_key): + """ + Hints property should be the same as requirements for the given key + :param schema_dict: + :param definition_key: + :return: + """ + + # Always do a deepcopy on the input + schema_dict = deepcopy(schema_dict) + + # Assert definitions key + assert_definitions_key(schema_dict) + + # Confirm definitions key exists + if definition_key not in schema_dict["definitions"]: + raise ValueError(f"Schema does not contain an '{definition_key}' key in 'definitions'") + + # Confirm that the definition_key has a properties key and the properties key is a dictionary + if ( + "properties" not in schema_dict["definitions"][definition_key] or + not isinstance(schema_dict["definitions"][definition_key]["properties"], Dict) + ): + raise ValueError( + f"Schema does not contain a 'properties' key in '{definition_key}.definitions' " + "or 'properties' is not a dictionary" + ) + + # Confirm that properties has a requirements key + if "requirements" not in schema_dict["definitions"][definition_key]["properties"]: + raise ValueError(f"Schema does not contain an 'requirements' key in '{definition_key}.properties'") + + # Copy requirements to hints + schema_dict["definitions"][definition_key]["properties"]["hints"] = \ + schema_dict["definitions"][definition_key]["properties"]["requirements"] + + return schema_dict + + +def add_file_and_directory_to_schema(schema_dict: Dict) -> Dict: + """ + Add file and directory defintions to schema + :param schema_dict: + :return: + """ + # Always do a deepcopy on the input + schema_dict = deepcopy(schema_dict) + + # Assert definitions key + assert_definitions_key(schema_dict) + + # Add in the File and Directory definitions to the schema + schema_dict['definitions'].update( + CLEANED_FILE_DEFINITION + ) + schema_dict['definitions'].update( + CLEANED_DIRECTORY_DEFINITION + ) + schema_dict['definitions'].update( + CLEANED_ANY_DEFINITION + ) + + return schema_dict + + +def drop_properties_and_required(schema_dict: Dict) -> Dict: + """ + Drop the properties and required keys + :param schema_dict: + :return: + """ + # Always do a deepcopy on the input + schema_dict = deepcopy(schema_dict) + + # Doesn't matter if they're not present + _ = schema_dict.pop("properties", None) + _ = schema_dict.pop("required", None) + + # Also drop additionalProperties + _ = schema_dict.pop("additionalProperties", None) + + return schema_dict + + +def main(): + # Step 1 - read in existing schema + schema_dict = read_schema_in_from_file(Path(sys.argv[1])) + + # Remove loading options from schema + schema_dict = remove_loading_options_and_extension_fields_from_schema(schema_dict) + + # Rename all keys with trailing underscore + schema_dict = rename_all_keys_with_trailing_underscore(schema_dict) + + # Drop the following definitions + # DefaultFetcher + # Dictionary + # Dictionary + # Fetcher + # T + for definition in DEFINTIIONS_TO_REMOVE: + _ = schema_dict["definitions"].pop(definition) + + # Add the following definitions + # File, Directory, + schema_dict = add_file_and_directory_to_schema(schema_dict) + + # Drop existing properties and required list (this will be populated by the script) + schema_dict = drop_properties_and_required(schema_dict) + + # Write out the new schema + write_schema_out_to_file(schema_dict, Path(sys.argv[2])) + + +if __name__ == "__main__": + main() + +``` +
+ +Run the python script above + +``` +python3 refine_workflow_input_json_schema_template.py \ + workflow_input_json_schema_template.primary.json \ + workflow_input_json_schema_template.json +``` + + +## Part 5 - Run schema generation against all tests in the cwl v1.2 directory + +> Note there is a set of six urls and inputs that we test the schema generation against in the tests directory + +
+ +Click to expand! + +```python +#!/usr/bin/env python3 +import json +from json import JSONDecodeError +from pathlib import Path +from subprocess import run +from tempfile import TemporaryDirectory +from jsonschema import validate +from jsonschema.exceptions import SchemaError, ValidationError +from ruamel.yaml import YAML + +from cwl_utils.loghandler import _logger as _cwlutilslogger + +FAILED_TESTS = [ + "filesarray_secondaryfiles2", # Conditional logic too complex for json schema + "wf_step_access_undeclared_param", # Inputs are valid, workflow expected to fail at tool level + "input_records_file_entry_with_format_and_bad_regular_input_file_format", # Don't validate formats + "secondary_files_missing", # Inputs are valid, workflow expected to fail at tool level + "input_records_file_entry_with_format_and_bad_entry_file_format", # Don't validate formats + "input_records_file_entry_with_format_and_bad_entry_array_file_format", # Don't validate formats + "timelimit_basic", # Inputs are valid, workflow expected to fail at tool level + "timelimit_invalid", # Inputs are valid, workflow expected to fail at tool level + "timelimit_from_expression", # Inputs are valid, workflow expected to fail at tool level + "timelimit_basic_wf", # Inputs are valid, workflow expected to fail at tool level + "timelimit_from_expression_wf", # Inputs are valid, workflow expected to fail at tool level + "networkaccess_disabled", # Inputs are valid, workflow expected to fail at tool level + "glob_outside_outputs_fails", # Inputs are valid, workflow expected to fail at tool level + "illegal_symlink", # Inputs are valid, workflow expected to fail at tool level + "params_broken_null", # Inputs are valid, workflow expected to fail at tool level + "length_for_non_array", # Inputs are valid, workflow expected to fail at tool level + "capture_files", # Inputs are valid, workflow expected to fail at tool level + "capture_dirs", # Inputs are valid, workflow expected to fail at tool level +] + +# Clone cwl1.2 repo into temp dir +with TemporaryDirectory() as temp_dir: + + run(['git', 'clone', 'https://github.com/common-workflow-language/cwl-v1.2'], cwd=temp_dir) + + tests_dir = Path(temp_dir) / 'cwl-v1.2' / 'tests' + + # Open conformance test yaml file + yaml = YAML() + + with open(Path(temp_dir) / 'cwl-v1.2' / "conformance_tests.yaml") as tests_yaml_h: + tests_list = yaml.load(tests_yaml_h) + + failed_runs = [] + + for test_item in tests_list: + id_ = test_item.get('id') + + # Check tool key exists + if test_item.get('tool', None) is None: + _cwlutilslogger.info(f"Skipping conformance test {id_}, no tool key") + continue + + tool = Path(test_item.get('tool').split("#", 1)[0]) + should_fail = test_item.get('should_fail', False) + + if "#" in test_item.get("tool") and not "#main" in test_item.get("tool"): + _cwlutilslogger.info(f"Skipping conformance test {id_}, we cannot load non main graphs") + continue + + if id_ in FAILED_TESTS: + _cwlutilslogger.info(f"Skipping conformance test {id_}") + continue + + _cwlutilslogger.info(f"Running conformance test {id_}") + + _cwlutilslogger.info(f"Generating schema for {tool.name}") + schema_gen_proc = run( + [ + "python3", str(Path(__file__).parent / "cwl_utils" / "inputs_schema_gen.py"), Path(temp_dir) / 'cwl-v1.2' / tool + ], + capture_output=True + ) + + if not schema_gen_proc.returncode == 0: + _cwlutilslogger.error(schema_gen_proc.stderr.decode()) + raise ChildProcessError + + schema_gen_stdout = schema_gen_proc.stdout.decode() + + if test_item.get('job', None) is None: + continue + + job = Path(test_item.get('job')) + + try: + input_schema_dict = json.loads(schema_gen_stdout) + except JSONDecodeError: + raise JSONDecodeError + + # Collect job + with open(Path(temp_dir) / 'cwl-v1.2' / job) as job_h: + job_dict = yaml.load(job_h) + + _cwlutilslogger.info(f"Testing {job.name} against schema generated for input {tool.name}") + try: + validate(job_dict, input_schema_dict) + except (ValidationError, SchemaError) as err: + if not should_fail: + _cwlutilslogger.error(f"Failed schema validation with {err}") + failed_runs.append(id_) + else: + if should_fail: + _cwlutilslogger.error(f"Expected to fail but passed") + failed_runs.append(id_) + + if len(failed_runs) > 0: + _cwlutilslogger.error("The following tests failed") + for failed_run in failed_runs: + _cwlutilslogger.error(failed_run) + +``` + +
\ No newline at end of file diff --git a/cwl_utils/templates/workflow_input_json_schema_template.json b/cwl_utils/templates/workflow_input_json_schema_template.json new file mode 100644 index 00000000..b07fd03f --- /dev/null +++ b/cwl_utils/templates/workflow_input_json_schema_template.json @@ -0,0 +1,471 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "InputArraySchema": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#InputArraySchema", + "properties": { + "doc": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "A documentation string for this object, or an array of strings which should be concatenated." + }, + "items": { + "anyOf": [ + { + "$ref": "#/definitions/InputRecordSchema" + }, + { + "$ref": "#/definitions/InputEnumSchema" + }, + { + "$ref": "#/definitions/InputArraySchema" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/definitions/InputRecordSchema" + }, + { + "$ref": "#/definitions/InputEnumSchema" + }, + { + "$ref": "#/definitions/InputArraySchema" + }, + { + "type": "string" + } + ] + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "Defines the type of the array elements." + }, + "label": { + "description": "A short, human-readable label of this object.", + "type": "string" + }, + "name": { + "description": "The identifier for this type", + "type": "string" + }, + "type": { + "const": "array", + "description": "Must be `array`", + "type": "string" + } + }, + "required": [ + "items", + "type" + ], + "type": "object" + }, + "InputBinding": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#InputBinding", + "properties": { + "loadContents": { + "description": "Use of `loadContents` in `InputBinding` is deprecated.\nPreserved for v1.0 backwards compatibility. Will be removed in\nCWL v2.0. Use `InputParameter.loadContents` instead.", + "type": "boolean" + } + }, + "required": [], + "type": "object" + }, + "InputEnumSchema": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#InputEnumSchema", + "properties": { + "doc": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "A documentation string for this object, or an array of strings which should be concatenated." + }, + "label": { + "description": "A short, human-readable label of this object.", + "type": "string" + }, + "name": { + "description": "The identifier for this type", + "type": "string" + }, + "symbols": { + "description": "Defines the set of valid symbols.", + "items": { + "type": "string" + }, + "type": "array" + }, + "type": { + "const": "enum", + "description": "Must be `enum`", + "type": "string" + } + }, + "required": [ + "symbols", + "type" + ], + "type": "object" + }, + "InputRecordField": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#InputRecordField", + "properties": { + "doc": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "A documentation string for this object, or an array of strings which should be concatenated." + }, + "format": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "Only valid when `type: File` or is an array of `items: File`.\n\nThis must be one or more IRIs of concept nodes\nthat represents file formats which are allowed as input to this\nparameter, preferably defined within an ontology. If no ontology is\navailable, file formats may be tested by exact match." + }, + "label": { + "description": "A short, human-readable label of this object.", + "type": "string" + }, + "loadContents": { + "description": "Only valid when `type: File` or is an array of `items: File`.\n\nIf true, the file (or each file in the array) must be a UTF-8\ntext file 64 KiB or smaller, and the implementation must read\nthe entire contents of the file (or file array) and place it\nin the `contents` field of the File object for use by\nexpressions. If the size of the file is greater than 64 KiB,\nthe implementation must raise a fatal error.", + "type": "boolean" + }, + "loadListing": { + "description": "Only valid when `type: Directory` or is an array of `items: Directory`.\n\nSpecify the desired behavior for loading the `listing` field of\na Directory object for use by expressions.\n\nThe order of precedence for loadListing is:\n\n 1. `loadListing` on an individual parameter\n 2. Inherited from `LoadListingRequirement`\n 3. By default: `no_listing`", + "enum": [ + "deep_listing", + "no_listing", + "shallow_listing" + ], + "type": "string" + }, + "name": { + "description": "The name of the field", + "type": "string" + }, + "secondaryFiles": { + "anyOf": [ + { + "$ref": "#/definitions/SecondaryFileSchema" + }, + { + "items": { + "$ref": "#/definitions/SecondaryFileSchema" + }, + "type": "array" + } + ], + "description": "Only valid when `type: File` or is an array of `items: File`.\n\nProvides a pattern or expression specifying files or\ndirectories that should be included alongside the primary\nfile. Secondary files may be required or optional. When not\nexplicitly specified, secondary files specified for `inputs`\nare required and `outputs` are optional. An implementation\nmust include matching Files and Directories in the\n`secondaryFiles` property of the primary file. These Files\nand Directories must be transferred and staged alongside the\nprimary file. An implementation may fail workflow execution\nif a required secondary file does not exist.\n\nIf the value is an expression, the value of `self` in the expression\nmust be the primary input or output File object to which this binding\napplies. The `basename`, `nameroot` and `nameext` fields must be\npresent in `self`. For `CommandLineTool` outputs the `path` field must\nalso be present. The expression must return a filename string relative\nto the path to the primary File, a File or Directory object with either\n`path` or `location` and `basename` fields set, or an array consisting\nof strings or File or Directory objects. It is legal to reference an\nunchanged File or Directory object taken from input as a secondaryFile.\nThe expression may return \"null\" in which case there is no secondaryFile\nfrom that expression.\n\nTo work on non-filename-preserving storage systems, portable tool\ndescriptions should avoid constructing new values from `location`, but\nshould construct relative references using `basename` or `nameroot`\ninstead.\n\nIf a value in `secondaryFiles` is a string that is not an expression,\nit specifies that the following pattern should be applied to the path\nof the primary file to yield a filename relative to the primary File:\n\n 1. If string ends with `?` character, remove the last `?` and mark\n the resulting secondary file as optional.\n 2. If string begins with one or more caret `^` characters, for each\n caret, remove the last file extension from the path (the last\n period `.` and all following characters). If there are no file\n extensions, the path is unchanged.\n 3. Append the remainder of the string to the end of the file path." + }, + "streamable": { + "description": "Only valid when `type: File` or is an array of `items: File`.\n\nA value of `true` indicates that the file is read or written\nsequentially without seeking. An implementation may use this flag to\nindicate whether it is valid to stream file contents using a named\npipe. Default: `false`.", + "type": "boolean" + }, + "type": { + "anyOf": [ + { + "$ref": "#/definitions/InputRecordSchema" + }, + { + "$ref": "#/definitions/InputEnumSchema" + }, + { + "$ref": "#/definitions/InputArraySchema" + }, + { + "items": { + "anyOf": [ + { + "$ref": "#/definitions/InputRecordSchema" + }, + { + "$ref": "#/definitions/InputEnumSchema" + }, + { + "$ref": "#/definitions/InputArraySchema" + }, + { + "type": "string" + } + ] + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "The field type" + } + }, + "required": [ + "name", + "type" + ], + "type": "object" + }, + "InputRecordSchema": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#InputRecordSchema", + "properties": { + "doc": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "string" + } + ], + "description": "A documentation string for this object, or an array of strings which should be concatenated." + }, + "fields": { + "description": "Defines the fields of the record.", + "items": { + "$ref": "#/definitions/InputRecordField" + }, + "type": "array" + }, + "label": { + "description": "A short, human-readable label of this object.", + "type": "string" + }, + "name": { + "description": "The identifier for this type", + "type": "string" + }, + "type": { + "const": "record", + "description": "Must be `record`", + "type": "string" + } + }, + "required": [ + "type" + ], + "type": "object" + }, + "SecondaryFileSchema": { + "additionalProperties": false, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#SecondaryFileSchema\n\nSecondary files are specified using the following micro-DSL for secondary files:\n\n* If the value is a string, it is transformed to an object with two fields\n `pattern` and `required`\n* By default, the value of `required` is `null`\n (this indicates default behavior, which may be based on the context)\n* If the value ends with a question mark `?` the question mark is\n stripped off and the value of the field `required` is set to `False`\n* The remaining value is assigned to the field `pattern`\n\nFor implementation details and examples, please see\n[this section](SchemaSalad.html#Domain_Specific_Language_for_secondary_files)\nin the Schema Salad specification.", + "properties": { + "pattern": { + "description": "Provides a pattern or expression specifying files or directories that\nshould be included alongside the primary file.\n\nIf the value is an expression, the value of `self` in the\nexpression must be the primary input or output File object to\nwhich this binding applies. The `basename`, `nameroot` and\n`nameext` fields must be present in `self`. For\n`CommandLineTool` inputs the `location` field must also be\npresent. For `CommandLineTool` outputs the `path` field must\nalso be present. If secondary files were included on an input\nFile object as part of the Process invocation, they must also\nbe present in `secondaryFiles` on `self`.\n\nThe expression must return either: a filename string relative\nto the path to the primary File, a File or Directory object\n(`class: File` or `class: Directory`) with either `location`\n(for inputs) or `path` (for outputs) and `basename` fields\nset, or an array consisting of strings or File or Directory\nobjects as previously described.\n\nIt is legal to use `location` from a File or Directory object\npassed in as input, including `location` from secondary files\non `self`. If an expression returns a File object with the\nsame `location` but a different `basename` as a secondary file\nthat was passed in, the expression result takes precedence.\nSetting the basename with an expression this way affects the\n`path` where the secondary file will be staged to in the\nCommandLineTool.\n\nThe expression may return \"null\" in which case there is no\nsecondary file from that expression.\n\nTo work on non-filename-preserving storage systems, portable\ntool descriptions should treat `location` as an\n[opaque identifier](#opaque-strings) and avoid constructing new\nvalues from `location`, but should construct relative references\nusing `basename` or `nameroot` instead, or propagate `location`\nfrom defined inputs.\n\nIf a value in `secondaryFiles` is a string that is not an expression,\nit specifies that the following pattern should be applied to the path\nof the primary file to yield a filename relative to the primary File:\n\n 1. If string ends with `?` character, remove the last `?` and mark\n the resulting secondary file as optional.\n 2. If string begins with one or more caret `^` characters, for each\n caret, remove the last file extension from the path (the last\n period `.` and all following characters). If there are no file\n extensions, the path is unchanged.\n 3. Append the remainder of the string to the end of the file path.", + "type": "string" + }, + "required": { + "description": "An implementation must not fail workflow execution if `required` is\nset to `false` and the expected secondary file does not exist.\nDefault value for `required` field is `true` for secondary files on\ninput and `false` for secondary files on output.", + "type": [ + "string", + "boolean" + ] + } + }, + "required": [ + "pattern" + ], + "type": "object" + }, + "File": { + "additionalProperties": false, + "description": "Represents a file (or group of files when `secondaryFiles` is provided) that\nwill be accessible by tools using standard POSIX file system call API such as\nopen(2) and read(2).\n\nFiles are represented as objects with `class` of `File`. File objects have\na number of properties that provide metadata about the file.\n\nThe `location` property of a File is a URI that uniquely identifies the\nfile. Implementations must support the `file://` URI scheme and may support\nother schemes such as `http://` and `https://`. The value of `location` may also be a\nrelative reference, in which case it must be resolved relative to the URI\nof the document it appears in. Alternately to `location`, implementations\nmust also accept the `path` property on File, which must be a filesystem\npath available on the same host as the CWL runner (for inputs) or the\nruntime environment of a command line tool execution (for command line tool\noutputs).\n\nIf no `location` or `path` is specified, a file object must specify\n`contents` with the UTF-8 text content of the file. This is a \"file\nliteral\". File literals do not correspond to external resources, but are\ncreated on disk with `contents` with when needed for executing a tool.\nWhere appropriate, expressions can return file literals to define new files\non a runtime. The maximum size of `contents` is 64 kilobytes.\n\nThe `basename` property defines the filename on disk where the file is\nstaged. This may differ from the resource name. If not provided,\n`basename` must be computed from the last path part of `location` and made\navailable to expressions.\n\nThe `secondaryFiles` property is a list of File or Directory objects that\nmust be staged in the same directory as the primary file. It is an error\nfor file names to be duplicated in `secondaryFiles`.\n\nThe `size` property is the size in bytes of the File. It must be computed\nfrom the resource and made available to expressions. The `checksum` field\ncontains a cryptographic hash of the file content for use it verifying file\ncontents. Implementations may, at user option, enable or disable\ncomputation of the `checksum` field for performance or other reasons.\nHowever, the ability to compute output checksums is required to pass the\nCWL conformance test suite.\n\nWhen executing a CommandLineTool, the files and secondary files may be\nstaged to an arbitrary directory, but must use the value of `basename` for\nthe filename. The `path` property must be file path in the context of the\ntool execution runtime (local to the compute node, or within the executing\ncontainer). All computed properties should be available to expressions.\nFile literals also must be staged and `path` must be set.\n\nWhen collecting CommandLineTool outputs, `glob` matching returns file paths\n(with the `path` property) and the derived properties. This can all be\nmodified by `outputEval`. Alternately, if the file `cwl.output.json` is\npresent in the output, `outputBinding` is ignored.\n\nFile objects in the output must provide either a `location` URI or a `path`\nproperty in the context of the tool execution runtime (local to the compute\nnode, or within the executing container).\n\nWhen evaluating an ExpressionTool, file objects must be referenced via\n`location` (the expression tool does not have access to files on disk so\n`path` is meaningless) or as file literals. It is legal to return a file\nobject with an existing `location` but a different `basename`. The\n`loadContents` field of ExpressionTool inputs behaves the same as on\nCommandLineTool inputs, however it is not meaningful on the outputs.\n\nAn ExpressionTool may forward file references from input to output by using\nthe same value for `location`.", + "properties": { + "basename": { + "description": "The base name of the file, that is, the name of the file without any\nleading directory path. The base name must not contain a slash `/`.\n\nIf not provided, the implementation must set this field based on the\n`location` field by taking the final path component after parsing\n`location` as an IRI. If `basename` is provided, it is not required to\nmatch the value from `location`.\n\nWhen this file is made available to a CommandLineTool, it must be named\nwith `basename`, i.e. the final component of the `path` field must match\n`basename`.", + "type": "string" + }, + "checksum": { + "description": "Optional hash code for validating file integrity. Currently, must be in the form\n\"sha1$ + hexadecimal string\" using the SHA-1 algorithm.", + "type": "string" + }, + "class": { + "const": "File", + "description": "Must be `File` to indicate this object describes a file.", + "type": "string" + }, + "contents": { + "description": "File contents literal.\n\nIf neither `location` nor `path` is provided, `contents` must be\nnon-null. The implementation must assign a unique identifier for the\n`location` field. When the file is staged as input to CommandLineTool,\nthe value of `contents` must be written to a file.\n\nIf `contents` is set as a result of a Javascript expression,\nan `entry` in `InitialWorkDirRequirement`, or read in from\n`cwl.output.json`, there is no specified upper limit on the\nsize of `contents`. Implementations may have practical limits\non the size of `contents` based on memory and storage\navailable to the workflow runner or other factors.\n\nIf the `loadContents` field of an `InputParameter` or\n`OutputParameter` is true, and the input or output File object\n`location` is valid, the file must be a UTF-8 text file 64 KiB\nor smaller, and the implementation must read the entire\ncontents of the file and place it in the `contents` field. If\nthe size of the file is greater than 64 KiB, the\nimplementation must raise a fatal error.", + "type": "string" + }, + "dirname": { + "description": "The name of the directory containing file, that is, the path leading up\nto the final slash in the path such that `dirname + '/' + basename ==\npath`.\n\nThe implementation must set this field based on the value of `path`\nprior to evaluating parameter references or expressions in a\nCommandLineTool document. This field must not be used in any other\ncontext.", + "type": "string" + }, + "format": { + "description": "The format of the file: this must be an IRI of a concept node that\nrepresents the file format, preferably defined within an ontology.\nIf no ontology is available, file formats may be tested by exact match.\n\nReasoning about format compatibility must be done by checking that an\ninput file format is the same, `owl:equivalentClass` or\n`rdfs:subClassOf` the format required by the input parameter.\n`owl:equivalentClass` is transitive with `rdfs:subClassOf`, e.g. if\n` owl:equivalentClass ` and ` owl:subclassOf
` then infer\n` owl:subclassOf `.\n\nFile format ontologies may be provided in the \"$schemas\" metadata at the\nroot of the document. If no ontologies are specified in `$schemas`, the\nruntime may perform exact file format matches.", + "type": "string" + }, + "location": { + "description": "An IRI that identifies the file resource. This may be a relative\nreference, in which case it must be resolved using the base IRI of the\ndocument. The location may refer to a local or remote resource; the\nimplementation must use the IRI to retrieve file content. If an\nimplementation is unable to retrieve the file content stored at a\nremote resource (due to unsupported protocol, access denied, or other\nissue) it must signal an error.\n\nIf the `location` field is not provided, the `contents` field must be\nprovided. The implementation must assign a unique identifier for\nthe `location` field.\n\nIf the `path` field is provided but the `location` field is not, an\nimplementation may assign the value of the `path` field to `location`,\nthen follow the rules above.", + "type": "string" + }, + "nameext": { + "description": "The basename extension such that `nameroot + nameext == basename`, and\n`nameext` is empty or begins with a period and contains at most one\nperiod. Leading periods on the basename are ignored; a basename of\n`.cshrc` will have an empty `nameext`.\n\nThe implementation must set this field automatically based on the value\nof `basename` prior to evaluating parameter references or expressions.", + "type": "string" + }, + "nameroot": { + "description": "The basename root such that `nameroot + nameext == basename`, and\n`nameext` is empty or begins with a period and contains at most one\nperiod. For the purposes of path splitting leading periods on the\nbasename are ignored; a basename of `.cshrc` will have a nameroot of\n`.cshrc`.\n\nThe implementation must set this field automatically based on the value\nof `basename` prior to evaluating parameter references or expressions.", + "type": "string" + }, + "path": { + "description": "The local host path where the File is available when a CommandLineTool is\nexecuted. This field must be set by the implementation. The final\npath component must match the value of `basename`. This field\nmust not be used in any other context. The command line tool being\nexecuted must be able to access the file at `path` using the POSIX\n`open(2)` syscall.\n\nAs a special case, if the `path` field is provided but the `location`\nfield is not, an implementation may assign the value of the `path`\nfield to `location`, and remove the `path` field.\n\nIf the `path` contains [POSIX shell metacharacters](http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_02)\n(`|`,`&`, `;`, `<`, `>`, `(`,`)`, `$`,`` ` ``, `\\`, `\"`, `'`,\n``, ``, and ``) or characters\n[not allowed](http://www.iana.org/assignments/idna-tables-6.3.0/idna-tables-6.3.0.xhtml)\nfor [Internationalized Domain Names for Applications](https://tools.ietf.org/html/rfc6452)\nthen implementations may terminate the process with a\n`permanentFailure`.", + "type": "string" + }, + "secondaryFiles": { + "description": "A list of additional files or directories that are associated with the\nprimary file and must be transferred alongside the primary file.\nExamples include indexes of the primary file, or external references\nwhich must be included when loading primary document. A file object\nlisted in `secondaryFiles` may itself include `secondaryFiles` for\nwhich the same rules apply.", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/Directory" + } + ] + }, + "type": "array" + }, + "size": { + "description": "Optional file size (in bytes)", + "type": "number" + } + }, + "required": [ + "class" + ], + "type": "object" + }, + "Directory": { + "additionalProperties": false, + "description": "Represents a directory to present to a command line tool.\n\nDirectories are represented as objects with `class` of `Directory`. Directory objects have\na number of properties that provide metadata about the directory.\n\nThe `location` property of a Directory is a URI that uniquely identifies\nthe directory. Implementations must support the file:// URI scheme and may\nsupport other schemes such as http://. Alternately to `location`,\nimplementations must also accept the `path` property on Directory, which\nmust be a filesystem path available on the same host as the CWL runner (for\ninputs) or the runtime environment of a command line tool execution (for\ncommand line tool outputs).\n\nA Directory object may have a `listing` field. This is a list of File and\nDirectory objects that are contained in the Directory. For each entry in\n`listing`, the `basename` property defines the name of the File or\nSubdirectory when staged to disk. If `listing` is not provided, the\nimplementation must have some way of fetching the Directory listing at\nruntime based on the `location` field.\n\nIf a Directory does not have `location`, it is a Directory literal. A\nDirectory literal must provide `listing`. Directory literals must be\ncreated on disk at runtime as needed.\n\nThe resources in a Directory literal do not need to have any implied\nrelationship in their `location`. For example, a Directory listing may\ncontain two files located on different hosts. It is the responsibility of\nthe runtime to ensure that those files are staged to disk appropriately.\nSecondary files associated with files in `listing` must also be staged to\nthe same Directory.\n\nWhen executing a CommandLineTool, Directories must be recursively staged\nfirst and have local values of `path` assigned.\n\nDirectory objects in CommandLineTool output must provide either a\n`location` URI or a `path` property in the context of the tool execution\nruntime (local to the compute node, or within the executing container).\n\nAn ExpressionTool may forward file references from input to output by using\nthe same value for `location`.\n\nName conflicts (the same `basename` appearing multiple times in `listing`\nor in any entry in `secondaryFiles` in the listing) is a fatal error.", + "properties": { + "basename": { + "description": "The base name of the directory, that is, the name of the file without any\nleading directory path. The base name must not contain a slash `/`.\n\nIf not provided, the implementation must set this field based on the\n`location` field by taking the final path component after parsing\n`location` as an IRI. If `basename` is provided, it is not required to\nmatch the value from `location`.\n\nWhen this file is made available to a CommandLineTool, it must be named\nwith `basename`, i.e. the final component of the `path` field must match\n`basename`.", + "type": "string" + }, + "class": { + "const": "Directory", + "description": "Must be `Directory` to indicate this object describes a Directory.", + "type": "string" + }, + "listing": { + "description": "List of files or subdirectories contained in this directory. The name\nof each file or subdirectory is determined by the `basename` field of\neach `File` or `Directory` object. It is an error if a `File` shares a\n`basename` with any other entry in `listing`. If two or more\n`Directory` object share the same `basename`, this must be treated as\nequivalent to a single subdirectory with the listings recursively\nmerged.", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/Directory" + } + ] + }, + "type": "array" + }, + "location": { + "description": "An IRI that identifies the directory resource. This may be a relative\nreference, in which case it must be resolved using the base IRI of the\ndocument. The location may refer to a local or remote resource. If\nthe `listing` field is not set, the implementation must use the\nlocation IRI to retrieve directory listing. If an implementation is\nunable to retrieve the directory listing stored at a remote resource (due to\nunsupported protocol, access denied, or other issue) it must signal an\nerror.\n\nIf the `location` field is not provided, the `listing` field must be\nprovided. The implementation must assign a unique identifier for\nthe `location` field.\n\nIf the `path` field is provided but the `location` field is not, an\nimplementation may assign the value of the `path` field to `location`,\nthen follow the rules above.", + "type": "string" + }, + "path": { + "description": "The local path where the Directory is made available prior to executing a\nCommandLineTool. This must be set by the implementation. This field\nmust not be used in any other context. The command line tool being\nexecuted must be able to access the directory at `path` using the POSIX\n`opendir(2)` syscall.\n\nIf the `path` contains [POSIX shell metacharacters](http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_02)\n(`|`,`&`, `;`, `<`, `>`, `(`,`)`, `$`,`` ` ``, `\\`, `\"`, `'`,\n``, ``, and ``) or characters\n[not allowed](http://www.iana.org/assignments/idna-tables-6.3.0/idna-tables-6.3.0.xhtml)\nfor [Internationalized Domain Names for Applications](https://tools.ietf.org/html/rfc6452)\nthen implementations may terminate the process with a\n`permanentFailure`.", + "type": "string" + } + }, + "required": [ + "class" + ], + "type": "object" + }, + "Any": { + "description": "A placeholder for any type of CWL object.", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array" + }, + { + "type": "object" + }, + { + "$ref": "#/definitions/File" + }, + { + "$ref": "#/definitions/Directory" + } + ], + "properties": { + "format": true, + "size": true, + "listing": true, + "checksum": true, + "nameroot": true, + "contents": true, + "location": true, + "path": true, + "dirname": true, + "nameext": true, + "secondaryFiles": true, + "class": true, + "basename": true + } + } + }, + "description": "Auto-generated class implementation for https://w3id.org/cwl/cwl#WorkflowInputParameter", + "type": "object" +} \ No newline at end of file diff --git a/cwl_utils/utils.py b/cwl_utils/utils.py index 3c884f61..5e494aa9 100644 --- a/cwl_utils/utils.py +++ b/cwl_utils/utils.py @@ -6,11 +6,12 @@ import sys import urllib.error import urllib.parse +from urllib.parse import urlparse import urllib.request from collections.abc import MutableMapping, MutableSequence from copy import deepcopy from io import StringIO -from typing import Any, Optional, Union +from typing import Any, Optional, Union, Dict, List from ruamel.yaml.main import YAML from ruamel.yaml.parser import ParserError @@ -19,6 +20,13 @@ from cwl_utils.errors import MissingKeyField from cwl_utils.loghandler import _logger +# Type hinting +from cwl_utils.parser import InputRecordSchemaTypes + +# Load as 1.2 files +from cwl_utils.parser.cwl_v1_2 import InputEnumSchema as InputEnumSchemaV1_2 +from cwl_utils.parser.cwl_v1_2 import InputArraySchema as InputArraySchemaV1_2 + fast_yaml = YAML(typ="safe") _USERNS: Optional[bool] = None @@ -235,3 +243,198 @@ def yaml_dumps(obj: Any) -> str: stream = StringIO() yaml.dump(obj, stream) return stream.getvalue() + + +def to_pascal_case(name: str) -> str: + """ + Convert a string to PascalCase. + + fastq-list-row to FastqListRow + fastq_list_row to FastqListRow + :param name: + :return: + """ + return "".join( + map(lambda word: word.capitalize(), name.replace("_", "-").split("-")) + ) + + +def sanitise_schema_field( + schema_field_item: Union[Dict[str, Any], str] +) -> Union[Dict[str, Any], str]: + """ + Schemas need to be resolved before converted to JSON properties. + + Convert + { + 'type': 'Directory?' + } + To + { + 'type': ['null', 'Directory'] + } + + Convert + { + 'type': 'string[]' + } + To + InputArraySchema( + type_=array, + items=string + ) + + Convert + { + 'type': 'File[]?' + } + To + { + 'type': [ + 'null', InputArraySchema( + type_=array, + items=File + ) + ] + } + + Convert + { + 'type': 'Enum', + 'symbols': ['A', 'B', 'C'] + } + + To + { + 'type': InputEnumSchema( + type_=enum, + symbols=['A', 'B', 'C'] + ) + } + + Convert + { + 'type': 'array', + 'items': { + '$import': '../../../schemas/fastq-list-row/1.0.0/fastq-list-row__1.0.0.yaml#fastq-list-row' + } + } + To + { + 'type': InputArraySchema( + type_=array, + items={ + '$import': '../../../schemas/fastq-list-row/1.0.0/fastq-list-row__1.0.0.yaml#fastq-list-row' + } + ) + } + + :param schema_field_item: + :return: + """ + # We might be just a string, in which case, just return + # This happens in the case that type is a list of primitive types + if isinstance(schema_field_item, str): + return schema_field_item + + # Copy schema field + schema_field_item = deepcopy(schema_field_item) + required = True + + if isinstance(schema_field_item, InputRecordSchemaTypes): + return schema_field_item + + if isinstance(schema_field_item.get("type"), List): + if "null" in schema_field_item.get("type", []): + required = False + schema_field_item["type"] = list( + filter( + lambda type_item: type_item != "null", schema_field_item.get("type", []) + ) + ) + if len(schema_field_item["type"]) == 1: + schema_field_item["type"] = schema_field_item["type"][0] + else: + # Recursively get items + schema_field_item["type"] = list( + map( + lambda field_subtypes: sanitise_schema_field(field_subtypes), + schema_field_item.get("type", []), + ) + ) + + if isinstance(schema_field_item.get("type"), str): + if schema_field_item.get("type", "").endswith("?"): + required = False + schema_field_item["type"] = schema_field_item.get("type", "").replace( + "?", "" + ) + + if schema_field_item.get("type", "").endswith("[]"): + # Strip list + schema_field_item["type"] = schema_field_item.get("type", "").replace( + "[]", "" + ) + # Convert to array + schema_field_item["type"] = InputArraySchemaV1_2( + type_="array", items=schema_field_item.get("type", "") + ) + + if isinstance(schema_field_item.get("type"), Dict): + # Likely an enum + if schema_field_item.get("type", {}).get("type", "") == "enum": + schema_field_item["type"] = InputEnumSchemaV1_2( + type_="enum", + symbols=schema_field_item.get("type", {}).get("symbols", ""), + ) + elif schema_field_item.get("type", {}).get("type", "") == "array": + schema_field_item["type"] = InputArraySchemaV1_2( + type_="array", items=schema_field_item.get("type", {}).get("items", "") + ) + elif "$import" in schema_field_item.get("type", {}).keys(): + # Leave import as is + pass + else: + raise ValueError(f"Unknown type: {schema_field_item.get('type')}") + + if not required: + if isinstance(schema_field_item.get("type"), List): + schema_field_item["type"] = ["null"] + schema_field_item.get("type", []) + else: + schema_field_item["type"] = ["null", schema_field_item.get("type", "")] + + return schema_field_item + + +def is_uri(uri: str) -> bool: + """ + Given a URI return True if it is a URI. + + :param uri: + :return: + """ + if not urlparse(uri).scheme == "": + return True + else: + return False + + +def is_local_uri(uri: str) -> bool: + """Given a uri, first check if it is a uri, then check if it is a local uri.""" + if is_uri(uri) and urlparse(uri).scheme == "file": + return True + return False + + +def get_value_from_uri(uri: str) -> str: + """ + Given a URI, return the value after #. + + file://path/to/imported/record#my_workflow_name/record_name + Returns + record_name + :param uri: + :return: + """ + url_obj = urlparse(uri) + return url_obj.fragment.rsplit("/")[-1] diff --git a/docs/index.rst b/docs/index.rst index 67f384d3..f4078dc7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -33,6 +33,8 @@ Included Utility Programs .. autoprogram:: cwl_utils.normalizer:arg_parser() :prog: cwl-normalizer +.. autoprogram:: cwl_utils.inputs_schema_gen:arg_parser() + :prog: cwl-inputs-schema-gen Indices and tables ================== diff --git a/mypy-requirements.txt b/mypy-requirements.txt index fb0cbcce..d9e20ec0 100644 --- a/mypy-requirements.txt +++ b/mypy-requirements.txt @@ -1,4 +1,5 @@ mypy==1.13.0 typing_extensions types-requests +types-jsonschema types-setuptools>=57.4.0 diff --git a/pyproject.toml b/pyproject.toml index 4519fb4e..bba79a6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ cwl-docker-extract = "cwl_utils.docker_extract:main" cwl-expression-refactor = "cwl_utils.expression_refactor:main" cwl-graph-split = "cwl_utils.graph_split:main" cwl-normalizer = "cwl_utils.normalizer:main" +cwl-inputs-schema-gen = "cwl_utils.inputs_schema_gen:main" [tool.aliases] test = "pytest" @@ -82,6 +83,9 @@ include-package-data = true "cwl_utils.testdata.types" = "testdata/types" "cwl_utils.testdata.workflows" = "testdata/workflows" +[tool.setuptools.package-data] +"cwl_utils" = ["templates/*.json"] + [tool.setuptools.dynamic] version = {attr = "cwl_utils.__meta__.__version__"} dependencies = {file = ["requirements.txt"]} diff --git a/requirements.txt b/requirements.txt index d24b7145..091020a4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ rdflib requests schema-salad >= 8.8.20241206093842, < 9 ruamel.yaml >= 0.17.6, < 0.19 +typing_extensions;python_version<'3.10' diff --git a/test-requirements.txt b/test-requirements.txt index 3e4dd88d..508e7953 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -3,3 +3,4 @@ pytest-cov pytest-xdist cwlformat pytest-mock >= 1.10.0 +jsonschema >= 4.21.1 diff --git a/tests/test_inputs_schema_gen.py b/tests/test_inputs_schema_gen.py new file mode 100644 index 00000000..cf71483a --- /dev/null +++ b/tests/test_inputs_schema_gen.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for cwl-inputs-schema-gen.""" +from pathlib import Path +from typing import Dict +import pytest +import requests +from jsonschema.exceptions import ValidationError, SchemaError +from jsonschema.validators import validate +from ruamel.yaml import YAML + +from cwl_utils.inputs_schema_gen import cwl_to_jsonschema +from cwl_utils.parser import load_document_by_uri +from cwl_utils.loghandler import _logger as _cwlutilslogger + +TEST_ROOT_URL = ( + "https://raw.githubusercontent.com/common-workflow-language/cwl-v1.2/main/tests" +) + +TEST_PARAMS = [ + # Packed Case + { + "tool_url": f"{TEST_ROOT_URL}/revsort-packed.cwl", + "input_url": f"{TEST_ROOT_URL}/revsort-job.json", + }, + # The number of parameters is a little large, and the definition itself is a straightforward case. + { + "tool_url": f"{TEST_ROOT_URL}/bwa-mem-tool.cwl", + "input_url": f"{TEST_ROOT_URL}/bwa-mem-job.json", + }, + # The case where CommandInputParameter is shortened (e.g., param: string) + { + "tool_url": f"{TEST_ROOT_URL}/env-tool1.cwl", + "input_url": f"{TEST_ROOT_URL}/env-job.json", + }, + # Dir + { + "tool_url": f"{TEST_ROOT_URL}/dir.cwl", + "input_url": f"{TEST_ROOT_URL}/dir-job.yml", + }, + # SecondaryFiles + { + "tool_url": f"{TEST_ROOT_URL}/secondaryfiles/rename-inputs.cwl", + "input_url": f"{TEST_ROOT_URL}/secondaryfiles/rename-inputs.yml", + }, + # Stage array + { + "tool_url": f"{TEST_ROOT_URL}/stage-array.cwl", + "input_url": f"{TEST_ROOT_URL}/stage-array-job.json", + }, +] + + +@pytest.mark.parametrize("test_param", TEST_PARAMS) +def test_cwl_inputs_to_jsonschema(test_param: Dict[str, str]) -> None: + tool_url = test_param["tool_url"] + input_url = test_param["input_url"] + + cwl_obj = load_document_by_uri(tool_url) + + _cwlutilslogger.info(f"Generating schema for {Path(tool_url).name}") + json_schema = cwl_to_jsonschema(cwl_obj) + + _cwlutilslogger.info( + f"Testing {Path(input_url).name} against schema generated for input {Path(tool_url).name}" + ) + + yaml = YAML() + + input_obj = yaml.load(requests.get(input_url).text) + + try: + validate(input_obj, json_schema) + except (ValidationError, SchemaError) as err: + _cwlutilslogger.error( + f"Validation failed for {Path(input_url).name} " + f"against schema generated for input {Path(tool_url).name}" + ) + raise SchemaError(f"{Path(input_url).name} failed schema validation") from err + + +def test_cwl_inputs_to_jsonschema_fails() -> None: + """Compare tool schema of param 1 against input schema of param 2.""" + tool_url = TEST_PARAMS[0]["tool_url"] + input_url = TEST_PARAMS[3]["input_url"] + + cwl_obj = load_document_by_uri(tool_url) + + _cwlutilslogger.info(f"Generating schema for {Path(tool_url).name}") + json_schema = cwl_to_jsonschema(cwl_obj) + + _cwlutilslogger.info( + f"Testing {Path(input_url).name} against schema generated for input {Path(tool_url).name}" + ) + + yaml = YAML() + + input_obj = yaml.load(requests.get(input_url).text) + + # We expect this to fail + with pytest.raises(ValidationError): + validate(input_obj, json_schema)