diff --git a/config/schemas/DATEXII_MDM_Parkdaten.sch b/config/schemas/DATEXII_MDM_Parkdaten.sch index 20d1249..52beaa2 100644 --- a/config/schemas/DATEXII_MDM_Parkdaten.sch +++ b/config/schemas/DATEXII_MDM_Parkdaten.sch @@ -11,7 +11,7 @@ Override checks - + Sum of all given totalParkingXXXCapacityOverride of a parkingFacility should be greater than 0 for # Sum of all totalParkingCapacityShortTermOverride () and totalParkingCapacityLongTermOverride () should not exceed totalParkingCapacityOverride () for # diff --git a/config/templates/dataset_template.html b/config/templates/dataset_template.html index d8e0b5f..fe21ae4 100644 --- a/config/templates/dataset_template.html +++ b/config/templates/dataset_template.html @@ -48,8 +48,8 @@ MFDZ - DPC - {{ ld.name }} - - + + {% include "includes/header.html" %}
@@ -63,36 +63,36 @@

{{ ld.name }}

{% include "includes/license_badge.html" %} + {% include "includes/validation_badge.html" %} + - -

{{ ld.description }}

-

Erstellt durch {{ld.creator.name or 'Unbekannt' }}

-
    {% for keyword in ld.keywords %} +

    Erstellt durch {{ld.creator.name or 'Unbekannt' }}

    +
      {% for keyword in ld.keywords %}
    • {{ keyword }}
    • {% endfor %}
    -

    Download-Optionen

    -
    -
    +

    Nutzungsbedingungen

    -
    Lizenz: {{ ld.license or 'Unbekannt'}}
    - Quellenvermerk: {{ ld.creditText or 'Keine Angabe' }} -
    +
    Lizenz: {{ ld.license or 'Unbekannt'}}
    + Quellenvermerk: {{ ld.creditText or 'Keine Angabe' }} +
{% include "includes/footer.html" %} - - \ No newline at end of file + + diff --git a/config/templates/includes/size_badge.html b/config/templates/includes/size_badge.html new file mode 100644 index 0000000..70dbb07 --- /dev/null +++ b/config/templates/includes/size_badge.html @@ -0,0 +1,6 @@ +
+
+ Größe + {{size}} +
+
\ No newline at end of file diff --git a/config/templates/includes/validation_badge.html b/config/templates/includes/validation_badge.html new file mode 100644 index 0000000..8053a08 --- /dev/null +++ b/config/templates/includes/validation_badge.html @@ -0,0 +1,10 @@ +
+
+ Validierung + {% if validation_ok %} + erfolgreich + {% else %} + Fehler + {% endif %} +
+
\ No newline at end of file diff --git a/config/templates/validation_results_template.html b/config/templates/validation_results_template.html new file mode 100644 index 0000000..9258c23 --- /dev/null +++ b/config/templates/validation_results_template.html @@ -0,0 +1,78 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MFDZ - DPC - Validierungsergebnisse {{ ld.name }} + + + {% include "includes/header.html" %} + +
+
+
+
+
+

Validierungsergebnisse Datensatz {{ ld.name }}

+
+ + +
+ + + + {% for error in errors %} + + {% endfor %} + +
ValidatorBeschreibung
{{error.domain_name}}{{error.message}}
+
+
+
+
+
+ {% include "includes/footer.html" %} + + \ No newline at end of file diff --git a/dodo.py b/dodo.py index 4501c3c..d24a659 100644 --- a/dodo.py +++ b/dodo.py @@ -1,5 +1,5 @@ import glob -from dpc.tasks import render_index_page, download_links, dpc_files, validate_xml, validate_xml_via_schematron, render_index, datasets_with_schematron_validation +import dpc.tasks as tasks DOIT_CONFIG = { 'action_string_formatting': 'new' @@ -19,20 +19,6 @@ def task_copy_static_files(): 'actions': ['mkdir -p {0} && cp -rf config/static/* {0}'.format(DPC_CONFIG['out_dir'])], } -def task_render_landing_page(): - '''render index.html page for datasets''' - for (dataset_name, dpc_file) in dpc_files(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): - dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/index.html' - ld_file = DPC_CONFIG['out_dir'] + dataset_name + '/ld.json' - datapackage_file = DPC_CONFIG['out_dir'] + dataset_name + '/datapackage.json' - - yield { - 'name': dataset_name, - 'file_dep': [dpc_file, DPC_CONFIG['dataset_template'], DPC_CONFIG['datapackage_template']], - 'targets': [dst_file, ld_file, datapackage_file], - 'actions': [(render_index_page, [dpc_file, dst_file, ld_file, datapackage_file, dataset_name, DPC_CONFIG])], - } - def task_render_index(): '''render index.html and sitemap.txt for datasets''' SITEMAP = DPC_CONFIG['out_dir'] + 'sitemap.txt' @@ -40,12 +26,12 @@ def task_render_index(): return { 'targets': [SITEMAP, INDEX], - 'actions': [(render_index, [DPC_CONFIG['dataset_definitions_dir'], DPC_CONFIG['host'], INDEX, SITEMAP])], + 'actions': [(tasks.render_index, [DPC_CONFIG['dataset_definitions_dir'], DPC_CONFIG['host'], INDEX, SITEMAP])], } def task_download_mdm_dataset(): - for (dataset_name, download_url, cert, file_type) in download_links(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): + for (dataset_name, download_url, cert, file_type) in tasks.download_links(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/body.' + file_type action = 'curl -fsS -z {targets} -o {targets} --create-dirs -R --compressed ' @@ -58,29 +44,69 @@ def task_download_mdm_dataset(): } def task_validate_xml(): - for (dataset_name, download_url, cert, file_type) in download_links(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): + for (dataset_name, download_url, cert, file_type) in tasks.download_links(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): if file_type != 'xml': continue body_file = DPC_CONFIG['out_dir'] + dataset_name + '/body.xml' - dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results.txt' + dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results.schema.jsonl' yield { 'name': dataset_name, 'file_dep': [body_file], 'targets': [dst_file], - 'actions': [(validate_xml, (body_file, dst_file))] + 'actions': [(tasks.validate_xml, (body_file, dst_file))] } def task_validate_xml_using_schematron(): - for (dataset_name, schematron_file) in datasets_with_schematron_validation(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): + for (dataset_name, schematron_file) in tasks.datasets_with_schematron_validation(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): body_file = DPC_CONFIG['out_dir'] + dataset_name + '/body.xml' - dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results_schematron.txt' + dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results.schematron.jsonl' yield { 'name': dataset_name, 'file_dep': [body_file, schematron_file], 'targets': [dst_file], - 'actions': [(validate_xml_via_schematron, (body_file, schematron_file, dst_file))] + 'actions': [(tasks.validate_xml_via_schematron, (body_file, schematron_file, dst_file))] + } + +def task_merge_validation_results(): + for (dataset_name, validation_files) in tasks.datasets_out_files(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json', DPC_CONFIG['out_dir'], 'validation_results.*.jsonl'): + dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results.jsonl' + + yield { + 'name': dataset_name, + 'file_dep': validation_files, + 'targets': [dst_file], + 'actions': [(tasks.merge_validation_results, (validation_files, dst_file))] + } + +def task_render_validation_results(): + for (dataset_name, dpc_file) in tasks.dpc_files(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): + validation_results_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results.jsonl' + dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results.html' + + yield { + 'name': dataset_name, + 'file_dep': [validation_results_file], + 'targets': [dst_file], + 'actions': [(tasks.render_validation_results, (validation_results_file, dpc_file, dst_file))] + } + +def task_render_landing_page(): + '''render index.html page for datasets''' + for (dataset_name, dpc_file) in tasks.dpc_files(DPC_CONFIG['dataset_definitions_dir']+'**/dpc.json'): + dst_file = DPC_CONFIG['out_dir'] + dataset_name + '/index.html' + ld_file = DPC_CONFIG['out_dir'] + dataset_name + '/ld.json' + validation_results_file = DPC_CONFIG['out_dir'] + dataset_name + '/validation_results.jsonl' + + datapackage_file = DPC_CONFIG['out_dir'] + dataset_name + '/datapackage.json' + + yield { + 'name': dataset_name, + 'file_dep': [dpc_file, validation_results_file, DPC_CONFIG['dataset_template'], DPC_CONFIG['datapackage_template']], + 'targets': [dst_file, ld_file, datapackage_file], + 'actions': [(tasks.render_landing_page, [dpc_file, dst_file, ld_file, datapackage_file, dataset_name, validation_results_file, DPC_CONFIG])], } + \ No newline at end of file diff --git a/dpc/schemavalidator.py b/dpc/schemavalidator.py index 9ada6d5..8da3a84 100644 --- a/dpc/schemavalidator.py +++ b/dpc/schemavalidator.py @@ -27,15 +27,17 @@ """ -def validate_XML(tree): +def validate_XML(tree, namespaces = None): """Validate an XML file represented as tree. Follow all schemaLocations. :param tree: :type tree: ElementTree """ schema_tree = etree.XML(SCHEMA_TEMPLATE) - # Find all unique instances of 'xsi:schemaLocation=" ..."' - schema_locations = set(tree.xpath("//*/@xsi:schemaLocation", namespaces={'xsi': XSI})) + + # Find all unique instances of 'xsi:schemaLocation=" ..."' + schema_locations = namespaces if namespaces else set(tree.xpath("//*/@xsi:schemaLocation", namespaces={'xsi': XSI})) + for schema_location in schema_locations: # Split namespaces and schema locations ; use strip to remove leading # and trailing whitespace. @@ -46,7 +48,7 @@ def validate_XML(tree): xs_import.attrib['namespace'] = namespace xs_import.attrib['schemaLocation'] = location schema_tree.append(xs_import) - # Contstruct the schema + # Construct the schema schema = etree.XMLSchema(schema_tree) # Validate! schema.validate(tree) diff --git a/dpc/tasks.py b/dpc/tasks.py index af9e4f3..9e5dbc8 100644 --- a/dpc/tasks.py +++ b/dpc/tasks.py @@ -5,6 +5,7 @@ from jinja2 import Environment, FileSystemLoader, select_autoescape from .schemavalidator import validate_XML from lxml.isoschematron import Schematron +import jsonlines env = Environment( loader=FileSystemLoader('config/templates'), @@ -42,6 +43,11 @@ def datasets_with_schematron_validation(pattern): if data.get('fileType')=='xml' and data.get('schematron'): yield (dataset_name, data['schematron']) +def datasets_out_files(pattern, outdir, out_file_pattern): + for (dataset_name, dpc_file) in dpc_files(pattern): + LIST = glob.glob(outdir + dataset_name + "/" + out_file_pattern) + yield (dataset_name, LIST) + def _format_for(encodingFormat): # FIXME this is temporary and should be replaced, e.g. by using datapackage's format metainfo if encodingFormat == "text/csv": @@ -84,14 +90,17 @@ def render_index(basedir, dpc_host, index_file, sitemap_file): fh.write(rendered_template) # TODO include validation result () -def render_index_page(dpc_file, dst_file, ld_file, datapackage_file, dataset_name, DPC_CONFIG): +def render_landing_page(dpc_file, dst_file, ld_file, datapackage_file, dataset_name, validation_results_file, DPC_CONFIG): '''Render the dataset index page using metadata from supplied dpc file''' # Open template file template = env.get_template('dataset_template.html') ld = _load_linked_data(dpc_file) + + size = os.path.getsize(validation_results_file) + validation_ok = True if os.path.getsize(validation_results_file) == 0 else False # render template with ld supplied as params - rendered_template = template.render(ld = ld) + rendered_template = template.render(ld = ld, validation_ok = validation_ok, size = size) datapackage_template = env.get_template('datapackage.json') dataset = _enhanced_linked_data(ld, dataset_name, DPC_CONFIG['host']) @@ -109,10 +118,19 @@ def render_index_page(dpc_file, dst_file, ld_file, datapackage_file, dataset_nam def validate_xml(xml_file, dst_file): doc = etree.parse(xml_file) - schema = validate_XML(doc) - with open(dst_file, 'w') as fh: + + with jsonlines.open(dst_file, mode='w') as writer: + schema = validate_XML(doc) for error in schema.error_log: - fh.write(f'{error}\n') + error_dict = { + 'line': error.line, + 'column': error.column, + 'level': error.level_name, + 'message': error.message, + 'domain_name': error.domain_name, + 'type_name': error.type_name + } + writer.write(error_dict) def validate_xml_via_schematron(xml_file, schema_file, dst_file): schematron = Schematron(file = schema_file, @@ -122,13 +140,42 @@ def validate_xml_via_schematron(xml_file, schema_file, dst_file): ns = {'svrl': 'http://purl.oclc.org/dsdl/svrl'} - with open(dst_file, 'w') as fh: + with jsonlines.open(dst_file, mode='w') as writer: for error in schematron.error_log: msg_xml = etree.fromstring(error.message) message = msg_xml.find('svrl:text', ns).text if msg_xml.find('svrl:text', ns) is not None else None - fh.write(u'%s:%d:%d:%s:%s:%s: %s\n' % ( - error.filename, error.line, error.column, error.level_name, - error.domain_name, error.type_name, message)) - + + error_dict = { + 'line': error.line, + 'column': error.column, + 'level': error.level_name, + 'message': message, + 'domain_name': error.domain_name, + 'type_name': error.type_name + } + writer.write(error_dict) + +def merge_validation_results(validation_files, dst_file): + with open(dst_file, 'w') as fh: + for in_file in validation_files: + print('render '+in_file) + # read row by row, collect only n samples of same category and count rest, write them out as result file + with open(in_file, 'r') as rh: + for line in rh: + # TODO here we should do some counting... + fh.write(line) + +def render_validation_results(results_file, dpc_file, dst_file): + template = env.get_template('validation_results_template.html') + + ld = _load_linked_data(dpc_file) + + with open(results_file, 'r', encoding='utf-8') as rh: + data = jsonlines.Reader(rh).iter(type=dict, skip_invalid=True) + rendered_template = template.render(errors = data, ld = ld) + + with open(dst_file, 'w') as fh: + fh.write(rendered_template) + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2336c3d..86d8d49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ doit==0.33.1 Jinja2==2.11.2 +jsonlines==2.0.0 lxml==4.6.2 schedule==0.6.0