Skip to content

Commit

Permalink
add schema caching
Browse files Browse the repository at this point in the history
hbruch committed Jul 9, 2024
1 parent f65a58f commit 7f3d2d9
Showing 4 changed files with 23 additions and 7 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -3,4 +3,5 @@ out/
__pycache__
todos.md
.doit.db
tmp/
tmp/
.schema_cache/
2 changes: 1 addition & 1 deletion dodo.py
Original file line number Diff line number Diff line change
@@ -58,7 +58,7 @@ def task_validate_xml():
'name': dataset_name,
'file_dep': [body_file],
'targets': [dst_file],
'actions': [(tasks.validate_xml, (body_file, fallback_schema, dst_file))]
'actions': [(tasks.validate_xml, (body_file, fallback_schema, dst_file, '.schema_cache'))]
}

def task_validate_xml_using_schematron():
24 changes: 19 additions & 5 deletions dpc/tasks.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import glob
import json
import os
import os.path
from lxml import etree
from jinja2 import Environment, FileSystemLoader, select_autoescape
from .schemavalidator import validate_XML
from lxml.isoschematron import Schematron
import jsonlines
import logging
import urllib.request
import tempfile
import hashlib

env = Environment(
loader=FileSystemLoader('config/templates'),
@@ -135,20 +139,30 @@ def _as_error_dict(error):
'type_name': error.type_name
}

def validate_xml(xml_file, fallback_schema, dst_file):
def _download_and_cache(fallback_schema, schema_cache_dir):
os.makedirs(schema_cache_dir, exist_ok=True)
file_name = hashlib.md5(fallback_schema.encode()).hexdigest()+'.xsd'
cached_file_path = os.path.join(schema_cache_dir, file_name)
urllib.request.urlretrieve(fallback_schema, cached_file_path)
return cached_file_path

def validate_xml(xml_file, fallback_schema, dst_file, schema_cache_dir):
try:
doc = etree.parse(xml_file)

with jsonlines.open(dst_file, mode='w') as writer:
schema = validate_XML(doc)
for error in schema.error_log:
writer.write(_as_error_dict(error))


if fallback_schema and len(schema.error_log) == 1 and "No matching global declaration" in schema.error_log[0].message:
xmlns = etree.QName(doc.getroot().tag).namespace
schema = validate_XML(doc, [u"%s %s"%(xmlns, fallback_schema)])
cached_fallback_schema = _download_and_cache(fallback_schema, schema_cache_dir)
schema = validate_XML(doc, [u"%s %s"%(xmlns, cached_fallback_schema)])
for error in schema.error_log:
writer.write(_as_error_dict(error))
else:
for error in schema.error_log:
writer.write(_as_error_dict(error))

except:
logging.error("Error parsing %s", validate_xml)

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -3,3 +3,4 @@ Jinja2==2.11.2
jsonlines==2.0.0
lxml==4.6.2
schedule==0.6.0
markupsafe==2.0.1

0 comments on commit 7f3d2d9

Please sign in to comment.