import os
from lxml import etree
from lxml import isoschematron
from .utils import NAMESPACES
METS_XSD_PATH = "resources/mets.xsd"
# Right now there are two different schematron files for validating
# Archivematica-generated METS files vs Archivematica-generated METS pointer
# files. These could be consolidated to one.
AM_SCT_PATH = "resources/archivematica_mets_schematron.xml"
AM_PNTR_SCT_PATH = "resources/archivematica_mets_pointer_file_schematron.xml"
def _get_file_path(path):
if not os.path.isfile(path):
path_2 = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
if not os.path.isfile(path_2):
raise ValueError(f"There is no (schema) file at either {path} or {path_2}")
return path_2
return path
[docs]def get_schematron(sct_path):
"""Return an lxml ``isoschematron.Schematron()`` instance using the
schematron file at ``sct_path``.
"""
sct_path = _get_file_path(sct_path)
parser = etree.XMLParser(remove_blank_text=True)
sct_doc = etree.parse(sct_path, parser=parser)
return isoschematron.Schematron(sct_doc, store_report=True)
[docs]def validate(mets_doc, xmlschema=METS_XSD_PATH, schematron=AM_SCT_PATH):
"""Validate a METS file using both an XMLSchema (.xsd) schema and a
schematron schema, the latter of which typically places additional
constraints on what a METS file can look like.
"""
is_xsd_valid, xsd_error_log = xsd_validate(mets_doc, xmlschema=xmlschema)
is_sct_valid, sct_report = schematron_validate(mets_doc, schematron=schematron)
valid = is_xsd_valid and is_sct_valid
report = {
"is_xsd_valid": is_xsd_valid,
"is_sct_valid": is_sct_valid,
"xsd_error_log": xsd_error_log,
"sct_report": sct_report,
}
report["report"] = report_string(report)
return valid, report
[docs]def get_xmlschema(xmlschema, mets_doc):
"""Return a ``class::lxml.etree.XMLSchema`` instance given the path to the
XMLSchema (.xsd) file in ``xmlschema`` and the
``class::lxml.etree._ElementTree`` instance ``mets_doc`` representing the
METS file being parsed. The complication here is that the METS file to be
validated via the .xsd file may reference additional schemata via
``xsi:schemaLocation`` attributes. We have to find all of these and import
them from within the returned XMLSchema.
For the solution that this is based on, see:
http://code.activestate.com/recipes/578503-validate-xml-with-schemalocation/
For other descriptions of the problem, see:
- https://groups.google.com/forum/#!topic/archivematica/UBS1ay-g_tE
- https://stackoverflow.com/questions/26712645/xml-type-definition-is-absent
- https://stackoverflow.com/questions/2979824/in-document-schema-declarations-and-lxml
"""
xsd_path = _get_file_path(xmlschema)
xmlschema = etree.parse(xsd_path)
schema_locations = set(
mets_doc.xpath("//*/@xsi:schemaLocation", namespaces=NAMESPACES)
)
for schema_location in schema_locations:
namespaces_locations = schema_location.strip().split()
for namespace, location in zip(*[iter(namespaces_locations)] * 2):
if namespace == NAMESPACES["mets"]:
continue
xs_import = etree.Element("{http://www.w3.org/2001/XMLSchema}import")
xs_import.attrib["namespace"] = namespace
xs_import.attrib["schemaLocation"] = location
xmlschema.getroot().insert(0, xs_import)
return etree.XMLSchema(xmlschema)
[docs]def xsd_validate(mets_doc, xmlschema=METS_XSD_PATH):
xmlschema = get_xmlschema(xmlschema, mets_doc)
is_valid = xmlschema.validate(mets_doc)
error_log = xmlschema.error_log
return is_valid, error_log
[docs]def schematron_validate(mets_doc, schematron=AM_SCT_PATH):
"""Validate a METS file using a schematron schema. Return a boolean
indicating validity and a report as an ``lxml.ElementTree`` instance.
"""
if isinstance(schematron, str):
schematron = get_schematron(schematron)
is_valid = schematron.validate(mets_doc)
report = schematron.validation_report
return is_valid, report
[docs]def sct_report_string(report):
"""Return a human-readable string representation of the error report
returned by lxml's schematron validator.
"""
ret = []
namespaces = {"svrl": "http://purl.oclc.org/dsdl/svrl"}
for index, failed_assert_el in enumerate(
report.findall("svrl:failed-assert", namespaces=namespaces)
):
ret.append(
"{}. {}".format(
index + 1,
failed_assert_el.find("svrl:text", namespaces=namespaces).text,
)
)
ret.append(" test: {}".format(failed_assert_el.attrib["test"]))
ret.append(" location: {}".format(failed_assert_el.attrib["location"]))
ret.append("\n")
return "\n".join(ret)
[docs]def xsd_error_log_string(xsd_error_log):
"""Return a human-readable string representation of the error log
returned by lxml's XMLSchema validator.
"""
ret = []
for error in xsd_error_log:
ret.append(
"ERROR ON LINE {}: {}".format(error.line, error.message.encode("utf-8"))
)
return "\n".join(ret)
[docs]def report_string(report):
"""Return a human-readable string representation of all of the validation
errors.
"""
return (
"Schematron Error(s):\n"
+ sct_report_string(report["sct_report"])
+ "\n\nXMLSchema (xsd) Error(s):\n"
+ xsd_error_log_string(report["xsd_error_log"])
)