335 lines
12 KiB
Python
335 lines
12 KiB
Python
|
"""The ``lxml.isoschematron`` package implements ISO Schematron support on top
|
||
|
of the pure-xslt 'skeleton' implementation.
|
||
|
"""
|
||
|
|
||
|
import sys
|
||
|
import os.path
|
||
|
from lxml import etree as _etree # due to validator __init__ signature
|
||
|
|
||
|
|
||
|
# some compat stuff, borrowed from lxml.html
|
||
|
try:
|
||
|
unicode
|
||
|
except NameError:
|
||
|
# Python 3
|
||
|
unicode = str
|
||
|
try:
|
||
|
basestring
|
||
|
except NameError:
|
||
|
# Python 3
|
||
|
basestring = str
|
||
|
|
||
|
|
||
|
__all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include',
|
||
|
'iso_abstract_expand', 'iso_svrl_for_xslt1',
|
||
|
'svrl_validation_errors', 'schematron_schema_valid',
|
||
|
'stylesheet_params', 'Schematron']
|
||
|
|
||
|
|
||
|
# some namespaces
|
||
|
#FIXME: Maybe lxml should provide a dedicated place for common namespace
|
||
|
#FIXME: definitions?
|
||
|
XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema"
|
||
|
RELAXNG_NS = "http://relaxng.org/ns/structure/1.0"
|
||
|
SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron"
|
||
|
SVRL_NS = "http://purl.oclc.org/dsdl/svrl"
|
||
|
|
||
|
|
||
|
# some helpers
|
||
|
_schematron_root = '{%s}schema' % SCHEMATRON_NS
|
||
|
_xml_schema_root = '{%s}schema' % XML_SCHEMA_NS
|
||
|
_resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
|
||
|
|
||
|
|
||
|
# the iso-schematron skeleton implementation steps aka xsl transformations
|
||
|
extract_xsd = _etree.XSLT(_etree.parse(
|
||
|
os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl')))
|
||
|
extract_rng = _etree.XSLT(_etree.parse(
|
||
|
os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl')))
|
||
|
iso_dsdl_include = _etree.XSLT(_etree.parse(
|
||
|
os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
|
||
|
'iso_dsdl_include.xsl')))
|
||
|
iso_abstract_expand = _etree.XSLT(_etree.parse(
|
||
|
os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
|
||
|
'iso_abstract_expand.xsl')))
|
||
|
iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse(
|
||
|
os.path.join(_resources_dir,
|
||
|
'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl')))
|
||
|
|
||
|
|
||
|
# svrl result accessors
|
||
|
svrl_validation_errors = _etree.XPath(
|
||
|
'//svrl:failed-assert', namespaces={'svrl': SVRL_NS})
|
||
|
|
||
|
|
||
|
# RelaxNG validator for schematron schemas
|
||
|
schematron_schema_valid = _etree.RelaxNG(
|
||
|
file=os.path.join(_resources_dir, 'rng', 'iso-schematron.rng'))
|
||
|
|
||
|
|
||
|
def stylesheet_params(**kwargs):
|
||
|
"""Convert keyword args to a dictionary of stylesheet parameters.
|
||
|
XSL stylesheet parameters must be XPath expressions, i.e.:
|
||
|
|
||
|
* string expressions, like "'5'"
|
||
|
* simple (number) expressions, like "5"
|
||
|
* valid XPath expressions, like "/a/b/text()"
|
||
|
|
||
|
This function converts native Python keyword arguments to stylesheet
|
||
|
parameters following these rules:
|
||
|
If an arg is a string wrap it with XSLT.strparam().
|
||
|
If an arg is an XPath object use its path string.
|
||
|
If arg is None raise TypeError.
|
||
|
Else convert arg to string.
|
||
|
"""
|
||
|
result = {}
|
||
|
for key, val in kwargs.items():
|
||
|
if isinstance(val, basestring):
|
||
|
val = _etree.XSLT.strparam(val)
|
||
|
elif val is None:
|
||
|
raise TypeError('None not allowed as a stylesheet parameter')
|
||
|
elif not isinstance(val, _etree.XPath):
|
||
|
val = unicode(val)
|
||
|
result[key] = val
|
||
|
return result
|
||
|
|
||
|
|
||
|
# helper function for use in Schematron __init__
|
||
|
def _stylesheet_param_dict(paramsDict, kwargsDict):
|
||
|
"""Return a copy of paramsDict, updated with kwargsDict entries, wrapped as
|
||
|
stylesheet arguments.
|
||
|
kwargsDict entries with a value of None are ignored.
|
||
|
"""
|
||
|
# beware of changing mutable default arg
|
||
|
paramsDict = dict(paramsDict)
|
||
|
for k, v in kwargsDict.items():
|
||
|
if v is not None: # None values do not override
|
||
|
paramsDict[k] = v
|
||
|
paramsDict = stylesheet_params(**paramsDict)
|
||
|
return paramsDict
|
||
|
|
||
|
|
||
|
class Schematron(_etree._Validator):
|
||
|
"""An ISO Schematron validator.
|
||
|
|
||
|
Pass a root Element or an ElementTree to turn it into a validator.
|
||
|
Alternatively, pass a filename as keyword argument 'file' to parse from
|
||
|
the file system.
|
||
|
|
||
|
Schematron is a less well known, but very powerful schema language.
|
||
|
The main idea is to use the capabilities of XPath to put restrictions on
|
||
|
the structure and the content of XML documents.
|
||
|
|
||
|
The standard behaviour is to fail on ``failed-assert`` findings only
|
||
|
(``ASSERTS_ONLY``). To change this, you can either pass a report filter
|
||
|
function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS``
|
||
|
or a custom ``XPath`` object), or subclass isoschematron.Schematron for
|
||
|
complete control of the validation process.
|
||
|
|
||
|
Built on the Schematron language 'reference' skeleton pure-xslt
|
||
|
implementation, the validator is created as an XSLT 1.0 stylesheet using
|
||
|
these steps:
|
||
|
|
||
|
0) (Extract from XML Schema or RelaxNG schema)
|
||
|
1) Process inclusions
|
||
|
2) Process abstract patterns
|
||
|
3) Compile the schematron schema to XSLT
|
||
|
|
||
|
The ``include`` and ``expand`` keyword arguments can be used to switch off
|
||
|
steps 1) and 2).
|
||
|
To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the
|
||
|
keyword arguments ``include_params``, ``expand_params`` or
|
||
|
``compile_params``.
|
||
|
For convenience, the compile-step parameter ``phase`` is also exposed as a
|
||
|
keyword argument ``phase``. This takes precedence if the parameter is also
|
||
|
given in the parameter dictionary.
|
||
|
|
||
|
If ``store_schematron`` is set to True, the (included-and-expanded)
|
||
|
schematron document tree is stored and available through the ``schematron``
|
||
|
property.
|
||
|
If ``store_xslt`` is set to True, the validation XSLT document tree will be
|
||
|
stored and can be retrieved through the ``validator_xslt`` property.
|
||
|
With ``store_report`` set to True (default: False), the resulting validation
|
||
|
report document gets stored and can be accessed as the ``validation_report``
|
||
|
property.
|
||
|
|
||
|
Here is a usage example::
|
||
|
|
||
|
>>> from lxml import etree
|
||
|
>>> from lxml.isoschematron import Schematron
|
||
|
|
||
|
>>> schematron = Schematron(etree.XML('''
|
||
|
... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
|
||
|
... <pattern id="id_only_attribute">
|
||
|
... <title>id is the only permitted attribute name</title>
|
||
|
... <rule context="*">
|
||
|
... <report test="@*[not(name()='id')]">Attribute
|
||
|
... <name path="@*[not(name()='id')]"/> is forbidden<name/>
|
||
|
... </report>
|
||
|
... </rule>
|
||
|
... </pattern>
|
||
|
... </schema>'''),
|
||
|
... error_finder=Schematron.ASSERTS_AND_REPORTS)
|
||
|
|
||
|
>>> xml = etree.XML('''
|
||
|
... <AAA name="aaa">
|
||
|
... <BBB id="bbb"/>
|
||
|
... <CCC color="ccc"/>
|
||
|
... </AAA>
|
||
|
... ''')
|
||
|
|
||
|
>>> schematron.validate(xml)
|
||
|
False
|
||
|
|
||
|
>>> xml = etree.XML('''
|
||
|
... <AAA id="aaa">
|
||
|
... <BBB id="bbb"/>
|
||
|
... <CCC/>
|
||
|
... </AAA>
|
||
|
... ''')
|
||
|
|
||
|
>>> schematron.validate(xml)
|
||
|
True
|
||
|
"""
|
||
|
|
||
|
# libxml2 error categorization for validation errors
|
||
|
_domain = _etree.ErrorDomains.SCHEMATRONV
|
||
|
_level = _etree.ErrorLevels.ERROR
|
||
|
_error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT
|
||
|
|
||
|
# convenience definitions for common behaviours
|
||
|
ASSERTS_ONLY = svrl_validation_errors # Default
|
||
|
ASSERTS_AND_REPORTS = _etree.XPath(
|
||
|
'//svrl:failed-assert | //svrl:successful-report',
|
||
|
namespaces={'svrl': SVRL_NS})
|
||
|
|
||
|
def _extract(self, element):
|
||
|
"""Extract embedded schematron schema from non-schematron host schema.
|
||
|
This method will only be called by __init__ if the given schema document
|
||
|
is not a schematron schema by itself.
|
||
|
Must return a schematron schema document tree or None.
|
||
|
"""
|
||
|
schematron = None
|
||
|
if element.tag == _xml_schema_root:
|
||
|
schematron = self._extract_xsd(element)
|
||
|
elif element.nsmap[element.prefix] == RELAXNG_NS:
|
||
|
# RelaxNG does not have a single unique root element
|
||
|
schematron = self._extract_rng(element)
|
||
|
return schematron
|
||
|
|
||
|
# customization points
|
||
|
# etree.XSLT objects that provide the extract, include, expand, compile
|
||
|
# steps
|
||
|
_extract_xsd = extract_xsd
|
||
|
_extract_rng = extract_rng
|
||
|
_include = iso_dsdl_include
|
||
|
_expand = iso_abstract_expand
|
||
|
_compile = iso_svrl_for_xslt1
|
||
|
|
||
|
# etree.xpath object that determines input document validity when applied to
|
||
|
# the svrl result report; must return a list of result elements (empty if
|
||
|
# valid)
|
||
|
_validation_errors = ASSERTS_ONLY
|
||
|
|
||
|
def __init__(self, etree=None, file=None, include=True, expand=True,
|
||
|
include_params={}, expand_params={}, compile_params={},
|
||
|
store_schematron=False, store_xslt=False, store_report=False,
|
||
|
phase=None, error_finder=ASSERTS_ONLY):
|
||
|
super(Schematron, self).__init__()
|
||
|
|
||
|
self._store_report = store_report
|
||
|
self._schematron = None
|
||
|
self._validator_xslt = None
|
||
|
self._validation_report = None
|
||
|
if error_finder is not self.ASSERTS_ONLY:
|
||
|
self._validation_errors = error_finder
|
||
|
|
||
|
# parse schema document, may be a schematron schema or an XML Schema or
|
||
|
# a RelaxNG schema with embedded schematron rules
|
||
|
root = None
|
||
|
try:
|
||
|
if etree is not None:
|
||
|
if _etree.iselement(etree):
|
||
|
root = etree
|
||
|
else:
|
||
|
root = etree.getroot()
|
||
|
elif file is not None:
|
||
|
root = _etree.parse(file).getroot()
|
||
|
except Exception:
|
||
|
raise _etree.SchematronParseError(
|
||
|
"No tree or file given: %s" % sys.exc_info()[1])
|
||
|
if root is None:
|
||
|
raise ValueError("Empty tree")
|
||
|
if root.tag == _schematron_root:
|
||
|
schematron = root
|
||
|
else:
|
||
|
schematron = self._extract(root)
|
||
|
if schematron is None:
|
||
|
raise _etree.SchematronParseError(
|
||
|
"Document is not a schematron schema or schematron-extractable")
|
||
|
# perform the iso-schematron skeleton implementation steps to get a
|
||
|
# validating xslt
|
||
|
if include:
|
||
|
schematron = self._include(schematron, **include_params)
|
||
|
if expand:
|
||
|
schematron = self._expand(schematron, **expand_params)
|
||
|
if not schematron_schema_valid(schematron):
|
||
|
raise _etree.SchematronParseError(
|
||
|
"invalid schematron schema: %s" %
|
||
|
schematron_schema_valid.error_log)
|
||
|
if store_schematron:
|
||
|
self._schematron = schematron
|
||
|
# add new compile keyword args here if exposing them
|
||
|
compile_kwargs = {'phase': phase}
|
||
|
compile_params = _stylesheet_param_dict(compile_params, compile_kwargs)
|
||
|
validator_xslt = self._compile(schematron, **compile_params)
|
||
|
if store_xslt:
|
||
|
self._validator_xslt = validator_xslt
|
||
|
self._validator = _etree.XSLT(validator_xslt)
|
||
|
|
||
|
def __call__(self, etree):
|
||
|
"""Validate doc using Schematron.
|
||
|
|
||
|
Returns true if document is valid, false if not.
|
||
|
"""
|
||
|
self._clear_error_log()
|
||
|
result = self._validator(etree)
|
||
|
if self._store_report:
|
||
|
self._validation_report = result
|
||
|
errors = self._validation_errors(result)
|
||
|
if errors:
|
||
|
if _etree.iselement(etree):
|
||
|
fname = etree.getroottree().docinfo.URL or '<file>'
|
||
|
else:
|
||
|
fname = etree.docinfo.URL or '<file>'
|
||
|
for error in errors:
|
||
|
# Does svrl report the line number, anywhere? Don't think so.
|
||
|
self._append_log_message(
|
||
|
domain=self._domain, type=self._error_type,
|
||
|
level=self._level, line=0,
|
||
|
message=_etree.tostring(error, encoding='unicode'),
|
||
|
filename=fname)
|
||
|
return False
|
||
|
return True
|
||
|
|
||
|
@property
|
||
|
def schematron(self):
|
||
|
"""ISO-schematron schema document (None if object has been initialized
|
||
|
with store_schematron=False).
|
||
|
"""
|
||
|
return self._schematron
|
||
|
|
||
|
@property
|
||
|
def validator_xslt(self):
|
||
|
"""ISO-schematron skeleton implementation XSLT validator document (None
|
||
|
if object has been initialized with store_xslt=False).
|
||
|
"""
|
||
|
return self._validator_xslt
|
||
|
|
||
|
@property
|
||
|
def validation_report(self):
|
||
|
"""ISO-schematron validation result report (None if result-storing has
|
||
|
been turned off).
|
||
|
"""
|
||
|
return self._validation_report
|