Skip to content

Commit 0856ac8

Browse files
authored
fix: SPARQL XML result parsing (#2044)
Fixed the following problems with the SPARQL XML result parsing: - Both the parse method of both the lxml and `xml` modules does not work work well with `TextIO` objects, the `xml` module works with `TextIO` objects if the XML encoding is `utf-8`, but not if it is `utf-16`, and with `lxml` parse fails for both `utf-8` and `utf-16`. To fix this I changed the XML result parser to first convert `TextIO` to `bytes` and then feed the `bytes` to `parse()` using `BytesIO`. - The parser was operating on all elements inside `results` and `result` elements, even if those elements were not `result` and `binding` elements respectively. This was causing problems with `lxml`, as `lxml` also returns comments when iterating over elements. To fix this I added a check for the element tags so that only the correct elements are considered. Other changes: - Added type hints to `rdflib.plugins.sparql.results.xmlresults`. - Run with `lxml` one some permutations in the test matrix. - Removed `rdflib.compat.etree`, as this was not very helpful for the SPARQL XML Result parser and it was not used elsewhere. - Added an `lxml` environment to tox which installs `lxml` and `lxml-stubs`. - Expanded SPARQL result testing by adding some additional parameters. Related issues: - Fixes #2035 - Fixes #1847
1 parent 54018fc commit 0856ac8

File tree

6 files changed

+136
-46
lines changed

6 files changed

+136
-46
lines changed

.github/workflows/validate.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ jobs:
3333
- python-version: "3.9"
3434
os: ubuntu-latest
3535
TOX_EXTRA_COMMAND: "- black --check --diff ./rdflib"
36+
TOXENV_SUFFIX: "-lxml"
3637
- python-version: "3.10"
3738
os: ubuntu-latest
3839
TOX_EXTRA_COMMAND: "flake8 --exit-zero rdflib"

CHANGELOG.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,31 @@ and will be removed for release.
100100
<!-- -->
101101
<!-- -->
102102

103+
104+
<!-- -->
105+
<!-- -->
106+
<!-- CHANGE BARRIER: START PR #2044 -->
107+
<!-- -->
108+
<!-- -->
109+
110+
- Fixed some issues with SPARQL XML result parsing that caused problems with
111+
[`lxml`](https://lxml.de/). Closed [issue #2035](https://github.com/RDFLib/rdflib/issues/2035),
112+
[issue #1847](https://github.com/RDFLib/rdflib/issues/1847).
113+
[PR #2044](https://github.com/RDFLib/rdflib/pull/2044).
114+
- Result parsing from
115+
[`TextIO`](https://docs.python.org/3/library/typing.html#typing.TextIO)
116+
streams now work correctly with `lxml` installed and with XML documents that
117+
are not `utf-8` encoded.
118+
- Elements inside `<results>` that are not `<result>` are now ignored.
119+
- Elements inside `<result>` that are not `<binding>` are now ignored.
120+
- Also added type hints to `rdflib.plugins.sparql.results.xmlresults`.
121+
122+
<!-- -->
123+
<!-- -->
124+
<!-- CHANGE BARRIER: END -->
125+
<!-- -->
126+
<!-- -->
127+
103128
<!-- -->
104129
<!-- -->
105130
<!-- CHANGE BARRIER: START -->

rdflib/compat.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,7 @@
66
import codecs
77
import re
88
import warnings
9-
from typing import TYPE_CHECKING, Match
10-
11-
if TYPE_CHECKING:
12-
import xml.etree.ElementTree as etree
13-
else:
14-
try:
15-
from lxml import etree
16-
except ImportError:
17-
import xml.etree.ElementTree as etree
9+
from typing import Match
1810

1911

2012
def cast_bytes(s, enc="utf-8"):

rdflib/plugins/sparql/results/xmlresults.py

Lines changed: 83 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,34 @@
11
import logging
2-
from typing import IO, Optional
2+
import xml.etree.ElementTree as xml_etree # noqa: N813
3+
from io import BytesIO
4+
from typing import (
5+
IO,
6+
TYPE_CHECKING,
7+
Any,
8+
BinaryIO,
9+
Dict,
10+
Optional,
11+
Sequence,
12+
TextIO,
13+
Tuple,
14+
Union,
15+
cast,
16+
)
317
from xml.dom import XML_NAMESPACE
418
from xml.sax.saxutils import XMLGenerator
519
from xml.sax.xmlreader import AttributesNSImpl
620

721
from rdflib import BNode, Literal, URIRef, Variable
8-
from rdflib.compat import etree
922
from rdflib.query import Result, ResultException, ResultParser, ResultSerializer
23+
from rdflib.term import Identifier
24+
25+
try:
26+
# https://adamj.eu/tech/2021/12/29/python-type-hints-optional-imports/
27+
import lxml.etree as lxml_etree
28+
29+
FOUND_LXML = True
30+
except ImportError:
31+
FOUND_LXML = False
1032

1133
SPARQL_XML_NAMESPACE = "http://www.w3.org/2005/sparql-results#"
1234
RESULTS_NS_ET = "{%s}" % SPARQL_XML_NAMESPACE
@@ -27,19 +49,32 @@
2749

2850
class XMLResultParser(ResultParser):
2951
# TODO FIXME: content_type should be a keyword only arg.
30-
def parse(self, source, content_type: Optional[str] = None): # type: ignore[override]
52+
def parse(self, source: IO, content_type: Optional[str] = None): # type: ignore[override]
3153
return XMLResult(source)
3254

3355

3456
class XMLResult(Result):
35-
def __init__(self, source, content_type: Optional[str] = None):
36-
37-
try:
38-
# try use as if etree is from lxml, and if not use it as normal.
39-
parser = etree.XMLParser(huge_tree=True) # type: ignore[call-arg]
40-
tree = etree.parse(source, parser)
41-
except TypeError:
42-
tree = etree.parse(source)
57+
def __init__(self, source: IO, content_type: Optional[str] = None):
58+
parser_encoding: Optional[str] = None
59+
if hasattr(source, "encoding"):
60+
if TYPE_CHECKING:
61+
assert isinstance(source, TextIO)
62+
parser_encoding = "utf-8"
63+
source_str = source.read()
64+
source = BytesIO(source_str.encode(parser_encoding))
65+
else:
66+
if TYPE_CHECKING:
67+
assert isinstance(source, BinaryIO)
68+
69+
if FOUND_LXML:
70+
lxml_parser = lxml_etree.XMLParser(huge_tree=True, encoding=parser_encoding)
71+
tree = cast(
72+
xml_etree.ElementTree,
73+
lxml_etree.parse(source, parser=lxml_parser),
74+
)
75+
else:
76+
xml_parser = xml_etree.XMLParser(encoding=parser_encoding)
77+
tree = xml_etree.parse(source, parser=xml_parser)
4378

4479
boolean = tree.find(RESULTS_NS_ET + "boolean")
4580
results = tree.find(RESULTS_NS_ET + "results")
@@ -56,8 +91,18 @@ def __init__(self, source, content_type: Optional[str] = None):
5691
if type_ == "SELECT":
5792
self.bindings = []
5893
for result in results: # type: ignore[union-attr]
94+
if result.tag != f"{RESULTS_NS_ET}result":
95+
# This is here because with lxml this also gets comments,
96+
# not just elements. Also this should not operate on non
97+
# "result" elements.
98+
continue
5999
r = {}
60100
for binding in result:
101+
if binding.tag != f"{RESULTS_NS_ET}binding":
102+
# This is here because with lxml this also gets
103+
# comments, not just elements. Also this should not
104+
# operate on non "binding" elements.
105+
continue
61106
# type error: error: Argument 1 to "Variable" has incompatible type "Union[str, None, Any]"; expected "str"
62107
# NOTE on type error: Element.get() can return None, and
63108
# this will invariably fail if passed into Variable
@@ -80,7 +125,7 @@ def __init__(self, source, content_type: Optional[str] = None):
80125
self.askAnswer = boolean.text.lower().strip() == "true" # type: ignore[union-attr]
81126

82127

83-
def parseTerm(element):
128+
def parseTerm(element: xml_etree.Element) -> Union[URIRef, Literal, BNode]:
84129
"""rdflib object (Literal, URIRef, BNode) for the given
85130
elementtree element"""
86131
tag, text = element.tag, element.text
@@ -90,15 +135,17 @@ def parseTerm(element):
90135
datatype = None
91136
lang = None
92137
if element.get("datatype", None):
93-
datatype = URIRef(element.get("datatype"))
138+
# type error: Argument 1 to "URIRef" has incompatible type "Optional[str]"; expected "str"
139+
datatype = URIRef(element.get("datatype")) # type: ignore[arg-type]
94140
elif element.get("{%s}lang" % XML_NAMESPACE, None):
95141
lang = element.get("{%s}lang" % XML_NAMESPACE)
96142

97143
ret = Literal(text, datatype=datatype, lang=lang)
98144

99145
return ret
100146
elif tag == RESULTS_NS_ET + "uri":
101-
return URIRef(text)
147+
# type error: Argument 1 to "URIRef" has incompatible type "Optional[str]"; expected "str"
148+
return URIRef(text) # type: ignore[arg-type]
102149
elif tag == RESULTS_NS_ET + "bnode":
103150
return BNode(text)
104151
else:
@@ -109,14 +156,14 @@ class XMLResultSerializer(ResultSerializer):
109156
def __init__(self, result):
110157
ResultSerializer.__init__(self, result)
111158

112-
def serialize(self, stream: IO, encoding: str = "utf-8", **kwargs):
113-
159+
def serialize(self, stream: IO, encoding: str = "utf-8", **kwargs: Any) -> None:
114160
writer = SPARQLXMLWriter(stream, encoding)
115161
if self.result.type == "ASK":
116162
writer.write_header([])
117163
writer.write_ask(self.result.askAnswer)
118164
else:
119-
writer.write_header(self.result.vars)
165+
# type error: Argument 1 to "write_header" of "SPARQLXMLWriter" has incompatible type "Optional[List[Variable]]"; expected "Sequence[Variable]"
166+
writer.write_header(self.result.vars) # type: ignore[arg-type]
120167
writer.write_results_header()
121168
for b in self.result.bindings:
122169
writer.write_start_result()
@@ -134,7 +181,7 @@ class SPARQLXMLWriter:
134181
Python saxutils-based SPARQL XML Writer
135182
"""
136183

137-
def __init__(self, output, encoding="utf-8"):
184+
def __init__(self, output: IO, encoding: str = "utf-8"):
138185
writer = XMLGenerator(output, encoding)
139186
writer.startDocument()
140187
writer.startPrefixMapping("", SPARQL_XML_NAMESPACE)
@@ -147,7 +194,7 @@ def __init__(self, output, encoding="utf-8"):
147194
self._encoding = encoding
148195
self._results = False
149196

150-
def write_header(self, allvarsL):
197+
def write_header(self, allvarsL: Sequence[Variable]) -> None:
151198
self.writer.startElementNS(
152199
(SPARQL_XML_NAMESPACE, "head"), "head", AttributesNSImpl({}, {})
153200
)
@@ -161,48 +208,52 @@ def write_header(self, allvarsL):
161208
self.writer.startElementNS(
162209
(SPARQL_XML_NAMESPACE, "variable"),
163210
"variable",
164-
AttributesNSImpl(attr_vals, attr_qnames),
211+
# type error: Argument 1 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]"
212+
# type error: Argument 2 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]" [arg-type]
213+
AttributesNSImpl(attr_vals, attr_qnames), # type: ignore[arg-type]
165214
)
166215
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "variable"), "variable")
167216
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "head"), "head")
168217

169-
def write_ask(self, val):
218+
def write_ask(self, val: bool) -> None:
170219
self.writer.startElementNS(
171220
(SPARQL_XML_NAMESPACE, "boolean"), "boolean", AttributesNSImpl({}, {})
172221
)
173222
self.writer.characters(str(val).lower())
174223
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "boolean"), "boolean")
175224

176-
def write_results_header(self):
225+
def write_results_header(self) -> None:
177226
self.writer.startElementNS(
178227
(SPARQL_XML_NAMESPACE, "results"), "results", AttributesNSImpl({}, {})
179228
)
180229
self._results = True
181230

182-
def write_start_result(self):
231+
def write_start_result(self) -> None:
183232
self.writer.startElementNS(
184233
(SPARQL_XML_NAMESPACE, "result"), "result", AttributesNSImpl({}, {})
185234
)
186235
self._resultStarted = True
187236

188-
def write_end_result(self):
237+
def write_end_result(self) -> None:
189238
assert self._resultStarted
190239
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "result"), "result")
191240
self._resultStarted = False
192241

193-
def write_binding(self, name, val):
242+
def write_binding(self, name: Variable, val: Identifier):
194243
assert self._resultStarted
195244

196-
attr_vals = {
245+
attr_vals: Dict[Tuple[Optional[str], str], str] = {
197246
(None, "name"): str(name),
198247
}
199-
attr_qnames = {
248+
attr_qnames: Dict[Tuple[Optional[str], str], str] = {
200249
(None, "name"): "name",
201250
}
202251
self.writer.startElementNS(
203252
(SPARQL_XML_NAMESPACE, "binding"),
204253
"binding",
205-
AttributesNSImpl(attr_vals, attr_qnames),
254+
# type error: Argument 1 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]"
255+
# type error: Argument 2 to "AttributesNSImpl" has incompatible type "Dict[Tuple[None, str], str]"; expected "Mapping[Tuple[str, str], str]"
256+
AttributesNSImpl(attr_vals, attr_qnames), # type: ignore[arg-type]
206257
)
207258

208259
if isinstance(val, URIRef):
@@ -230,7 +281,9 @@ def write_binding(self, name, val):
230281
self.writer.startElementNS(
231282
(SPARQL_XML_NAMESPACE, "literal"),
232283
"literal",
233-
AttributesNSImpl(attr_vals, attr_qnames),
284+
# type error: Argument 1 to "AttributesNSImpl" has incompatible type "Dict[Tuple[Optional[str], str], str]"; expected "Mapping[Tuple[str, str], str]"
285+
# type error: Argument 2 to "AttributesNSImpl" has incompatible type "Dict[Tuple[Optional[str], str], str]"; expected "Mapping[Tuple[str, str], str]"
286+
AttributesNSImpl(attr_vals, attr_qnames), # type: ignore[arg-type]
234287
)
235288
self.writer.characters(val)
236289
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "literal"), "literal")
@@ -240,7 +293,7 @@ def write_binding(self, name, val):
240293

241294
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "binding"), "binding")
242295

243-
def close(self):
296+
def close(self) -> None:
244297
if self._results:
245298
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "results"), "results")
246299
self.writer.endElementNS((SPARQL_XML_NAMESPACE, "sparql"), "sparql")

test/test_sparql/test_result.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def make(cls, *result_format: ResultFormat) -> "ResultFormats":
207207
ResultFormatTrait.HAS_PARSER,
208208
ResultFormatTrait.HAS_SERIALIZER,
209209
},
210-
{"utf-8"},
210+
{"utf-8", "utf-16"},
211211
),
212212
ResultFormat(
213213
"tsv",
@@ -239,7 +239,7 @@ class DestRef:
239239

240240
@contextmanager
241241
def make_dest(
242-
tmp_path: Path, type: Optional[DestinationType]
242+
tmp_path: Path, type: Optional[DestinationType], encoding: str
243243
) -> Iterator[Optional[DestRef]]:
244244
if type is None:
245245
yield None
@@ -251,7 +251,8 @@ def make_dest(
251251
with path.open("wb") as bfh:
252252
yield DestRef(bfh, path)
253253
elif type is DestinationType.TEXT_IO:
254-
with path.open("w") as fh:
254+
assert encoding is not None
255+
with path.open("w", encoding=encoding) as fh:
255256
yield DestRef(fh, path)
256257
else:
257258
raise ValueError(f"unsupported type {type}")
@@ -299,6 +300,10 @@ def make_select_result_serialize_parse_tests() -> Iterator[ParameterSet]:
299300
raises=FileNotFoundError,
300301
reason="string path handling does not work on windows",
301302
)
303+
xfails[("xml", DestinationType.STR_PATH, "utf-16")] = pytest.mark.xfail(
304+
raises=FileNotFoundError,
305+
reason="string path handling does not work on windows",
306+
)
302307
formats = [
303308
format
304309
for format in result_formats.values()
@@ -332,7 +337,7 @@ def test_select_result_serialize_parse(
332337
specific format results in an equivalent result object.
333338
"""
334339
format, destination_type, encoding = args
335-
with make_dest(tmp_path, destination_type) as dest_ref:
340+
with make_dest(tmp_path, destination_type, encoding) as dest_ref:
336341
destination = None if dest_ref is None else dest_ref.param
337342
serialize_result = select_result.serialize(
338343
destination=destination,
@@ -345,7 +350,8 @@ def test_select_result_serialize_parse(
345350
serialized_data = serialize_result.decode(encoding)
346351
else:
347352
assert serialize_result is None
348-
serialized_data = dest_ref.path.read_bytes().decode(encoding)
353+
dest_bytes = dest_ref.path.read_bytes()
354+
serialized_data = dest_bytes.decode(encoding)
349355

350356
logging.debug("serialized_data = %s", serialized_data)
351357
check_serialized(format.name, select_result, serialized_data)
@@ -363,7 +369,7 @@ def serialize_select(select_result: Result, format: str, encoding: str) -> bytes
363369
encoding
364370
)
365371
else:
366-
result = select_result.serialize(format=format)
372+
result = select_result.serialize(format=format, encoding=encoding)
367373
assert result is not None
368374
return result
369375

@@ -377,8 +383,17 @@ def make_select_result_parse_serialized_tests() -> Iterator[ParameterSet]:
377383
and ResultType.SELECT in format.supported_types
378384
]
379385
source_types = set(SourceType)
386+
xfails[("csv", SourceType.BINARY_IO, "utf-16")] = pytest.mark.xfail(
387+
raises=UnicodeDecodeError,
388+
)
389+
xfails[("json", SourceType.BINARY_IO, "utf-16")] = pytest.mark.xfail(
390+
raises=UnicodeDecodeError,
391+
)
392+
xfails[("tsv", SourceType.BINARY_IO, "utf-16")] = pytest.mark.xfail(
393+
raises=UnicodeDecodeError,
394+
)
380395
for format, destination_type in itertools.product(formats, source_types):
381-
for encoding in {"utf-8"}:
396+
for encoding in format.encodings:
382397
xfail = xfails.get((format.name, destination_type, encoding))
383398
marks = (xfail,) if xfail is not None else ()
384399
yield pytest.param(

0 commit comments

Comments
 (0)