Skip to content

Commit

Permalink
fix: issues with string destination handling in `{Graph,Result}.seria…
Browse files Browse the repository at this point in the history
…lize`

Change `{Graph,Result}.serialize` to only handle string destinations as URIs if
the schema is `file` and to treat it as operating system paths in all
other cases.

This is for the following reasons:
- `Result.serialize` was treating URI paths as OS paths which only works
  for some cases, for special charachters and percentage encoding it
  does not work.
- Many valid Unix and Windows paths parse using `urlparse` and have no netloc, e.g. `C:\some\path`
  and `some:/path`, however they should be treated as paths and not as URIs.
- `Graph` and `Result` should behave consistently.

Some caveats in this change:
- non-file URIs will now be treated as OS paths which may result in
  slightly confusing error messages, such as
  `FileNotFoundError: [Errno 2] No such file or directory: 'http://example.com/'`
  if http://example.com/ is passed.
- some valid file URIs (e.g. `file:/path/to/file` from https://datatracker.ietf.org/doc/html/rfc8089)
  are also valid Posix paths but will be treated as file-URIs instead of
  Posix paths. For `Graph.serialize` this ambiguity can be avoided by
  using `pathlib.Path`, but for `Result.serialize` there is currently no
  way to avoid it, though I will work on RDFLib#1834
  soon and in that provide a way to avoid the ambiguity there also.
  • Loading branch information
aucampia committed Jul 31, 2022
1 parent 8d58fd0 commit 4e90415
Show file tree
Hide file tree
Showing 8 changed files with 555 additions and 254 deletions.
22 changes: 22 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,28 @@ and will be removed for release.
<!-- -->
<!-- -->


<!-- -->
<!-- -->
<!-- CHANGE BARRIER: START #2068 -->
<!-- -->
<!-- -->

- Improve file-URI and path handling in `Graph.serialize` and `Result.serialize` to
address problems with windows path handling in `Result.serialize` and to make
the behavior between `Graph.serialize` and `Result.serialie` more consistent.
Closed [issue #2067](https://github.com/RDFLib/rdflib/issues/2067).
[PR #2068](https://github.com/RDFLib/rdflib/pull/2068).
- String values for the `destination` argument will now only be treated as
file URIs if `urllib.parse.urlparse` returns their schema as `file`.
- Simplified file writing to avoid a temporary file.

<!-- -->
<!-- -->
<!-- CHANGE BARRIER: END #2068 -->
<!-- -->
<!-- -->

<!-- -->
<!-- -->
<!-- CHANGE BARRIER: START -->
Expand Down
29 changes: 13 additions & 16 deletions rdflib/graph.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import logging
import os
import pathlib
import random
import shutil
import tempfile
from io import BytesIO
from typing import (
IO,
Expand Down Expand Up @@ -1201,20 +1198,20 @@ def serialize(
serializer.serialize(stream, base=base, encoding=encoding, **args)
else:
if isinstance(destination, pathlib.PurePath):
location = str(destination)
os_path = str(destination)
else:
location = cast(str, destination)
scheme, netloc, path, params, _query, fragment = urlparse(location)
if netloc != "":
raise ValueError(
f"destination {destination} is not a local file reference"
)
fd, name = tempfile.mkstemp()
stream = os.fdopen(fd, "wb")
serializer.serialize(stream, base=base, encoding=encoding, **args)
stream.close()
dest = url2pathname(path) if scheme == "file" else location
shutil.move(name, dest)
scheme, netloc, path, params, _query, fragment = urlparse(location)
if scheme == "file":
if netloc != "":
raise ValueError(
f"the file URI {location!r} has an authority component which is not supported"
)
os_path = url2pathname(path)
else:
os_path = location
with open(os_path, "wb") as stream:
serializer.serialize(stream, encoding=encoding, **args)
return self

def print(self, format="turtle", encoding="utf-8", out=None):
Expand Down Expand Up @@ -1276,7 +1273,7 @@ def parse(
... </rdf:Description>
... </rdf:RDF>
... '''
>>> import tempfile
>>> import os, tempfile
>>> fd, file_name = tempfile.mkstemp()
>>> f = os.fdopen(fd, "w")
>>> dummy = f.write(my_data) # Returns num bytes written
Expand Down
24 changes: 11 additions & 13 deletions rdflib/query.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import itertools
import os
import shutil
import tempfile
import types
import warnings
from io import BytesIO
from typing import IO, TYPE_CHECKING, List, Optional, Union, cast
from urllib.parse import urlparse
from urllib.request import url2pathname

__all__ = [
"Processor",
Expand Down Expand Up @@ -267,16 +265,16 @@ def serialize(
else:
location = cast(str, destination)
scheme, netloc, path, params, query, fragment = urlparse(location)
if netloc != "":
print(
"WARNING: not saving as location" + "is not a local file reference"
)
return None
fd, name = tempfile.mkstemp()
stream = os.fdopen(fd, "wb")
serializer.serialize(stream, encoding=encoding, **args)
stream.close()
shutil.move(name, path)
if scheme == "file":
if netloc != "":
raise ValueError(
f"the file URI {location!r} has an authority component which is not supported"
)
os_path = url2pathname(path)
else:
os_path = location
with open(os_path, "wb") as stream:
serializer.serialize(stream, encoding=encoding, **args)
return None

def __len__(self):
Expand Down
Loading

0 comments on commit 4e90415

Please sign in to comment.