Validate xref position explicitly (#980)

pdfminer · Jul 9, 2024 · 3a789a4 · 3a789a4
1 parent 8386822
commit 3a789a4
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,8 +11,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
-- `TypeError` when PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))])
-- `TypeError` when PDF literal cannot be converted to str ([#978](https://github.com/pdfminer/pdfminer.six/pull/978))
+- `TypeError` when corrupt PDF object reference cannot be parsed as int ([#972](https://github.com/pdfminer/pdfminer.six/pull/972))])
+- `TypeError` when corrupt PDF literal cannot be converted to str ([#978](https://github.com/pdfminer/pdfminer.six/pull/978))
+- `ValueError` when corrupt PDF specifies a negative xref location ([#980](http://github.com/pdfminer/pdfminer.six/pull/980))
 
 ### Removed
 

diff --git a/pdfminer/pdfdocument.py b/pdfminer/pdfdocument.py
@@ -950,19 +950,28 @@ def get_dest(self, name: Union[str, bytes]) -> Any:
     def find_xref(self, parser: PDFParser) -> int:
         """Internal function used to locate the first XRef."""
         # search the last xref table by scanning the file backwards.
-        prev = None
+        prev = b""
         for line in parser.revreadlines():
             line = line.strip()
             log.debug("find_xref: %r", line)
+
             if line == b"startxref":
-                break
+                log.debug("xref found: pos=%r", prev)
+
+                if not prev.isdigit():
+                    raise PDFNoValidXRef(f"Invalid xref position: {prev!r}")
+
+                start = int(prev)
+
+                if not start >= 0:
+                    raise PDFNoValidXRef(f"Invalid negative xref position: {start}")
+
+                return start
+
             if line:
                 prev = line
-        else:
-            raise PDFNoValidXRef("Unexpected EOF")
-        log.debug("xref found: pos=%r", prev)
-        assert prev is not None
-        return int(prev)
+
+        raise PDFNoValidXRef("Unexpected EOF")
 
     # read xref table
     def read_xref_from(

diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import io
 
 # -*- coding: utf-8 -*-
 
@@ -260,7 +261,7 @@ def revreadlines(self) -> Iterator[bytes]:
 
         This is used to locate the trailers at the end of a file.
         """
-        self.fp.seek(0, 2)
+        self.fp.seek(0, io.SEEK_END)
         pos = self.fp.tell()
         buf = b""
         while 0 < pos: