Skip to content

Commit

Permalink
Fix extraction of some cjk characters (#593)
Browse files Browse the repository at this point in the history
Fixes #566 

* try to fix issue of some Chinese characters cannot be extracted
correctly (#566).

* format code to pass flake8 check.

* fix typo and refer to issue 593.

Co-authored-by: huan_cheng <huan_cheng@bestsign.cn>
Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
  • Loading branch information
3 people authored Aug 26, 2021
1 parent d821fed commit 234c466
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 8 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,7 @@ tests/*.txt
# python venv management tools
Pipfile
Pipfile.lock
.noseids
.vscode/
pyproject.toml
poetry.lock
poetry.lock
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
- Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
- `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
- Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
- Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))

Expand Down
8 changes: 4 additions & 4 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def do_keyword(self, pos, token):
if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
Expand All @@ -352,7 +352,7 @@ def do_keyword(self, pos, token):
vlen = len(svar)
for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
self.cmap.add_cid2unichr(cid+i, x)
return

if token is self.KEYWORD_BEGINCIDCHAR:
Expand All @@ -361,8 +361,8 @@ def do_keyword(self, pos, token):
if token is self.KEYWORD_ENDCIDCHAR:
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid))
if isinstance(code, bytes) and isinstance(cid, int):
self.cmap.add_cid2unichr(cid, code)
return

if token is self.KEYWORD_BEGINBFRANGE:
Expand Down
4 changes: 2 additions & 2 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,9 +403,9 @@ def __init__(self, name, fp):
return

def create_unicode_map(self):
if 'cmap' not in self.tables:
if b'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
(base_offset, length) = self.tables[b'cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = struct.unpack('>HH', fp.read(4))
Expand Down
Binary file added samples/contrib/issue_566_test_1.pdf
Binary file not shown.
Binary file added samples/contrib/issue_566_test_2.pdf
Binary file not shown.
14 changes: 13 additions & 1 deletion tests/test_highlevel_extracttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ def run_with_file(sample_path):
"simple2.pdf": "\f",
"simple3.pdf": "Hello\n\nHello\n\n\n\n\n\n\n\n\n\n\n"
"World\n\nWorld\n\n\f",
"simple4.pdf": "Text1\nText2\nText3\n\n\f"
"simple4.pdf": "Text1\nText2\nText3\n\n\f",
"contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣",
"contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)",
}


Expand Down Expand Up @@ -80,6 +82,16 @@ def test_simple4_with_file(self):
s = run_with_file(test_file)
self.assertEqual(s, test_strings[test_file])

def test_issue_566_cmap_bytes(self):
test_file = "contrib/issue_566_test_1.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])

def test_issue_566_cid_range(self):
test_file = "contrib/issue_566_test_2.pdf"
s = run_with_file(test_file)
self.assertEqual(s.strip(), test_strings[test_file])


class TestExtractPages(unittest.TestCase):
def _get_test_file_path(self):
Expand Down

0 comments on commit 234c466

Please sign in to comment.