Fix extraction of some cjk characters (#593)

Fixes #566 * try to fix issue of some Chinese characters cannot be extracted correctly (#566). * format code to pass flake8 check. * fix typo and refer to issue 593. Co-authored-by: huan_cheng <huan_cheng@bestsign.cn> Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
pdfminer · Aug 26, 2021 · 234c466 · 234c466
1 parent d821fed
commit 234c466
Show file tree

Hide file tree

Showing 7 changed files with 23 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -21,5 +21,7 @@ tests/*.txt
 # python venv management tools
 Pipfile
 Pipfile.lock
+.noseids
+.vscode/
 pyproject.toml
-poetry.lock
+poetry.lock
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Fix issue of ValueError and KeyError rasied in PDFdocument and PDFparser ([#573](https://github.com/pdfminer/pdfminer.six/pull/574))
 - Fix issue of TypeError: cannot unpack non-iterable PDFObjRef object, when unpacking the value of 'DW2' ([#529](https://github.com/pdfminer/pdfminer.six/pull/529))
 - `PermissionError` when creating temporary filepaths on windows when running tests ([#469](https://github.com/pdfminer/pdfminer.six/issues/469))
+- Fix issue of some Chinese characters can not be extracted correctly ([#593](https://github.com/pdfminer/pdfminer.six/pull/593))
 - Detecting trailer correctly when surrounded with needless whitespace ([#535](https://github.com/pdfminer/pdfminer.six/pull/535))
 - Fix `.paint_path` logic for handling single line segments and extracting point-on-curve positions of Beziér path commands ([#530](https://github.com/pdfminer/pdfminer.six/pull/530))
 

diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -338,7 +338,7 @@ def do_keyword(self, pos, token):
         if token is self.KEYWORD_ENDCIDRANGE:
             objs = [obj for (__, obj) in self.popall()]
             for (s, e, cid) in choplist(3, objs):
-                if (not isinstance(s, str) or not isinstance(e, str) or
+                if (not isinstance(s, bytes) or not isinstance(e, bytes) or
                    not isinstance(cid, int) or len(s) != len(e)):
                     continue
                 sprefix = s[:-4]
@@ -352,7 +352,7 @@ def do_keyword(self, pos, token):
                 vlen = len(svar)
                 for i in range(e1-s1+1):
                     x = sprefix+struct.pack('>L', s1+i)[-vlen:]
-                    self.cmap.add_code2cid(x, cid+i)
+                    self.cmap.add_cid2unichr(cid+i, x)
             return
 
         if token is self.KEYWORD_BEGINCIDCHAR:
@@ -361,8 +361,8 @@ def do_keyword(self, pos, token):
         if token is self.KEYWORD_ENDCIDCHAR:
             objs = [obj for (__, obj) in self.popall()]
             for (cid, code) in choplist(2, objs):
-                if isinstance(code, str) and isinstance(cid, str):
-                    self.cmap.add_code2cid(code, nunpack(cid))
+                if isinstance(code, bytes) and isinstance(cid, int):
+                    self.cmap.add_cid2unichr(cid, code)
             return
 
         if token is self.KEYWORD_BEGINBFRANGE:

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -403,9 +403,9 @@ def __init__(self, name, fp):
         return
 
     def create_unicode_map(self):
-        if 'cmap' not in self.tables:
+        if b'cmap' not in self.tables:
             raise TrueTypeFont.CMapNotFound
-        (base_offset, length) = self.tables['cmap']
+        (base_offset, length) = self.tables[b'cmap']
         fp = self.fp
         fp.seek(base_offset)
         (version, nsubtables) = struct.unpack('>HH', fp.read(4))

diff --git a/samples/contrib/issue_566_test_1.pdf b/samples/contrib/issue_566_test_1.pdf
diff --git a/samples/contrib/issue_566_test_2.pdf b/samples/contrib/issue_566_test_2.pdf
diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -30,7 +30,9 @@ def run_with_file(sample_path):
     "simple2.pdf": "\f",
     "simple3.pdf": "Hello\n\nHello\nあ\nい\nう\nえ\nお\nあ\nい\nう\nえ\nお\n"
                    "World\n\nWorld\n\n\f",
-    "simple4.pdf": "Text1\nText2\nText3\n\n\f"
+    "simple4.pdf": "Text1\nText2\nText3\n\n\f",
+    "contrib/issue_566_test_1.pdf": "ISSUE Date：2019-4-25 Buyer：黎荣",
+    "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
 }
 
 
@@ -80,6 +82,16 @@ def test_simple4_with_file(self):
         s = run_with_file(test_file)
         self.assertEqual(s, test_strings[test_file])
 
+    def test_issue_566_cmap_bytes(self):
+        test_file = "contrib/issue_566_test_1.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s.strip(), test_strings[test_file])
+
+    def test_issue_566_cid_range(self):
+        test_file = "contrib/issue_566_test_2.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s.strip(), test_strings[test_file])
+
 
 class TestExtractPages(unittest.TestCase):
     def _get_test_file_path(self):