Skip to content

Commit

Permalink
fix github issue pdfminer#566.
Browse files Browse the repository at this point in the history
  • Loading branch information
huan_cheng committed Feb 26, 2021
1 parent 2f7eddb commit eb0d675
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 7 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ tests/*.txt
.idea/
.tox/
Pipfile
Pipfile.lock
Pipfile.lock
.noseids
.vscode/
8 changes: 4 additions & 4 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ def do_keyword(self, pos, token):
if token is self.KEYWORD_ENDCIDRANGE:
objs = [obj for (__, obj) in self.popall()]
for (s, e, cid) in choplist(3, objs):
if (not isinstance(s, str) or not isinstance(e, str) or
if (not isinstance(s, bytes) or not isinstance(e, bytes) or
not isinstance(cid, int) or len(s) != len(e)):
continue
sprefix = s[:-4]
Expand All @@ -352,7 +352,7 @@ def do_keyword(self, pos, token):
vlen = len(svar)
for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
self.cmap.add_cid2unichr(cid+i, x)
return

if token is self.KEYWORD_BEGINCIDCHAR:
Expand All @@ -361,8 +361,8 @@ def do_keyword(self, pos, token):
if token is self.KEYWORD_ENDCIDCHAR:
objs = [obj for (__, obj) in self.popall()]
for (cid, code) in choplist(2, objs):
if isinstance(code, str) and isinstance(cid, str):
self.cmap.add_code2cid(code, nunpack(cid))
if isinstance(code, bytes) and isinstance(cid, bytes):
self.cmap.add_cid2unichr(cid, code)
return

if token is self.KEYWORD_BEGINBFRANGE:
Expand Down
4 changes: 2 additions & 2 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,9 +403,9 @@ def __init__(self, name, fp):
return

def create_unicode_map(self):
if 'cmap' not in self.tables:
if b'cmap' not in self.tables:
raise TrueTypeFont.CMapNotFound
(base_offset, length) = self.tables['cmap']
(base_offset, length) = self.tables[b'cmap']
fp = self.fp
fp.seek(base_offset)
(version, nsubtables) = struct.unpack('>HH', fp.read(4))
Expand Down
25 changes: 25 additions & 0 deletions tests/test_hcheng.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from pdfminer.settings import STRICT
import sys
print(sys.path)

from pdfminer import high_level

STRICT = True

# pdf_file = '/home/huan_cheng/workspace/algrithm/contractsimilarity/data/pdf/B3.pdf'
# pdf_file = '/home/huan_cheng/Documents/pdf_files/A0095607-010169.pdf'
pdf_file = '/tmp/aa.pdf'

text = high_level.extract_text(
pdf_file,
password='',
# page_numbers=[13],
# page_numbers=[0],
maxpages=0,
caching=True,
codec='utf-8',
laparams=None
)
print(text)

print("-------> Done!")

0 comments on commit eb0d675

Please sign in to comment.