Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pdfstream as cmap #283

Merged
merged 15 commits into from
Oct 12, 2019
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,15 @@ def decode(self, code):
return ()


class IdentityCMapByte(IdentityCMap):

def decode(self, code):
n = len(code)
if n:
return struct.unpack('>%dB' % n, code)
else:
return ()

## UnicodeMap
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a lot of these unnecessary code comments in cmapdb.py. Since you are editing this file anyway, could you remove those? And likewise for pdffont.py?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, Can do that.

##
class UnicodeMap(CMapBase):
Expand Down Expand Up @@ -252,6 +261,10 @@ def get_cmap(klass, name):
return IdentityCMap(WMode=0)
elif name == 'Identity-V':
return IdentityCMap(WMode=1)
elif name == 'OneByteIdentityH':
return IdentityCMapByte(WMode=0)
elif name == 'OneByteIdentityV':
return IdentityCMapByte(WMode=1)
try:
return klass._cmap_cache[name]
except KeyError:
Expand Down
54 changes: 41 additions & 13 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from .encodingdb import name2unicode
from .fontmetrics import FONT_METRICS
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import resolve1
from .pdftypes import dict_value
from .pdftypes import int_value
from .pdftypes import list_value
Expand Down Expand Up @@ -140,7 +142,13 @@ def do_keyword(self, pos, token):


NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')

IDENTITY_ENCODER = {'Identity-H':'Identity-H',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mentioned that DLIdent-* is not in the PDF reference manual. Did you find other pdf documentation that mentions this? If so, you could add a comment that refers to it such that we do not remove DLIdent-* by accident.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't seen any pdf with DLIdent-* but have included it due to its harmless nature. Will also include code comment to avoid DLIdent-* being removed by accident.

'Identity-V':'Identity-V',
'DLIdent-H':'Identity-H',
'DLIdent-V':'Identity-V',
'OneByteIdentityH':'OneByteIdentityH',
'OneByteIdentityV':'OneByteIdentityV',
}

## CFFFont
## (Format specified in Adobe Technical Note: #5176
Expand Down Expand Up @@ -661,18 +669,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
self.cmap = CMap()
self.cmap = self.get_cmap_from_spec(spec, strict)

try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
Expand Down Expand Up @@ -719,6 +717,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return

def get_cmap_from_spec(self, spec, strict):
"""
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribure of spec['Encoding'].
The horizaontal/vertical modes are mentioned with diffrent name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'
"""
try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
cmap_name = literal_name(spec['Encoding'])
else:
cmap_name = literal_name(spec_encoding['CMapName'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'
if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER:
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
else:
return CMap()

def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)

Expand Down
Binary file added samples/sampleOneByteIdentityEncode.pdf
Binary file not shown.
111 changes: 111 additions & 0 deletions tests/test_pdfencoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python

# -*- coding: utf-8 -*-

import nose, logging, os
from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte
from pdfminer.pdffont import PDFCIDFont
from pdfminer.pdftypes import PDFStream
from pdfminer.psparser import PSLiteral

class TestPDFEncoding():

def test_cmapname_onebyteidentityV(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte)

def test_cmapname_onebyteidentityH(self):
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMapByte)

def test_cmapname_V(self):
stream = PDFStream({'CMapName': PSLiteral('V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)

def test_cmapname_H(self):
stream = PDFStream({'CMapName': PSLiteral('H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, CMap)

def test_encoding_identityH(self):
spec = {'Encoding': PSLiteral('Identity-H')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityV(self):
spec = {'Encoding': PSLiteral('Identity-V')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityV_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityH_as_stream(self):
stream = PDFStream({'CMapName':'Identity-H'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_identityV_as_stream(self):
stream = PDFStream({'CMapName':'Identity-V'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_DLIdentH(self):
spec = {'Encoding': PSLiteral('DLIdent-H')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_DLIdentV(self):
spec = {'Encoding': PSLiteral('DLIdent-V')}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_DLIdentH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_DLIdentH_as_PSLiteral_stream(self):
stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_DLIdentH_as_stream(self):
stream = PDFStream({'CMapName':'DLIdent-H'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_encoding_DLIdentV_as_stream(self):
stream = PDFStream({'CMapName':'DLIdent-V'}, '')
spec = {'Encoding': stream}
font = PDFCIDFont(None, spec)
assert isinstance(font.cmap, IdentityCMap)

def test_font_without_spec(self):
font = PDFCIDFont(None, {})
assert isinstance(font.cmap, CMap)


if __name__ == '__main__':
nose.runmodule()
1 change: 1 addition & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def test_1(self):
run('../samples/','simple1')
run('../samples/','simple2')
run('../samples/','simple3')
run('../samples/','sampleOneByteIdentityEncode')

def test_2(self):
run('../samples/nonfree/','dmca')
Expand Down