Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pdfstream as cmap #283

Merged
merged 15 commits into from
Oct 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 15 additions & 26 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


""" Adobe character mapping (CMap) support.

CMaps provide the mapping between character codes and Unicode
Expand Down Expand Up @@ -40,8 +38,6 @@ class CMapError(Exception):
pass


## CMapBase
##
class CMapBase(object):

debug = 0
Expand All @@ -67,8 +63,6 @@ def use_cmap(self, cmap):
return


## CMap
##
class CMap(CMapBase):

def __init__(self, **kwargs):
Expand Down Expand Up @@ -119,8 +113,6 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):
return


## IdentityCMap
##
class IdentityCMap(CMapBase):

def decode(self, code):
Expand All @@ -131,8 +123,16 @@ def decode(self, code):
return ()


## UnicodeMap
##
class IdentityCMapByte(IdentityCMap):

def decode(self, code):
n = len(code)
if n:
return struct.unpack('>%dB' % n, code)
else:
return ()


class UnicodeMap(CMapBase):

def __init__(self, **kwargs):
Expand All @@ -153,8 +153,6 @@ def dump(self, out=sys.stdout):
return


## FileCMap
##
class FileCMap(CMap):

def add_code2cid(self, code, cid):
Expand All @@ -173,8 +171,6 @@ def add_code2cid(self, code, cid):
return


## FileUnicodeMap
##
class FileUnicodeMap(UnicodeMap):

def add_cid2unichr(self, cid, code):
Expand All @@ -192,8 +188,6 @@ def add_cid2unichr(self, cid, code):
return


## PyCMap
##
class PyCMap(CMap):

def __init__(self, name, module):
Expand All @@ -204,8 +198,6 @@ def __init__(self, name, module):
return


## PyUnicodeMap
##
class PyUnicodeMap(UnicodeMap):

def __init__(self, name, module, vertical):
Expand All @@ -218,8 +210,6 @@ def __init__(self, name, module, vertical):
return


## CMapDB
##
class CMapDB(object):

_cmap_cache = {}
Expand Down Expand Up @@ -252,6 +242,10 @@ def get_cmap(klass, name):
return IdentityCMap(WMode=0)
elif name == 'Identity-V':
return IdentityCMap(WMode=1)
elif name == 'OneByteIdentityH':
return IdentityCMapByte(WMode=0)
elif name == 'OneByteIdentityV':
return IdentityCMapByte(WMode=1)
try:
return klass._cmap_cache[name]
except KeyError:
Expand All @@ -271,8 +265,6 @@ def get_unicode_map(klass, name, vertical=False):
return umaps[vertical]


## CMapParser
##
class CMapParser(PSStackParser):

def __init__(self, cmap, fp):
Expand Down Expand Up @@ -360,7 +352,6 @@ def do_keyword(self, pos, token):
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
#assert s1 <= e1, str((s1, e1))
for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
Expand All @@ -387,7 +378,6 @@ def do_keyword(self, pos, token):
continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1, str((s1, e1))
if isinstance(code, list):
for i in range(e1-s1+1):
self.cmap.add_cid2unichr(s1+i, code[i])
Expand Down Expand Up @@ -422,17 +412,16 @@ def do_keyword(self, pos, token):
return


# test
def main(argv):
args = argv[1:]
for fname in args:
fp = open(fname, 'rb')
cmap = FileUnicodeMap()
#cmap = FileCMap()
CMapParser(cmap, fp).run()
fp.close()
cmap.dump()
return


if __name__ == '__main__':
sys.exit(main(sys.argv))
89 changes: 45 additions & 44 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from .encodingdb import name2unicode
from .fontmetrics import FONT_METRICS
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import resolve1
from .pdftypes import dict_value
from .pdftypes import int_value
from .pdftypes import list_value
Expand All @@ -33,7 +35,6 @@

log = logging.getLogger(__name__)


def get_widths(seq):
widths = {}
r = []
Expand All @@ -52,10 +53,6 @@ def get_widths(seq):
widths[i] = w
r = []
return widths
#assert get_widths([1]) == {}
#assert get_widths([1,2,3]) == {1:3, 2:3}
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}


def get_widths2(seq):
widths = {}
Expand All @@ -75,22 +72,15 @@ def get_widths2(seq):
widths[i] = (w, (vx, vy))
r = []
return widths
#assert get_widths2([1]) == {}
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}


## FontMetricsDB
##
class FontMetricsDB(object):

@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]


## Type1FontHeaderParser
##
class Type1FontHeaderParser(PSStackParser):

KEYWORD_BEGIN = KWD(b'begin')
Expand Down Expand Up @@ -141,11 +131,16 @@ def do_keyword(self, pos, token):

NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')

#Note: DLIdent-* isn't found in PDF Reference but is been kept as
#it is harmless and have possibility of been a type. (induced from bug report/PR)
IDENTITY_ENCODER = {'Identity-H':'Identity-H',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mentioned that DLIdent-* is not in the PDF reference manual. Did you find other pdf documentation that mentions this? If so, you could add a comment that refers to it such that we do not remove DLIdent-* by accident.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't seen any pdf with DLIdent-* but have included it due to its harmless nature. Will also include code comment to avoid DLIdent-* being removed by accident.

'Identity-V':'Identity-V',
'DLIdent-H':'Identity-H',
'DLIdent-V':'Identity-V',
'OneByteIdentityH':'OneByteIdentityH',
'OneByteIdentityV':'OneByteIdentityV',
}

## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
##
def getdict(data):
d = {}
fp = BytesIO(data)
Expand Down Expand Up @@ -273,6 +268,7 @@ class CFFFont(object):
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
)


class INDEX(object):

def __init__(self, fp):
Expand Down Expand Up @@ -373,9 +369,6 @@ def __init__(self, name, fp):
assert False, str(('Unhandled', format))
else:
raise ValueError('unsupported charset format: %r' % format)
#print self.code2gid
#print self.name2gid
#assert 0
return

def getstr(self, sid):
Expand All @@ -384,8 +377,6 @@ def getstr(self, sid):
return self.string_index[sid-len(self.STANDARD_STRINGS)]


## TrueTypeFont
##
class TrueTypeFont(object):

class CMapNotFound(Exception):
Expand Down Expand Up @@ -471,8 +462,6 @@ def create_unicode_map(self):
return unicode_map


## Fonts
##
class PDFFontError(PDFException):
pass

Expand All @@ -484,7 +473,6 @@ class PDFUnicodeNotDefined(PDFFontError):
LITERAL_TYPE1C = LIT('Type1C')


# PDFFont
class PDFFont(object):

def __init__(self, descriptor, widths, default_width=None):
Expand Down Expand Up @@ -549,7 +537,6 @@ def string_width(self, s):
return sum(self.char_width(cid) for cid in self.decode(s))


# PDFSimpleFont
class PDFSimpleFont(PDFFont):

def __init__(self, descriptor, widths, spec):
Expand Down Expand Up @@ -586,7 +573,6 @@ def to_unichr(self, cid):
raise PDFUnicodeNotDefined(None, cid)


# PDFType1Font
class PDFType1Font(PDFSimpleFont):

def __init__(self, rsrcmgr, spec):
Expand Down Expand Up @@ -618,14 +604,12 @@ def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont


# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):

def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont


# PDFType3Font
class PDFType3Font(PDFSimpleFont):

def __init__(self, rsrcmgr, spec):
Expand All @@ -648,7 +632,6 @@ def __repr__(self):
return '<PDFType3Font>'


# PDFCIDFont
class PDFCIDFont(PDFFont):

def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
Expand All @@ -661,18 +644,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
self.cmap = CMap()
self.cmap = self.get_cmap_from_spec(spec, strict)

try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
Expand Down Expand Up @@ -719,6 +692,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return

def get_cmap_from_spec(self, spec, strict):
"""
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribute of spec['Encoding'].
The horizontal/vertical modes are mentioned with different name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
"""
try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
cmap_name = literal_name(spec['Encoding'])
else:
cmap_name = literal_name(spec_encoding['CMapName'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'
if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER:
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
else:
return CMap()

def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)

Expand All @@ -743,16 +746,14 @@ def to_unichr(self, cid):
except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)


# main
def main(argv):
for fname in argv[1:]:
fp = open(fname, 'rb')
#font = TrueTypeFont(fname, fp)
font = CFFFont(fname, fp)
print (font)
fp.close()
return


if __name__ == '__main__':
sys.exit(main(sys.argv))
Binary file added samples/sampleOneByteIdentityEncode.pdf
Binary file not shown.
Loading