Skip to content

Commit

Permalink
Merge pull request #283 from fakabbir/pdfstream-as-cmap
Browse files Browse the repository at this point in the history
Pdfstream as cmap
  • Loading branch information
tataganesh authored Oct 12, 2019
2 parents 42e2c81 + 7c03d96 commit f53fbd9
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 70 deletions.
41 changes: 15 additions & 26 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


""" Adobe character mapping (CMap) support.
CMaps provide the mapping between character codes and Unicode
Expand Down Expand Up @@ -40,8 +38,6 @@ class CMapError(Exception):
pass


## CMapBase
##
class CMapBase(object):

debug = 0
Expand All @@ -67,8 +63,6 @@ def use_cmap(self, cmap):
return


## CMap
##
class CMap(CMapBase):

def __init__(self, **kwargs):
Expand Down Expand Up @@ -119,8 +113,6 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):
return


## IdentityCMap
##
class IdentityCMap(CMapBase):

def decode(self, code):
Expand All @@ -131,8 +123,16 @@ def decode(self, code):
return ()


## UnicodeMap
##
class IdentityCMapByte(IdentityCMap):

def decode(self, code):
n = len(code)
if n:
return struct.unpack('>%dB' % n, code)
else:
return ()


class UnicodeMap(CMapBase):

def __init__(self, **kwargs):
Expand All @@ -153,8 +153,6 @@ def dump(self, out=sys.stdout):
return


## FileCMap
##
class FileCMap(CMap):

def add_code2cid(self, code, cid):
Expand All @@ -173,8 +171,6 @@ def add_code2cid(self, code, cid):
return


## FileUnicodeMap
##
class FileUnicodeMap(UnicodeMap):

def add_cid2unichr(self, cid, code):
Expand All @@ -192,8 +188,6 @@ def add_cid2unichr(self, cid, code):
return


## PyCMap
##
class PyCMap(CMap):

def __init__(self, name, module):
Expand All @@ -204,8 +198,6 @@ def __init__(self, name, module):
return


## PyUnicodeMap
##
class PyUnicodeMap(UnicodeMap):

def __init__(self, name, module, vertical):
Expand All @@ -218,8 +210,6 @@ def __init__(self, name, module, vertical):
return


## CMapDB
##
class CMapDB(object):

_cmap_cache = {}
Expand Down Expand Up @@ -252,6 +242,10 @@ def get_cmap(klass, name):
return IdentityCMap(WMode=0)
elif name == 'Identity-V':
return IdentityCMap(WMode=1)
elif name == 'OneByteIdentityH':
return IdentityCMapByte(WMode=0)
elif name == 'OneByteIdentityV':
return IdentityCMapByte(WMode=1)
try:
return klass._cmap_cache[name]
except KeyError:
Expand All @@ -271,8 +265,6 @@ def get_unicode_map(klass, name, vertical=False):
return umaps[vertical]


## CMapParser
##
class CMapParser(PSStackParser):

def __init__(self, cmap, fp):
Expand Down Expand Up @@ -360,7 +352,6 @@ def do_keyword(self, pos, token):
s1 = nunpack(svar)
e1 = nunpack(evar)
vlen = len(svar)
#assert s1 <= e1, str((s1, e1))
for i in range(e1-s1+1):
x = sprefix+struct.pack('>L', s1+i)[-vlen:]
self.cmap.add_code2cid(x, cid+i)
Expand All @@ -387,7 +378,6 @@ def do_keyword(self, pos, token):
continue
s1 = nunpack(s)
e1 = nunpack(e)
#assert s1 <= e1, str((s1, e1))
if isinstance(code, list):
for i in range(e1-s1+1):
self.cmap.add_cid2unichr(s1+i, code[i])
Expand Down Expand Up @@ -422,17 +412,16 @@ def do_keyword(self, pos, token):
return


# test
def main(argv):
args = argv[1:]
for fname in args:
fp = open(fname, 'rb')
cmap = FileUnicodeMap()
#cmap = FileCMap()
CMapParser(cmap, fp).run()
fp.close()
cmap.dump()
return


if __name__ == '__main__':
sys.exit(main(sys.argv))
89 changes: 45 additions & 44 deletions pdfminer/pdffont.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from .encodingdb import name2unicode
from .fontmetrics import FONT_METRICS
from .pdftypes import PDFException
from .pdftypes import PDFStream
from .pdftypes import resolve1
from .pdftypes import dict_value
from .pdftypes import int_value
from .pdftypes import list_value
Expand All @@ -33,7 +35,6 @@

log = logging.getLogger(__name__)


def get_widths(seq):
widths = {}
r = []
Expand All @@ -52,10 +53,6 @@ def get_widths(seq):
widths[i] = w
r = []
return widths
#assert get_widths([1]) == {}
#assert get_widths([1,2,3]) == {1:3, 2:3}
#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}


def get_widths2(seq):
widths = {}
Expand All @@ -75,22 +72,15 @@ def get_widths2(seq):
widths[i] = (w, (vx, vy))
r = []
return widths
#assert get_widths2([1]) == {}
#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}


## FontMetricsDB
##
class FontMetricsDB(object):

@classmethod
def get_metrics(klass, fontname):
return FONT_METRICS[fontname]


## Type1FontHeaderParser
##
class Type1FontHeaderParser(PSStackParser):

KEYWORD_BEGIN = KWD(b'begin')
Expand Down Expand Up @@ -141,11 +131,16 @@ def do_keyword(self, pos, token):

NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')

#Note: DLIdent-* isn't found in PDF Reference but is been kept as
#it is harmless and have possibility of been a type. (induced from bug report/PR)
IDENTITY_ENCODER = {'Identity-H':'Identity-H',
'Identity-V':'Identity-V',
'DLIdent-H':'Identity-H',
'DLIdent-V':'Identity-V',
'OneByteIdentityH':'OneByteIdentityH',
'OneByteIdentityV':'OneByteIdentityV',
}

## CFFFont
## (Format specified in Adobe Technical Note: #5176
## "The Compact Font Format Specification")
##
def getdict(data):
d = {}
fp = BytesIO(data)
Expand Down Expand Up @@ -273,6 +268,7 @@ class CFFFont(object):
'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
)


class INDEX(object):

def __init__(self, fp):
Expand Down Expand Up @@ -373,9 +369,6 @@ def __init__(self, name, fp):
assert False, str(('Unhandled', format))
else:
raise ValueError('unsupported charset format: %r' % format)
#print self.code2gid
#print self.name2gid
#assert 0
return

def getstr(self, sid):
Expand All @@ -384,8 +377,6 @@ def getstr(self, sid):
return self.string_index[sid-len(self.STANDARD_STRINGS)]


## TrueTypeFont
##
class TrueTypeFont(object):

class CMapNotFound(Exception):
Expand Down Expand Up @@ -471,8 +462,6 @@ def create_unicode_map(self):
return unicode_map


## Fonts
##
class PDFFontError(PDFException):
pass

Expand All @@ -484,7 +473,6 @@ class PDFUnicodeNotDefined(PDFFontError):
LITERAL_TYPE1C = LIT('Type1C')


# PDFFont
class PDFFont(object):

def __init__(self, descriptor, widths, default_width=None):
Expand Down Expand Up @@ -549,7 +537,6 @@ def string_width(self, s):
return sum(self.char_width(cid) for cid in self.decode(s))


# PDFSimpleFont
class PDFSimpleFont(PDFFont):

def __init__(self, descriptor, widths, spec):
Expand Down Expand Up @@ -586,7 +573,6 @@ def to_unichr(self, cid):
raise PDFUnicodeNotDefined(None, cid)


# PDFType1Font
class PDFType1Font(PDFSimpleFont):

def __init__(self, rsrcmgr, spec):
Expand Down Expand Up @@ -618,14 +604,12 @@ def __repr__(self):
return '<PDFType1Font: basefont=%r>' % self.basefont


# PDFTrueTypeFont
class PDFTrueTypeFont(PDFType1Font):

def __repr__(self):
return '<PDFTrueTypeFont: basefont=%r>' % self.basefont


# PDFType3Font
class PDFType3Font(PDFSimpleFont):

def __init__(self, rsrcmgr, spec):
Expand All @@ -648,7 +632,6 @@ def __repr__(self):
return '<PDFType3Font>'


# PDFCIDFont
class PDFCIDFont(PDFFont):

def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
Expand All @@ -661,18 +644,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
try:
name = literal_name(spec['Encoding'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
name = 'unknown'
try:
self.cmap = CMapDB.get_cmap(name)
except CMapDB.CMapNotFound as e:
if strict:
raise PDFFontError(e)
self.cmap = CMap()
self.cmap = self.get_cmap_from_spec(spec, strict)

try:
descriptor = dict_value(spec['FontDescriptor'])
except KeyError:
Expand Down Expand Up @@ -719,6 +692,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
return

def get_cmap_from_spec(self, spec, strict):
"""
For certain PDFs, Encoding Type isn't mentioned as an attribute of
Encoding but as an attribute of CMapName, where CMapName is an
attribute of spec['Encoding'].
The horizontal/vertical modes are mentioned with different name
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
"""
try:
spec_encoding = spec['Encoding']
if hasattr(spec_encoding, 'name'):
cmap_name = literal_name(spec['Encoding'])
else:
cmap_name = literal_name(spec_encoding['CMapName'])
except KeyError:
if strict:
raise PDFFontError('Encoding is unspecified')
cmap_name = 'unknown'
if type(cmap_name) is PDFStream:
if 'CMapName' in cmap_name:
cmap_name = cmap_name.get('CMapName').name
else:
if strict:
raise PDFFontError('CMapName unspecified for encoding')
cmap_name = 'unknown'
if cmap_name in IDENTITY_ENCODER:
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
else:
return CMap()

def __repr__(self):
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)

Expand All @@ -743,16 +746,14 @@ def to_unichr(self, cid):
except KeyError:
raise PDFUnicodeNotDefined(self.cidcoding, cid)


# main
def main(argv):
for fname in argv[1:]:
fp = open(fname, 'rb')
#font = TrueTypeFont(fname, fp)
font = CFFFont(fname, fp)
print (font)
fp.close()
return


if __name__ == '__main__':
sys.exit(main(sys.argv))
Binary file added samples/sampleOneByteIdentityEncode.pdf
Binary file not shown.
Loading

0 comments on commit f53fbd9

Please sign in to comment.