Merge pull request #283 from fakabbir/pdfstream-as-cmap

Pdfstream as cmap
pdfminer · Oct 12, 2019 · f53fbd9 · f53fbd9
2 parents 42e2c81 + 7c03d96
commit f53fbd9
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 70 deletions.
diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -1,5 +1,3 @@
-
-
 """ Adobe character mapping (CMap) support.
 
 CMaps provide the mapping between character codes and Unicode
@@ -40,8 +38,6 @@ class CMapError(Exception):
     pass
 
 
-##  CMapBase
-##
 class CMapBase(object):
 
     debug = 0
@@ -67,8 +63,6 @@ def use_cmap(self, cmap):
         return
 
 
-##  CMap
-##
 class CMap(CMapBase):
 
     def __init__(self, **kwargs):
@@ -119,8 +113,6 @@ def dump(self, out=sys.stdout, code2cid=None, code=None):
         return
 
 
-##  IdentityCMap
-##
 class IdentityCMap(CMapBase):
 
     def decode(self, code):
@@ -131,8 +123,16 @@ def decode(self, code):
             return ()
 
 
-##  UnicodeMap
-##
+class IdentityCMapByte(IdentityCMap):
+
+    def decode(self, code):
+        n = len(code)
+        if n:
+            return struct.unpack('>%dB' % n, code)
+        else:
+            return ()
+
+
 class UnicodeMap(CMapBase):
 
     def __init__(self, **kwargs):
@@ -153,8 +153,6 @@ def dump(self, out=sys.stdout):
         return
 
 
-##  FileCMap
-##
 class FileCMap(CMap):
 
     def add_code2cid(self, code, cid):
@@ -173,8 +171,6 @@ def add_code2cid(self, code, cid):
         return
 
 
-##  FileUnicodeMap
-##
 class FileUnicodeMap(UnicodeMap):
 
     def add_cid2unichr(self, cid, code):
@@ -192,8 +188,6 @@ def add_cid2unichr(self, cid, code):
         return
 
 
-##  PyCMap
-##
 class PyCMap(CMap):
 
     def __init__(self, name, module):
@@ -204,8 +198,6 @@ def __init__(self, name, module):
         return
 
 
-##  PyUnicodeMap
-##
 class PyUnicodeMap(UnicodeMap):
 
     def __init__(self, name, module, vertical):
@@ -218,8 +210,6 @@ def __init__(self, name, module, vertical):
         return
 
 
-##  CMapDB
-##
 class CMapDB(object):
 
     _cmap_cache = {}
@@ -252,6 +242,10 @@ def get_cmap(klass, name):
             return IdentityCMap(WMode=0)
         elif name == 'Identity-V':
             return IdentityCMap(WMode=1)
+        elif name == 'OneByteIdentityH':
+            return IdentityCMapByte(WMode=0)
+        elif name == 'OneByteIdentityV':
+            return IdentityCMapByte(WMode=1)
         try:
             return klass._cmap_cache[name]
         except KeyError:
@@ -271,8 +265,6 @@ def get_unicode_map(klass, name, vertical=False):
         return umaps[vertical]
 
 
-##  CMapParser
-##
 class CMapParser(PSStackParser):
 
     def __init__(self, cmap, fp):
@@ -360,7 +352,6 @@ def do_keyword(self, pos, token):
                 s1 = nunpack(svar)
                 e1 = nunpack(evar)
                 vlen = len(svar)
-                #assert s1 <= e1, str((s1, e1))
                 for i in range(e1-s1+1):
                     x = sprefix+struct.pack('>L', s1+i)[-vlen:]
                     self.cmap.add_code2cid(x, cid+i)
@@ -387,7 +378,6 @@ def do_keyword(self, pos, token):
                         continue
                 s1 = nunpack(s)
                 e1 = nunpack(e)
-                #assert s1 <= e1, str((s1, e1))
                 if isinstance(code, list):
                     for i in range(e1-s1+1):
                         self.cmap.add_cid2unichr(s1+i, code[i])
@@ -422,17 +412,16 @@ def do_keyword(self, pos, token):
         return
 
 
-# test
 def main(argv):
     args = argv[1:]
     for fname in args:
         fp = open(fname, 'rb')
         cmap = FileUnicodeMap()
-        #cmap = FileCMap()
         CMapParser(cmap, fp).run()
         fp.close()
         cmap.dump()
     return
 
+
 if __name__ == '__main__':
     sys.exit(main(sys.argv))
diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -14,6 +14,8 @@
 from .encodingdb import name2unicode
 from .fontmetrics import FONT_METRICS
 from .pdftypes import PDFException
+from .pdftypes import PDFStream
+from .pdftypes import resolve1
 from .pdftypes import dict_value
 from .pdftypes import int_value
 from .pdftypes import list_value
@@ -33,7 +35,6 @@
 
 log = logging.getLogger(__name__)
 
-
 def get_widths(seq):
     widths = {}
     r = []
@@ -52,10 +53,6 @@ def get_widths(seq):
                     widths[i] = w
                 r = []
     return widths
-#assert get_widths([1]) == {}
-#assert get_widths([1,2,3]) == {1:3, 2:3}
-#assert get_widths([1,[2,3],6,[7,8]]) == {1:2,2:3, 6:7,7:8}
-
 
 def get_widths2(seq):
     widths = {}
@@ -75,22 +72,15 @@ def get_widths2(seq):
                     widths[i] = (w, (vx, vy))
                 r = []
     return widths
-#assert get_widths2([1]) == {}
-#assert get_widths2([1,2,3,4,5]) == {1:(3, (4,5)), 2:(3, (4,5))}
-#assert get_widths2([1,[2,3,4,5],6,[7,8,9]]) == {1:(2, (3,4)), 6:(7, (8,9))}
 
 
-##  FontMetricsDB
-##
 class FontMetricsDB(object):
 
     @classmethod
     def get_metrics(klass, fontname):
         return FONT_METRICS[fontname]
 
 
-##  Type1FontHeaderParser
-##
 class Type1FontHeaderParser(PSStackParser):
 
     KEYWORD_BEGIN = KWD(b'begin')
@@ -141,11 +131,16 @@ def do_keyword(self, pos, token):
 
 NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
 
+#Note: DLIdent-* isn't found in PDF Reference but is been kept as
+#it is harmless and have possibility of been a type. (induced from bug report/PR)
+IDENTITY_ENCODER = {'Identity-H':'Identity-H',
+                    'Identity-V':'Identity-V',
+                    'DLIdent-H':'Identity-H',
+                    'DLIdent-V':'Identity-V',
+                    'OneByteIdentityH':'OneByteIdentityH',
+                    'OneByteIdentityV':'OneByteIdentityV',
+                    }
 
-##  CFFFont
-##  (Format specified in Adobe Technical Note: #5176
-##   "The Compact Font Format Specification")
-##
 def getdict(data):
     d = {}
     fp = BytesIO(data)
@@ -273,6 +268,7 @@ class CFFFont(object):
       'Light', 'Medium', 'Regular', 'Roman', 'Semibold',
     )
 
+
     class INDEX(object):
 
         def __init__(self, fp):
@@ -373,9 +369,6 @@ def __init__(self, name, fp):
             assert False, str(('Unhandled', format))
         else:
             raise ValueError('unsupported charset format: %r' % format)
-        #print self.code2gid
-        #print self.name2gid
-        #assert 0
         return
 
     def getstr(self, sid):
@@ -384,8 +377,6 @@ def getstr(self, sid):
         return self.string_index[sid-len(self.STANDARD_STRINGS)]
 
 
-##  TrueTypeFont
-##
 class TrueTypeFont(object):
 
     class CMapNotFound(Exception):
@@ -471,8 +462,6 @@ def create_unicode_map(self):
         return unicode_map
 
 
-##  Fonts
-##
 class PDFFontError(PDFException):
     pass
 
@@ -484,7 +473,6 @@ class PDFUnicodeNotDefined(PDFFontError):
 LITERAL_TYPE1C = LIT('Type1C')
 
 
-# PDFFont
 class PDFFont(object):
 
     def __init__(self, descriptor, widths, default_width=None):
@@ -549,7 +537,6 @@ def string_width(self, s):
         return sum(self.char_width(cid) for cid in self.decode(s))
 
 
-# PDFSimpleFont
 class PDFSimpleFont(PDFFont):
 
     def __init__(self, descriptor, widths, spec):
@@ -586,7 +573,6 @@ def to_unichr(self, cid):
             raise PDFUnicodeNotDefined(None, cid)
 
 
-# PDFType1Font
 class PDFType1Font(PDFSimpleFont):
 
     def __init__(self, rsrcmgr, spec):
@@ -618,14 +604,12 @@ def __repr__(self):
         return '<PDFType1Font: basefont=%r>' % self.basefont
 
 
-# PDFTrueTypeFont
 class PDFTrueTypeFont(PDFType1Font):
 
     def __repr__(self):
         return '<PDFTrueTypeFont: basefont=%r>' % self.basefont
 
 
-# PDFType3Font
 class PDFType3Font(PDFSimpleFont):
 
     def __init__(self, rsrcmgr, spec):
@@ -648,7 +632,6 @@ def __repr__(self):
         return '<PDFType3Font>'
 
 
-# PDFCIDFont
 class PDFCIDFont(PDFFont):
 
     def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
@@ -661,18 +644,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
         self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
         self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
                                     resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
-        try:
-            name = literal_name(spec['Encoding'])
-        except KeyError:
-            if strict:
-                raise PDFFontError('Encoding is unspecified')
-            name = 'unknown'
-        try:
-            self.cmap = CMapDB.get_cmap(name)
-        except CMapDB.CMapNotFound as e:
-            if strict:
-                raise PDFFontError(e)
-            self.cmap = CMap()
+        self.cmap = self.get_cmap_from_spec(spec, strict)
+
         try:
             descriptor = dict_value(spec['FontDescriptor'])
         except KeyError:
@@ -719,6 +692,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
         PDFFont.__init__(self, descriptor, widths, default_width=default_width)
         return
 
+    def get_cmap_from_spec(self, spec, strict):
+        """
+        For certain PDFs, Encoding Type isn't mentioned as an attribute of
+        Encoding but as an attribute of CMapName, where CMapName is an
+        attribute of spec['Encoding'].
+        The horizontal/vertical modes are mentioned with different name
+        such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
+        """
+        try:
+            spec_encoding = spec['Encoding']
+            if hasattr(spec_encoding, 'name'):
+                cmap_name = literal_name(spec['Encoding'])
+            else:
+                cmap_name = literal_name(spec_encoding['CMapName'])
+        except KeyError:
+            if strict:
+                raise PDFFontError('Encoding is unspecified')
+            cmap_name = 'unknown'
+        if type(cmap_name) is PDFStream:
+            if 'CMapName' in cmap_name:
+                cmap_name = cmap_name.get('CMapName').name
+            else:
+                if strict:
+                    raise PDFFontError('CMapName unspecified for encoding')
+                cmap_name = 'unknown'
+        if cmap_name in IDENTITY_ENCODER:
+            return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
+        else:
+            return CMap()
+
     def __repr__(self):
         return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
 
@@ -743,16 +746,14 @@ def to_unichr(self, cid):
         except KeyError:
             raise PDFUnicodeNotDefined(self.cidcoding, cid)
 
-
-# main
 def main(argv):
     for fname in argv[1:]:
         fp = open(fname, 'rb')
-        #font = TrueTypeFont(fname, fp)
         font = CFFFont(fname, fp)
         print (font)
         fp.close()
     return
 
+
 if __name__ == '__main__':
     sys.exit(main(sys.argv))
diff --git a/samples/sampleOneByteIdentityEncode.pdf b/samples/sampleOneByteIdentityEncode.pdf