pdfminer · tataganesh · Oct 12, 2019 · Feb 25, 2019 · Jul 12, 2019 · Jul 12, 2019
diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py
@@ -131,6 +131,15 @@ def decode(self, code):
             return ()
 
 
+class IdentityCMapByte(IdentityCMap):
+
+    def decode(self, code):
+        n = len(code)
+        if n:
+            return struct.unpack('>%dB' % n, code)
+        else:
+            return ()
+
 ##  UnicodeMap
 ##
 class UnicodeMap(CMapBase):
@@ -252,6 +261,10 @@ def get_cmap(klass, name):
             return IdentityCMap(WMode=0)
         elif name == 'Identity-V':
             return IdentityCMap(WMode=1)
+        elif name == 'OneByteIdentityH':
+            return IdentityCMapByte(WMode=0)
+        elif name == 'OneByteIdentityV':
+            return IdentityCMapByte(WMode=1)
         try:
             return klass._cmap_cache[name]
         except KeyError:

diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py
@@ -14,6 +14,8 @@
 from .encodingdb import name2unicode
 from .fontmetrics import FONT_METRICS
 from .pdftypes import PDFException
+from .pdftypes import PDFStream
+from .pdftypes import resolve1
 from .pdftypes import dict_value
 from .pdftypes import int_value
 from .pdftypes import list_value
@@ -140,7 +142,13 @@ def do_keyword(self, pos, token):
 
 
 NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-')
-
+IDENTITY_ENCODER = {'Identity-H':'Identity-H',
+                    'Identity-V':'Identity-V',
+                    'DLIdent-H':'Identity-H',
+                    'DLIdent-V':'Identity-V',
+                    'OneByteIdentityH':'OneByteIdentityH',
+                    'OneByteIdentityV':'OneByteIdentityV',
+                    }
 
 ##  CFFFont
 ##  (Format specified in Adobe Technical Note: #5176
@@ -661,18 +669,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
         self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {}))
         self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"),
                                     resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1"))
-        try:
-            name = literal_name(spec['Encoding'])
-        except KeyError:
-            if strict:
-                raise PDFFontError('Encoding is unspecified')
-            name = 'unknown'
-        try:
-            self.cmap = CMapDB.get_cmap(name)
-        except CMapDB.CMapNotFound as e:
-            if strict:
-                raise PDFFontError(e)
-            self.cmap = CMap()
+        self.cmap = self.get_cmap_from_spec(spec, strict)
+
         try:
             descriptor = dict_value(spec['FontDescriptor'])
         except KeyError:
@@ -719,6 +717,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT):
         PDFFont.__init__(self, descriptor, widths, default_width=default_width)
         return
 
+    def get_cmap_from_spec(self, spec, strict):
+        """
+        For certain PDFs, Encoding Type isn't mentioned as an attribute of
+        Encoding but as an attribute of CMapName, where CMapName is an
+        attribure of spec['Encoding'].
+        The horizaontal/vertical modes are mentioned with diffrent name
+        such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'
+        """
+        try:
+            spec_encoding = spec['Encoding']
+            if hasattr(spec_encoding, 'name'):
+                cmap_name = literal_name(spec['Encoding'])
+            else:
+                cmap_name = literal_name(spec_encoding['CMapName'])
+        except KeyError:
+            if strict:
+                raise PDFFontError('Encoding is unspecified')
+            cmap_name = 'unknown'
+        if type(cmap_name) is PDFStream:
+            if 'CMapName' in cmap_name:
+                cmap_name = cmap_name.get('CMapName').name
+            else:
+                if strict:
+                    raise PDFFontError('CMapName unspecified for encoding')
+                cmap_name = 'unknown'
+        if cmap_name in IDENTITY_ENCODER:
+            return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name])
+        else:
+            return CMap()
+
     def __repr__(self):
         return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding)
 

diff --git a/samples/sampleOneByteIdentityEncode.pdf b/samples/sampleOneByteIdentityEncode.pdf
diff --git a/tests/test_pdfencoding.py b/tests/test_pdfencoding.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+
+# -*- coding: utf-8 -*-
+
+import nose, logging, os
+from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte
+from pdfminer.pdffont import PDFCIDFont
+from pdfminer.pdftypes import PDFStream
+from pdfminer.psparser import PSLiteral
+
+class TestPDFEncoding():
+
+    def test_cmapname_onebyteidentityV(self):
+        stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMapByte)
+
+    def test_cmapname_onebyteidentityH(self):
+        stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMapByte)
+
+    def test_cmapname_V(self):
+        stream = PDFStream({'CMapName': PSLiteral('V')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, CMap)
+
+    def test_cmapname_H(self):
+        stream = PDFStream({'CMapName': PSLiteral('H')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, CMap)
+
+    def test_encoding_identityH(self):
+        spec = {'Encoding': PSLiteral('Identity-H')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityV(self):
+        spec = {'Encoding': PSLiteral('Identity-V')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityH_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityV_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityH_as_stream(self):
+        stream = PDFStream({'CMapName':'Identity-H'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_identityV_as_stream(self):
+        stream = PDFStream({'CMapName':'Identity-V'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH(self):
+        spec = {'Encoding': PSLiteral('DLIdent-H')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentV(self):
+        spec = {'Encoding': PSLiteral('DLIdent-V')}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH_as_PSLiteral_stream(self):
+        stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentH_as_stream(self):
+        stream = PDFStream({'CMapName':'DLIdent-H'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_encoding_DLIdentV_as_stream(self):
+        stream = PDFStream({'CMapName':'DLIdent-V'}, '')
+        spec = {'Encoding': stream}
+        font = PDFCIDFont(None, spec)
+        assert isinstance(font.cmap, IdentityCMap)
+
+    def test_font_without_spec(self):
+        font = PDFCIDFont(None, {})
+        assert isinstance(font.cmap, CMap)
+
+
+if __name__ == '__main__':
+    nose.runmodule()
diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py
@@ -24,6 +24,7 @@ def test_1(self):
         run('../samples/','simple1')
         run('../samples/','simple2')
         run('../samples/','simple3')
+        run('../samples/','sampleOneByteIdentityEncode')
 
     def test_2(self):
         run('../samples/nonfree/','dmca')