-
Notifications
You must be signed in to change notification settings - Fork 928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Pdfstream as cmap #283
Pdfstream as cmap #283
Changes from 12 commits
8ab2e28
c022358
8e4a82a
cc40af3
fa40043
b4c261b
f1a4dce
5a0d8db
5b21098
fe38695
3125d36
3f0f05d
3d549ea
abd685f
7c03d96
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ | |
from .encodingdb import name2unicode | ||
from .fontmetrics import FONT_METRICS | ||
from .pdftypes import PDFException | ||
from .pdftypes import PDFStream | ||
from .pdftypes import resolve1 | ||
from .pdftypes import dict_value | ||
from .pdftypes import int_value | ||
from .pdftypes import list_value | ||
|
@@ -140,7 +142,13 @@ def do_keyword(self, pos, token): | |
|
||
|
||
NIBBLES = ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', 'e', 'e-', None, '-') | ||
|
||
IDENTITY_ENCODER = {'Identity-H':'Identity-H', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You mentioned that DLIdent-* is not in the PDF reference manual. Did you find other pdf documentation that mentions this? If so, you could add a comment that refers to it such that we do not remove DLIdent-* by accident. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't seen any pdf with DLIdent-* but have included it due to its harmless nature. Will also include code comment to avoid DLIdent-* being removed by accident. |
||
'Identity-V':'Identity-V', | ||
'DLIdent-H':'Identity-H', | ||
'DLIdent-V':'Identity-V', | ||
'OneByteIdentityH':'OneByteIdentityH', | ||
'OneByteIdentityV':'OneByteIdentityV', | ||
} | ||
|
||
## CFFFont | ||
## (Format specified in Adobe Technical Note: #5176 | ||
|
@@ -661,18 +669,8 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): | |
self.cidsysteminfo = dict_value(spec.get('CIDSystemInfo', {})) | ||
self.cidcoding = '%s-%s' % (resolve1(self.cidsysteminfo.get('Registry', b'unknown')).decode("latin1"), | ||
resolve1(self.cidsysteminfo.get('Ordering', b'unknown')).decode("latin1")) | ||
try: | ||
name = literal_name(spec['Encoding']) | ||
except KeyError: | ||
if strict: | ||
raise PDFFontError('Encoding is unspecified') | ||
name = 'unknown' | ||
try: | ||
self.cmap = CMapDB.get_cmap(name) | ||
except CMapDB.CMapNotFound as e: | ||
if strict: | ||
raise PDFFontError(e) | ||
self.cmap = CMap() | ||
self.cmap = self.get_cmap_from_spec(spec, strict) | ||
|
||
try: | ||
descriptor = dict_value(spec['FontDescriptor']) | ||
except KeyError: | ||
|
@@ -719,6 +717,36 @@ def __init__(self, rsrcmgr, spec, strict=settings.STRICT): | |
PDFFont.__init__(self, descriptor, widths, default_width=default_width) | ||
return | ||
|
||
def get_cmap_from_spec(self, spec, strict): | ||
""" | ||
For certain PDFs, Encoding Type isn't mentioned as an attribute of | ||
Encoding but as an attribute of CMapName, where CMapName is an | ||
attribure of spec['Encoding']. | ||
The horizaontal/vertical modes are mentioned with diffrent name | ||
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V' | ||
""" | ||
try: | ||
spec_encoding = spec['Encoding'] | ||
if hasattr(spec_encoding, 'name'): | ||
cmap_name = literal_name(spec['Encoding']) | ||
else: | ||
cmap_name = literal_name(spec_encoding['CMapName']) | ||
except KeyError: | ||
if strict: | ||
raise PDFFontError('Encoding is unspecified') | ||
cmap_name = 'unknown' | ||
if type(cmap_name) is PDFStream: | ||
if 'CMapName' in cmap_name: | ||
cmap_name = cmap_name.get('CMapName').name | ||
else: | ||
if strict: | ||
raise PDFFontError('CMapName unspecified for encoding') | ||
cmap_name = 'unknown' | ||
if cmap_name in IDENTITY_ENCODER: | ||
return CMapDB.get_cmap(IDENTITY_ENCODER[cmap_name]) | ||
else: | ||
return CMap() | ||
|
||
def __repr__(self): | ||
return '<PDFCIDFont: basefont=%r, cidcoding=%r>' % (self.basefont, self.cidcoding) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
#!/usr/bin/env python | ||
|
||
# -*- coding: utf-8 -*- | ||
|
||
import nose, logging, os | ||
from pdfminer.cmapdb import IdentityCMap, CMap, IdentityCMapByte | ||
from pdfminer.pdffont import PDFCIDFont | ||
from pdfminer.pdftypes import PDFStream | ||
from pdfminer.psparser import PSLiteral | ||
|
||
class TestPDFEncoding(): | ||
|
||
def test_cmapname_onebyteidentityV(self): | ||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityV')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMapByte) | ||
|
||
def test_cmapname_onebyteidentityH(self): | ||
stream = PDFStream({'CMapName': PSLiteral('OneByteIdentityH')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMapByte) | ||
|
||
def test_cmapname_V(self): | ||
stream = PDFStream({'CMapName': PSLiteral('V')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, CMap) | ||
|
||
def test_cmapname_H(self): | ||
stream = PDFStream({'CMapName': PSLiteral('H')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, CMap) | ||
|
||
def test_encoding_identityH(self): | ||
spec = {'Encoding': PSLiteral('Identity-H')} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_identityV(self): | ||
spec = {'Encoding': PSLiteral('Identity-V')} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_identityH_as_PSLiteral_stream(self): | ||
stream = PDFStream({'CMapName':PSLiteral('Identity-H')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_identityV_as_PSLiteral_stream(self): | ||
stream = PDFStream({'CMapName':PSLiteral('Identity-V')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_identityH_as_stream(self): | ||
stream = PDFStream({'CMapName':'Identity-H'}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_identityV_as_stream(self): | ||
stream = PDFStream({'CMapName':'Identity-V'}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_DLIdentH(self): | ||
spec = {'Encoding': PSLiteral('DLIdent-H')} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_DLIdentV(self): | ||
spec = {'Encoding': PSLiteral('DLIdent-V')} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_DLIdentH_as_PSLiteral_stream(self): | ||
stream = PDFStream({'CMapName':PSLiteral('DLIdent-H')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_DLIdentH_as_PSLiteral_stream(self): | ||
stream = PDFStream({'CMapName':PSLiteral('DLIdent-V')}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_DLIdentH_as_stream(self): | ||
stream = PDFStream({'CMapName':'DLIdent-H'}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_encoding_DLIdentV_as_stream(self): | ||
stream = PDFStream({'CMapName':'DLIdent-V'}, '') | ||
spec = {'Encoding': stream} | ||
font = PDFCIDFont(None, spec) | ||
assert isinstance(font.cmap, IdentityCMap) | ||
|
||
def test_font_without_spec(self): | ||
font = PDFCIDFont(None, {}) | ||
assert isinstance(font.cmap, CMap) | ||
|
||
|
||
if __name__ == '__main__': | ||
nose.runmodule() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are a lot of these unnecessary code comments in
cmapdb.py
. Since you are editing this file anyway, could you remove those? And likewise forpdffont.py
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, Can do that.