Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix font name by removing subset tag #357

Merged
merged 5 commits into from
Jan 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Fixed
- Interpret two's complement integer as unsigned integer ([#352](https://github.com/pdfminer/pdfminer.six/pull/352))
- Fix font name in html output such that it is recognized by browser ([#357](https://github.com/pdfminer/pdfminer.six/pull/357))
- Compute correct font height by removing scaling with font bounding box height ([#348](https://github.com/pdfminer/pdfminer.six/pull/348))
- KeyError when extracting embedded files and a Unicode file specification is missing ([#338](https://github.com/pdfminer/pdfminer.six/pull/338))

Expand Down
15 changes: 9 additions & 6 deletions pdfminer/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def write_footer(self):
return

def write_text(self, text):
self.write(enc(text, None))
self.write(enc(text))
return

def place_rect(self, color, borderwidth, x, y, w, h):
Expand All @@ -317,7 +317,7 @@ def place_image(self, item, borderwidth, x, y, w, h):
name = self.imagewriter.export_image(item)
s = '<img src="%s" border="%d" style="position:absolute; ' \
'left:%dpx; top:%dpx;" width="%d" height="%d" />\n' % \
(enc(name, None), borderwidth, x * self.scale,
(enc(name), borderwidth, x * self.scale,
(self._yoffset - y) * self.scale, w * self.scale,
h * self.scale)
self.write(s)
Expand Down Expand Up @@ -358,8 +358,11 @@ def put_text(self, text, fontname, fontsize):
if font != self._font:
if self._font is not None:
self.write('</span>')
# Remove subset tag from fontname, see PDF Reference 5.5.3
fontname_without_subset_tag = fontname.split('+')[-1]
self.write('<span style="font-family: %s; font-size:%dpx">' %
(enc(fontname), fontsize * self.scale * self.fontscale))
(fontname_without_subset_tag,
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
fontsize * self.scale * self.fontscale))
self._font = font
self.write_text(text)
return
Expand Down Expand Up @@ -479,7 +482,7 @@ def write_footer(self):
def write_text(self, text):
if self.stripcontrol:
text = self.CONTROL.sub('', text)
self.write(enc(text, None))
self.write(enc(text))
return

def receive_layout(self, ltpage):
Expand Down Expand Up @@ -544,7 +547,7 @@ def render(item):
elif isinstance(item, LTChar):
s = '<text font="%s" bbox="%s" colourspace="%s" ' \
'ncolour="%s" size="%.3f">' % \
(enc(item.fontname, None), bbox2str(item.bbox),
(enc(item.fontname), bbox2str(item.bbox),
item.ncs.name, item.graphicstate.ncolor, item.size)
self.write(s)
self.write_text(item.get_text())
Expand All @@ -555,7 +558,7 @@ def render(item):
if self.imagewriter is not None:
name = self.imagewriter.export_image(item)
self.write('<image src="%s" width="%d" height="%d" />\n' %
(enc(name, None), item.width, item.height))
(enc(name), item.width, item.height))
else:
self.write('<image width="%d" height="%d" />\n' %
(item.width, item.height))
Expand Down
2 changes: 1 addition & 1 deletion pdfminer/pdfdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def render_string(self, textstate, seq, ncs, graphicstate):
except PDFUnicodeNotDefined:
print(chars)
pass
self.outfp.write(utils.enc(text, self.codec))
self.outfp.write(utils.enc(text))
return

def begin_page(self, page, ctm):
Expand Down
10 changes: 4 additions & 6 deletions pdfminer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Miscellaneous Routines.
"""
import struct
from html import escape

import chardet # For str encoding detection

# from sys import maxint as INF doesn't work anymore under Python3, but PDF
Expand Down Expand Up @@ -250,15 +252,11 @@ def decode_text(s):
return ''.join(PDFDocEncoding[c] for c in s)


def enc(x, codec='ascii'):
def enc(x):
"""Encodes a string for SGML/XML/HTML"""
if isinstance(x, bytes):
return ''
x = x.replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;') \
.replace('"', '&quot;')
if codec:
x = x.encode(codec, 'xmlcharrefreplace')
return x
return escape(x)


def bbox2str(bbox):
Expand Down