Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Word fonts #1

Merged
merged 12 commits into from
Apr 28, 2017
73 changes: 68 additions & 5 deletions pdfplumber/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
from pdfminer.utils import PDFDocEncoding
from decimal import Decimal, ROUND_HALF_UP
import numbers
from collections import Counter
from operator import itemgetter
import itertools
import six

DEFAULT_X_TOLERANCE = 3
DEFAULT_Y_TOLERANCE = 3

## Raise an error if the individual characters' font sizes vary by more
## than this (if we are in strict font height mode )
DEFAULT_FONT_HEIGHT_TOLERANCE = 0.5

class WordFontError(RuntimeError):
def __init__(self,*args,**kwargs):
RuntimeError.__init__(self,*args,**kwargs)

def cluster_list(xs, tolerance=0):
tolerance = decimalize(tolerance)
if tolerance == 0: return [ [x] for x in sorted(xs) ]
Expand Down Expand Up @@ -126,6 +135,51 @@ def objects_to_bbox(objects):
max(map(itemgetter("bottom"), objects)),
)

def get_font_from_chars(chars, match_fontname):
fontset = set()
for char in chars:
fontset.add(char['fontname'])
number_of_fonts_found = len(fontset)
if number_of_fonts_found > 1:
if match_fontname:
charfonttext = map(itemgetter("fontname"), chars)
charlisttext = map(itemgetter("text"), chars)
raise WordFontError("Multiple fonts '%s' found in word %s \nPerhaps word tolerance is set too low?" % (charfonttext, charlisttext))
return( ", ".join([str(font) for font in fontset]) )
if number_of_fonts_found == 0:
return ""
return fontset.pop()

def get_font_height_from_chars(chars, match_fontsize, font_height_tolerance):
charlist = map(itemgetter("height"), chars)
max_font_height = max(charlist)
min_font_height = min(charlist)
font_height_range = max_font_height - min_font_height

if match_fontsize and font_height_range > font_height_tolerance:
charlist = map(itemgetter("height"), chars)
charlisttext = map(itemgetter("text"), chars)
raise WordFontError("Font size variation of '%s' exceeds tolerance of %s in word %s with heights %s\nPerhaps word tolerance is set too low?" % (font_height_range, font_height_tolerance, charlisttext, charlist))

return ( ( max_font_height + min_font_height) / 2 )

def objects_to_bbox_with_font(objects, match_fontname, match_fontsize, font_height_tolerance):
fontname = None
fontsize = None
if match_fontname:
fontname = get_font_from_chars(objects, match_fontname)
if match_fontsize:
fontsize = get_font_height_from_chars(objects, match_fontsize, font_height_tolerance)

return (
min(map(itemgetter("x0"), objects)),
min(map(itemgetter("top"), objects)),
max(map(itemgetter("x1"), objects)),
max(map(itemgetter("bottom"), objects)),
fontname,
fontsize
)

obj_to_bbox = itemgetter("x0", "top", "x1", "bottom")

def bbox_to_rect(bbox):
Expand All @@ -139,21 +193,30 @@ def bbox_to_rect(bbox):
def extract_words(chars,
x_tolerance=DEFAULT_X_TOLERANCE,
y_tolerance=DEFAULT_Y_TOLERANCE,
keep_blank_chars=False
keep_blank_chars=False,
match_fontname=True,
match_fontsize=True,
font_height_tolerance=DEFAULT_FONT_HEIGHT_TOLERANCE
):

x_tolerance = decimalize(x_tolerance)
y_tolerance = decimalize(y_tolerance)

def process_word_chars(chars):
x0, top, x1, bottom = objects_to_bbox(chars)
return {
x0, top, x1, bottom, fontname, fontsize= objects_to_bbox_with_font(chars, match_fontname, match_fontsize, font_height_tolerance)
result = {
"x0": x0,
"x1": x1,
"top": top,
"bottom": bottom,
"text": "".join(map(itemgetter("text"), chars))
"text": "".join(map(itemgetter("text"), chars)),
}
if match_fontname:
result["fontname"]=fontname
if match_fontsize:
result["fontsize"]=fontsize

return result


def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
Expand Down
Binary file added tests/pdfs/Acorn_127_201604.pdf
Binary file not shown.
73 changes: 73 additions & 0 deletions tests/test_fonts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python
import unittest
import sys
import os
import logging

import pdfplumber
from pdfplumber.utils import extract_words, WordFontError

logging.disable(logging.ERROR)
HERE = os.path.abspath(os.path.dirname(__file__))

class Test(unittest.TestCase):

def setUp(self):
path = os.path.join(HERE, "pdfs/Acorn_127_201604.pdf")
"""
This file has inconsistently named fonts and font sizes.
Some words begin with characters roughly 3 (units?) bigger in size.
Courtesy of Ryan Ross (Prince Edward Island Guardian)
http://www.gov.pe.ca/publicdisclosure/pdSummary.php
"""
test_pdf = pdfplumber.from_path(path)
self.pdf_chars = test_pdf.pages[0].chars

""" As of this writing, the default x_tolerance is 3, y_tolerance is 3 and default_font_height_tolerance is 1 """

self.default_x_tolerance = 3
self.default_y_tolerance = 3
self.default_font_height_tolerance = 1


def test_fontname(self):
extract_words(self.pdf_chars,
y_tolerance=self.default_y_tolerance,
x_tolerance=self.default_x_tolerance,
font_height_tolerance=self.default_font_height_tolerance,
match_fontsize=False,
match_fontname=False)

with self.assertRaises(WordFontError):
extract_words(self.pdf_chars,
y_tolerance=self.default_y_tolerance,
x_tolerance=self.default_x_tolerance,
font_height_tolerance=self.default_font_height_tolerance,
match_fontsize=False,
match_fontname=True)

def test_font_height_tolerance(self):
extract_words(self.pdf_chars,
y_tolerance=self.default_y_tolerance,
x_tolerance=self.default_x_tolerance,
font_height_tolerance=3.5,
match_fontname=False,
match_fontsize=True)

with self.assertRaises(WordFontError):
extract_words(self.pdf_chars,
y_tolerance=self.default_y_tolerance,
x_tolerance=self.default_x_tolerance,
font_height_tolerance=2,
match_fontname=False,
match_fontsize=True)

def test_fontsize(self):

with self.assertRaises(WordFontError):
extract_words(self.pdf_chars,
y_tolerance=self.default_y_tolerance,
x_tolerance=self.default_x_tolerance,
font_height_tolerance=self.default_font_height_tolerance,
match_fontname=False,
match_fontsize=True)