Skip to content

Commit

Permalink
add paragraph parser
Browse files Browse the repository at this point in the history
  • Loading branch information
meldonization committed Mar 11, 2020
1 parent 6865523 commit 9a6e33b
Show file tree
Hide file tree
Showing 14 changed files with 312 additions and 136 deletions.
12 changes: 9 additions & 3 deletions depdf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
from depdf.api import convert_pdf_to_html, convert_pdf_to_html_by_page
from depdf.api import *
from depdf.config import Config
from depdf.pdf import DePDF
from depdf.page import DePage
from depdf.version import __version__

__all__ = [
'convert_pdf_to_html',
'convert_pdf_to_html_by_page',
'Config',
'DePDF',
'DePage',
'convert_pdf_to_html',
'convert_page_to_html',
'extract_page_tables',
'extract_page_paragraphs',
]
32 changes: 16 additions & 16 deletions depdf/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from depdf.error import PDFTypeError
from depdf.log import logger_init
from depdf.pdf import DePDF
from depdf.page import DePage

log = logger_init(__name__)

Expand All @@ -30,47 +31,46 @@ def wrapper(pdf_file_path, *args, **kwargs):


@api_load_pdf
def convert_pdf_to_html(pdf_file_path, **kwargs):
def convert_pdf_to_html(pdf_file, **kwargs):
"""
:param pdf_file_path: pdf file absolute path
:param pdf_file: pdf file absolute path
:param kwargs: config keyword arguments
:return:
"""
html = []
return html
return pdf_file.html


@api_load_pdf
def convert_pdf_to_html_by_page(pdf_file_path, pid, **kwargs):
def convert_page_to_html(pdf_file, pid, **kwargs):
"""
:param pdf_file_path: pdf file absolute path
:param pdf_file: pdf file absolute path
:param pid: page number start from 1
:param kwargs: config keyword arguments
:return:
"""
html_page = ''
return html_page
page = pdf_file.pages[pid - 1]
return page.html


@api_load_pdf
def extract_page_tables(pdf_file_path, pid, **kwargs):
def extract_page_tables(pdf_file, pid, **kwargs):
"""
:param pdf_file_path: pdf file absolute path
:param pdf_file: pdf file absolute path
:param pid: page number start from 1
:param kwargs: config keyword arguments
:return:
"""
tables = []
return tables
page = pdf_file.pages[pid - 1]
return page.tables


@api_load_pdf
def extract_page_paragraphs(pdf_file_path, pid, **kwargs):
def extract_page_paragraphs(pdf_file, pid, **kwargs):
"""
:param pdf_file_path: pdf file absolute path
:param pdf_file: pdf file absolute path
:param pid: page number start from 1
:param kwargs: config keyword arguments
:return:
"""
paragraphs = []
return paragraphs
page = pdf_file.pages[pid - 1]
return page.paragraphs
16 changes: 14 additions & 2 deletions depdf/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ class Box(object):
bottom = Decimal(0)
_bbox = (x0, top, x1, bottom)

def __repr__(self):
return '<depdf.Box: {}>'.format(tuple(self.bbox))

@property
def width(self):
return self.x1 - self.x0
Expand All @@ -27,7 +30,7 @@ def bbox(self):
def bbox(self, value):
if value is not None:
bbox = self.normalize_bbox(value)
(self.x0, self.top, self.x1, self.bottom) = bbox
self.x0, self.top, self.x1, self.bottom = bbox
self._bbox = bbox

@staticmethod
Expand All @@ -36,7 +39,9 @@ def normalize_bbox(bbox):
raise BoxValueError(bbox)
if isinstance(bbox, str):
raise BoxValueError(bbox)
bbox = (Decimal(i) for i in bbox)
if len(bbox) != 4:
raise BoxValueError(bbox)
bbox = [Decimal(i) for i in bbox]
return bbox


Expand Down Expand Up @@ -86,3 +91,10 @@ def refresh(self):
def reset(self):
pass


class InnerWrapper(Base):
_inner_objects = []

@property
def inner_objects(self):
return [obj.to_dict if hasattr(obj, 'to_dict') else obj for obj in self._inner_objects]
29 changes: 17 additions & 12 deletions depdf/components/paragraph.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,36 @@
from depdf.base import Base, Box
from depdf.base import Box, InnerWrapper
from depdf.config import check_config
from depdf.log import logger_init
from depdf.utils import calc_bbox, construct_style

log = logger_init(__name__)


class Paragraph(Base, Box):
class Paragraph(InnerWrapper, Box):
object_type = 'paragraph'

@check_config
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_object=None):
self.bbox = bbox
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None):
para_id = 'page-{pid}-paragraph-{para_id}'.format(pid=pid, para_id=para_idx)
para_class = '{para_class} page-{pid}'.format(para_class=getattr(config, 'paragraph_class'), pid=pid)
html = '<p id="{para_id}" class="{para_class}">'.format(
para_id=para_id, para_class=para_class
style = construct_style(style=style)
html = '<p id="{para_id}" class="{para_class}"{style}>'.format(
para_id=para_id, para_class=para_class, style=style
)
self.pid = pid
self.para_id = para_idx
self.bbox = bbox
if text:
self.text = text
html += str(text)
else:
self._inner_object = [inner_object]
for obj in inner_object:
self.html += getattr(obj, 'html', '')
if bbox is None:
self.bbox = calc_bbox(inner_objects)
self._inner_objects = inner_objects
for obj in inner_objects:
html += getattr(obj, 'html', '')
html += '</p>'
self.html = html

@property
def inner_object(self):
return [obj.to_dict if hasattr(obj, 'to_dict') else obj for obj in self._inner_object]
def __repr__(self):
return '<depdf.Paragraph: ({}, {})>'.format(self.pid, self.para_id)
8 changes: 5 additions & 3 deletions depdf/components/span.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from depdf.base import Base, Box
from depdf.config import check_config
from depdf.log import logger_init
from depdf.utils import construct_style

log = logger_init(__name__)

Expand All @@ -9,10 +10,11 @@ class Span(Base, Box):
object_type = 'span'

@check_config
def __init__(self, bbox=None, span_text='', pid=1, config=None):
def __init__(self, bbox=None, span_text='', config=None, style=None):
self.bbox = bbox
self.text = span_text
span_class = getattr(config, 'span_class')
self.html = '<span class="{span_class} page-{pid}">{span_text}</span>'.format(
span_class=span_class, pid=pid, span_text=span_text
style = construct_style(style=style)
self.html = '<span class="{span_class}{style}">{span_text}</span>'.format(
span_class=span_class, span_text=span_text, style=style
)
45 changes: 16 additions & 29 deletions depdf/components/table.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,24 @@
from depdf.base import Base, Box
from depdf.base import Base, Box, InnerWrapper
from depdf.config import check_config
from depdf.log import logger_init
from depdf.utils import calc_bbox

log = logger_init(__name__)


class Cell(Base, Box):
class Cell(InnerWrapper, Box):
object_type = 'cell'

def __init__(self, bbox=None, text='', font_size=14, inner_object=None):
def __init__(self, bbox=None, text='', inner_objects=None):
self.bbox = bbox
self.fs = font_size
if text:
self.text = text
self.html = text
else:
self._inner_object = inner_object
for obj in inner_object:
self._inner_objects = inner_objects
for obj in inner_objects:
self.html += getattr(obj, 'html', '')

@property
def inner_object(self):
return self._inner_object.to_dict if hasattr(self._inner_object, 'to_dict') else self._inner_object


class Table(Base, Box):
object_type = 'table'
Expand All @@ -33,24 +29,10 @@ def __init__(self, rows, pid=1, tid=1, config=None, bbox=None):
self.tid = tid
self.rows = rows
self.config = config
self.bbox = bbox if bbox else self.calc_table_bbox_by_rows(rows)
self.bbox = bbox if bbox else calc_bbox(rows)

@staticmethod
def calc_table_bbox_by_rows(rows):
x0_list, top_list, x1_list, bottom_list = [], [], [], []
for row in rows:
for cell in row:
x0_list.append(cell.x0)
top_list.append(cell.top)
x1_list.append(cell.x1)
bottom_list.append(cell.bottom)
bbox = (
min(x0_list),
min(top_list),
max(x1_list),
max(bottom_list),
)
return bbox
def __repr__(self):
return '<depdf.Table: ({}, {})>'.format(self.pid, self.tid)

@property
def to_dict(self):
Expand All @@ -63,16 +45,21 @@ def to_dict(self):
]
return table_dict

@property
def html(self):
if not self._html and hasattr(self, 'to_html'):
return self.to_html
return self._html

@property
def to_html(self):
table_class = getattr(self.config, 'table_class')
table_cell_merge_tolerance = getattr(self.config, 'table_cell_merge_tolerance')
skip_empty_table = getattr(self.config, 'skip_empty_table')
self.html = convert_table_to_html(
return convert_table_to_html(
self.to_dict, pid=self.pid, tid=self.tid, tc_mt=table_cell_merge_tolerance,
table_class=table_class, skip_et=skip_empty_table
)
return self.html


def gen_column_cell_sizes(t):
Expand Down
7 changes: 4 additions & 3 deletions depdf/components/text.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from depdf.base import Base
from depdf.base import Base, Box


class Text(Base):
class Text(Base, Box):
object_type = 'text'

def __init__(self, text):
def __init__(self, bbox='', text=''):
self.bbox = bbox
self.text = text
self.html = text
5 changes: 5 additions & 0 deletions depdf/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class Config(Base):
table_class = DEFAULT_TABLE_CLASS
pdf_class = DEFAULT_PDF_CLASS
image_class = DEFAULT_IMAGE_CLASS
page_class = DEFAULT_PAGE_CLASS

def __init__(self, **kwargs):
# set log level automatically if debug mode enabled
Expand All @@ -71,10 +72,14 @@ def __init__(self, **kwargs):

# add configuration parameters
self.update(**kwargs)
self._kwargs = kwargs

# set logging level by log_level parameter
logging.getLogger('depdf').setLevel(self.log_level)

def __repr__(self):
return '<depdf.Config: {}>'.format(self._kwargs)

def update(self, **kwargs):
for key, value in kwargs.items():
if hasattr(self, key):
Expand Down
Loading

0 comments on commit 9a6e33b

Please sign in to comment.