Skip to content

Commit

Permalink
add tables and images extraction methods
Browse files Browse the repository at this point in the history
  • Loading branch information
meldonization committed Mar 10, 2020
1 parent 2138046 commit 6865523
Show file tree
Hide file tree
Showing 9 changed files with 481 additions and 53 deletions.
4 changes: 3 additions & 1 deletion depdf/components/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@
from depdf.components.text import Text
from depdf.components.span import Span
from depdf.components.table import Table, Cell
from depdf.components.image import Image

component_list = [
Paragraph,
Table,
Span,
Text,
Cell,
Image,
]

__all__ = [
'Paragraph', 'Table', 'Span', 'Text', 'Cell',
'Paragraph', 'Table', 'Span', 'Text', 'Cell', 'Image',
]
22 changes: 22 additions & 0 deletions depdf/components/image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from depdf.base import Base, Box
from depdf.config import check_config
from depdf.log import logger_init

log = logger_init(__name__)


class Image(Base, Box):
object_type = 'image'

@check_config
def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None):
self.bbox = bbox
self.scan = scan
width = bbox[2] - bbox[0]
img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
img_id=img_id, img_class=img_class, src=src, width=width
)
html += '</img>'
self.html = html
4 changes: 0 additions & 4 deletions depdf/components/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,3 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj
@property
def inner_object(self):
return [obj.to_dict if hasattr(obj, 'to_dict') else obj for obj in self._inner_object]


def extract_pdf_paragraph_by_page(page):
pass
8 changes: 1 addition & 7 deletions depdf/components/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,10 @@ def convert_table_to_html(table_dict, pid=1, tid=1, tc_mt=5, table_class='pdf-ta
html_table_string += ' rowspan="{}"'.format(row_span)
if col_span > 1:
html_table_string += ' colspan="{}"'.format(col_span)
html_table_string += ' style="font-size: {font_size}px;">{tc_text}</td>'.format(
font_size=tc['fs'], tc_text=tc['html']
)
html_table_string += '>{tc_text}</td>'.format(tc_text=tc['html'])
none_text_table = False if tc['html'] else none_text_table
html_table_string += '</tr>'
html_table_string += '</table>'
if skip_et and none_text_table:
return empty_table_html
return html_table_string


def extract_pdf_table_by_page(page):
pass
23 changes: 16 additions & 7 deletions depdf/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import uuid
from functools import wraps

from depdf.base import Base
Expand All @@ -13,18 +12,22 @@ class Config(Base):
# pdf
logo_flag = DEFAULT_LOGO_FLAG
header_footer_flag = DEFAULT_HEADER_FOOTER_FLAG
temp_dir_prefix = DEFAULT_TEMP_DIR_PREFIX
unique_prefix = None # 该参数会根据 pdf 的文件名自动更新

# page
table_flag = DEFAULT_TABLE_FLAG
paragraph_flag = DEFAULT_PARAGRAPH_FLAG
img_flag = DEFAULT_IMG_FLAG
image_flag = DEFAULT_IMAGE_FLAG
resolution = DEFAULT_RESOLUTION
main_frame_tolerance = DEFAULT_MAIN_FRAME_TOLERANCE
main_frame_tolerance = None # 该参数可通过页面内容自动分析
x_tolerance = None # 该参数可通过页面内容自动分析
y_tolerance = None # 该参数可通过页面内容自动分析
page_num_top_fraction = DEFAULT_PAGE_NUM_TOP_FRACTION
page_num_left_fraction = DEFAULT_PAGE_NUM_LEFT_FRACTION
page_num_right_fraction = DEFAULT_PAGE_NUM_RIGHT_FRACTION
dotted_line_flag = True
curved_line_flag = False

# chars
char_overlap_size = DEFAULT_CHAR_OVERLAP_SIZE
Expand All @@ -35,10 +38,15 @@ class Config(Base):
# table
snap_flag = DEFAULT_SNAP_FLAG
add_line_flag = DEFAULT_ADD_LINE_FLAG
double_line_tolerance = DEFAULT_DOUBLE_LINE_TOLERANCE
min_double_line_tolerance = DEFAULT_MIN_DOUBLE_LINE_TOLERANCE # used in page class
max_double_line_tolerance = DEFAULT_MAX_DOUBLE_LINE_TOLERANCE # used in page class
vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE # used in page class
table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE

# image
min_image_size = DEFAULT_MIN_IMAGE_SIZE

# head & tail
default_head_tail_page_offset_percent = DEFAULT_HEAD_TAIL_PAGE_OFFSET_PERCENT

Expand All @@ -52,13 +60,14 @@ class Config(Base):
paragraph_class = DEFAULT_PARAGRAPH_CLASS
table_class = DEFAULT_TABLE_CLASS
pdf_class = DEFAULT_PDF_CLASS
image_class = DEFAULT_IMAGE_CLASS

def __init__(self, **kwargs):
# add unique prefix to dePDF instance
self.unique_prefix = uuid.uuid4()

# set log level automatically if debug mode enabled
if kwargs.get('debug_flag'):
self.log_level = logging.DEBUG
if kwargs.get('verbose_flag'):
self.log_level = logging.INFO

# add configuration parameters
self.update(**kwargs)
Expand Down
Loading

0 comments on commit 6865523

Please sign in to comment.