add tables and images extraction methods

meldonization · Mar 10, 2020 · 6865523 · 6865523
1 parent 2138046
commit 6865523
Show file tree

Hide file tree

Showing 9 changed files with 481 additions and 53 deletions.
diff --git a/depdf/components/__init__.py b/depdf/components/__init__.py
@@ -2,15 +2,17 @@
 from depdf.components.text import Text
 from depdf.components.span import Span
 from depdf.components.table import Table, Cell
+from depdf.components.image import Image
 
 component_list = [
     Paragraph,
     Table,
     Span,
     Text,
     Cell,
+    Image,
 ]
 
 __all__ = [
-    'Paragraph', 'Table', 'Span', 'Text', 'Cell',
+    'Paragraph', 'Table', 'Span', 'Text', 'Cell', 'Image',
 ]
diff --git a/depdf/components/image.py b/depdf/components/image.py
@@ -0,0 +1,22 @@
+from depdf.base import Base, Box
+from depdf.config import check_config
+from depdf.log import logger_init
+
+log = logger_init(__name__)
+
+
+class Image(Base, Box):
+    object_type = 'image'
+
+    @check_config
+    def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None):
+        self.bbox = bbox
+        self.scan = scan
+        width = bbox[2] - bbox[0]
+        img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
+        img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
+        html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
+            img_id=img_id, img_class=img_class, src=src, width=width
+        )
+        html += '</img>'
+        self.html = html
diff --git a/depdf/components/paragraph.py b/depdf/components/paragraph.py
@@ -29,7 +29,3 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj
     @property
     def inner_object(self):
         return [obj.to_dict if hasattr(obj, 'to_dict') else obj for obj in self._inner_object]
-
-
-def extract_pdf_paragraph_by_page(page):
-    pass
diff --git a/depdf/components/table.py b/depdf/components/table.py
@@ -129,16 +129,10 @@ def convert_table_to_html(table_dict, pid=1, tid=1, tc_mt=5, table_class='pdf-ta
                 html_table_string += ' rowspan="{}"'.format(row_span)
             if col_span > 1:
                 html_table_string += ' colspan="{}"'.format(col_span)
-            html_table_string += ' style="font-size: {font_size}px;">{tc_text}</td>'.format(
-                font_size=tc['fs'], tc_text=tc['html']
-            )
+            html_table_string += '>{tc_text}</td>'.format(tc_text=tc['html'])
             none_text_table = False if tc['html'] else none_text_table
         html_table_string += '</tr>'
     html_table_string += '</table>'
     if skip_et and none_text_table:
         return empty_table_html
     return html_table_string
-
-
-def extract_pdf_table_by_page(page):
-    pass
diff --git a/depdf/config.py b/depdf/config.py
@@ -1,4 +1,3 @@
-import uuid
 from functools import wraps
 
 from depdf.base import Base
@@ -13,18 +12,22 @@ class Config(Base):
     # pdf
     logo_flag = DEFAULT_LOGO_FLAG
     header_footer_flag = DEFAULT_HEADER_FOOTER_FLAG
+    temp_dir_prefix = DEFAULT_TEMP_DIR_PREFIX
+    unique_prefix = None  # 该参数会根据 pdf 的文件名自动更新
 
     # page
     table_flag = DEFAULT_TABLE_FLAG
     paragraph_flag = DEFAULT_PARAGRAPH_FLAG
-    img_flag = DEFAULT_IMG_FLAG
+    image_flag = DEFAULT_IMAGE_FLAG
     resolution = DEFAULT_RESOLUTION
-    main_frame_tolerance = DEFAULT_MAIN_FRAME_TOLERANCE
+    main_frame_tolerance = None  # 该参数可通过页面内容自动分析
     x_tolerance = None  # 该参数可通过页面内容自动分析
     y_tolerance = None  # 该参数可通过页面内容自动分析
     page_num_top_fraction = DEFAULT_PAGE_NUM_TOP_FRACTION
     page_num_left_fraction = DEFAULT_PAGE_NUM_LEFT_FRACTION
     page_num_right_fraction = DEFAULT_PAGE_NUM_RIGHT_FRACTION
+    dotted_line_flag = True
+    curved_line_flag = False
 
     # chars
     char_overlap_size = DEFAULT_CHAR_OVERLAP_SIZE
@@ -35,10 +38,15 @@ class Config(Base):
     # table
     snap_flag = DEFAULT_SNAP_FLAG
     add_line_flag = DEFAULT_ADD_LINE_FLAG
-    double_line_tolerance = DEFAULT_DOUBLE_LINE_TOLERANCE
+    min_double_line_tolerance = DEFAULT_MIN_DOUBLE_LINE_TOLERANCE  # used in page class
+    max_double_line_tolerance = DEFAULT_MAX_DOUBLE_LINE_TOLERANCE  # used in page class
+    vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE  # used in page class
     table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
     skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE
 
+    # image
+    min_image_size = DEFAULT_MIN_IMAGE_SIZE
+
     # head & tail
     default_head_tail_page_offset_percent = DEFAULT_HEAD_TAIL_PAGE_OFFSET_PERCENT
 
@@ -52,13 +60,14 @@ class Config(Base):
     paragraph_class = DEFAULT_PARAGRAPH_CLASS
     table_class = DEFAULT_TABLE_CLASS
     pdf_class = DEFAULT_PDF_CLASS
+    image_class = DEFAULT_IMAGE_CLASS
 
     def __init__(self, **kwargs):
-        # add unique prefix to dePDF instance
-        self.unique_prefix = uuid.uuid4()
-
+        # set log level automatically if debug mode enabled
         if kwargs.get('debug_flag'):
             self.log_level = logging.DEBUG
+        if kwargs.get('verbose_flag'):
+            self.log_level = logging.INFO
 
         # add configuration parameters
         self.update(**kwargs)