welcome to the first release

meldonization · Mar 12, 2020 · dfd5b5b · dfd5b5b
1 parent 9a6e33b
commit dfd5b5b
Show file tree

Hide file tree

Showing 21 changed files with 328 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .DS_Store
 .idea
 __pycache__
-*.py[cod]
+*.py[cod]
+temp_depdf/
diff --git a/README.md b/README.md
@@ -1,5 +1,118 @@
-# depdf
+# DePDF
 
-An ultimate pdf file disintegration tool. Yet able to extract pages embedded with tables and paragraphs into structured markup language.
+An ultimate pdf file disintegration tool. DePDF is designed to extract tables and paragraphs into structured markup language [eg. html] from embedding pdf pages. You can also use it to convert page/pdf to html.
 
-Built on [`pdfplumber`](https://github.com/jsvine/pdfplumber)
+Built on top of [`pdfplumber`](https://github.com/jsvine/pdfplumber)
+
+# Table of Contents
+[toc]
+
+
+# Installation
+`pip install depdf`
+
+# Example
+```python
+from depdf import DePDF
+from depdf import DePage
+
+# general
+with DePDF.load('test/test_general.pdf') as pdf
+    pdf_html = pdf.to_html
+    print(pdf_html)
+
+# with dedicated configurations
+c = Config(
+    debug_flag=True,
+    verbose_flag=True,
+    add_line_flag=True
+)
+pdf = DePDF.load('test/test_general.pdf', config=c)
+page_index = 23  # start from zero
+page = pdf_file.pages[page_index]
+page_soup = page.soup
+print(page_soup.text)
+```
+
+
+# APIs
+| **functions** | usage |
+|:---:|---|
+| `extract_page_paragraphs` | extract paragraphs from specific page |
+| `extract_page_tables` | extract tables from specific page |
+| `convert_pdf_to_html` | convert the entire pdf to html | 
+| `convert_page_to_html` | convert specific page to html | 
+
+
+# In-Depth
+
+## In-page elements
+* Paragraph
+    + Text
+    + Span
+* Table
+    + Cell
+* Image
+
+## Common properties
+| **property & method** | explanation |
+|:---:|---|
+| `html` | converted html string |
+| `soup` | converted beautiful soup |
+| `bbox` | bounding box region | 
+| `save_html` | write html tag to local file| 
+
+## DePDf HTML structure
+```html
+<div class="{pdf_class}">
+    %for <!--page-{pid}-->
+        <div id="page-{}" class="{}">
+            %for {html_elements} endfor%
+        </div>
+    endfor%
+</div>
+```
+
+## DePage HTML element structure
+
+### Paragraph
+```html
+<p>
+    {paragraph-content}
+    <span> {span-content} </span>
+    ... 
+</p>
+```
+
+### Table
+```html
+<table>
+    <tr>
+        <td> {cell_0_0} </td>
+        <td> {cell_0_1} </td>
+        ...
+    </tr>
+    <tr colspan=2>
+        <td> {cell_1_0} </td>
+        ...
+    </tr>
+    ...
+</table>
+```
+
+### Image
+```
+<img src="temp_depdf/$prefix.png"></img>
+```
+# Appendix
+
+## DePage element denotations
+> Useful element properties within page
+
+![page element](annotations.jpg)
+
+## todo
+
+* [ ] add support for multiple-column pdf page
+* [ ] better table structure recognition
+* [x] recognize embedded objects inside page elements
diff --git a/annotations.jpg b/annotations.jpg
diff --git a/depdf/api.py b/depdf/api.py
@@ -21,7 +21,7 @@ def wrapper(pdf_file_path, *args, **kwargs):
         elif isinstance(pdf_file_path, PDF):
             pdf = DePDF(pdf_file_path, config=config, **kwargs)
         elif isinstance(pdf_file_path, str):
-            pdf = DePDF.open(pdf_file_path, config=config, **kwargs)
+            pdf = DePDF.load(pdf_file_path, config=config, **kwargs)
         else:
             raise PDFTypeError
         res = api_func(pdf, pid) if pid > 0 else api_func(pdf)

diff --git a/depdf/base.py b/depdf/base.py
@@ -61,6 +61,10 @@ def html(self, html_value):
     def soup(self):
         return convert_html_to_soup(self._html)
 
+    def write_to(self, file_name):
+        with open(file_name, "w") as file:
+            file.write(self.html)
+
     @property
     def to_dict(self):
         return {

diff --git a/depdf/components/image.py b/depdf/components/image.py
@@ -15,7 +15,7 @@ def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None)
         width = bbox[2] - bbox[0]
         img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
         img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
-        html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
+        html = '<img id="{img_id}" class="{img_class}" src="{src}" width="{width}">'.format(
             img_id=img_id, img_class=img_class, src=src, width=width
         )
         html += '</img>'

diff --git a/depdf/components/paragraph.py b/depdf/components/paragraph.py
@@ -10,15 +10,17 @@ class Paragraph(InnerWrapper, Box):
     object_type = 'paragraph'
 
     @check_config
-    def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None):
+    def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None, align=None):
         para_id = 'page-{pid}-paragraph-{para_id}'.format(pid=pid, para_id=para_idx)
         para_class = '{para_class} page-{pid}'.format(para_class=getattr(config, 'paragraph_class'), pid=pid)
-        style = construct_style(style=style)
-        html = '<p id="{para_id}" class="{para_class}"{style}>'.format(
-            para_id=para_id, para_class=para_class, style=style
+        style_text = construct_style(style=style)
+        align_text = ' align="{}"'.format(align) if align else ''
+        html = '<p id="{para_id}" class="{para_class}"{align_text}{style_text}>'.format(
+            para_id=para_id, para_class=para_class, style_text=style_text, align_text=align_text
         )
         self.pid = pid
         self.para_id = para_idx
+        self.config = config
         self.bbox = bbox
         if text:
             self.text = text
@@ -34,3 +36,7 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj
 
     def __repr__(self):
         return '<depdf.Paragraph: ({}, {})>'.format(self.pid, self.para_id)
+
+    def save_html(self):
+        paragraph_file_name = '{}_page_{}_paragraph_{}.html'.format(self.config.unique_prefix, self.pid, self.para_id)
+        return super().write_to(paragraph_file_name)
diff --git a/depdf/components/span.py b/depdf/components/span.py
@@ -14,7 +14,7 @@ def __init__(self, bbox=None, span_text='', config=None, style=None):
         self.bbox = bbox
         self.text = span_text
         span_class = getattr(config, 'span_class')
-        style = construct_style(style=style)
-        self.html = '<span class="{span_class}{style}">{span_text}</span>'.format(
-            span_class=span_class, span_text=span_text, style=style
+        style_text = construct_style(style=style)
+        self.html = '<span class="{span_class}"{style_text}>{span_text}</span>'.format(
+            span_class=span_class, span_text=span_text, style_text=style_text
         )
diff --git a/depdf/components/table.py b/depdf/components/table.py
@@ -45,6 +45,10 @@ def to_dict(self):
         ]
         return table_dict
 
+    def save_html(self):
+        table_file_name = '{}_page_{}_table_{}.html'.format(self.config.unique_prefix, self.pid, self.tid)
+        return super().write_to(table_file_name)
+
     @property
     def html(self):
         if not self._html and hasattr(self, 'to_html'):

diff --git a/depdf/config.py b/depdf/config.py
@@ -1,4 +1,5 @@
 from functools import wraps
+import os
 
 from depdf.base import Base
 from depdf.error import ConfigTypeError
@@ -43,6 +44,9 @@ class Config(Base):
     vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE  # used in page class
     table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
     skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE
+    add_vertical_lines_flag = DEFAULT_ADD_VERTICAL_LINES_FLAG
+    add_horizontal_lines_flag = DEFAULT_ADD_HORIZONTAL_LINES_FLAG
+    add_horizontal_line_tolerance = DEFAULT_ADD_HORIZONTAL_LINE_TOLERANCE
 
     # image
     min_image_size = DEFAULT_MIN_IMAGE_SIZE
@@ -74,6 +78,10 @@ def __init__(self, **kwargs):
         self.update(**kwargs)
         self._kwargs = kwargs
 
+        # create temporary folder
+        if not os.path.isdir(self.temp_dir_prefix):
+            os.mkdir(self.temp_dir_prefix)
+
         # set logging level by log_level parameter
         logging.getLogger('depdf').setLevel(self.log_level)
 

diff --git a/depdf/page.py b/depdf/page.py
@@ -1,12 +1,11 @@
-import os
 from statistics import mean, median
 import uuid
 
 from pdfplumber.page import Page
 
 from depdf.base import Base
-from depdf.components import Image, Paragraph, Text, Span
-from depdf.config import check_config, check_config_type
+from depdf.components import Paragraph, Text, Span
+from depdf.config import check_config_type
 from depdf.error import PageTypeError
 from depdf.page_tools import *
 
@@ -164,6 +163,10 @@ def images(self):
     def images_raw(self):
         return self._images_raw
 
+    def save_html(self):
+        page_file_name = '{}_page_{}.html'.format(self.prefix, self.pid)
+        return super().write_to(page_file_name)
+
     @property
     def html(self):
         if not self._html and hasattr(self, 'to_html'):
@@ -173,7 +176,9 @@ def html(self):
     @property
     def to_html(self):
         page_class = getattr(self.config, 'page_class')
-        html = '<div id="page-{}" class="{}">'.format(self.pid, page_class)
+        html = '<div id="page-{}" class="{}" new_para_start="{}" new_para_end="{}">'.format(
+            self.pid, page_class, self.new_para_start_flag, self.new_para_end_flag
+        )
         for obj in self.objects:
             html += getattr(obj, 'html', '')
         html += '</div>'
@@ -254,7 +259,9 @@ def analyze_main_frame(self):
 
     def extract_phrases(self):
         phrases = [
-            i for i in self.page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
+            i for i in self.page.extract_words(x_tolerance=self.x_tolerance,
+                                               y_tolerance=self.y_tolerance,
+                                               keep_blank_chars=True)
             if 'top' in i and i['top'] >= self.frame_top and 'bottom' in i and i['bottom'] <= self.frame_bottom
         ]
         self.phrases = phrases
@@ -314,6 +321,19 @@ def analyze_lines(self):
         h_lines.extend(h_curves)
         v_lines.extend(v_curves)
 
+        # 增加竖线
+        add_vlf = getattr(self.config, 'add_vertical_lines_flag')
+        if add_vlf:
+            v_lines_add = add_vertical_lines(v_lines, h_lines, rect_edges_raw, self.page, self.ave_cs)
+            v_lines.extend(v_lines_add)
+
+        # 增加顶部和底部的横线
+        add_hlf = getattr(self.config, 'add_horizontal_lines_flag')
+        vlts_tolerance = getattr(self.config, 'add_horizontal_line_tolerance')
+        if add_hlf:
+            h_lines_add = add_horizontal_lines(v_lines, h_lines, vlts_tolerance=vlts_tolerance)
+            h_lines.extend(h_lines_add)
+
         # 设定页面的横竖线列表
         self.h_edges = [{'top': i['top'], 'x0': i['x0'], 'x1': i['x1']} for i in h_lines]
         self.v_edges = [{'x': i['x0'], 'top': i['top'], 'bottom': i['bottom']} for i in v_lines]
@@ -389,7 +409,7 @@ def extract_images(self):
         for image in images_raw:
             try:
                 image_area = self.page.within_bbox(image['bbox'])
-                image_words.extend(image_area.extract_words(x_tolerance=self.ave_cs * 3 / 2))
+                image_words.extend(image_area.extract_words(x_tolerance=self.ave_cs * 3 / 2, keep_blank_chars=True))
             except:
                 pass
         self._image_phrases = image_words
@@ -411,7 +431,7 @@ def extract_paragraph(self):
         para_idx, paragraphs, paragraph_objects = 1, [], []
         ave_ts = ave_cs = self.ave_cs
         ave_lh, page_width = self.ave_lh, self.width
-        div_flag = center_flag = False
+        div_flag = center_flag = right_flag = False
         para_style = {}
         for i in self.phrases:
             if i in self.same_tmp or i in self._image_phrases or \
@@ -444,18 +464,21 @@ def extract_paragraph(self):
                         if abs(left - ll) <= 1 and p_right >= lr - ave_ts * 3 / 2:
                             new_para_flag = False  # 如果该行的左边距特别小且上一行的右边距相对较小，则认为是同一个段落
                     if new_para_flag:
-                        if abs(page_width - right - left) <= ave_ts / 2:
+                        if abs(page_width - right - left) <= ave_ts * 2:
                             if abs(lr - right) >= 4 * ave_ts:  # 段前有四个 char_size 大小的空白
                                 center_flag = True
                         if left > ll + ave_ts * 4:
                             div_flag = True
+                            if right >= lr - ave_ts:
+                                right_flag = True
                 elif abs(left - p_right) >= ave_ts * 2:  # 同一行需要判定该段落是否为文本框组合
                     if abs(top - p_top) <= ave_ts / 2:
                         new_line_flag = new_para_flag = False
 
             if new_para_flag and paragraph_objects:
+                align = para_style.pop('align') if 'align' in para_style else None
                 paragraphs.append(Paragraph(
-                    pid=self.pid, para_idx=para_idx, config=self.config,
+                    pid=self.pid, para_idx=para_idx, config=self.config, align=align,
                     inner_objects=paragraph_objects, style=para_style
                 ))
                 para_style = {}
@@ -469,6 +492,8 @@ def extract_paragraph(self):
                     para_style.update({'align': 'center'})
                 elif div_flag:
                     para_style.update({'margin-left': '{0}px'.format((left - ll))})
+                    if right_flag:
+                        para_style.update({'align': 'right'})
 
             if new_line_flag:
                 paragraph_objects.append(Text(bbox=bbox, text=text))
@@ -486,6 +511,7 @@ def extract_paragraph(self):
             if center_flag:
                 para_style.update({'align': 'center'})
             elif div_flag:
+                para_style.update({'align': 'left'})
                 para_style.update({'margin-left': '{0}px'.format((left - ll))})
             paragraphs.append(Paragraph(
                 pid=self.pid, para_idx=para_idx, config=self.config,