Skip to content

Commit

Permalink
welcome to the first release
Browse files Browse the repository at this point in the history
  • Loading branch information
meldonization committed Mar 12, 2020
1 parent 9a6e33b commit dfd5b5b
Show file tree
Hide file tree
Showing 21 changed files with 328 additions and 35 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.DS_Store
.idea
__pycache__
*.py[cod]
*.py[cod]
temp_depdf/
119 changes: 116 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,118 @@
# depdf
# DePDF

An ultimate pdf file disintegration tool. Yet able to extract pages embedded with tables and paragraphs into structured markup language.
An ultimate pdf file disintegration tool. DePDF is designed to extract tables and paragraphs into structured markup language [eg. html] from embedding pdf pages. You can also use it to convert page/pdf to html.

Built on [`pdfplumber`](https://github.com/jsvine/pdfplumber)
Built on top of [`pdfplumber`](https://github.com/jsvine/pdfplumber)

# Table of Contents
[toc]


# Installation
`pip install depdf`

# Example
```python
from depdf import DePDF
from depdf import DePage

# general
with DePDF.load('test/test_general.pdf') as pdf
pdf_html = pdf.to_html
print(pdf_html)

# with dedicated configurations
c = Config(
debug_flag=True,
verbose_flag=True,
add_line_flag=True
)
pdf = DePDF.load('test/test_general.pdf', config=c)
page_index = 23 # start from zero
page = pdf_file.pages[page_index]
page_soup = page.soup
print(page_soup.text)
```


# APIs
| **functions** | usage |
|:---:|---|
| `extract_page_paragraphs` | extract paragraphs from specific page |
| `extract_page_tables` | extract tables from specific page |
| `convert_pdf_to_html` | convert the entire pdf to html |
| `convert_page_to_html` | convert specific page to html |


# In-Depth

## In-page elements
* Paragraph
+ Text
+ Span
* Table
+ Cell
* Image

## Common properties
| **property & method** | explanation |
|:---:|---|
| `html` | converted html string |
| `soup` | converted beautiful soup |
| `bbox` | bounding box region |
| `save_html` | write html tag to local file|

## DePDf HTML structure
```html
<div class="{pdf_class}">
%for <!--page-{pid}-->
<div id="page-{}" class="{}">
%for {html_elements} endfor%
</div>
endfor%
</div>
```

## DePage HTML element structure

### Paragraph
```html
<p>
{paragraph-content}
<span> {span-content} </span>
...
</p>
```

### Table
```html
<table>
<tr>
<td> {cell_0_0} </td>
<td> {cell_0_1} </td>
...
</tr>
<tr colspan=2>
<td> {cell_1_0} </td>
...
</tr>
...
</table>
```

### Image
```
<img src="temp_depdf/$prefix.png"></img>
```
# Appendix

## DePage element denotations
> Useful element properties within page
![page element](annotations.jpg)

## todo

* [ ] add support for multiple-column pdf page
* [ ] better table structure recognition
* [x] recognize embedded objects inside page elements
Binary file added annotations.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion depdf/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def wrapper(pdf_file_path, *args, **kwargs):
elif isinstance(pdf_file_path, PDF):
pdf = DePDF(pdf_file_path, config=config, **kwargs)
elif isinstance(pdf_file_path, str):
pdf = DePDF.open(pdf_file_path, config=config, **kwargs)
pdf = DePDF.load(pdf_file_path, config=config, **kwargs)
else:
raise PDFTypeError
res = api_func(pdf, pid) if pid > 0 else api_func(pdf)
Expand Down
4 changes: 4 additions & 0 deletions depdf/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ def html(self, html_value):
def soup(self):
return convert_html_to_soup(self._html)

def write_to(self, file_name):
with open(file_name, "w") as file:
file.write(self.html)

@property
def to_dict(self):
return {
Expand Down
2 changes: 1 addition & 1 deletion depdf/components/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def __init__(self, bbox=None, src='', pid=1, img_idx=1, scan=False, config=None)
width = bbox[2] - bbox[0]
img_id = 'page-{pid}-image-{img_idx}'.format(pid=pid, img_idx=img_idx)
img_class = '{img_class} page-{pid}'.format(img_class=getattr(config, 'image_class'), pid=pid)
html = '<img id="{img_id}" class="{img_class}" src={src} width={width}>'.format(
html = '<img id="{img_id}" class="{img_class}" src="{src}" width="{width}">'.format(
img_id=img_id, img_class=img_class, src=src, width=width
)
html += '</img>'
Expand Down
14 changes: 10 additions & 4 deletions depdf/components/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,17 @@ class Paragraph(InnerWrapper, Box):
object_type = 'paragraph'

@check_config
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None):
def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_objects=None, style=None, align=None):
para_id = 'page-{pid}-paragraph-{para_id}'.format(pid=pid, para_id=para_idx)
para_class = '{para_class} page-{pid}'.format(para_class=getattr(config, 'paragraph_class'), pid=pid)
style = construct_style(style=style)
html = '<p id="{para_id}" class="{para_class}"{style}>'.format(
para_id=para_id, para_class=para_class, style=style
style_text = construct_style(style=style)
align_text = ' align="{}"'.format(align) if align else ''
html = '<p id="{para_id}" class="{para_class}"{align_text}{style_text}>'.format(
para_id=para_id, para_class=para_class, style_text=style_text, align_text=align_text
)
self.pid = pid
self.para_id = para_idx
self.config = config
self.bbox = bbox
if text:
self.text = text
Expand All @@ -34,3 +36,7 @@ def __init__(self, bbox=None, text='', pid=1, para_idx=1, config=None, inner_obj

def __repr__(self):
return '<depdf.Paragraph: ({}, {})>'.format(self.pid, self.para_id)

def save_html(self):
paragraph_file_name = '{}_page_{}_paragraph_{}.html'.format(self.config.unique_prefix, self.pid, self.para_id)
return super().write_to(paragraph_file_name)
6 changes: 3 additions & 3 deletions depdf/components/span.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self, bbox=None, span_text='', config=None, style=None):
self.bbox = bbox
self.text = span_text
span_class = getattr(config, 'span_class')
style = construct_style(style=style)
self.html = '<span class="{span_class}{style}">{span_text}</span>'.format(
span_class=span_class, span_text=span_text, style=style
style_text = construct_style(style=style)
self.html = '<span class="{span_class}"{style_text}>{span_text}</span>'.format(
span_class=span_class, span_text=span_text, style_text=style_text
)
4 changes: 4 additions & 0 deletions depdf/components/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ def to_dict(self):
]
return table_dict

def save_html(self):
table_file_name = '{}_page_{}_table_{}.html'.format(self.config.unique_prefix, self.pid, self.tid)
return super().write_to(table_file_name)

@property
def html(self):
if not self._html and hasattr(self, 'to_html'):
Expand Down
8 changes: 8 additions & 0 deletions depdf/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from functools import wraps
import os

from depdf.base import Base
from depdf.error import ConfigTypeError
Expand Down Expand Up @@ -43,6 +44,9 @@ class Config(Base):
vertical_double_line_tolerance = DEFAULT_VERTICAL_DOUBLE_LINE_TOLERANCE # used in page class
table_cell_merge_tolerance = DEFAULT_TABLE_CELL_MERGE_TOLERANCE
skip_empty_table = DEFAULT_SKIP_EMPTY_TABLE
add_vertical_lines_flag = DEFAULT_ADD_VERTICAL_LINES_FLAG
add_horizontal_lines_flag = DEFAULT_ADD_HORIZONTAL_LINES_FLAG
add_horizontal_line_tolerance = DEFAULT_ADD_HORIZONTAL_LINE_TOLERANCE

# image
min_image_size = DEFAULT_MIN_IMAGE_SIZE
Expand Down Expand Up @@ -74,6 +78,10 @@ def __init__(self, **kwargs):
self.update(**kwargs)
self._kwargs = kwargs

# create temporary folder
if not os.path.isdir(self.temp_dir_prefix):
os.mkdir(self.temp_dir_prefix)

# set logging level by log_level parameter
logging.getLogger('depdf').setLevel(self.log_level)

Expand Down
44 changes: 35 additions & 9 deletions depdf/page.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import os
from statistics import mean, median
import uuid

from pdfplumber.page import Page

from depdf.base import Base
from depdf.components import Image, Paragraph, Text, Span
from depdf.config import check_config, check_config_type
from depdf.components import Paragraph, Text, Span
from depdf.config import check_config_type
from depdf.error import PageTypeError
from depdf.page_tools import *

Expand Down Expand Up @@ -164,6 +163,10 @@ def images(self):
def images_raw(self):
return self._images_raw

def save_html(self):
page_file_name = '{}_page_{}.html'.format(self.prefix, self.pid)
return super().write_to(page_file_name)

@property
def html(self):
if not self._html and hasattr(self, 'to_html'):
Expand All @@ -173,7 +176,9 @@ def html(self):
@property
def to_html(self):
page_class = getattr(self.config, 'page_class')
html = '<div id="page-{}" class="{}">'.format(self.pid, page_class)
html = '<div id="page-{}" class="{}" new_para_start="{}" new_para_end="{}">'.format(
self.pid, page_class, self.new_para_start_flag, self.new_para_end_flag
)
for obj in self.objects:
html += getattr(obj, 'html', '')
html += '</div>'
Expand Down Expand Up @@ -254,7 +259,9 @@ def analyze_main_frame(self):

def extract_phrases(self):
phrases = [
i for i in self.page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
i for i in self.page.extract_words(x_tolerance=self.x_tolerance,
y_tolerance=self.y_tolerance,
keep_blank_chars=True)
if 'top' in i and i['top'] >= self.frame_top and 'bottom' in i and i['bottom'] <= self.frame_bottom
]
self.phrases = phrases
Expand Down Expand Up @@ -314,6 +321,19 @@ def analyze_lines(self):
h_lines.extend(h_curves)
v_lines.extend(v_curves)

# 增加竖线
add_vlf = getattr(self.config, 'add_vertical_lines_flag')
if add_vlf:
v_lines_add = add_vertical_lines(v_lines, h_lines, rect_edges_raw, self.page, self.ave_cs)
v_lines.extend(v_lines_add)

# 增加顶部和底部的横线
add_hlf = getattr(self.config, 'add_horizontal_lines_flag')
vlts_tolerance = getattr(self.config, 'add_horizontal_line_tolerance')
if add_hlf:
h_lines_add = add_horizontal_lines(v_lines, h_lines, vlts_tolerance=vlts_tolerance)
h_lines.extend(h_lines_add)

# 设定页面的横竖线列表
self.h_edges = [{'top': i['top'], 'x0': i['x0'], 'x1': i['x1']} for i in h_lines]
self.v_edges = [{'x': i['x0'], 'top': i['top'], 'bottom': i['bottom']} for i in v_lines]
Expand Down Expand Up @@ -389,7 +409,7 @@ def extract_images(self):
for image in images_raw:
try:
image_area = self.page.within_bbox(image['bbox'])
image_words.extend(image_area.extract_words(x_tolerance=self.ave_cs * 3 / 2))
image_words.extend(image_area.extract_words(x_tolerance=self.ave_cs * 3 / 2, keep_blank_chars=True))
except:
pass
self._image_phrases = image_words
Expand All @@ -411,7 +431,7 @@ def extract_paragraph(self):
para_idx, paragraphs, paragraph_objects = 1, [], []
ave_ts = ave_cs = self.ave_cs
ave_lh, page_width = self.ave_lh, self.width
div_flag = center_flag = False
div_flag = center_flag = right_flag = False
para_style = {}
for i in self.phrases:
if i in self.same_tmp or i in self._image_phrases or \
Expand Down Expand Up @@ -444,18 +464,21 @@ def extract_paragraph(self):
if abs(left - ll) <= 1 and p_right >= lr - ave_ts * 3 / 2:
new_para_flag = False # 如果该行的左边距特别小且上一行的右边距相对较小,则认为是同一个段落
if new_para_flag:
if abs(page_width - right - left) <= ave_ts / 2:
if abs(page_width - right - left) <= ave_ts * 2:
if abs(lr - right) >= 4 * ave_ts: # 段前有四个 char_size 大小的空白
center_flag = True
if left > ll + ave_ts * 4:
div_flag = True
if right >= lr - ave_ts:
right_flag = True
elif abs(left - p_right) >= ave_ts * 2: # 同一行需要判定该段落是否为文本框组合
if abs(top - p_top) <= ave_ts / 2:
new_line_flag = new_para_flag = False

if new_para_flag and paragraph_objects:
align = para_style.pop('align') if 'align' in para_style else None
paragraphs.append(Paragraph(
pid=self.pid, para_idx=para_idx, config=self.config,
pid=self.pid, para_idx=para_idx, config=self.config, align=align,
inner_objects=paragraph_objects, style=para_style
))
para_style = {}
Expand All @@ -469,6 +492,8 @@ def extract_paragraph(self):
para_style.update({'align': 'center'})
elif div_flag:
para_style.update({'margin-left': '{0}px'.format((left - ll))})
if right_flag:
para_style.update({'align': 'right'})

if new_line_flag:
paragraph_objects.append(Text(bbox=bbox, text=text))
Expand All @@ -486,6 +511,7 @@ def extract_paragraph(self):
if center_flag:
para_style.update({'align': 'center'})
elif div_flag:
para_style.update({'align': 'left'})
para_style.update({'margin-left': '{0}px'.format((left - ll))})
paragraphs.append(Paragraph(
pid=self.pid, para_idx=para_idx, config=self.config,
Expand Down
Loading

0 comments on commit dfd5b5b

Please sign in to comment.