Skip to content

Commit

Permalink
Merge branch 'master' into refactor-loader
Browse files Browse the repository at this point in the history
  • Loading branch information
bosd authored Mar 6, 2023
2 parents 19d9f1d + e3a5a2d commit 9597a9e
Show file tree
Hide file tree
Showing 15 changed files with 221 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.7, 3.8, 3.9, 3.10.x]
python-version: [3.7, 3.8, 3.9, 3.10.x, 3.11]
os: [ubuntu-latest]

steps:
Expand Down
35 changes: 33 additions & 2 deletions TUTORIAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ Optional properties:

- `type` (if present must be one of: `int`, `float`, `date`) -results
in parsing every matched value to a specified type
- `group` (if present must be `sum`) - results in grouping all matched
values using specified method
- `group` (if present must be one of: `sum`, `min`, `max`, `first`,
`last`) - specifies grouping function (defines what value to return in
case of multiple matches)

Example for `regex`:

Expand Down Expand Up @@ -136,6 +137,10 @@ This parser allows parsing selected invoice section as a set of lines
sharing some pattern. Those can be e.g. invoice items (good or services)
or VAT rates.

Some companies may use multiple formats for their line-based data. In
such cases multiple sets of parsing regexes can be added to the `rules`.
Results from multiple `rules` get merged into a single array.

It replaces `lines` plugin and should be preferred over it. It allows
reusing in multiple `fields`.

Expand All @@ -148,6 +153,17 @@ Example for `fields`:
end: \s+Total
line: (?P<description>.+)\s+(?P<discount>\d+.\d+)\s+(?P<price>\d+\d+)

fields:
lines:
parser: lines
rules:
- start: Item\s+Discount\s+Price$
end: \s+Total
line: (?P<description>.+)\s+(?P<discount>\d+.\d+)\s+(?P<price>\d+\d+)
- start: Item\s+Price$
end: \s+Total
line: (?P<description>.+)\s+(?P<price>\d+\d+)

### Legacy regexes

For non-text fields, the name of the field is important:
Expand Down Expand Up @@ -301,6 +317,21 @@ options and their defaults are:
different fields, you can supply a list here. The extraction will
fail if not all fields are matched.

### Priority

In case of multiple templates matching single invoice the one with the
highest priority will be used. Default `priority` value (assigned if
missing) is 5.

This property needs to be specified only when designing some generic or
very specific templates.

Suggested values:

- 0-4: accounting/invoice software specific template
- 5: company specific template
- 6-10: company department/unit specific template

### Example of template using most options

issuer: Free Mobile
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = invoice2data
author = Manuel Riel
description = Python parser to extract data from pdf invoice
version = 0.4.2
version = 0.4.3
url = https://github.com/invoice-x/invoice2data
keywords =
pdf
Expand Down
5 changes: 2 additions & 3 deletions src/invoice2data/extract/invoice_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def prepare_input(self, extracted_str: str) -> str:

# Remove accents
if self.options["remove_accents"]:
optimized_str = unicodedata.normalize('NFKD', optimized_str).encode('ascii', 'ignore').decode('ascii')
optimized_str = re.sub('[\u0300-\u0362]', '', unicodedata.normalize('NFKD', optimized_str))

# Convert to lower case
if self.options["lowercase"]:
Expand Down Expand Up @@ -201,8 +201,7 @@ def extract(self, optimized_str: str, invoice_file: str, input_module: str) -> O
# Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8")
# Log the text
logger.debug("START pdftotext area result ===========================")
logger.debug(optimized_str_area)
logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
logger.debug("END pdftotext area result =============================")
optimized_str_for_parser = optimized_str_area
else:
Expand Down
2 changes: 0 additions & 2 deletions src/invoice2data/extract/parsers/__interface__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# SPDX-License-Identifier: MIT

"""
Interface for fields parsers.
Expand Down
22 changes: 20 additions & 2 deletions src/invoice2data/extract/parsers/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ def parse_block(template, field, settings, content):
return lines


def parse(template, field, _settings, content):
def parse_by_rule(template, field, rule, content):
# First apply default options.
settings = DEFAULT_OPTIONS.copy()
settings.update(_settings)
settings.update(rule)

# Validate settings
assert "start" in settings, "Lines start regex missing"
Expand Down Expand Up @@ -152,6 +152,24 @@ def parse(template, field, _settings, content):
return lines


def parse(template, field, settings, content):
if "rules" in settings:
# One field can have multiple sets of line-parsing rules
rules = settings['rules']
else:
# Original syntax stored line-parsing rules in top field YAML object
keys = ('start', 'end', 'line', 'first_line', 'last_line', 'skip_line', 'types')
rules = [{k: v for k, v in settings.items() if k in keys}]

lines = []
for rule in rules:
new_lines = parse_by_rule(template, field, rule, content)
if new_lines is not None:
lines += new_lines

return lines


def parse_current_row(match, current_row):
# Parse the current row data
for field, value in match.groupdict().items():
Expand Down
8 changes: 8 additions & 0 deletions src/invoice2data/extract/parsers/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ def parse(template, field, settings, content, legacy=False):
if "group" in settings:
if settings["group"] == "sum":
result = sum(result)
elif settings["group"] == "min":
result = min(result)
elif settings["group"] == "max":
result = max(result)
elif settings["group"] == "first":
result = result[0]
elif settings["group"] == "last":
result = result[-1]
else:
logger.warning("Unsupported grouping method: " + settings["group"])
return None
Expand Down
43 changes: 43 additions & 0 deletions src/invoice2data/extract/templates/pl/pl.insert.subiekt-gt.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-License-Identifier: MIT
keywords:
- 'Miejsce wystawienia:'
- 'Data wystawienia:'
- 'Sprzedawca:'
- 'Nabywca:'
- 'według stawki VAT'
- 'Razem do zapłaty:'
- 'Wystawił\(a\)'
- 'Odebrał\(a\)'
- 'Podpis osoby upoważnionej'
fields:
issuer:
parser: regex
regex: Sprzedawca:.*\n(.*?)\s{3,}
vatin:
parser: regex
regex: NIP:\s+(\d{10})
type: int
group: first
date:
parser: regex
regex:
- Data wystawienia:\n.*(\d{2}\.\d{2}\.\d{4})
- Data wystawienia:\n.*(\d{4}-\d{2}-\d{2})
type: date
invoice_number:
parser: regex
regex: Faktura VAT\s+(.*?)\s+oryginał
amount:
parser: regex
regex: Razem do zapłaty:\s+([\d\s]+,[\d][\d])
type: float
nrb:
parser: regex
regex: PLN:\s+([0-9]{2}(?:\s?[0-9]{4}){6})
options:
currency: PLN
date_formats:
- '%d.%m.%Y'
- '%Y-%m-%d'
decimal_separator: ','
priority: 3
33 changes: 33 additions & 0 deletions src/invoice2data/extract/templates/pl/pl.insert.subiekt-nexo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# SPDX-License-Identifier: MIT
keywords:
- 'InsERT nexo'
fields:
issuer:
parser: regex
regex: Sprzedawca.*\n(.*?)\s{3,}
vatin:
parser: regex
regex: NIP:\s+(\d{10})
type: int
group: first
date:
parser: regex
regex: Data wystawienia\s+(\d{2}-\d{2}-\d{4})
type: date
invoice_number:
parser: regex
regex: Faktura VAT sprzedaży\s+(.*)
group: first
amount:
parser: regex
regex: Razem do zapłaty:\s+([\d\s]+,[\d][\d])
type: float
nrb:
parser: regex
regex: PL\s+([0-9]{2}(?:\s?[0-9]{4}){6})
options:
currency: PLN
date_formats:
- '%d-%m-%Y'
decimal_separator: ','
priority: 3
4 changes: 2 additions & 2 deletions src/invoice2data/input/pdftotext.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def to_text(path: str, area_details: dict = None):
If pdftotext library is not found
"""
import subprocess
from distutils import spawn # py2 compat
import shutil

if spawn.find_executable("pdftotext"): # shutil.which('pdftotext'):
if shutil.which('pdftotext'):
cmd = ["pdftotext", "-layout", "-enc", "UTF-8"]
if area_details is not None:
# An area was specified
Expand Down
36 changes: 30 additions & 6 deletions src/invoice2data/input/tesseract.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

from distutils import spawn
import shutil
import tempfile
import mimetypes

Expand All @@ -13,13 +13,16 @@
logger = logging.getLogger(__name__)


def to_text(path):
def to_text(path: str, area_details: dict = None):
"""Wraps Tesseract OCR with auto language model.
Parameters
----------
path : str
path of electronic invoice in PDF, JPG or PNG format
area_details : dictionary
of the format {x: int, y: int, r: int, W: int, H: int}
used when extracting an area of the pdf rather than the whole document
Returns
-------
Expand All @@ -29,9 +32,9 @@ def to_text(path):
"""

# Check for dependencies. Needs Tesseract and Imagemagick installed.
if not spawn.find_executable("tesseract"):
if not shutil.which("tesseract"):
raise EnvironmentError("tesseract not installed.")
if not spawn.find_executable("convert"):
if not shutil.which("convert"):
raise EnvironmentError("imagemagick not installed.")

language = get_languages()
Expand Down Expand Up @@ -105,9 +108,30 @@ def to_text(path):
"-layout",
"-enc",
"UTF-8",
TMP_FOLDER + filename + ".pdf",
"-",
]
if area_details is not None:
# An area was specified
# Validate the required keys were provided
assert 'f' in area_details, 'Area r details missing'
assert 'l' in area_details, 'Area r details missing'
assert 'r' in area_details, 'Area r details missing'
assert 'x' in area_details, 'Area x details missing'
assert 'y' in area_details, 'Area y details missing'
assert 'W' in area_details, 'Area W details missing'
assert 'H' in area_details, 'Area H details missing'
# Convert all of the values to strings
for key in area_details.keys():
area_details[key] = str(area_details[key])
pdftotext_cmd += [
'-f', area_details['f'],
'-l', area_details['l'],
'-r', area_details['r'],
'-x', area_details['x'],
'-y', area_details['y'],
'-W', area_details['W'],
'-H', area_details['H'],
]
pdftotext_cmd += [TMP_FOLDER + filename + ".pdf", "-"]

logger.debug("Calling pdfttext with, %s", pdftotext_cmd)
p3 = Popen(pdftotext_cmd, stdin=p2.stdout, stdout=PIPE)
Expand Down
25 changes: 11 additions & 14 deletions src/invoice2data/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,6 @@ def extract_data(invoicefile, templates=None, input_module=None):
'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}
"""
if templates is None:
templates = read_templates()

# print(templates[0])

if input_module is None:
if invoicefile.lower().endswith('.txt'):
Expand All @@ -98,17 +94,18 @@ def extract_data(invoicefile, templates=None, input_module=None):
logger.debug("START pdftotext result ===========================\n" + extracted_str)
logger.debug("END pdftotext result =============================")

for t in templates:
optimized_str = t.prepare_input(extracted_str)

if t.matches_input(optimized_str):
logger.info("Using %s template", t["template_name"])
# Call extract with entire text and the invoicefile path
# The path is used if an area is called as a field option
return t.extract(optimized_str, invoicefile, input_module)
if templates is None:
templates = read_templates()
templates = filter(lambda t: t.matches_input(t.prepare_input(extracted_str)), templates)
templates = sorted(templates, key=lambda k: k['priority'], reverse=True)
if not templates:
logger.error("No template for %s", invoicefile)
return False

logger.error("No template for %s", invoicefile)
return False
t = templates[0]
logger.info("Using %s template", t["template_name"])
optimized_str = t.prepare_input(extracted_str)
return t.extract(optimized_str, invoicefile, input_module)


def create_parser():
Expand Down
7 changes: 7 additions & 0 deletions tests/custom/lines-multiple-patterns.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
{ "pos": 6, "name": "Penguin" },
{ "pos": 7, "name": "Ostrich" }
],
"dimensions": [
{ "pos": 1, "angle": 30, "length": 30 },
{ "pos": 2, "angle": 45, "length": 40 },
{ "pos": 3, "angle": 90, "length": 60 },
{ "pos": 4, "length": 80, "angle": 135 },
{ "pos": 5, "length": 100, "angle": 180 }
],
"currency": "EUR",
"desc": "Invoice from Lines Tests"
}
Expand Down
13 changes: 13 additions & 0 deletions tests/custom/lines-multiple-patterns.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ Total: 50.00 EUR

Lines with multiple patterns


Lines start

Group: Mammals
Expand All @@ -21,3 +22,15 @@ Subgroup: Flightless
7. Ostrich

Lines end


No Angle [°] Length [cm]
1 30 30
2 45 40
3 90 60
Count: 3

No Length [cm] Angle [°]
4 80 135
5 100 180
Count: 2
Loading

0 comments on commit 9597a9e

Please sign in to comment.