Merge branch 'master' into refactor-loader

invoice-x · Mar 6, 2023 · 9597a9e · 9597a9e
2 parents 19d9f1d + e3a5a2d
commit 9597a9e
Show file tree

Hide file tree

Showing 15 changed files with 221 additions and 33 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        python-version: [3.7, 3.8, 3.9, 3.10.x]
+        python-version: [3.7, 3.8, 3.9, 3.10.x, 3.11]
         os: [ubuntu-latest]
 
     steps:

diff --git a/TUTORIAL.md b/TUTORIAL.md
@@ -96,8 +96,9 @@ Optional properties:
 
 - `type` (if present must be one of: `int`, `float`, `date`) -results
   in parsing every matched value to a specified type
-- `group` (if present must be `sum`) - results in grouping all matched
-  values using specified method
+- `group` (if present must be one of: `sum`, `min`, `max`, `first`,
+  `last`) - specifies grouping function (defines what value to return in
+  case of multiple matches)
 
 Example for `regex`:
 
@@ -136,6 +137,10 @@ This parser allows parsing selected invoice section as a set of lines
 sharing some pattern. Those can be e.g. invoice items (good or services)
 or VAT rates.
 
+Some companies may use multiple formats for their line-based data. In
+such cases multiple sets of parsing regexes can be added to the `rules`.
+Results from multiple `rules` get merged into a single array.
+
 It replaces `lines` plugin and should be preferred over it. It allows
 reusing in multiple `fields`.
 
@@ -148,6 +153,17 @@ Example for `fields`:
         end: \s+Total
         line: (?P<description>.+)\s+(?P<discount>\d+.\d+)\s+(?P<price>\d+\d+)
 
+    fields:
+      lines:
+        parser: lines
+        rules:
+          - start: Item\s+Discount\s+Price$
+            end: \s+Total
+            line: (?P<description>.+)\s+(?P<discount>\d+.\d+)\s+(?P<price>\d+\d+)
+          - start: Item\s+Price$
+            end: \s+Total
+            line: (?P<description>.+)\s+(?P<price>\d+\d+)
+
 ### Legacy regexes
 
 For non-text fields, the name of the field is important:
@@ -301,6 +317,21 @@ options and their defaults are:
   different fields, you can supply a list here. The extraction will
   fail if not all fields are matched.
 
+### Priority
+
+In case of multiple templates matching single invoice the one with the
+highest priority will be used. Default `priority` value (assigned if
+missing) is 5.
+
+This property needs to be specified only when designing some generic or
+very specific templates.
+
+Suggested values:
+
+- 0-4: accounting/invoice software specific template
+- 5: company specific template
+- 6-10: company department/unit specific template
+
 ### Example of template using most options
 
     issuer: Free Mobile

diff --git a/setup.cfg b/setup.cfg
@@ -2,7 +2,7 @@
 name = invoice2data
 author = Manuel Riel
 description = Python parser to extract data from pdf invoice
-version = 0.4.2
+version = 0.4.3
 url = https://github.com/invoice-x/invoice2data
 keywords =
   pdf

diff --git a/src/invoice2data/extract/invoice_template.py b/src/invoice2data/extract/invoice_template.py
@@ -82,7 +82,7 @@ def prepare_input(self, extracted_str: str) -> str:
 
         # Remove accents
         if self.options["remove_accents"]:
-            optimized_str = unicodedata.normalize('NFKD', optimized_str).encode('ascii', 'ignore').decode('ascii')
+            optimized_str = re.sub('[\u0300-\u0362]', '', unicodedata.normalize('NFKD', optimized_str))
 
         # Convert to lower case
         if self.options["lowercase"]:
@@ -201,8 +201,7 @@ def extract(self, optimized_str: str, invoice_file: str, input_module: str) -> O
                     # Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
                     optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8")
                     # Log the text
-                    logger.debug("START pdftotext area result ===========================")
-                    logger.debug(optimized_str_area)
+                    logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
                     logger.debug("END pdftotext area result =============================")
                     optimized_str_for_parser = optimized_str_area
                 else:

diff --git a/src/invoice2data/extract/parsers/__interface__.py b/src/invoice2data/extract/parsers/__interface__.py
@@ -1,5 +1,3 @@
-# SPDX-License-Identifier: MIT
-
 """
 Interface for fields parsers.
 

diff --git a/src/invoice2data/extract/parsers/lines.py b/src/invoice2data/extract/parsers/lines.py
@@ -115,10 +115,10 @@ def parse_block(template, field, settings, content):
     return lines
 
 
-def parse(template, field, _settings, content):
+def parse_by_rule(template, field, rule, content):
     # First apply default options.
     settings = DEFAULT_OPTIONS.copy()
-    settings.update(_settings)
+    settings.update(rule)
 
     # Validate settings
     assert "start" in settings, "Lines start regex missing"
@@ -152,6 +152,24 @@ def parse(template, field, _settings, content):
     return lines
 
 
+def parse(template, field, settings, content):
+    if "rules" in settings:
+        # One field can have multiple sets of line-parsing rules
+        rules = settings['rules']
+    else:
+        # Original syntax stored line-parsing rules in top field YAML object
+        keys = ('start', 'end', 'line', 'first_line', 'last_line', 'skip_line', 'types')
+        rules = [{k: v for k, v in settings.items() if k in keys}]
+
+    lines = []
+    for rule in rules:
+        new_lines = parse_by_rule(template, field, rule, content)
+        if new_lines is not None:
+            lines += new_lines
+
+    return lines
+
+
 def parse_current_row(match, current_row):
     # Parse the current row data
     for field, value in match.groupdict().items():

diff --git a/src/invoice2data/extract/parsers/regex.py b/src/invoice2data/extract/parsers/regex.py
@@ -49,6 +49,14 @@ def parse(template, field, settings, content, legacy=False):
     if "group" in settings:
         if settings["group"] == "sum":
             result = sum(result)
+        elif settings["group"] == "min":
+            result = min(result)
+        elif settings["group"] == "max":
+            result = max(result)
+        elif settings["group"] == "first":
+            result = result[0]
+        elif settings["group"] == "last":
+            result = result[-1]
         else:
             logger.warning("Unsupported grouping method: " + settings["group"])
             return None

diff --git a/src/invoice2data/extract/templates/pl/pl.insert.subiekt-gt.yml b/src/invoice2data/extract/templates/pl/pl.insert.subiekt-gt.yml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: MIT
+keywords:
+  - 'Miejsce wystawienia:'
+  - 'Data wystawienia:'
+  - 'Sprzedawca:'
+  - 'Nabywca:'
+  - 'według stawki VAT'
+  - 'Razem do zapłaty:'
+  - 'Wystawił\(a\)'
+  - 'Odebrał\(a\)'
+  - 'Podpis osoby upoważnionej'
+fields:
+  issuer:
+    parser: regex
+    regex: Sprzedawca:.*\n(.*?)\s{3,}
+  vatin:
+    parser: regex
+    regex: NIP:\s+(\d{10})
+    type: int
+    group: first
+  date:
+    parser: regex
+    regex:
+      - Data wystawienia:\n.*(\d{2}\.\d{2}\.\d{4})
+      - Data wystawienia:\n.*(\d{4}-\d{2}-\d{2})
+    type: date
+  invoice_number:
+    parser: regex
+    regex: Faktura VAT\s+(.*?)\s+oryginał
+  amount:
+    parser: regex
+    regex: Razem do zapłaty:\s+([\d\s]+,[\d][\d])
+    type: float
+  nrb:
+    parser: regex
+    regex: PLN:\s+([0-9]{2}(?:\s?[0-9]{4}){6})
+options:
+  currency: PLN
+  date_formats:
+    - '%d.%m.%Y'
+    - '%Y-%m-%d'
+  decimal_separator: ','
+priority: 3
diff --git a/src/invoice2data/extract/templates/pl/pl.insert.subiekt-nexo.yml b/src/invoice2data/extract/templates/pl/pl.insert.subiekt-nexo.yml
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: MIT
+keywords:
+  - 'InsERT nexo'
+fields:
+  issuer:
+    parser: regex
+    regex: Sprzedawca.*\n(.*?)\s{3,}
+  vatin:
+    parser: regex
+    regex: NIP:\s+(\d{10})
+    type: int
+    group: first
+  date:
+    parser: regex
+    regex: Data wystawienia\s+(\d{2}-\d{2}-\d{4})
+    type: date
+  invoice_number:
+    parser: regex
+    regex: Faktura VAT sprzedaży\s+(.*)
+    group: first
+  amount:
+    parser: regex
+    regex: Razem do zapłaty:\s+([\d\s]+,[\d][\d])
+    type: float
+  nrb:
+    parser: regex
+    regex: PL\s+([0-9]{2}(?:\s?[0-9]{4}){6})
+options:
+  currency: PLN
+  date_formats:
+    - '%d-%m-%Y'
+  decimal_separator: ','
+priority: 3
diff --git a/src/invoice2data/input/pdftotext.py b/src/invoice2data/input/pdftotext.py
@@ -21,9 +21,9 @@ def to_text(path: str, area_details: dict = None):
         If pdftotext library is not found
     """
     import subprocess
-    from distutils import spawn  # py2 compat
+    import shutil
 
-    if spawn.find_executable("pdftotext"):  # shutil.which('pdftotext'):
+    if shutil.which('pdftotext'):
         cmd = ["pdftotext", "-layout", "-enc", "UTF-8"]
         if area_details is not None:
             # An area was specified

diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-from distutils import spawn
+import shutil
 import tempfile
 import mimetypes
 
@@ -13,13 +13,16 @@
 logger = logging.getLogger(__name__)
 
 
-def to_text(path):
+def to_text(path: str, area_details: dict = None):
     """Wraps Tesseract OCR with auto language model.
 
     Parameters
     ----------
     path : str
         path of electronic invoice in PDF, JPG or PNG format
+    area_details : dictionary
+        of the format {x: int, y: int, r: int, W: int, H: int}
+        used when extracting an area of the pdf rather than the whole document
 
     Returns
     -------
@@ -29,9 +32,9 @@ def to_text(path):
     """
 
     # Check for dependencies. Needs Tesseract and Imagemagick installed.
-    if not spawn.find_executable("tesseract"):
+    if not shutil.which("tesseract"):
         raise EnvironmentError("tesseract not installed.")
-    if not spawn.find_executable("convert"):
+    if not shutil.which("convert"):
         raise EnvironmentError("imagemagick not installed.")
 
     language = get_languages()
@@ -105,9 +108,30 @@ def to_text(path):
         "-layout",
         "-enc",
         "UTF-8",
-        TMP_FOLDER + filename + ".pdf",
-        "-",
     ]
+    if area_details is not None:
+        # An area was specified
+        # Validate the required keys were provided
+        assert 'f' in area_details, 'Area r details missing'
+        assert 'l' in area_details, 'Area r details missing'
+        assert 'r' in area_details, 'Area r details missing'
+        assert 'x' in area_details, 'Area x details missing'
+        assert 'y' in area_details, 'Area y details missing'
+        assert 'W' in area_details, 'Area W details missing'
+        assert 'H' in area_details, 'Area H details missing'
+        # Convert all of the values to strings
+        for key in area_details.keys():
+            area_details[key] = str(area_details[key])
+        pdftotext_cmd += [
+            '-f', area_details['f'],
+            '-l', area_details['l'],
+            '-r', area_details['r'],
+            '-x', area_details['x'],
+            '-y', area_details['y'],
+            '-W', area_details['W'],
+            '-H', area_details['H'],
+        ]
+    pdftotext_cmd += [TMP_FOLDER + filename + ".pdf", "-"]
 
     logger.debug("Calling pdfttext with, %s", pdftotext_cmd)
     p3 = Popen(pdftotext_cmd, stdin=p2.stdout, stdout=PIPE)

diff --git a/src/invoice2data/main.py b/src/invoice2data/main.py
@@ -79,10 +79,6 @@ def extract_data(invoicefile, templates=None, input_module=None):
      'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}
 
     """
-    if templates is None:
-        templates = read_templates()
-
-    # print(templates[0])
 
     if input_module is None:
         if invoicefile.lower().endswith('.txt'):
@@ -98,17 +94,18 @@ def extract_data(invoicefile, templates=None, input_module=None):
     logger.debug("START pdftotext result ===========================\n" + extracted_str)
     logger.debug("END pdftotext result =============================")
 
-    for t in templates:
-        optimized_str = t.prepare_input(extracted_str)
-
-        if t.matches_input(optimized_str):
-            logger.info("Using %s template", t["template_name"])
-            # Call extract with entire text and the invoicefile path
-            # The path is used if an area is called as a field option
-            return t.extract(optimized_str, invoicefile, input_module)
+    if templates is None:
+        templates = read_templates()
+    templates = filter(lambda t: t.matches_input(t.prepare_input(extracted_str)), templates)
+    templates = sorted(templates, key=lambda k: k['priority'], reverse=True)
+    if not templates:
+        logger.error("No template for %s", invoicefile)
+        return False
 
-    logger.error("No template for %s", invoicefile)
-    return False
+    t = templates[0]
+    logger.info("Using %s template", t["template_name"])
+    optimized_str = t.prepare_input(extracted_str)
+    return t.extract(optimized_str, invoicefile, input_module)
 
 
 def create_parser():

diff --git a/tests/custom/lines-multiple-patterns.json b/tests/custom/lines-multiple-patterns.json
@@ -17,6 +17,13 @@
             { "pos": 6, "name": "Penguin" },
             { "pos": 7, "name": "Ostrich" }
         ],
+        "dimensions": [
+            { "pos": 1, "angle": 30, "length": 30 },
+            { "pos": 2, "angle": 45, "length": 40 },
+            { "pos": 3, "angle": 90, "length": 60 },
+            { "pos": 4, "length": 80, "angle": 135 },
+            { "pos": 5, "length": 100, "angle": 180 }
+        ],
         "currency": "EUR",
         "desc": "Invoice from Lines Tests"
     }

diff --git a/tests/custom/lines-multiple-patterns.txt b/tests/custom/lines-multiple-patterns.txt
@@ -5,6 +5,7 @@ Total: 50.00 EUR
 
 Lines with multiple patterns
 
+
 Lines start
 
 Group: Mammals
@@ -21,3 +22,15 @@ Subgroup: Flightless
 7. Ostrich
 
 Lines end
+
+
+No  Angle [°]  Length [cm]
+1   30         30
+2   45         40
+3   90         60
+Count: 3
+
+No  Length [cm]  Angle [°]
+4   80           135
+5   100          180
+Count: 2