fix: allow for caps in file name suffixes #206

@schinkelg ran aground of this one and I took the opportunity to add a test to catch this sort of thing for next time.
the-paperless-project · Mar 28, 2017 · fa4924d · fa4924d
1 parent 5b88ebf
commit fa4924d
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 3 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,6 +1,10 @@
 Changelog
 #########
 
+* 0.4.1
+  * Fix for `#206`_ wherein the pluggable parser didn't recognise files with
+    all-caps suffixes like ``.PDF``
+
 * 0.4.0
   * Introducing reminders.  See `#199`_ for more information, but the short
     explanation is that you can now attach simple notes & times to documents
@@ -211,3 +215,4 @@ Changelog
 .. _#179: https://github.com/danielquinn/paperless/pull/179
 .. _#199: https://github.com/danielquinn/paperless/issues/199
 .. _#200: https://github.com/danielquinn/paperless/issues/200
+.. _#206: https://github.com/danielquinn/paperless/issues/206
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
@@ -102,7 +102,7 @@ def consume(self):
             parser_class = self._get_parser_class(doc)
             if not parser_class:
                 self.log(
-                    "info", "No parsers could be found for {}".format(doc))
+                    "error", "No parsers could be found for {}".format(doc))
                 self._ignore.append(doc)
                 continue
 
@@ -160,6 +160,16 @@ def _get_parser_class(self, doc):
             if result:
                 options.append(result)
 
+        self.log(
+            "info",
+            "Parsers available: {}".format(
+                ", ".join([str(o["parser"].__name__) for o in options])
+            )
+        )
+
+        if not options:
+            return None
+
         # Return the parser with the highest weight.
         return sorted(
             options, key=lambda _: _["weight"], reverse=True)[0]["parser"]

diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
@@ -1,8 +1,56 @@
 from django.test import TestCase
+from unittest import mock
 
+from ..consumer import Consumer
 from ..models import FileInfo
 
 
+class TestConsumer(TestCase):
+
+    class DummyParser(object):
+        pass
+
+    def test__get_parser_class_1_parser(self):
+        self.assertEqual(
+            self._get_consumer()._get_parser_class("doc.pdf"),
+            self.DummyParser
+        )
+
+    @mock.patch("documents.consumer.os.makedirs")
+    @mock.patch("documents.consumer.os.path.exists", return_value=True)
+    @mock.patch("documents.consumer.document_consumer_declaration.send")
+    def test__get_parser_class_n_parsers(self, m, *args):
+
+        class DummyParser1(object):
+            pass
+
+        class DummyParser2(object):
+            pass
+
+        m.return_value = (
+            (None, lambda _: {"weight": 0, "parser": DummyParser1}),
+            (None, lambda _: {"weight": 1, "parser": DummyParser2}),
+        )
+
+        self.assertEqual(Consumer()._get_parser_class("doc.pdf"), DummyParser2)
+
+    @mock.patch("documents.consumer.os.makedirs")
+    @mock.patch("documents.consumer.os.path.exists", return_value=True)
+    @mock.patch("documents.consumer.document_consumer_declaration.send")
+    def test__get_parser_class_0_parsers(self, m, *args):
+        m.return_value = ((None, lambda _: None),)
+        self.assertIsNone(Consumer()._get_parser_class("doc.pdf"))
+
+    @mock.patch("documents.consumer.os.makedirs")
+    @mock.patch("documents.consumer.os.path.exists", return_value=True)
+    @mock.patch("documents.consumer.document_consumer_declaration.send")
+    def _get_consumer(self, m, *args):
+        m.return_value = (
+            (None, lambda _: {"weight": 0, "parser": self.DummyParser}),
+        )
+        return Consumer()
+
+
 class TestAttributes(TestCase):
 
     TAGS = ("tag1", "tag2", "tag3")

diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py
@@ -5,7 +5,7 @@
 
 class ConsumerDeclaration(object):
 
-    MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff|pnm|bmp)$")
+    MATCHING_FILES = re.compile("^.*\.(pdf|jpg|gif|png|tiff?|pnm|bmp)$")
 
     @classmethod
     def handle(cls, sender, **kwargs):
@@ -14,7 +14,7 @@ def handle(cls, sender, **kwargs):
     @classmethod
     def test(cls, doc):
 
-        if cls.MATCHING_FILES.match(doc):
+        if cls.MATCHING_FILES.match(doc.lower()):
             return {
                 "parser": RasterisedDocumentParser,
                 "weight": 0

diff --git a/src/paperless_tesseract/tests/test_signals.py b/src/paperless_tesseract/tests/test_signals.py
@@ -0,0 +1,36 @@
+from django.test import TestCase
+
+from ..signals import ConsumerDeclaration
+
+
+class SignalsTestCase(TestCase):
+
+    def test_test_handles_various_file_names_true(self):
+
+        prefixes = (
+            "doc", "My Document", "Μυ Γρεεκ Δοψθμεντ", "Doc -with - tags",
+            "A document with a . in it", "Doc with -- in it"
+        )
+        suffixes = (
+            "pdf", "jpg", "gif", "png", "tiff", "tif", "pnm", "bmp",
+            "PDF", "JPG", "GIF", "PNG", "TIFF", "TIF", "PNM", "BMP",
+            "pDf", "jPg", "gIf", "pNg", "tIff", "tIf", "pNm", "bMp",
+        )
+
+        for prefix in prefixes:
+            for suffix in suffixes:
+                name = "{}.{}".format(prefix, suffix)
+                self.assertTrue(ConsumerDeclaration.test(name))
+
+    def test_test_handles_various_file_names_false(self):
+
+        prefixes = ("doc",)
+        suffixes = ("txt", "markdown", "",)
+
+        for prefix in prefixes:
+            for suffix in suffixes:
+                name = "{}.{}".format(prefix, suffix)
+                self.assertFalse(ConsumerDeclaration.test(name))
+
+        self.assertFalse(ConsumerDeclaration.test(""))
+        self.assertFalse(ConsumerDeclaration.test("doc"))