From dd2b2505b2aef4cc9146cf0871769ebf415bba3a Mon Sep 17 00:00:00 2001
From: Jonathan Sick <jonathan@jsick.codes>
Date: Fri, 23 Apr 2021 17:00:01 -0400
Subject: [PATCH 1/2] Refactor suffix code

Set the suffix at the latest time possible. This makes way for a user to
specify a format, that in turn, can provide default suffix information.
---
 src/docxplain/cli.py       |  2 +-
 src/docxplain/converter.py | 13 ++++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/docxplain/cli.py b/src/docxplain/cli.py
index 2b721c3..84b6f58 100644
--- a/src/docxplain/cli.py
+++ b/src/docxplain/cli.py
@@ -21,7 +21,7 @@ def create_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Convert docx to plain text.")
     parser.add_argument("source")
     parser.add_argument(
-        "--suffix", default=".txt", help="File suffix for plain text file."
+        "--suffix", default=None, help="File suffix for plain text file."
     )
     parser.add_argument(
         "--header",
diff --git a/src/docxplain/converter.py b/src/docxplain/converter.py
index 92b66c6..fe83680 100644
--- a/src/docxplain/converter.py
+++ b/src/docxplain/converter.py
@@ -10,7 +10,9 @@
 
 
 def convert_file(
-    filename: str, suffix: str = ".txt", header: Optional[str] = None
+    filename: str,
+    suffix: Optional[str] = None,
+    header: Optional[str] = None,
 ) -> bool:
     """Convert the docx file to plaintext.
 
@@ -18,7 +20,7 @@ def convert_file(
     ----------
     filename : `str`
         Path of the docx file.
-    suffix : `str`
+    suffix : `str`, optional
         Suffix for the output plain text file, including ``"."`` prefix.
         Default is ``".txt"``, but a suffix like ``".extracted.txt"``
         could be useful.
@@ -34,7 +36,12 @@ def convert_file(
     if not docx_path.is_file():
         raise RuntimeError(f"Source file {docx_path} does not exist.")
 
-    plain_path = docx_path.with_suffix(suffix)
+    if suffix is None:
+        file_suffix = ".txt"
+    else:
+        file_suffix = suffix
+
+    plain_path = docx_path.with_suffix(file_suffix)
     if plain_path.is_file():
         exists = True
         initial_hash = get_hash(plain_path)

From 44814231512d3e08238058ab8f72f5970be3c9d5 Mon Sep 17 00:00:00 2001
From: Jonathan Sick <jonathan@jsick.codes>
Date: Fri, 23 Apr 2021 18:11:23 -0400
Subject: [PATCH 2/2] Add support for a --format CLI arg

The formats module contains a central listing of supported Pandoc
formats and the default extension for each format.

The CLI enforces that the selected value of --format is one of those
formats.
---
 src/docxplain/cli.py       | 15 +++++++++++++--
 src/docxplain/converter.py | 14 ++++++++++----
 src/docxplain/formats.py   | 26 ++++++++++++++++++++++++++
 tests/converter_test.py    | 22 ++++++++++++++++------
 4 files changed, 65 insertions(+), 12 deletions(-)
 create mode 100644 src/docxplain/formats.py

diff --git a/src/docxplain/cli.py b/src/docxplain/cli.py
index 84b6f58..3d78664 100644
--- a/src/docxplain/cli.py
+++ b/src/docxplain/cli.py
@@ -4,13 +4,17 @@
 import sys
 
 from docxplain.converter import convert_file
+from docxplain.formats import get_format, supported_formats
 
 
 def main() -> None:
     """Command-line entrypoint."""
     parser = create_parser()
     args = parser.parse_args()
-    changed = convert_file(args.source, suffix=args.suffix, header=args.header)
+    fmt = get_format(args.format)
+    changed = convert_file(
+        args.source, output_format=fmt, suffix=args.suffix, header=args.header
+    )
     if changed:
         sys.exit(1)
     else:
@@ -21,7 +25,14 @@ def create_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Convert docx to plain text.")
     parser.add_argument("source")
     parser.add_argument(
-        "--suffix", default=None, help="File suffix for plain text file."
+        "--format",
+        default="plain",
+        choices=[f.name for f in supported_formats],
+    )
+    parser.add_argument(
+        "--suffix",
+        default=None,
+        help="Custom file suffix for plain text file.",
     )
     parser.add_argument(
         "--header",
diff --git a/src/docxplain/converter.py b/src/docxplain/converter.py
index fe83680..e1e7ec1 100644
--- a/src/docxplain/converter.py
+++ b/src/docxplain/converter.py
@@ -2,15 +2,19 @@
 
 import hashlib
 from pathlib import Path
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
 import pypandoc
 
+if TYPE_CHECKING:
+    from docxplain.formats import PandocFormat
+
 __all__ = ["convert_file", "get_hash"]
 
 
 def convert_file(
     filename: str,
+    output_format: PandocFormat,
     suffix: Optional[str] = None,
     header: Optional[str] = None,
 ) -> bool:
@@ -20,10 +24,12 @@ def convert_file(
     ----------
     filename : `str`
         Path of the docx file.
+    output_format : `docxplain.formats.PandocFormat`
+        The output format for the converted plain text file.
     suffix : `str`, optional
-        Suffix for the output plain text file, including ``"."`` prefix.
-        Default is ``".txt"``, but a suffix like ``".extracted.txt"``
-        could be useful.
+        Custom suffix for the output plain text file, including ``"."`` prefix.
+        Default is based on the output format, but a custom suffix like
+        ``".extracted.txt"`` can be useful.
     header : `str`, optional
         Content that is added to the top of the plain text file.
 
diff --git a/src/docxplain/formats.py b/src/docxplain/formats.py
new file mode 100644
index 0000000..6a50985
--- /dev/null
+++ b/src/docxplain/formats.py
@@ -0,0 +1,26 @@
+"""Information about supported formats."""
+
+from dataclasses import dataclass
+
+__all__ = ["PandocFormat", "supported_formats", "get_format"]
+
+
+@dataclass
+class PandocFormat:
+    """A plain text format supported by pandoc."""
+
+    name: str
+    """Pandoc's name for the format."""
+
+    suffix: str
+    """The default suffix for the format."""
+
+
+supported_formats = (PandocFormat(name="plain", suffix=".txt"),)
+
+
+def get_format(name: str) -> PandocFormat:
+    for f in supported_formats:
+        if f.name == name:
+            return f
+    raise ValueError(f"Format '{name}' is unknown.")
diff --git a/tests/converter_test.py b/tests/converter_test.py
index 4876fc3..28d8f9a 100644
--- a/tests/converter_test.py
+++ b/tests/converter_test.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 from docxplain.converter import convert_file, trim_trailing_whitespace
+from docxplain.formats import get_format
 
 
 def test_unchanged(tmp_path: Path) -> None:
@@ -12,7 +13,7 @@ def test_unchanged(tmp_path: Path) -> None:
     work_dir = tmp_path / "unchanged"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath)) is False
+    assert convert_file(str(docxpath), get_format("plain")) is False
 
 
 def test_changed(tmp_path: Path) -> None:
@@ -21,7 +22,7 @@ def test_changed(tmp_path: Path) -> None:
     work_dir = tmp_path / "changed"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath)) is True
+    assert convert_file(str(docxpath), get_format("plain")) is True
 
 
 def test_new(tmp_path: Path) -> None:
@@ -30,7 +31,7 @@ def test_new(tmp_path: Path) -> None:
     work_dir = tmp_path / "new"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath)) is True
+    assert convert_file(str(docxpath), get_format("plain")) is True
 
 
 def test_suffix(tmp_path: Path) -> None:
@@ -39,7 +40,12 @@ def test_suffix(tmp_path: Path) -> None:
     work_dir = tmp_path / "suffix"
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
-    assert convert_file(str(docxpath), suffix=".extracted.txt") is True
+    assert (
+        convert_file(
+            str(docxpath), get_format("plain"), suffix=".extracted.txt"
+        )
+        is True
+    )
     plain_path = work_dir.joinpath("test_doc.extracted.txt")
     assert plain_path.is_file()
 
@@ -51,7 +57,9 @@ def test_header(tmp_path: Path) -> None:
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
     header = "This file is autogenerated."
-    assert convert_file(str(docxpath), header=header) is True
+    assert (
+        convert_file(str(docxpath), get_format("plain"), header=header) is True
+    )
     plain_path = docxpath.with_suffix(".txt")
     assert plain_path.is_file()
     content = plain_path.read_text().splitlines()
@@ -67,7 +75,9 @@ def test_header_templating(tmp_path: Path) -> None:
     shutil.copytree(repo_data, work_dir)
     docxpath = work_dir.joinpath("test_doc.docx")
     header = "This file is autogenerated from {docx}."
-    assert convert_file(str(docxpath), header=header) is True
+    assert (
+        convert_file(str(docxpath), get_format("plain"), header=header) is True
+    )
     plain_path = docxpath.with_suffix(".txt")
     assert plain_path.is_file()
     content = plain_path.read_text().splitlines()