From dd2b2505b2aef4cc9146cf0871769ebf415bba3a Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Fri, 23 Apr 2021 17:00:01 -0400 Subject: [PATCH 1/2] Refactor suffix code Set the suffix at the latest time possible. This makes way for a user to specify a format, that in turn, can provide default suffix information. --- src/docxplain/cli.py | 2 +- src/docxplain/converter.py | 13 ++++++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/docxplain/cli.py b/src/docxplain/cli.py index 2b721c3..84b6f58 100644 --- a/src/docxplain/cli.py +++ b/src/docxplain/cli.py @@ -21,7 +21,7 @@ def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Convert docx to plain text.") parser.add_argument("source") parser.add_argument( - "--suffix", default=".txt", help="File suffix for plain text file." + "--suffix", default=None, help="File suffix for plain text file." ) parser.add_argument( "--header", diff --git a/src/docxplain/converter.py b/src/docxplain/converter.py index 92b66c6..fe83680 100644 --- a/src/docxplain/converter.py +++ b/src/docxplain/converter.py @@ -10,7 +10,9 @@ def convert_file( - filename: str, suffix: str = ".txt", header: Optional[str] = None + filename: str, + suffix: Optional[str] = None, + header: Optional[str] = None, ) -> bool: """Convert the docx file to plaintext. @@ -18,7 +20,7 @@ def convert_file( ---------- filename : `str` Path of the docx file. - suffix : `str` + suffix : `str`, optional Suffix for the output plain text file, including ``"."`` prefix. Default is ``".txt"``, but a suffix like ``".extracted.txt"`` could be useful. @@ -34,7 +36,12 @@ def convert_file( if not docx_path.is_file(): raise RuntimeError(f"Source file {docx_path} does not exist.") - plain_path = docx_path.with_suffix(suffix) + if suffix is None: + file_suffix = ".txt" + else: + file_suffix = suffix + + plain_path = docx_path.with_suffix(file_suffix) if plain_path.is_file(): exists = True initial_hash = get_hash(plain_path) From 44814231512d3e08238058ab8f72f5970be3c9d5 Mon Sep 17 00:00:00 2001 From: Jonathan Sick Date: Fri, 23 Apr 2021 18:11:23 -0400 Subject: [PATCH 2/2] Add support for a --format CLI arg The formats module contains a central listing of supported Pandoc formats and the default extension for each format. The CLI enforces that the selected value of --format is one of those formats. --- src/docxplain/cli.py | 15 +++++++++++++-- src/docxplain/converter.py | 14 ++++++++++---- src/docxplain/formats.py | 26 ++++++++++++++++++++++++++ tests/converter_test.py | 22 ++++++++++++++++------ 4 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 src/docxplain/formats.py diff --git a/src/docxplain/cli.py b/src/docxplain/cli.py index 84b6f58..3d78664 100644 --- a/src/docxplain/cli.py +++ b/src/docxplain/cli.py @@ -4,13 +4,17 @@ import sys from docxplain.converter import convert_file +from docxplain.formats import get_format, supported_formats def main() -> None: """Command-line entrypoint.""" parser = create_parser() args = parser.parse_args() - changed = convert_file(args.source, suffix=args.suffix, header=args.header) + fmt = get_format(args.format) + changed = convert_file( + args.source, output_format=fmt, suffix=args.suffix, header=args.header + ) if changed: sys.exit(1) else: @@ -21,7 +25,14 @@ def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Convert docx to plain text.") parser.add_argument("source") parser.add_argument( - "--suffix", default=None, help="File suffix for plain text file." + "--format", + default="plain", + choices=[f.name for f in supported_formats], + ) + parser.add_argument( + "--suffix", + default=None, + help="Custom file suffix for plain text file.", ) parser.add_argument( "--header", diff --git a/src/docxplain/converter.py b/src/docxplain/converter.py index fe83680..e1e7ec1 100644 --- a/src/docxplain/converter.py +++ b/src/docxplain/converter.py @@ -2,15 +2,19 @@ import hashlib from pathlib import Path -from typing import Optional +from typing import TYPE_CHECKING, Optional import pypandoc +if TYPE_CHECKING: + from docxplain.formats import PandocFormat + __all__ = ["convert_file", "get_hash"] def convert_file( filename: str, + output_format: PandocFormat, suffix: Optional[str] = None, header: Optional[str] = None, ) -> bool: @@ -20,10 +24,12 @@ def convert_file( ---------- filename : `str` Path of the docx file. + output_format : `docxplain.formats.PandocFormat` + The output format for the converted plain text file. suffix : `str`, optional - Suffix for the output plain text file, including ``"."`` prefix. - Default is ``".txt"``, but a suffix like ``".extracted.txt"`` - could be useful. + Custom suffix for the output plain text file, including ``"."`` prefix. + Default is based on the output format, but a custom suffix like + ``".extracted.txt"`` can be useful. header : `str`, optional Content that is added to the top of the plain text file. diff --git a/src/docxplain/formats.py b/src/docxplain/formats.py new file mode 100644 index 0000000..6a50985 --- /dev/null +++ b/src/docxplain/formats.py @@ -0,0 +1,26 @@ +"""Information about supported formats.""" + +from dataclasses import dataclass + +__all__ = ["PandocFormat", "supported_formats", "get_format"] + + +@dataclass +class PandocFormat: + """A plain text format supported by pandoc.""" + + name: str + """Pandoc's name for the format.""" + + suffix: str + """The default suffix for the format.""" + + +supported_formats = (PandocFormat(name="plain", suffix=".txt"),) + + +def get_format(name: str) -> PandocFormat: + for f in supported_formats: + if f.name == name: + return f + raise ValueError(f"Format '{name}' is unknown.") diff --git a/tests/converter_test.py b/tests/converter_test.py index 4876fc3..28d8f9a 100644 --- a/tests/converter_test.py +++ b/tests/converter_test.py @@ -4,6 +4,7 @@ from pathlib import Path from docxplain.converter import convert_file, trim_trailing_whitespace +from docxplain.formats import get_format def test_unchanged(tmp_path: Path) -> None: @@ -12,7 +13,7 @@ def test_unchanged(tmp_path: Path) -> None: work_dir = tmp_path / "unchanged" shutil.copytree(repo_data, work_dir) docxpath = work_dir.joinpath("test_doc.docx") - assert convert_file(str(docxpath)) is False + assert convert_file(str(docxpath), get_format("plain")) is False def test_changed(tmp_path: Path) -> None: @@ -21,7 +22,7 @@ def test_changed(tmp_path: Path) -> None: work_dir = tmp_path / "changed" shutil.copytree(repo_data, work_dir) docxpath = work_dir.joinpath("test_doc.docx") - assert convert_file(str(docxpath)) is True + assert convert_file(str(docxpath), get_format("plain")) is True def test_new(tmp_path: Path) -> None: @@ -30,7 +31,7 @@ def test_new(tmp_path: Path) -> None: work_dir = tmp_path / "new" shutil.copytree(repo_data, work_dir) docxpath = work_dir.joinpath("test_doc.docx") - assert convert_file(str(docxpath)) is True + assert convert_file(str(docxpath), get_format("plain")) is True def test_suffix(tmp_path: Path) -> None: @@ -39,7 +40,12 @@ def test_suffix(tmp_path: Path) -> None: work_dir = tmp_path / "suffix" shutil.copytree(repo_data, work_dir) docxpath = work_dir.joinpath("test_doc.docx") - assert convert_file(str(docxpath), suffix=".extracted.txt") is True + assert ( + convert_file( + str(docxpath), get_format("plain"), suffix=".extracted.txt" + ) + is True + ) plain_path = work_dir.joinpath("test_doc.extracted.txt") assert plain_path.is_file() @@ -51,7 +57,9 @@ def test_header(tmp_path: Path) -> None: shutil.copytree(repo_data, work_dir) docxpath = work_dir.joinpath("test_doc.docx") header = "This file is autogenerated." - assert convert_file(str(docxpath), header=header) is True + assert ( + convert_file(str(docxpath), get_format("plain"), header=header) is True + ) plain_path = docxpath.with_suffix(".txt") assert plain_path.is_file() content = plain_path.read_text().splitlines() @@ -67,7 +75,9 @@ def test_header_templating(tmp_path: Path) -> None: shutil.copytree(repo_data, work_dir) docxpath = work_dir.joinpath("test_doc.docx") header = "This file is autogenerated from {docx}." - assert convert_file(str(docxpath), header=header) is True + assert ( + convert_file(str(docxpath), get_format("plain"), header=header) is True + ) plain_path = docxpath.with_suffix(".txt") assert plain_path.is_file() content = plain_path.read_text().splitlines()