Remove duplicate token matcher Python code (#205)

cucumber · Jan 4, 2024 · 26b0f70 · 26b0f70
1 parent 99041f5
commit 26b0f70
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 87 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ This project adheres to [Semantic Versioning](http://semver.org).
 This document is formatted according to the principles of [Keep A CHANGELOG](http://keepachangelog.com).
 
 ## [Unreleased]
+### Changed
+- [Python] Removed duplicate code in markdown token matcher ([#205](https://github.com/cucumber/gherkin/pull/205))
+
 ### Added
 - (i18n) Added Vietnamese translation of "Rule"
 

diff --git a/python/.gitignore b/python/.gitignore
@@ -5,6 +5,7 @@ acceptance/
 *.pyc
 dist/
 build/
+venv/
 .tox/
 .coverage
 .coverage.*
diff --git a/python/MANIFEST b/python/MANIFEST
@@ -18,6 +18,7 @@ gherkin/parser.py
 gherkin/token.py
 gherkin/token_formatter_builder.py
 gherkin/token_matcher.py
+gherkin/token_matcher_markdown.py
 gherkin/token_scanner.py
 gherkin/pickles/__init__.py
 gherkin/pickles/compiler.py

diff --git a/python/gherkin/token_matcher.py b/python/gherkin/token_matcher.py
@@ -112,11 +112,15 @@ def match_DocStringSeparator(self, token):
             # close
             return self._match_DocStringSeparator(token, self._active_doc_string_separator, False)
 
+    @staticmethod
+    def _default_docstring_content_type():
+        return None
+
     def _match_DocStringSeparator(self, token, separator, is_open):
         if not token.line.startswith(separator):
             return False
 
-        content_type = None
+        content_type = self._default_docstring_content_type()
         if is_open:
             content_type = token.line.get_rest_trimmed(len(separator))
             self._active_doc_string_separator = separator

diff --git a/python/gherkin/token_matcher_markdown.py b/python/gherkin/token_matcher_markdown.py
@@ -1,24 +1,16 @@
 import re
-from collections import defaultdict
-from .dialect import Dialect
-from .errors import NoSuchLanguageException
+from .token_matcher import TokenMatcher
 
 KEYWORD_PREFIX_BULLET = '^(\\s*[*+-]\\s*)'
 KEYWORD_PREFIX_HEADER = '^(#{1,6}\\s)'
 
-class GherkinInMarkdownTokenMatcher(object):
-    LANGUAGE_RE = re.compile(r"^\s*#\s*language\s*:\s*([a-zA-Z\-_]+)\s*$")
+class GherkinInMarkdownTokenMatcher(TokenMatcher):
 
     def __init__(self, dialect_name='en'):
-        self._default_dialect_name = dialect_name
-        self._change_dialect(dialect_name)
-        self.reset()
+        super(GherkinInMarkdownTokenMatcher, self).__init__(dialect_name)
 
     def reset(self):
-        if self.dialect_name != self._default_dialect_name:
-            self._change_dialect(self._default_dialect_name)
-        self._indent_to_remove = 0
-        self._active_doc_string_separator = None
+        super(GherkinInMarkdownTokenMatcher, self).reset()
         self.matched_feature_line=False
 
     def match_FeatureLine(self, token):
@@ -52,7 +44,7 @@ def match_ExamplesLine(self, token):
         return self._match_title_line(KEYWORD_PREFIX_HEADER, self.dialect.examples_keywords, ':', token, 'ExamplesLine')
 
     def match_TableRow(self, token):
-        # Gherkin tables must be indented 2-5 spaces in order to be distinguidedn from non-Gherkin tables
+        # Gherkin tables must be indented 2-5 spaces to be distinguishable from non-Gherkin tables
 
         if re.match('^\\s\\s\\s?\\s?\\s?\\|',token.line.get_line_text(0)):
             table_cells = token.line.table_cells
@@ -119,7 +111,6 @@ def match_Language(self, token):
         return False
 
     def match_TagLine(self, token):
-
         tags = []
         matching_tags = re.finditer('`(@[^`]+)`', token.line.get_line_text())
         idx=0
@@ -144,88 +135,19 @@ def match_DocStringSeparator(self, token):
             # close
             return self._match_DocStringSeparator(token, self._active_doc_string_separator, False)
 
-    def _match_DocStringSeparator(self, token, separator, is_open):
-        if not token.line.startswith(separator):
-            return False
-
-        content_type = ''
-        if is_open:
-            content_type = token.line.get_rest_trimmed(len(separator))
-            self._active_doc_string_separator = separator
-            self._indent_to_remove = token.line.indent
-        else:
-            self._active_doc_string_separator = None
-            self._indent_to_remove = 0
-
-        # TODO: Use the separator as keyword. That's needed for pretty printing.
-        self._set_token_matched(token, 'DocStringSeparator', content_type, separator)
-        return True
-
-    def match_Other(self, token):
-        # take the entire line, except removing DocString indents
-        text = token.line.get_line_text(self._indent_to_remove)
-        self._set_token_matched(token, 'Other', self._unescaped_docstring(text), indent=0)
-        return True
-
-    def match_EOF(self, token):
-        if not token.eof():
-            return False
-
-        self._set_token_matched(token, 'EOF')
-        return True
+    @staticmethod
+    def _default_docstring_content_type():
+        return ''
 
     def _match_title_line(self, prefix, keywords, keywordSuffix, token, token_type):
 
         keywords_or_list="|".join(map(lambda x: re.escape(x), keywords))
         match = re.search(u'{}({}){}(.*)'.format(prefix, keywords_or_list, keywordSuffix), token.line.get_line_text())
         indent = token.line.indent
-        result = False
 
         if(match):
             matchedKeyword = match.group(2)
             indent += len(match.group(1))
             self._set_token_matched(token, token_type, match.group(3).strip(), matchedKeyword, indent=indent)
             return True
         return False
-
-    def _set_token_matched(self, token, matched_type, text=None,
-                           keyword=None, keyword_type=None, indent=None, items=None):
-        if items is None:
-            items = []
-        token.matched_type = matched_type
-        # text == '' should not result in None
-        token.matched_text = text.rstrip('\r\n') if text is not None else None
-        token.matched_keyword = keyword
-        token.matched_keyword_type = keyword_type
-        if indent is not None:
-            token.matched_indent = indent
-        else:
-            token.matched_indent = token.line.indent if token.line else 0
-        token.matched_items = items
-        token.location['column'] = token.matched_indent + 1
-        token.matched_gherkin_dialect = self.dialect_name
-
-    def _change_dialect(self, dialect_name, location=None):
-        dialect = Dialect.for_name(dialect_name)
-        if not dialect:
-            raise NoSuchLanguageException(dialect_name, location)
-
-        self.dialect_name = dialect_name
-        self.dialect = dialect
-        self.keyword_types = defaultdict(list)
-        for keyword in self.dialect.given_keywords:
-            self.keyword_types[keyword].append('Context')
-        for keyword in self.dialect.when_keywords:
-            self.keyword_types[keyword].append('Action')
-        for keyword in self.dialect.then_keywords:
-            self.keyword_types[keyword].append('Outcome')
-        for keyword in self.dialect.and_keywords + self.dialect.but_keywords:
-            self.keyword_types[keyword].append('Conjunction')
-
-    def _unescaped_docstring(self, text):
-        if self._active_doc_string_separator == '"""':
-            return text.replace('\\"\\"\\"', '"""')
-        elif self._active_doc_string_separator == '```':
-            return text.replace('\\`\\`\\`', '```')
-        else:
-            return text