Performance: Allow large sets of JSON docs without StackOverflows

getmoto · May 10, 2024 · ebf4901 · ebf4901
1 parent ca7faeb
commit ebf4901
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 59 deletions.
diff --git a/py_partiql_parser/_internal/from_parser.py b/py_partiql_parser/_internal/from_parser.py
@@ -88,15 +88,12 @@ def get_source_data(self, documents: Dict[str, str]) -> Any:
             from_query
         ].endswith("]")
 
-        source_data = JsonParser().parse(documents[from_query])
+        source_data = list(JsonParser().parse(documents[from_query]))
 
         if doc_is_list:
-            return {"_1": source_data}
+            return {"_1": source_data[0]}
         elif from_alias:
-            if isinstance(source_data, list):
-                return [CaseInsensitiveDict({from_alias: doc}) for doc in source_data]
-            else:
-                return CaseInsensitiveDict({from_alias: source_data})
+            return [CaseInsensitiveDict({from_alias: doc}) for doc in source_data]
         else:
             return source_data
 
@@ -135,10 +132,10 @@ def _get_nested_source_data(self, documents: Dict[str, Any]) -> Any:
                     doc_is_list = source_data[new_key].startswith("[") and source_data[
                         new_key
                     ].endswith("]")
-                    source_data = JsonParser().parse(source_data[new_key])
+                    source_data = list(JsonParser().parse(source_data[new_key]))  # type: ignore
                     if root_doc and doc_is_list:
                         # AWS behaviour when the root-document is a list
-                        source_data = {"_1": source_data}
+                        source_data = {"_1": source_data[0]}  # type: ignore
                     elif key_so_far == entire_key:
                         if isinstance(source_data, list):  # type: ignore[unreachable]
                             source_data = [{alias: doc} for doc in source_data]  # type: ignore[unreachable]

diff --git a/py_partiql_parser/_internal/insert_parser.py b/py_partiql_parser/_internal/insert_parser.py
@@ -38,7 +38,7 @@ def parse(self, query: str) -> Tuple[str, Dict[str, Any]]:
                 if section == "SECTION_VALUE":
                     assert current_phrase.upper() in ["VALUE"]
                     tokenizer.skip_white_space()
-                    attr = JsonParser().parse(tokenizer.give_remaining())
+                    attr = next(JsonParser().parse(tokenizer.give_remaining()))
                     for key, value in attr.items():
                         attr[key] = serializer.serialize(value)
                 if section == "TABLE_NAME":

diff --git a/py_partiql_parser/_internal/json_parser.py b/py_partiql_parser/_internal/json_parser.py
@@ -22,17 +22,25 @@ def parse(
     ) -> Any:
         if not (original.startswith("{") or original.startswith("[")):
             # Doesn't look like JSON - let's return as a variable
-            return original if original.isnumeric() else Variable(original)
+            yield original if original.isnumeric() else Variable(original)
+        tokenizer = tokenizer or ClauseTokenizer(original)
+        while tokenizer.current() is not None:
+            result = self._parse(original, tokenizer, only_parse_initial)
+            if result is not None:
+                yield result
+
+    def _parse(self, original: str, tokenizer: ClauseTokenizer, only_parse_initial: bool = False) -> Any:
         section: Optional[str] = None  # DICT_KEY | KEY_TO_VALUE | DICT_VAL | OBJECT_END
         dict_key = ""
         current_phrase = ""
         result = CaseInsensitiveDict()
-        tokenizer = tokenizer or ClauseTokenizer(original)
+        level = 0
         while True:
             c = tokenizer.next()
             if not c:
-                break
+                return None
             elif c == "[" and (not section or section == "KEY_TO_VALUE"):
+                level += 1
                 # Start of a list
                 if not section:
                     return self._parse_list(original, tokenizer)
@@ -41,6 +49,8 @@ def parse(
                     section = None
                     current_phrase = ""
             elif c in ["{", ","] and (not section or section == "OBJECT_END"):
+                if c == "{":
+                    level += 1
                 # Start of a key
                 section = "DICT_KEY"
                 tokenizer.skip_until(ACCEPTED_QUOTES)
@@ -52,9 +62,10 @@ def parse(
                 section = "KEY_TO_VALUE"
                 current_phrase = ""
             elif c in ["{"] and section == "KEY_TO_VALUE":
+                level += 1
                 # Start of a value with a new dictionary
                 tokenizer.revert()  # Ensure we start the new parser with the initial {
-                result[dict_key] = self.parse(original, tokenizer)
+                result[dict_key] = self._parse(original, tokenizer)
                 section = None
                 current_phrase = ""
             elif c in ACCEPTED_QUOTES and section == "KEY_TO_VALUE":
@@ -67,6 +78,7 @@ def parse(
                 section = None
                 current_phrase = ""
             elif c in ["}"] and section in ["VAR_VALUE", "INT_VALUE"]:
+                level -= 1
                 # End of a variable/number
                 if section == "INT_VALUE":
                     result[dict_key] = int(current_phrase)
@@ -90,29 +102,11 @@ def parse(
                 tokenizer.revert()
                 section = None
                 current_phrase = ""
-            elif section in ["OBJECT_END"]:
-                next_documents = self.parse(original, tokenizer)
-                if next_documents == {}:
-                    return result
-                elif isinstance(next_documents, list):
-                    return [result] + next_documents
-                else:
-                    return [result, next_documents]
             elif c == "}" and section is None:
-                section = "OBJECT_END"
-                # We know whether we are at the end of an object at this point
-                # But we don't know whether this is:
-                # - end of the root object
-                # - end of a nested object
-                # - inbetween multiple objects (separated by new-line)
-                tokenizer.skip_white_space()
-                if tokenizer.current() == "{":
-                    # we're inbetween multiple objects - continue parsing
-                    tokenizer.revert()
-                    pass
+                level -= 1
+                if level == 0:
+                    return result
                 else:
-                    # we're at the end of the root object - next char is probably None. Break and return to the user
-                    # we're at the end of a nested object - next char is probably }.    Break and let the parent processor takeover
                     break
             elif c in [" ", NEW_LINE] and section not in ["DICT_KEY", "DICT_VAL"]:
                 pass
@@ -126,7 +120,6 @@ def parse(
                         section = "VAR_VALUE"
                 if section in ["DICT_KEY", "DICT_VAL", "INT_VALUE", "VAR_VALUE"]:
                     current_phrase += c
-
         return result
 
     def _parse_list(self, original: str, tokenizer: ClauseTokenizer) -> Any:
@@ -139,7 +132,7 @@ def _parse_list(self, original: str, tokenizer: ClauseTokenizer) -> Any:
                 break
             if c == "{":
                 tokenizer.revert()  # Ensure we start the new parser with the initial {
-                result.append(self.parse(original, tokenizer, only_parse_initial=True))
+                result.append(self._parse(original, tokenizer, only_parse_initial=True))
                 if tokenizer.current() == "]":
                     break
                 tokenizer.skip_until([","])

diff --git a/tests/test_json_parser.py b/tests/test_json_parser.py
@@ -1,83 +1,84 @@
 import json
 import pytest
 from typing import Any
+from uuid import uuid4
 from py_partiql_parser._internal.json_parser import JsonParser, Variable
 
 
 def test_static_value() -> None:
-    assert JsonParser().parse("a") == Variable("a")
+    assert next(JsonParser().parse("a")) == Variable("a")
 
 
 def test_dict() -> None:
-    assert JsonParser().parse(json.dumps({"a": "b"})) == {"a": "b"}
-    assert JsonParser().parse("{'a': 'b'}") == {"a": "b"}
-    assert JsonParser().parse('{"a": "b"}') == {"a": "b"}
+    assert next(JsonParser().parse(json.dumps({"a": "b"}))) == {"a": "b"}
+    assert next(JsonParser().parse("{'a': 'b'}")) == {"a": "b"}
+    assert next(JsonParser().parse('{"a": "b"}')) == {"a": "b"}
 
 
 def test_dict_with_spaces_in_keys_and_values() -> None:
-    assert JsonParser().parse(json.dumps({"a sth": "b sth"})) == {"a sth": "b sth"}
+    assert next(JsonParser().parse(json.dumps({"a sth": "b sth"}))) == {"a sth": "b sth"}
 
 
 def test_dict_with_multiple_entries() -> None:
-    assert JsonParser().parse(json.dumps({"a": "b", "c": "d"})) == {"a": "b", "c": "d"}
+    assert next(JsonParser().parse(json.dumps({"a": "b", "c": "d"}))) == {"a": "b", "c": "d"}
 
 
 def test_dict_with_nested_entries() -> None:
     original = {"a": {"b1": {"b1.1": "b1.2"}}, "c": "d"}
-    assert JsonParser().parse(json.dumps(original)) == original
+    assert next(JsonParser().parse(json.dumps(original))) == original
 
 
 def test_dict_with_list() -> None:
-    assert JsonParser().parse(json.dumps({"a": ["b1", "b2"], "c": "d"})) == {
+    assert next(JsonParser().parse(json.dumps({"a": ["b1", "b2"], "c": "d"}))) == {
         "a": ["b1", "b2"],
         "c": "d",
     }
 
 
 def test_list() -> None:
-    assert JsonParser().parse(json.dumps(["a", "b", "asdfasdf"])) == [
+    assert next(JsonParser().parse(json.dumps(["a", "b", "asdfasdf"]))) == [
         "a",
         "b",
         "asdfasdf",
     ]
 
 
 def test_list_with_only_numbers() -> None:
-    assert JsonParser().parse(json.dumps([1, 1234, 12341234])) == [1, 1234, 12341234]
+    assert next(JsonParser().parse(json.dumps([1, 1234, 12341234]))) == [1, 1234, 12341234]
 
 
 def test_list_with_numbers_and_strings() -> None:
-    assert JsonParser().parse(json.dumps(["x", 1324, "y"])) == ["x", 1324, "y"]
+    assert next(JsonParser().parse(json.dumps(["x", 1324, "y"]))) == ["x", 1324, "y"]
 
 
 def test_list_with_variables() -> None:
-    assert JsonParser().parse("[v.a, v.b]") == [Variable("v.a"), Variable("v.b")]
+    assert next(JsonParser().parse("[v.a, v.b]")) == [Variable("v.a"), Variable("v.b")]
 
 
 def test_dict_with_key_containing_a_special_char() -> None:
-    assert JsonParser().parse(json.dumps({"a:a": "b"})) == {"a:a": "b"}
+    assert next(JsonParser().parse(json.dumps({"a:a": "b"}))) == {"a:a": "b"}
 
 
 def test_dict_with_value_containing_a_special_char() -> None:
-    assert JsonParser().parse(json.dumps({"a": "b:b"})) == {"a": "b:b"}
+    assert next(JsonParser().parse(json.dumps({"a": "b:b"}))) == {"a": "b:b"}
 
 
-def test_dict_containing_a_number() -> None:
-    original = "[{'a':'legit', 'b':1}, {'a':400, 'b':2}]"
-    assert JsonParser().parse(original) == [{"a": "legit", "b": 1}, {"a": 400, "b": 2}]
+@pytest.mark.parametrize("original", [[{'a':'legit', 'b':1}, {'a':400, 'b':2}], {'a': 'legit', 'b': {'nr': 25}}])
+def test_dict_containing_a_number(original: str) -> None:
+    assert next(JsonParser().parse(json.dumps(original))) == original
 
 
 def test_dict_containing_a_variable() -> None:
     original = "[{'a':'legit', 'b':1}, {'a':qwer, 'b':'2'}]"
-    assert JsonParser().parse(original) == [
+    assert next(JsonParser().parse(original)) == [
         {"a": "legit", "b": 1},
         {"a": Variable("qwer"), "b": "2"},
     ]
 
 
 def test_unusual_quotes() -> None:
     original = "[{’a’:1, ’b’:true}, {’a’:2, ’b’:null}, {’a’:3}]"
-    assert JsonParser().parse(original) == [
+    assert next(JsonParser().parse(original)) == [
         {"a": 1, "b": True},
         {"a": 2, "b": Variable(None)},
         {"a": 3},
@@ -96,7 +97,7 @@ def test_parse_multiple_objects() -> None:
 }
     
     """
-    assert JsonParser().parse(multi_object_string) == [
+    assert list(JsonParser().parse(multi_object_string)) == [
         {"a1": "v1", "a1": "v2"},
         {"a2": "w1", "a2": "w2"},
         {"a3": "z"},
@@ -112,15 +113,23 @@ def test_parse_multiple_objects() -> None:
     ],
 )
 def test_list_and_string_are_siblings(source: Any) -> None:  # type: ignore[misc]
-    assert JsonParser().parse(json.dumps(source)) == source
+    assert next(JsonParser().parse(json.dumps(source))) == source
 
 
 def test_bool_parser() -> None:
-    assert JsonParser().parse(json.dumps({"sth": False})) == {"sth": False}
+    assert next(JsonParser().parse(json.dumps({"sth": False}))) == {"sth": False}
 
 
 def test_multiline_bool_parser() -> None:
     obj1 = {"sth": False}
     obj2 = {"k1": "v1"}
     combined = json.dumps(obj1) + "\n" + json.dumps(obj2)
-    assert JsonParser().parse(combined) == [obj1, obj2]
+    assert list(JsonParser().parse(combined)) == [obj1, obj2]
+
+
+@pytest.mark.parametrize("nr_of_docs", [1, 25, 2500])
+def test_large_object(nr_of_docs: int) -> None:
+    data = "".join([json.dumps({"pk": f"pk{i}", "data": str(uuid4())}) for i in range(nr_of_docs)])
+
+    res = list(JsonParser().parse(data))
+    assert len(res) == nr_of_docs