Skip to content

Commit

Permalink
Handle denormalized properties everywhere* (#5635)
Browse files Browse the repository at this point in the history
* WIP: port process_math to support materialized columns

* Add skipped test showing trend breakdowns dont use materialized columns

* Simplify testing and test&fix math property aggregation w/ materialized columns

* Add (failing) test for filtering with materialized action props

* Add test around materialized property filtering

* Refactor entity.math materialization impl

* Make trends breakdowns work with materialized columns

* Simplify process_math further

* Handle denormalized properties in format_action_filter for step.properties

Note the following files all called this method:
ee/clickhouse/views/events.py
ee/clickhouse/views/actions.py
ee/clickhouse/queries/trends/util.py
ee/clickhouse/queries/trends/lifecycle.py
ee/clickhouse/queries/trends/breakdown.py
ee/clickhouse/queries/funnels/base.py
ee/clickhouse/queries/sessions/util.py
ee/clickhouse/queries/clickhouse_stickiness.py
ee/clickhouse/queries/clickhouse_retention.py
ee/clickhouse/models/cohort.py

I verified all of them are OK since they query events table directly
with the passed filter

* Handle materialized $current_url in action step filtering

* Remove now unneeded clause

* Update test helper

* Allow denormalized props for filtering with breakdowns

* Allow denormalized props for filtering with lifecycle

* Allow denormalized props for some views

* Fix entity math yet again

* Query materialized columns in insights > sessions

* Handle breakdown edge case

* Allow denormalized props for more views

* PR feedback

* reformat
  • Loading branch information
macobo authored Aug 19, 2021
1 parent 1402faf commit 0f58482
Show file tree
Hide file tree
Showing 19 changed files with 168 additions and 58 deletions.
23 changes: 11 additions & 12 deletions ee/clickhouse/models/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def format_action_filter(
team_id=action.team.pk if filter_by_team else None,
prepend=f"action_props_{action.pk}_{step.pk}",
table_name=table_name,
allow_denormalized_props=True,
)
conditions.append(prop_query.replace("AND", "", 1))
params = {**params, **prop_params}
Expand All @@ -61,28 +62,26 @@ def format_action_filter(
def filter_event(
step: ActionStep, prepend: str = "event", index: int = 0, table_name: str = ""
) -> Tuple[List[str], Dict]:
from ee.clickhouse.models.property import get_property_string_expr

params = {"{}_{}".format(prepend, index): step.event}
conditions = []

if table_name != "":
table_name += "."

if step.url:
value_expr, _ = get_property_string_expr("events", "$current_url", "'$current_url'", f"{table_name}properties")
prop_name = f"{prepend}_prop_val_{index}"
if step.url_matching == ActionStep.EXACT:
conditions.append(
f"JSONExtractString({table_name}properties, '$current_url') = %({prepend}_prop_val_{index})s"
)
params.update({f"{prepend}_prop_val_{index}": step.url})
conditions.append(f"{value_expr} = %({prop_name})s")
params.update({prop_name: step.url})
elif step.url_matching == ActionStep.REGEX:
conditions.append(
f"match(JSONExtractString({table_name}properties, '$current_url'), %({prepend}_prop_val_{index})s)"
)
params.update({f"{prepend}_prop_val_{index}": step.url})
conditions.append(f"match({value_expr}, %({prop_name})s)")
params.update({prop_name: step.url})
else:
conditions.append(
f"JSONExtractString({table_name}properties, '$current_url') LIKE %({prepend}_prop_val_{index})s"
)
params.update({f"{prepend}_prop_val_{index}": f"%{step.url}%"})
conditions.append(f"{value_expr} LIKE %({prop_name})s")
params.update({prop_name: f"%{step.url}%"})

conditions.append(f"event = %({prepend}_{index})s")

Expand Down
6 changes: 5 additions & 1 deletion ee/clickhouse/models/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,11 @@ def property_table(property: Property) -> TableWithProperties:


def get_property_string_expr(
table: TableWithProperties, property_name: PropertyName, var: str, prop_var: str, allow_denormalized_props: bool
table: TableWithProperties,
property_name: PropertyName,
var: str,
prop_var: str,
allow_denormalized_props: bool = True,
) -> Tuple[str, bool]:
materialized_columns = get_materialized_columns(table) if allow_denormalized_props else {}

Expand Down
4 changes: 1 addition & 3 deletions ee/clickhouse/queries/breakdown_props.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,7 @@ def get_breakdown_event_prop_values(

entity_params, entity_format_params = populate_entity_params(entity)

value_expression, _ = get_property_string_expr(
"events", cast(str, filter.breakdown), "%(key)s", "properties", allow_denormalized_props=True
)
value_expression, _ = get_property_string_expr("events", cast(str, filter.breakdown), "%(key)s", "properties")

elements_query = TOP_ELEMENTS_ARRAY_OF_KEY_SQL.format(
value_expression=value_expression,
Expand Down
10 changes: 3 additions & 7 deletions ee/clickhouse/queries/clickhouse_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,14 @@ class ClickhousePaths(Paths):
def _determine_path_type(self, requested_type=None):
# Default
event: Optional[str] = "$pageview"
path_type, _ = get_property_string_expr(
"events", "$current_url", "'$current_url'", "properties", allow_denormalized_props=True
)
path_type, _ = get_property_string_expr("events", "$current_url", "'$current_url'", "properties")
start_comparator = "path_type"

# determine requested type
if requested_type:
if requested_type == SCREEN_EVENT:
event = SCREEN_EVENT
path_type, _ = get_property_string_expr(
"events", "$screen_name", "'$screen_name'", "properties", allow_denormalized_props=True
)
path_type, _ = get_property_string_expr("events", "$screen_name", "'$screen_name'", "properties")
elif requested_type == AUTOCAPTURE_EVENT:
event = AUTOCAPTURE_EVENT
path_type = "concat('<', {tag_regex}, '> ', {text_regex})".format(
Expand All @@ -44,7 +40,7 @@ def get_query(self, filter: PathFilter, team: Team) -> Tuple[str, Dict]:
event, path_type, start_comparator = self._determine_path_type(filter.path_type if filter else None)

prop_filters, prop_filter_params = parse_prop_clauses(
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts, allow_denormalized_props=True
)

# Step 0. Event culling subexpression for step 1.
Expand Down
6 changes: 3 additions & 3 deletions ee/clickhouse/queries/clickhouse_retention.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class ClickhouseRetention(Retention):
def _execute_sql(self, filter: RetentionFilter, team: Team,) -> Dict[Tuple[int, int], Dict[str, Any]]:
period = filter.period
prop_filters, prop_filter_params = parse_prop_clauses(
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts, allow_denormalized_props=True
)
target_entity = filter.target_entity
returning_entity = filter.returning_entity
Expand Down Expand Up @@ -152,7 +152,7 @@ def _retrieve_people(self, filter: RetentionFilter, team: Team):
is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME
trunc_func = get_trunc_func_ch(period)
prop_filters, prop_filter_params = parse_prop_clauses(
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts, allow_denormalized_props=True
)

returning_entity = filter.returning_entity if filter.selected_interval > 0 else filter.target_entity
Expand Down Expand Up @@ -209,7 +209,7 @@ def _retrieve_people_in_period(self, filter: RetentionFilter, team: Team):
period = filter.period
is_first_time_retention = filter.retention_type == RETENTION_FIRST_TIME
trunc_func = get_trunc_func_ch(period)
prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk)
prop_filters, prop_filter_params = parse_prop_clauses(filter.properties, team.pk, allow_denormalized_props=True)

target_query, target_params = self._get_condition(filter.target_entity, table="e")
target_query_formatted = "AND {target_query}".format(target_query=target_query)
Expand Down
10 changes: 8 additions & 2 deletions ee/clickhouse/queries/clickhouse_stickiness.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ def stickiness(self, entity: Entity, filter: StickinessFilter, team_id: int) ->

parsed_date_from, parsed_date_to, _ = parse_timestamps(filter=filter, team_id=team_id)
prop_filters, prop_filter_params = parse_prop_clauses(
filter.properties + entity.properties, team_id, filter_test_accounts=filter.filter_test_accounts
filter.properties + entity.properties,
team_id,
filter_test_accounts=filter.filter_test_accounts,
allow_denormalized_props=True,
)
trunc_func = get_trunc_func_ch(filter.interval)

Expand Down Expand Up @@ -93,7 +96,10 @@ def _format_entity_filter(entity: Entity) -> Tuple[str, Dict]:
def _process_content_sql(target_entity: Entity, filter: StickinessFilter, team: Team) -> Tuple[str, Dict[str, Any]]:
parsed_date_from, parsed_date_to, _ = parse_timestamps(filter=filter, team_id=team.pk)
prop_filters, prop_filter_params = parse_prop_clauses(
filter.properties + target_entity.properties, team.pk, filter_test_accounts=filter.filter_test_accounts
filter.properties + target_entity.properties,
team.pk,
filter_test_accounts=filter.filter_test_accounts,
allow_denormalized_props=True,
)
entity_sql, entity_params = _format_entity_filter(entity=target_entity)
trunc_func = get_trunc_func_ch(filter.interval)
Expand Down
4 changes: 1 addition & 3 deletions ee/clickhouse/queries/column_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ def materialized_event_columns_to_query(self) -> List[ColumnName]:

@cached_property
def should_query_event_properties_column(self) -> bool:
# :TODO: Once issue 5463 is solved (supporting materialized columns everywhere), uncomment this
# return len(self.materialized_event_columns_to_query) != len(self._used_properties_with_type("event"))
return True
return len(self.materialized_event_columns_to_query) != len(self._used_properties_with_type("event"))

@cached_property
def should_query_elements_chain_column(self) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion ee/clickhouse/queries/sessions/average.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def calculate_avg(self, filter: Filter, team: Team):
parsed_date_from, parsed_date_to, _ = parse_timestamps(filter, team.pk)

filters, params = parse_prop_clauses(
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts, allow_denormalized_props=True
)

trunc_func = get_trunc_func_ch(filter.interval)
Expand Down
2 changes: 1 addition & 1 deletion ee/clickhouse/queries/sessions/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def calculate_dist(self, filter: Filter, team: Team):
parsed_date_from, parsed_date_to, _ = parse_timestamps(filter, team.pk)

filters, params = parse_prop_clauses(
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts
filter.properties, team.pk, filter_test_accounts=filter.filter_test_accounts, allow_denormalized_props=True
)

entity_conditions, entity_params = entity_query_conditions(filter, team)
Expand Down
3 changes: 1 addition & 2 deletions ee/clickhouse/queries/test/test_event_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ def test_basic_event_filter(self):
)

correct = """
SELECT e.timestamp as timestamp,
e.properties as properties
SELECT e.timestamp as timestamp
FROM events e
WHERE team_id = %(team_id)s
AND event = %(event)s
Expand Down
89 changes: 86 additions & 3 deletions ee/clickhouse/queries/test/test_trends.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from uuid import uuid4

from django.conf import settings
from django.utils import timezone
from freezegun import freeze_time

from ee.clickhouse.materialized_columns import materialize
from ee.clickhouse.models.event import create_event
from ee.clickhouse.models.person import create_person, create_person_distinct_id
from ee.clickhouse.models.person import create_person_distinct_id
from ee.clickhouse.queries.trends.clickhouse_trends import ClickhouseTrends
from ee.clickhouse.util import ClickhouseTestMixin
from posthog.constants import TRENDS_BAR_VALUE
Expand All @@ -15,7 +15,6 @@
from posthog.models.filters import Filter
from posthog.models.person import Person
from posthog.queries.test.test_trends import trend_test_factory
from posthog.settings import SHELL_PLUS_PRINT_SQL


def _create_action(**kwargs):
Expand Down Expand Up @@ -409,6 +408,7 @@ def test_breakdown_user_props_with_filter(self):
),
self.team,
)

self.assertEqual(len(response), 1)
self.assertEqual(response[0]["breakdown_value"], "test@gmail.com")

Expand Down Expand Up @@ -714,3 +714,86 @@ def test_breakdown_single_cohort(self):
)

self.assertEqual(res[0]["count"], 1)

def test_materialized_property_filtering(self):
materialize("events", "$some_property")

with self.capture_select_queries() as sqls:
self.test_property_filtering()

for sql in sqls:
self.assertNotIn("JSONExtract", sql)
self.assertNotIn("properties", sql)

def test_materialized_per_entity_filtering(self):
materialize("events", "$some_property")

with self.capture_select_queries() as sqls:
self.test_per_entity_filtering()

for sql in sqls:
self.assertNotIn("JSONExtract", sql)
self.assertNotIn("properties", sql)

def test_materialized_breakdown_queries_right_columns(self):
materialize("events", "$some_property")

with self.capture_select_queries() as sqls:
self.test_breakdown_filtering_limit()

for sql in sqls:
self.assertNotIn("JSONExtract", sql)
self.assertNotIn("properties", sql)

def test_materialized_math_queries_right_columns(self):
materialize("events", "some_number")

with self.capture_select_queries() as sqls:
self.test_sum_filtering()

for sql in sqls:
self.assertNotIn("JSONExtract", sql)
self.assertNotIn("properties", sql)

def test_materialized_filtering_with_action_props(self):
materialize("events", "key")
materialize("events", "$current_url")

_create_event(
event="sign up",
distinct_id="person1",
team=self.team,
properties={"key": "val", "$current_url": "/some/page"},
)
_create_event(
event="sign up",
distinct_id="person2",
team=self.team,
properties={"key": "val", "$current_url": "/some/page"},
)
_create_event(
event="sign up",
distinct_id="person3",
team=self.team,
properties={"key": "val", "$current_url": "/another/page"},
)

action = Action.objects.create(name="sign up", team=self.team)
ActionStep.objects.create(
action=action,
event="sign up",
url="/some/page",
properties=[{"key": "key", "type": "event", "value": ["val"], "operator": "exact"}],
)

with self.capture_select_queries() as sqls:
response = ClickhouseTrends().run(
Filter(data={"date_from": "-14d", "actions": [{"id": action.pk, "type": "actions", "order": 0}],}),
self.team,
)

self.assertEqual(response[0]["count"], 2)

for sql in sqls:
self.assertNotIn("JSONExtract", sql)
self.assertNotIn("properties", sql)
22 changes: 14 additions & 8 deletions ee/clickhouse/queries/trends/breakdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ee.clickhouse.client import sync_execute
from ee.clickhouse.models.action import format_action_filter
from ee.clickhouse.models.property import parse_prop_clauses
from ee.clickhouse.models.property import get_property_string_expr, parse_prop_clauses
from ee.clickhouse.queries.breakdown_props import (
ALL_USERS_COHORT_ID,
format_breakdown_cohort_join_query,
Expand Down Expand Up @@ -41,7 +41,11 @@ def _format_breakdown_query(self, entity: Entity, filter: Filter, team_id: int)

props_to_filter = [*filter.properties, *entity.properties]
prop_filters, prop_filter_params = parse_prop_clauses(
props_to_filter, team_id, table_name="e", filter_test_accounts=filter.filter_test_accounts
props_to_filter,
team_id,
table_name="e",
filter_test_accounts=filter.filter_test_accounts,
allow_denormalized_props=True,
)
aggregate_operation, _, math_params = process_math(entity)

Expand Down Expand Up @@ -183,14 +187,16 @@ def _breakdown_prop_params(
values_arr = get_breakdown_event_prop_values(
filter, entity, aggregate_operation, team_id, extra_params=math_params
)
params = {
"values": values_arr,
}

# :TRICKY: We only support string breakdown for event/person properties
assert isinstance(filter.breakdown, str)
breakdown_value, _ = get_property_string_expr("events", filter.breakdown, "%(key)s", "properties")

return (
params,
{"values": values_arr, "key": filter.breakdown},
BREAKDOWN_PROP_JOIN_SQL,
{},
"trim(BOTH '\"' FROM JSONExtractRaw(properties, %(key)s))",
{"breakdown_value_expr": breakdown_value},
breakdown_value,
)

def _parse_single_aggregate_result(
Expand Down
4 changes: 2 additions & 2 deletions ee/clickhouse/queries/trends/lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _format_lifecycle_query(self, entity: Entity, filter: Filter, team_id: int)

props_to_filter = [*filter.properties, *entity.properties]
prop_filters, prop_filter_params = parse_prop_clauses(
props_to_filter, team_id, filter_test_accounts=filter.filter_test_accounts
props_to_filter, team_id, filter_test_accounts=filter.filter_test_accounts, allow_denormalized_props=True
)

_, _, date_params = parse_timestamps(filter=filter, team_id=team_id)
Expand Down Expand Up @@ -141,7 +141,7 @@ def get_people(

props_to_filter = [*filter.properties, *entity.properties]
prop_filters, prop_filter_params = parse_prop_clauses(
props_to_filter, team_id, filter_test_accounts=filter.filter_test_accounts
props_to_filter, team_id, filter_test_accounts=filter.filter_test_accounts, allow_denormalized_props=True
)

result = sync_execute(
Expand Down
Loading

0 comments on commit 0f58482

Please sign in to comment.