Implement SkipScan to speed up SELECT DISTINCT

This patch implements a skip-scan; an optimization for SELECT DISTINCT ON. Usually for SELECT DISTINCT ON postgres will plan either a UNIQUE over a sorted path, or some form of aggregate. In either case, it needs to scan the entire table, even in cases where there are only a few unique values. A skip-scan optimizes this case when we have an ordered index. Instead of scanning the entire table and deduplicating after, the scan remembers the last value returned, and searches the index for the next value after that one. This means that for a table with k keys, with u distinct values, a skip-scan runs in time u * log(k) as opposed to scanning then deduplicating, which takes time k. We can write the number of unique values u as of function of k by dividing by the number of repeats r i.e. u = k/r this means that a skip-scan will be faster if each key is repeated more than a logarithmic number of times, i.e. if r > log(k) then u * log(k) < k/log(k) * log(k) < k. Co-authored-by: Joshua Lockerman <josh@timescale.com>
timescale · Apr 8, 2021 · bddcbba · bddcbba
1 parent 639aef7
commit bddcbba
Show file tree

Hide file tree

Showing 29 changed files with 13,405 additions and 5 deletions.
diff --git a/.github/workflows/alpine-32bit-build-and-test.yaml b/.github/workflows/alpine-32bit-build-and-test.yaml
@@ -22,11 +22,11 @@ jobs:
         build_type: [ Debug ]
         include:
           - pg: 11.11
-            ignores: append-11 chunk_adaptive-11 continuous_aggs_bgw_drop_chunks remote_txn transparent_decompression-11 continuous_aggs_insert continuous_aggs_multi continuous_aggs_concurrent_refresh
+            ignores: append-11 chunk_adaptive-11 continuous_aggs_bgw_drop_chunks remote_txn transparent_decompression-11 continuous_aggs_insert continuous_aggs_multi continuous_aggs_concurrent_refresh plan_skip_scan-11
           - pg: 12.6
-            ignores: append-12 chunk_adaptive-12 continuous_aggs_bgw_drop_chunks remote_txn transparent_decompression-12
+            ignores: append-12 chunk_adaptive-12 continuous_aggs_bgw_drop_chunks remote_txn transparent_decompression-12 plan_skip_scan-12
           - pg: 13.2
-            ignores: append-13 chunk_adaptive-13 remote_txn transparent_decompression-13 vacuum_parallel
+            ignores: append-13 chunk_adaptive-13 remote_txn transparent_decompression-13 vacuum_parallel plan_skip_scan-13
 
     steps:
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ accidentally triggering the load of a previous DB version.**
 
 ## Unreleased
 
+**Major Features**
+* #3000 SkipScan to speed up SELECT DISTINCT
+
 **Bugfixes**
 * #2989 Refactor and harden size and stats functions
 * #3058 Reduce memory usage for distributed inserts

diff --git a/src/chunk_append/chunk_append.c b/src/chunk_append/chunk_append.c
@@ -34,6 +34,13 @@ static CustomPathMethods chunk_append_path_methods = {
 	.PlanCustomPath = ts_chunk_append_plan_create,
 };
 
+bool
+ts_is_chunk_append_path(Path *path)
+{
+	return IsA(path, CustomPath) &&
+		   castNode(CustomPath, path)->methods == &chunk_append_path_methods;
+}
+
 static bool
 has_joins(FromExpr *jointree)
 {

diff --git a/src/chunk_append/chunk_append.h b/src/chunk_append/chunk_append.h
@@ -27,5 +27,6 @@ extern Path *ts_chunk_append_path_create(PlannerInfo *root, RelOptInfo *rel, Hyp
 extern bool ts_ordered_append_should_optimize(PlannerInfo *root, RelOptInfo *rel, Hypertable *ht,
 											  List *join_conditions, int *order_attno,
 											  bool *reverse);
+extern TSDLLEXPORT bool ts_is_chunk_append_path(Path *path);
 
 #endif /* TIMESCALEDB_CHUNK_APPEND_H */
diff --git a/src/compat.h b/src/compat.h
@@ -371,6 +371,7 @@ get_vacuum_options(const VacuumStmt *stmt)
 #define list_delete_cell_compat(l, lc, prev) list_delete_cell((l), (lc))
 #define list_make5(x1, x2, x3, x4, x5) lappend(list_make4(x1, x2, x3, x4), x5)
 #define list_make5_oid(x1, x2, x3, x4, x5) lappend_oid(list_make4_oid(x1, x2, x3, x4), x5)
+#define list_make5_int(x1, x2, x3, x4, x5) lappend_int(list_make4_int(x1, x2, x3, x4), x5)
 #define for_each_cell_compat(cell, list, initcell) for_each_cell (cell, list, initcell)
 #endif
 

diff --git a/src/constraint_aware_append.c b/src/constraint_aware_append.c
@@ -570,6 +570,14 @@ ts_constraint_aware_append_possible(Path *path)
 	}
 	return false;
 }
+
+bool
+ts_is_constraint_aware_append_path(Path *path)
+{
+	return IsA(path, CustomPath) &&
+		   castNode(CustomPath, path)->methods == &constraint_aware_append_path_methods;
+}
+
 void
 _constraint_aware_append_init(void)
 {

diff --git a/src/constraint_aware_append.h b/src/constraint_aware_append.h
@@ -24,7 +24,8 @@ typedef struct ConstraintAwareAppendState
 typedef struct Hypertable Hypertable;
 
 extern bool ts_constraint_aware_append_possible(Path *path);
-extern Path *ts_constraint_aware_append_path_create(PlannerInfo *root, Path *subpath);
+extern TSDLLEXPORT Path *ts_constraint_aware_append_path_create(PlannerInfo *root, Path *subpath);
+extern TSDLLEXPORT bool ts_is_constraint_aware_append_path(Path *path);
 extern void _constraint_aware_append_init(void);
 
 #endif /* TIMESCALEDB_CONSTRAINT_AWARE_APPEND_H */
diff --git a/src/guc.c b/src/guc.c
@@ -53,6 +53,7 @@ bool ts_guc_enable_cagg_reorder_groupby = true;
 TSDLLEXPORT bool ts_guc_enable_transparent_decompression = true;
 bool ts_guc_enable_per_data_node_queries = true;
 bool ts_guc_enable_async_append = true;
+TSDLLEXPORT bool ts_guc_enable_skip_scan = true;
 int ts_guc_max_open_chunks_per_insert = 10;
 int ts_guc_max_cached_chunks_per_hypertable = 10;
 int ts_guc_telemetry_level = TELEMETRY_DEFAULT;
@@ -197,6 +198,17 @@ _guc_init(void)
 							 NULL,
 							 NULL);
 
+	DefineCustomBoolVariable("timescaledb.enable_skipscan",
+							 "Enable SkipScan",
+							 "Enable SkipScan for DISTINCT queries",
+							 &ts_guc_enable_skip_scan,
+							 true,
+							 PGC_USERSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
 	DefineCustomBoolVariable("timescaledb.enable_cagg_reorder_groupby",
 							 "Enable group by reordering",
 							 "Enable group by clause reordering for continuous aggregates",

diff --git a/src/guc.h b/src/guc.h
@@ -23,6 +23,7 @@ extern bool ts_guc_enable_cagg_reorder_groupby;
 extern TSDLLEXPORT bool ts_guc_enable_transparent_decompression;
 extern TSDLLEXPORT bool ts_guc_enable_per_data_node_queries;
 extern TSDLLEXPORT bool ts_guc_enable_async_append;
+extern TSDLLEXPORT bool ts_guc_enable_skip_scan;
 extern bool ts_guc_restoring;
 extern int ts_guc_max_open_chunks_per_insert;
 extern int ts_guc_max_cached_chunks_per_hypertable;

diff --git a/src/import/planner.h b/src/import/planner.h
@@ -54,7 +54,7 @@ extern TSDLLEXPORT PathKey *
 ts_make_pathkey_from_sortinfo(PlannerInfo *root, Expr *expr, Relids nullable_relids, Oid opfamily,
 							  Oid opcintype, Oid collation, bool reverse_sort, bool nulls_first,
 							  Index sortref, Relids rel, bool create_it);
-extern List *ts_build_path_tlist(PlannerInfo *root, Path *path);
+extern TSDLLEXPORT List *ts_build_path_tlist(PlannerInfo *root, Path *path);
 
 extern void ts_ExecSetTupleBound(int64 tuples_needed, PlanState *child_node);
 

diff --git a/tsl/src/init.c b/tsl/src/init.c
@@ -36,6 +36,7 @@
 #include "hypertable.h"
 #include "license_guc.h"
 #include "nodes/decompress_chunk/planner.h"
+#include "nodes/skip_scan/skip_scan.h"
 #include "nodes/gapfill/gapfill.h"
 #include "partialize_finalize.h"
 #include "planner.h"
@@ -200,6 +201,7 @@ ts_module_init(PG_FUNCTION_ARGS)
 
 	_continuous_aggs_cache_inval_init();
 	_decompress_chunk_init();
+	_skip_scan_init();
 	_remote_connection_cache_init();
 	_remote_dist_txn_init();
 	_tsl_process_utility_init();

diff --git a/tsl/src/nodes/CMakeLists.txt b/tsl/src/nodes/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(compress_dml)
 add_subdirectory(decompress_chunk)
 add_subdirectory(gapfill)
+add_subdirectory(skip_scan)
diff --git a/tsl/src/nodes/skip_scan/CMakeLists.txt b/tsl/src/nodes/skip_scan/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(SOURCES
+  ${CMAKE_CURRENT_SOURCE_DIR}/planner.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/exec.c
+)
+target_sources(${TSL_LIBRARY_NAME} PRIVATE ${SOURCES})
diff --git a/tsl/src/nodes/skip_scan/README.md b/tsl/src/nodes/skip_scan/README.md
@@ -0,0 +1,87 @@
+# SkipScan #
+
+This module implements a skip-scan; an optimization for `SELECT DISTINCT ON`.
+Usually for `SELECT DISTINCT ON` Postgres will plan either a `UNIQUE` over a
+sorted path, or some form of aggregate. In either case, it needs to scan the
+entire table, even in cases where there are only a few unique values.
+
+A skip-scan optimizes this case when we have an ordered index. Instead of
+scanning the entire table and deduplicating after, the scan remembers the last
+value returned, and searches the index for the next value after that one. This
+means that for a table with `k` keys, with `u` distinct values, a skip-scan runs
+in time `u * log(k)` as opposed to scanning then deduplicating, which takes time
+`k`. We can write the number of unique values `u` as of function of `k` by
+dividing by the number of repeats `r` i.e. `u = k/r` this means that a skip-scan
+will be faster if each key is repeated more than a logarithmic number of times,
+i.e. if `r > log(k)` then `u * log(k) < k/log(k) * log(k) < k`.
+
+
+## Implementation ##
+
+We plan our skip-scan with a tree something like
+
+```SQL
+Custom Scan (SkipScan) on table
+   ->  Index Scan using table_key_idx on table
+       Index Cond: (key > NULL)
+```
+
+After each iteration through the `SkipScan` we replace the `key > NULL` with
+a `key > [next value we are returning]` and restart the underlying `IndexScan`.
+There are some subtleties around `NULL` handling, see the source file for more
+detail.
+
+
+## Planning Heuristics ##
+
+To plan our SkipScan we look for a compatible plan, for instance
+
+```SQL
+Unique
+   ->  Index Scan
+```
+
+or
+
+```SQL
+Unique
+   ->  Merge Append
+         ->  Index Scan
+         ...
+```
+
+given such a plan, we know the index is sorted in an order with the distinct
+key(s) first, so we can add quals to the `IndexScan` representing the previous
+key returned, and thus skip over the repeated values. The `Unique` node tells us
+which columns are relevant.
+
+We use this to create plans that look like
+
+```SQL
+Unique
+  ->  Custom Scan (SkipScan) on skip_scan
+        ->  Index Scan using skip_scan_dev_name_idx on skip_scan
+```
+
+or
+
+```SQL
+Unique
+  ->  Merge Append
+        Sort Key: _hyper_2_1_chunk.dev_name
+        ->  Custom Scan (SkipScan) on _hyper_2_1_chunk
+              ->  Index Scan using _hyper_2_1_chunk_idx on _hyper_2_1_chunk
+        ->  Custom Scan (SkipScan) on _hyper_2_2_chunk
+              ->  Index Scan using _hyper_2_2_chunk_idx on _hyper_2_2_chunk
+```
+
+respectively.
+
+## Postgres-Native Skip Scan ##
+
+Upstream postgres is also working on a skip-scan implementation, see e.g.
+https://commitfest.postgresql.org/32/1741/
+As when this document was first written, it is not yet merged. Their strategy
+involves integrating this functionality into the btree searching code,
+and will be available in PG15 at the earliest. The two
+implementations should not interfere with eachother.