From 8a4bdc42dfbe8bb73c671889b0b44bfbbdbeafd3 Mon Sep 17 00:00:00 2001 From: wypb Date: Tue, 4 Jun 2024 15:18:19 +0800 Subject: [PATCH] [native] Add support for convert VARBINARY filter to velox filter --- .../main/types/PrestoToVeloxConnector.cpp | 4 ++ .../AbstractTestNativeGeneralQueries.java | 44 +++++++++++++ .../nativeworker/AbstractTestWriter.java | 61 ++++++++++--------- 3 files changed, 81 insertions(+), 28 deletions(-) diff --git a/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp b/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp index 99e36c20de74..900beb05b474 100644 --- a/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp +++ b/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp @@ -235,6 +235,9 @@ std::string toString( const VeloxExprConverter& exprConverter, const TypePtr& type) { auto value = exprConverter.getConstantValue(type, *block); + if (type->isVarbinary()) { + return value.value(); + } return value.value(); } @@ -652,6 +655,7 @@ std::unique_ptr toFilter( case TypeKind::DOUBLE: return doubleRangeToFilter(range, nullAllowed, exprConverter, type); case TypeKind::VARCHAR: + case TypeKind::VARBINARY: return varcharRangeToFilter(range, nullAllowed, exprConverter, type); case TypeKind::BOOLEAN: return boolRangeToFilter(range, nullAllowed, exprConverter, type); diff --git a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java index bd4f8afe34ae..9925486c4a84 100644 --- a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java +++ b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestNativeGeneralQueries.java @@ -1563,6 +1563,50 @@ public void testKeyBasedSamplingInlined() assertQuerySucceeds(session, "select count(1) from orders join lineitem using(orderkey)"); } + @Test + public void testColumnFilter() + { + String tmpTableName = generateRandomTableName(); + assertUpdate(format("CREATE TABLE %s " + + "AS " + + "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary " + + "FROM ( " + + " VALUES " + + " (null, null, null, null, null, null), " + + " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), to_ieee754_64(1))," + + " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), to_ieee754_64(2))," + + " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), to_ieee754_64(3)), " + + " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4)) " + + ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary)", tmpTableName), 5); + + // NOTE: The query below does not list the c_timestamp field because Velox uses the America/Los_Angeles + // time zone when reading and writing TIMESTAMP type data in DWRF/ORC format (see https://github.com/facebookincubator/velox/issues/8127), + // while Presto Java uses the America/Bahia_Banderas time zone when reading TIMESTAMP during the test (see com.facebook.presto.hive.HiveQueryRunner), + // Therefore, the data read by the two engines will be inconsistent. + + // BOOLEAN column filter + assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_boolean", tmpTableName)); + + // BIGINT column filter + assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_bigint = 0", tmpTableName)); + + // DOUBLE column filter + assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_double = 1.2", tmpTableName)); + + // VARCHAR column filter + assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_varchar = CAST('cba2' AS VARCHAR)", tmpTableName)); + + // TIMESTAMP column filter + assertQuery(format("SELECT * FROM %s WHERE c_TIMESTAMP = TIMESTAMP '2012-09-09 00:00'", tmpTableName), "VALUES(false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4))"); + + // NOTE: Presto Java's DWRF format does not support pushing down VARBINARY type filters to TableScan, so we need to disable filter pushdown. + Session session = Session.builder(getSession()) + .setCatalogSessionProperty("hive", "pushdown_filter_enabled", "false") + .build(); + // VARBINARY column filter + assertQuery(session, format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_varbinary = to_ieee754_64(1)", tmpTableName)); + } + private void assertQueryResultCount(String sql, int expectedResultCount) { assertEquals(getQueryRunner().execute(sql).getRowCount(), expectedResultCount); diff --git a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java index d815594a5bec..11c924d59b3a 100644 --- a/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java +++ b/presto-native-execution/src/test/java/com/facebook/presto/nativeworker/AbstractTestWriter.java @@ -255,26 +255,24 @@ public void testCollectColumnStatisticsOnCreateTable() { Session session = buildSessionForTableWrite(); String tmpTableName = generateRandomTableName(); - // TODO: add varbinary test support once velox supports varbinary in value node - // https://github.com/prestodb/presto/blob/master/presto-native-execution/presto_cpp/main/types/PrestoToVeloxQueryPlan.cpp#L915 assertUpdate(session, format("" + "CREATE TABLE %s " + "WITH ( " + " partitioned_by = ARRAY['p_varchar'] " + ") " + "AS " + - "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar " + + "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar " + "FROM ( " + " VALUES " + - " (null, null, null, null, null, null, 'p1'), " + - " (null, null, null, null, null, null, 'p1'), " + - " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), sequence(0, 10), 'p1')," + - " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), sequence(10, 20), 'p1')," + - " (null, null, null, null, null, null, 'p2'), " + - " (null, null, null, null, null, null, 'p2'), " + - " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), sequence(20, 25), 'p2'), " + - " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), sequence(30, 35), 'p2') " + - ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)", tmpTableName), 8); + " (null, null, null, null, null, null, null, 'p1'), " + + " (null, null, null, null, null, null, null, 'p1'), " + + " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), to_ieee754_64(1), sequence(0, 10), 'p1')," + + " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), to_ieee754_64(2), sequence(10, 20), 'p1')," + + " (null, null, null, null, null, null, null, 'p2'), " + + " (null, null, null, null, null, null, null, 'p2'), " + + " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), to_ieee754_64(3), sequence(20, 25), 'p2'), " + + " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4), sequence(30, 35), 'p2') " + + ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar)", tmpTableName), 8); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tmpTableName), "SELECT * FROM (VALUES " + @@ -283,9 +281,10 @@ public void testCollectColumnStatisticsOnCreateTable() "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8.0 + "('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " + "('c_array', 184.0E0, null, 0.5, null, null, null, null), " + // 176 "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)"); + "(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tmpTableName), "SELECT * FROM (VALUES " + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + @@ -293,9 +292,10 @@ public void testCollectColumnStatisticsOnCreateTable() "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8 + "('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " + "('c_array', 104.0E0, null, 0.5, null, null, null, null), " + // 96 "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)"); + "(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)"); // non existing partition assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tmpTableName), @@ -305,9 +305,10 @@ public void testCollectColumnStatisticsOnCreateTable() "('c_double', null, 0E0, 0E0, null, null, null, null), " + "('c_timestamp', null, 0E0, 0E0, null, null, null, null), " + "('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + + "('c_varbinary', null, 0E0, 0E0, null, null, null, null), " + "('c_array', null, 0E0, 0E0, null, null, null, null), " + "('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "(null, null, null, null, 0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)"); + "(null, null, null, null, 0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)"); dropTableIfExists(tmpTableName); } @@ -324,6 +325,7 @@ public void testCollectColumnStatisticsOnInsert() " c_double DOUBLE, " + " c_timestamp TIMESTAMP, " + " c_varchar VARCHAR, " + + " c_varbinary VARBINARY, " + " c_array ARRAY(BIGINT), " + " p_varchar VARCHAR " + ") " + @@ -333,18 +335,18 @@ public void testCollectColumnStatisticsOnInsert() assertUpdate(format("" + "INSERT INTO %s " + - "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar " + + "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar " + "FROM ( " + " VALUES " + - " (null, null, null, null, null, null, 'p1'), " + - " (null, null, null, null, null, null, 'p1'), " + - " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), sequence(0, 10), 'p1')," + - " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), sequence(10, 20), 'p1')," + - " (null, null, null, null, null, null, 'p2'), " + - " (null, null, null, null, null, null, 'p2'), " + - " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), sequence(20, 25), 'p2'), " + - " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), sequence(30, 35), 'p2') " + - ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)", tmpTableName), 8); + " (null, null, null, null, null, null, null, 'p1'), " + + " (null, null, null, null, null, null, null, 'p1'), " + + " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), to_ieee754_64(1), sequence(0, 10), 'p1')," + + " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), to_ieee754_64(2), sequence(10, 20), 'p1')," + + " (null, null, null, null, null, null, null, 'p2'), " + + " (null, null, null, null, null, null, null, 'p2'), " + + " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), to_ieee754_64(3), sequence(20, 25), 'p2'), " + + " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4), sequence(30, 35), 'p2') " + + ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar)", tmpTableName), 8); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tmpTableName), "SELECT * FROM (VALUES " + @@ -353,9 +355,10 @@ public void testCollectColumnStatisticsOnInsert() "('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8 + "('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " + "('c_array', 184.0E0, null, 0.5E0, null, null, null, null), " + // 176 "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)"); + "(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)"); assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tmpTableName), "SELECT * FROM (VALUES " + "('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " + @@ -363,9 +366,10 @@ public void testCollectColumnStatisticsOnInsert() "('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " + "('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " + "('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8 + "('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " + "('c_array', 104.0E0, null, 0.5, null, null, null, null), " + // 96 "('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " + - "(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)"); + "(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)"); // non existing partition assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tmpTableName), @@ -375,9 +379,10 @@ public void testCollectColumnStatisticsOnInsert() "('c_double', null, 0E0, 0E0, null, null, null, null), " + "('c_timestamp', null, 0E0, 0E0, null, null, null, null), " + "('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + + "('c_varbinary', null, 0E0, 0E0, null, null, null, null), " + "('c_array', null, 0E0, 0E0, null, null, null, null), " + "('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " + - "(null, null, null, null, 0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)"); + "(null, null, null, null, 0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)"); dropTableIfExists(tmpTableName); }