Skip to content

Commit

Permalink
[native] Add support for convert VARBINARY filter to velox filter
Browse files Browse the repository at this point in the history
  • Loading branch information
wypb authored and aditi-pandit committed Jun 10, 2024
1 parent 0504500 commit 8a4bdc4
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ std::string toString(
const VeloxExprConverter& exprConverter,
const TypePtr& type) {
auto value = exprConverter.getConstantValue(type, *block);
if (type->isVarbinary()) {
return value.value<TypeKind::VARBINARY>();
}
return value.value<std::string>();
}

Expand Down Expand Up @@ -652,6 +655,7 @@ std::unique_ptr<common::Filter> toFilter(
case TypeKind::DOUBLE:
return doubleRangeToFilter(range, nullAllowed, exprConverter, type);
case TypeKind::VARCHAR:
case TypeKind::VARBINARY:
return varcharRangeToFilter(range, nullAllowed, exprConverter, type);
case TypeKind::BOOLEAN:
return boolRangeToFilter(range, nullAllowed, exprConverter, type);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1563,6 +1563,50 @@ public void testKeyBasedSamplingInlined()
assertQuerySucceeds(session, "select count(1) from orders join lineitem using(orderkey)");
}

@Test
public void testColumnFilter()
{
String tmpTableName = generateRandomTableName();
assertUpdate(format("CREATE TABLE %s " +
"AS " +
"SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary " +
"FROM ( " +
" VALUES " +
" (null, null, null, null, null, null), " +
" (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), to_ieee754_64(1))," +
" (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), to_ieee754_64(2))," +
" (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), to_ieee754_64(3)), " +
" (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4)) " +
") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary)", tmpTableName), 5);

// NOTE: The query below does not list the c_timestamp field because Velox uses the America/Los_Angeles
// time zone when reading and writing TIMESTAMP type data in DWRF/ORC format (see https://github.com/facebookincubator/velox/issues/8127),
// while Presto Java uses the America/Bahia_Banderas time zone when reading TIMESTAMP during the test (see com.facebook.presto.hive.HiveQueryRunner),
// Therefore, the data read by the two engines will be inconsistent.

// BOOLEAN column filter
assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_boolean", tmpTableName));

// BIGINT column filter
assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_bigint = 0", tmpTableName));

// DOUBLE column filter
assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_double = 1.2", tmpTableName));

// VARCHAR column filter
assertQuery(format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_varchar = CAST('cba2' AS VARCHAR)", tmpTableName));

// TIMESTAMP column filter
assertQuery(format("SELECT * FROM %s WHERE c_TIMESTAMP = TIMESTAMP '2012-09-09 00:00'", tmpTableName), "VALUES(false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4))");

// NOTE: Presto Java's DWRF format does not support pushing down VARBINARY type filters to TableScan, so we need to disable filter pushdown.
Session session = Session.builder(getSession())
.setCatalogSessionProperty("hive", "pushdown_filter_enabled", "false")
.build();
// VARBINARY column filter
assertQuery(session, format("SELECT c_boolean, c_bigint, c_double, c_varchar, c_varbinary FROM %s WHERE c_varbinary = to_ieee754_64(1)", tmpTableName));
}

private void assertQueryResultCount(String sql, int expectedResultCount)
{
assertEquals(getQueryRunner().execute(sql).getRowCount(), expectedResultCount);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,26 +255,24 @@ public void testCollectColumnStatisticsOnCreateTable()
{
Session session = buildSessionForTableWrite();
String tmpTableName = generateRandomTableName();
// TODO: add varbinary test support once velox supports varbinary in value node
// https://github.com/prestodb/presto/blob/master/presto-native-execution/presto_cpp/main/types/PrestoToVeloxQueryPlan.cpp#L915
assertUpdate(session, format("" +
"CREATE TABLE %s " +
"WITH ( " +
" partitioned_by = ARRAY['p_varchar'] " +
") " +
"AS " +
"SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar " +
"SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar " +
"FROM ( " +
" VALUES " +
" (null, null, null, null, null, null, 'p1'), " +
" (null, null, null, null, null, null, 'p1'), " +
" (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), sequence(0, 10), 'p1')," +
" (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), sequence(10, 20), 'p1')," +
" (null, null, null, null, null, null, 'p2'), " +
" (null, null, null, null, null, null, 'p2'), " +
" (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), sequence(20, 25), 'p2'), " +
" (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), sequence(30, 35), 'p2') " +
") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)", tmpTableName), 8);
" (null, null, null, null, null, null, null, 'p1'), " +
" (null, null, null, null, null, null, null, 'p1'), " +
" (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), to_ieee754_64(1), sequence(0, 10), 'p1')," +
" (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), to_ieee754_64(2), sequence(10, 20), 'p1')," +
" (null, null, null, null, null, null, null, 'p2'), " +
" (null, null, null, null, null, null, null, 'p2'), " +
" (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), to_ieee754_64(3), sequence(20, 25), 'p2'), " +
" (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4), sequence(30, 35), 'p2') " +
") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar)", tmpTableName), 8);

assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tmpTableName),
"SELECT * FROM (VALUES " +
Expand All @@ -283,19 +281,21 @@ public void testCollectColumnStatisticsOnCreateTable()
"('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " +
"('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " +
"('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8.0
"('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " +
"('c_array', 184.0E0, null, 0.5, null, null, null, null), " + // 176
"('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " +
"(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)");
"(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)");
assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tmpTableName),
"SELECT * FROM (VALUES " +
"('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " +
"('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2', null), " +
"('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " +
"('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " +
"('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8
"('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " +
"('c_array', 104.0E0, null, 0.5, null, null, null, null), " + // 96
"('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " +
"(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)");
"(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)");

// non existing partition
assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tmpTableName),
Expand All @@ -305,9 +305,10 @@ public void testCollectColumnStatisticsOnCreateTable()
"('c_double', null, 0E0, 0E0, null, null, null, null), " +
"('c_timestamp', null, 0E0, 0E0, null, null, null, null), " +
"('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " +
"('c_varbinary', null, 0E0, 0E0, null, null, null, null), " +
"('c_array', null, 0E0, 0E0, null, null, null, null), " +
"('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " +
"(null, null, null, null, 0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, h_varchar)");
"(null, null, null, null, 0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)");

dropTableIfExists(tmpTableName);
}
Expand All @@ -324,6 +325,7 @@ public void testCollectColumnStatisticsOnInsert()
" c_double DOUBLE, " +
" c_timestamp TIMESTAMP, " +
" c_varchar VARCHAR, " +
" c_varbinary VARBINARY, " +
" c_array ARRAY(BIGINT), " +
" p_varchar VARCHAR " +
") " +
Expand All @@ -333,18 +335,18 @@ public void testCollectColumnStatisticsOnInsert()

assertUpdate(format("" +
"INSERT INTO %s " +
"SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar " +
"SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar " +
"FROM ( " +
" VALUES " +
" (null, null, null, null, null, null, 'p1'), " +
" (null, null, null, null, null, null, 'p1'), " +
" (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), sequence(0, 10), 'p1')," +
" (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), sequence(10, 20), 'p1')," +
" (null, null, null, null, null, null, 'p2'), " +
" (null, null, null, null, null, null, 'p2'), " +
" (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), sequence(20, 25), 'p2'), " +
" (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), sequence(30, 35), 'p2') " +
") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar)", tmpTableName), 8);
" (null, null, null, null, null, null, null, 'p1'), " +
" (null, null, null, null, null, null, null, 'p1'), " +
" (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', CAST('abc1' AS VARCHAR), to_ieee754_64(1), sequence(0, 10), 'p1')," +
" (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', CAST('abc2' AS VARCHAR), to_ieee754_64(2), sequence(10, 20), 'p1')," +
" (null, null, null, null, null, null, null, 'p2'), " +
" (null, null, null, null, null, null, null, 'p2'), " +
" (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', CAST('cba1' AS VARCHAR), to_ieee754_64(3), sequence(20, 25), 'p2'), " +
" (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', CAST('cba2' AS VARCHAR), to_ieee754_64(4), sequence(30, 35), 'p2') " +
") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, c_array, p_varchar)", tmpTableName), 8);

assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1')", tmpTableName),
"SELECT * FROM (VALUES " +
Expand All @@ -353,19 +355,21 @@ public void testCollectColumnStatisticsOnInsert()
"('c_double', null, 2.0E0, 0.5E0, null, '1.2', '2.2', null), " +
"('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " +
"('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8
"('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " +
"('c_array', 184.0E0, null, 0.5E0, null, null, null, null), " + // 176
"('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " +
"(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)");
"(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)");
assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2')", tmpTableName),
"SELECT * FROM (VALUES " +
"('c_boolean', null, 2.0E0, 0.5E0, null, null, null, null), " +
"('c_bigint', null, 2.0E0, 0.5E0, null, '1', '2', null), " +
"('c_double', null, 2.0E0, 0.5E0, null, '2.3', '3.3', null), " +
"('c_timestamp', null, 2.0E0, 0.5E0, null, null, null, null), " +
"('c_varchar', 16.0E0, 2.0E0, 0.5E0, null, null, null, null), " + // 8
"('c_varbinary', 24.0, null, 0.5E0, null, null, null, null), " +
"('c_array', 104.0E0, null, 0.5, null, null, null, null), " + // 96
"('p_varchar', 8.0E0, 1.0E0, 0.0E0, null, null, null, null), " +
"(null, null, null, null, 4.0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)");
"(null, null, null, null, 4.0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)");

// non existing partition
assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3')", tmpTableName),
Expand All @@ -375,9 +379,10 @@ public void testCollectColumnStatisticsOnInsert()
"('c_double', null, 0E0, 0E0, null, null, null, null), " +
"('c_timestamp', null, 0E0, 0E0, null, null, null, null), " +
"('c_varchar', 0E0, 0E0, 0E0, null, null, null, null), " +
"('c_varbinary', null, 0E0, 0E0, null, null, null, null), " +
"('c_array', null, 0E0, 0E0, null, null, null, null), " +
"('p_varchar', 0E0, 0E0, 0E0, null, null, null, null), " +
"(null, null, null, null, 0E0, null, null, null)) AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_array, p_varchar, p_varchar)");
"(null, null, null, null, 0E0, null, null, null)) AS x (column_name, data_size, distinct_values_count, nulls_fraction, row_count, low_value, high_value, histogram)");

dropTableIfExists(tmpTableName);
}
Expand Down

0 comments on commit 8a4bdc4

Please sign in to comment.