From 82c729b096b1d75647466809fe029212aeb13db4 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 18 Sep 2024 22:49:07 +0800 Subject: [PATCH 1/4] implement the testing framework for string type --- .../init_data.slt.part} | 51 +-- .../test_files/string/large_string.slt | 50 +++ .../sqllogictest/test_files/string/string.slt | 50 +++ .../string_literal.slt} | 17 +- .../test_files/string/string_query.slt.part | 326 ++++++++++++++++ .../test_files/string/string_view.slt | 356 ++++++++++++++++++ 6 files changed, 802 insertions(+), 48 deletions(-) rename datafusion/sqllogictest/test_files/{string_functions/substr/substr_runner.slt => string/init_data.slt.part} (57%) create mode 100644 datafusion/sqllogictest/test_files/string/large_string.slt create mode 100644 datafusion/sqllogictest/test_files/string/string.slt rename datafusion/sqllogictest/test_files/{string_functions/substr/substr_literal.slt.part => string/string_literal.slt} (87%) create mode 100644 datafusion/sqllogictest/test_files/string/string_query.slt.part create mode 100644 datafusion/sqllogictest/test_files/string/string_view.slt diff --git a/datafusion/sqllogictest/test_files/string_functions/substr/substr_runner.slt b/datafusion/sqllogictest/test_files/string/init_data.slt.part similarity index 57% rename from datafusion/sqllogictest/test_files/string_functions/substr/substr_runner.slt rename to datafusion/sqllogictest/test_files/string/init_data.slt.part index d61cfa2b743a..d99401f10d20 100644 --- a/datafusion/sqllogictest/test_files/string_functions/substr/substr_runner.slt +++ b/datafusion/sqllogictest/test_files/string/init_data.slt.part @@ -15,10 +15,12 @@ # specific language governing permissions and limitations # under the License. -# -------------------------------------- -# Test `substr()` with literal arguments -# -------------------------------------- -include ./substr_literal.slt.part +statement ok +create table test_source as values + ('Andrew', 'X', 'datafusion📊🔥', '🔥'), + ('Xiangpeng', 'Xiangpeng', 'datafusion数据融合', 'datafusion数据融合'), + ('Raphael', 'R', 'datafusionДатаФусион', 'аФус'), + (NULL, 'R', NULL, '🔥'); # -------------------------------------- # Setup test tables with different physical string types (Utf8/Utf8View/LargeUtf8) @@ -29,44 +31,3 @@ create table test_substr_base ( col1 VARCHAR ) as values ('foo'), ('hello🌏世界'), ('💩'), ('ThisIsAVeryLongASCIIString'), (''), (NULL); -# -# Run1: Utf8 -# -statement ok -create table test_substr as -select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; - -include ./substr_table.slt.part - -statement ok -drop table test_substr; - -# -# Run2: Utf8View -# -statement ok -create table test_substr as -select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base; - -include ./substr_table.slt.part - -statement ok -drop table test_substr; - -# -# Run3: LargeUtf8 -# -statement ok -create table test_substr as -select arrow_cast(col1, 'LargeUtf8') as c1 from test_substr_base; - -include ./substr_table.slt.part - -statement ok -drop table test_substr; - -# -------------------------------------- -# Cleanup -# -------------------------------------- -statement ok -drop table test_substr_base; \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt b/datafusion/sqllogictest/test_files/string/large_string.slt new file mode 100644 index 000000000000..d90f2bffe0f5 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string/large_string.slt @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +include ./init_data.slt.part + +# -------------------------------------- +# Setup test tables with different physical string types +# and repeat tests in `string_query.slt.part` +# -------------------------------------- +statement ok +create table test_basic_operator as +select + arrow_cast(column1, 'LargeUtf8') as ascii_1, + arrow_cast(column2, 'LargeUtf8') as ascii_2, + arrow_cast(column3, 'LargeUtf8') as unicode_1, + arrow_cast(column4, 'LargeUtf8') as unicode_2 +from test_source; + +statement ok +create table test_substr as +select arrow_cast(col1, 'LargeUtf8') as c1 from test_substr_base; + +# +# common test for string-like functions and operators +# +include ./string_query.slt.part + +# +# Clean up +# + +statement ok +drop table test_basic_operator; + +statement ok +drop table test_substr_base; diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt new file mode 100644 index 000000000000..435795309f52 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +include ./init_data.slt.part + +# -------------------------------------- +# Setup test tables with different physical string types +# and repeat tests in `string_query.slt.part` +# -------------------------------------- +statement ok +create table test_basic_operator as +select + arrow_cast(column1, 'Utf8') as ascii_1, + arrow_cast(column2, 'Utf8') as ascii_2, + arrow_cast(column3, 'Utf8') as unicode_1, + arrow_cast(column4, 'Utf8') as unicode_2 +from test_source; + +statement ok +create table test_substr as +select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; + +# +# common test for string-like functions and operators +# +include ./string_query.slt.part + +# +# Clean up +# + +statement ok +drop table test_basic_operator; + +statement ok +drop table test_substr; diff --git a/datafusion/sqllogictest/test_files/string_functions/substr/substr_literal.slt.part b/datafusion/sqllogictest/test_files/string/string_literal.slt similarity index 87% rename from datafusion/sqllogictest/test_files/string_functions/substr/substr_literal.slt.part rename to datafusion/sqllogictest/test_files/string/string_literal.slt index dac59016627f..37dc1a8ce84d 100644 --- a/datafusion/sqllogictest/test_files/string_functions/substr/substr_literal.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -56,8 +56,8 @@ SELECT substr('alphabet', 3, 20) phabet query TT -select - substr(arrow_cast('alphabet', 'LargeUtf8'), 3, 20), +select + substr(arrow_cast('alphabet', 'LargeUtf8'), 3, 20), substr(arrow_cast('alphabet', 'Utf8View'), 3, 20); ---- phabet phabet @@ -99,7 +99,7 @@ SELECT # Nulls query TTTTTTTTTT -SELECT +SELECT substr('alphabet', NULL), substr(NULL, 1), substr(NULL, NULL), @@ -134,3 +134,14 @@ select substr(arrow_cast('foo', 'Utf8View'), 1, -1); statement error Execution error: negative substring length not allowed select substr('', 1, -1); + +# StringView scalar to StringView scalar + +query BBBB +select + arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); +---- +false true true true diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part new file mode 100644 index 000000000000..4f737954000b --- /dev/null +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -0,0 +1,326 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This file is intended to be run with tables already defined +# with standard values, but different types in string columns +# (String, StringView, etc.) + +# select +query TTTT +SELECT ascii_1, ascii_2, unicode_1, unicode_2 FROM test_basic_operator +---- +Andrew X datafusion📊🔥 🔥 +Xiangpeng Xiangpeng datafusion数据融合 datafusion数据融合 +Raphael R datafusionДатаФусион аФус +NULL R NULL 🔥 + +# -------------------------------------- +# column comparison as filters +# -------------------------------------- + +query TT +select ascii_1, ascii_2 from test_basic_operator where ascii_1 = ascii_2 +---- +Xiangpeng Xiangpeng + +query TT +select ascii_1, ascii_2 from test_basic_operator where ascii_1 <> ascii_2 +---- +Andrew X +Raphael R + +query TT +select unicode_1, unicode_2 from test_basic_operator where unicode_1 = unicode_2 +---- +datafusion数据融合 datafusion数据融合 + +query TT +select unicode_1, unicode_2 from test_basic_operator where unicode_1 <> unicode_2 +---- +datafusion📊🔥 🔥 +datafusionДатаФусион аФус + +query TT +select ascii_1, unicode_1 from test_basic_operator where ascii_1 = unicode_1 +---- + +query TT +select ascii_1, unicode_1 from test_basic_operator where ascii_1 <> unicode_1 +---- +Andrew datafusion📊🔥 +Xiangpeng datafusion数据融合 +Raphael datafusionДатаФусион + +# -------------------------------------- +# column comparison +# -------------------------------------- +query TTTTBBBBBB +select + ascii_1, ascii_2, unicode_1, unicode_2, + ascii_1 = ascii_2, + ascii_1 <> ascii_2, + unicode_1 = unicode_2, + unicode_1 <> unicode_2, + ascii_1 = unicode_1, + ascii_1 <> unicode_1 +from test_basic_operator; +---- +Andrew X datafusion📊🔥 🔥 false true false true false true +Xiangpeng Xiangpeng datafusion数据融合 datafusion数据融合 true false true false false true +Raphael R datafusionДатаФусион аФус false true false true false true +NULL R NULL 🔥 NULL NULL NULL NULL NULL NULL + +# -------------------------------------- +# column to StringView scalar comparison +# -------------------------------------- +query TTBBBB +select + ascii_1, unicode_1, + ascii_1 = arrow_cast('Andrew', 'Utf8View'), + ascii_1 <> arrow_cast('Andrew', 'Utf8View'), + unicode_1 = arrow_cast('datafusion数据融合', 'Utf8View'), + unicode_1 <> arrow_cast('datafusion数据融合', 'Utf8View') +from test_basic_operator; +---- +Andrew datafusion📊🔥 true false false true +Xiangpeng datafusion数据融合 false true true false +Raphael datafusionДатаФусион false true false true +NULL NULL NULL NULL NULL NULL + +# -------------------------------------- +# column to String scalar +# -------------------------------------- +query TTBBBB +select + ascii_1, unicode_1, + ascii_1 = arrow_cast('Andrew', 'Utf8'), + ascii_1 <> arrow_cast('Andrew', 'Utf8'), + unicode_1 = arrow_cast('datafusion数据融合', 'Utf8'), + unicode_1 <> arrow_cast('datafusion数据融合', 'Utf8') +from test_basic_operator; +---- +Andrew datafusion📊🔥 true false false true +Xiangpeng datafusion数据融合 false true true false +Raphael datafusionДатаФусион false true false true +NULL NULL NULL NULL NULL NULL + +# -------------------------------------- +# column to LargeString scalar +# -------------------------------------- +query TTBBBB +select + ascii_1, unicode_1, + ascii_1 = arrow_cast('Andrew', 'LargeUtf8'), + ascii_1 <> arrow_cast('Andrew', 'LargeUtf8'), + unicode_1 = arrow_cast('datafusion数据融合', 'LargeUtf8'), + unicode_1 <> arrow_cast('datafusion数据融合', 'LargeUtf8') +from test_basic_operator; +---- +Andrew datafusion📊🔥 true false false true +Xiangpeng datafusion数据融合 false true true false +Raphael datafusionДатаФусион false true false true +NULL NULL NULL NULL NULL NULL + +# -------------------------------------- +# substr function +# -------------------------------------- + +query TTTTTTTTTTTTTT +select + substr(c1, 1), + substr(c1, 3), + substr(c1, 100), + substr(c1, -1), + substr(c1, 0, 0), + substr(c1, -1, 2), + substr(c1, -2, 10), + substr(c1, -100, 200), + substr(c1, -10, 10), + substr(c1, -100, 10), + substr(c1, 1, 100), + substr(c1, 5, 3), + substr(c1, 100, 200), + substr(c1, 8, 0) +from test_substr; +---- +foo o (empty) foo (empty) (empty) foo foo (empty) (empty) foo (empty) (empty) (empty) +hello🌏世界 llo🌏世界 (empty) hello🌏世界 (empty) (empty) hello🌏世 hello🌏世界 (empty) (empty) hello🌏世界 o🌏世 (empty) (empty) +💩 (empty) (empty) 💩 (empty) (empty) 💩 💩 (empty) (empty) 💩 (empty) (empty) (empty) +ThisIsAVeryLongASCIIString isIsAVeryLongASCIIString (empty) ThisIsAVeryLongASCIIString (empty) (empty) ThisIsA ThisIsAVeryLongASCIIString (empty) (empty) ThisIsAVeryLongASCIIString IsA (empty) (empty) +(empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) +NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL + +# -------------------------------------- +# test distinct aggregate +# -------------------------------------- +query II +SELECT + COUNT(DISTINCT ascii_1), + COUNT(DISTINCT unicode_1) +FROM + test_basic_operator +---- +3 3 + +query II +SELECT + COUNT(DISTINCT ascii_1), + COUNT(DISTINCT unicode_1) +FROM + test_basic_operator +GROUP BY ascii_2; +---- +1 1 +1 1 +1 1 + +query II +SELECT + COUNT(DISTINCT ascii_1), + COUNT(DISTINCT unicode_1) +FROM + test_basic_operator +GROUP BY unicode_2; +---- +1 1 +1 1 +1 1 + +# -------------------------------------- +# STARTS_WITH function +# -------------------------------------- + +query BBBB +SELECT + STARTS_WITH(ascii_1, ascii_2), + STARTS_WITH(unicode_1, unicode_2), + STARTS_WITH(ascii_1, unicode_2), + STARTS_WITH(unicode_1, ascii_2) +FROM test_basic_operator +---- +false false false false +true true false false +true false false false +NULL NULL NULL NULL + +query BBBB +SELECT + STARTS_WITH(ascii_1, 'And'), + STARTS_WITH(ascii_2, 'And'), + STARTS_WITH(unicode_1, 'data'), + STARTS_WITH(unicode_2, 'data') +FROM test_basic_operator +---- +true false true false +false false true true +false false true false +NULL false NULL false + +# -------------------------------------- +# Test TRANSLATE +# -------------------------------------- + +query T +SELECT + TRANSLATE(ascii_1, 'foo', 'bar') as c +FROM test_basic_operator; +---- +Andrew +Xiangpeng +Raphael +NULL + +query T +SELECT + TRANSLATE(unicode_1, 'foo', 'bar') as c +FROM test_basic_operator; +---- +databusirn📊🔥 +databusirn数据融合 +databusirnДатаФусион +NULL + +# -------------------------------------- +# Test REGEXP_REPLACE +# -------------------------------------- + +# Should run REGEXP_REPLACE with Scalar value for string +query T +SELECT + REGEXP_REPLACE(ascii_1, 'e', 'f') AS k +FROM test_basic_operator; +---- +Andrfw +Xiangpfng +Raphafl +NULL + +# Should run REGEXP_REPLACE with Scalar value for string with flag +query T +SELECT + REGEXP_REPLACE(ascii_1, 'e', 'f', 'i') AS k +FROM test_basic_operator; +---- +Andrfw +Xiangpfng +Raphafl +NULL + +# Should run REGEXP_REPLACE with ScalarArray value for string +query T +SELECT + REGEXP_REPLACE(ascii_1, lower(ascii_1), 'bar') AS k +FROM test_basic_operator; +---- +Andrew +Xiangpeng +Raphael +NULL + +# Should run REGEXP_REPLACE with ScalarArray value for string with flag +query T +SELECT + REGEXP_REPLACE(ascii_1, lower(ascii_1), 'bar', 'g') AS k +FROM test_basic_operator; +---- +Andrew +Xiangpeng +Raphael +NULL + +# -------------------------------------- +# Test Initcap +# -------------------------------------- +statement ok +CREATE TABLE test_lowercase AS SELECT + lower(ascii_1) as ascii_1_lower, + lower(unicode_1) as unicode_1_lower +FROM test_basic_operator; + +query TT +SELECT + INITCAP(ascii_1_lower) as c1, + INITCAP(unicode_1_lower) as c2 +FROM test_lowercase; +---- +Andrew Datafusion📊🔥 +Xiangpeng Datafusion数据融合 +Raphael Datafusionдатафусион +NULL NULL + +statement ok +drop table test_lowercase; diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt new file mode 100644 index 000000000000..ca08f647d246 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -0,0 +1,356 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +include ./init_data.slt.part + +# -------------------------------------- +# Setup test tables with different physical string types +# and repeat tests in `string_query.slt.part` +# -------------------------------------- +statement ok +create table test_basic_operator as +select + arrow_cast(column1, 'Utf8View') as ascii_1, + arrow_cast(column2, 'Utf8View') as ascii_2, + arrow_cast(column3, 'Utf8View') as unicode_1, + arrow_cast(column4, 'Utf8View') as unicode_2 +from test_source; + +statement ok +create table test_substr as +select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base; + +statement ok +drop table test_source + +# +# common test for string-like functions and operators +# +include ./string_query.slt.part + +# +# Clean up +# +statement ok +drop table test_basic_operator; + +statement ok +drop table test_substr_base; + + +# -------------------------------------- +# String_view specific tests +# -------------------------------------- +statement ok +create table test_source as values + ('Andrew', 'X'), + ('Xiangpeng', 'Xiangpeng'), + ('Raphael', 'R'), + (NULL, 'R'); + +# Table with the different combination of column types +statement ok +create table test as +SELECT + arrow_cast(column1, 'Utf8') as column1_utf8, + arrow_cast(column2, 'Utf8') as column2_utf8, + arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, + arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, + arrow_cast(column1, 'Utf8View') as column1_utf8view, + arrow_cast(column2, 'Utf8View') as column2_utf8view, + arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1_dict, + arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2_dict +FROM test_source; + +statement ok +drop table test_source + +######## +## StringView to Other Types column +######## + +# test StringViewArray with Utf8 columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_utf8, + column2_utf8 = column1_utf8view, + column1_utf8view <> column2_utf8, + column2_utf8 <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# test StringViewArray with LargeUtf8 columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_large_utf8, + column2_large_utf8 = column1_utf8view, + column1_utf8view <> column2_large_utf8, + column2_large_utf8 <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +######## +## StringView to Dictionary +######## + +# test StringViewArray with Dictionary columns +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = column2_dict, + column2_dict = column1_utf8view, + column1_utf8view <> column2_dict, + column2_dict <> column1_utf8view +from test; +---- +Andrew X false false true true +Xiangpeng Xiangpeng true true false false +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# StringView column to Dict scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), + arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view, + column1_utf8view <> arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), + arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') <> column1_utf8view +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +# Dict column to StringView scalar +query TTBBBB +select + column1_utf8, column2_utf8, + column1_dict = arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') = column1_dict, + column1_dict <> arrow_cast('Andrew', 'Utf8View'), + arrow_cast('Andrew', 'Utf8View') <> column1_dict +from test; +---- +Andrew X true true false false +Xiangpeng Xiangpeng false false true true +Raphael R false false true true +NULL R NULL NULL NULL NULL + +######## +## Coercion Rules +######## + +statement ok +set datafusion.explain.logical_plan_only = true; + + +# Filter should have a StringView literal and no column cast +query TT +explain SELECT column1_utf8 from test where column1_utf8view = 'Andrew'; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +# reverse order should be the same +query TT +explain SELECT column1_utf8 from test where 'Andrew' = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +query TT +explain SELECT column1_utf8 from test where column1_utf8 = arrow_cast('Andrew', 'Utf8View'); +---- +logical_plan +01)Filter: test.column1_utf8 = Utf8("Andrew") +02)--TableScan: test projection=[column1_utf8] + +query TT +explain SELECT column1_utf8 from test where arrow_cast('Andrew', 'Utf8View') = column1_utf8; +---- +logical_plan +01)Filter: test.column1_utf8 = Utf8("Andrew") +02)--TableScan: test projection=[column1_utf8] + +query TT +explain SELECT column1_utf8 from test where column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'); +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +query TT +explain SELECT column1_utf8 from test where arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = Utf8View("Andrew") +03)----TableScan: test projection=[column1_utf8, column1_utf8view] + +# compare string / stringview +# Should cast string -> stringview (which is cheap), not stringview -> string (which is not) +query TT +explain SELECT column1_utf8 from test where column1_utf8view = column2_utf8; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: test.column1_utf8view = CAST(test.column2_utf8 AS Utf8View) +03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] + +query TT +explain SELECT column1_utf8 from test where column2_utf8 = column1_utf8view; +---- +logical_plan +01)Projection: test.column1_utf8 +02)--Filter: CAST(test.column2_utf8 AS Utf8View) = test.column1_utf8view +03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] + +query TT +EXPLAIN SELECT + COUNT(DISTINCT column1_utf8), + COUNT(DISTINCT column1_utf8view), + COUNT(DISTINCT column1_dict) +FROM test; +---- +logical_plan +01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]] +02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict] + + +### `STARTS_WITH` + +# Test STARTS_WITH with utf8view against utf8view, utf8, and largeutf8 +# (should be no casts) +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8view, column2_utf8view) as c1, + STARTS_WITH(column1_utf8view, column2_utf8) as c2, + STARTS_WITH(column1_utf8view, column2_large_utf8) as c3 +FROM test; +---- +logical_plan +01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c2, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3 +02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] + +query BBB +SELECT + STARTS_WITH(column1_utf8view, column2_utf8view) as c1, + STARTS_WITH(column1_utf8view, column2_utf8) as c2, + STARTS_WITH(column1_utf8view, column2_large_utf8) as c3 +FROM test; +---- +false false false +true true true +true true true +NULL NULL NULL + +# Test STARTS_WITH with utf8 against utf8view, utf8, and largeutf8 +# Should work, but will have to cast to common types +# should cast utf8 -> utf8view and largeutf8 -> utf8view +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8, column2_utf8view) as c1, + STARTS_WITH(column1_utf8, column2_utf8) as c3, + STARTS_WITH(column1_utf8, column2_large_utf8) as c4 +FROM test; +---- +logical_plan +01)Projection: starts_with(__common_expr_1, test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(__common_expr_1, CAST(test.column2_large_utf8 AS Utf8View)) AS c4 +02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] + +query BBB + SELECT + STARTS_WITH(column1_utf8, column2_utf8view) as c1, + STARTS_WITH(column1_utf8, column2_utf8) as c3, + STARTS_WITH(column1_utf8, column2_large_utf8) as c4 +FROM test; +---- +false false false +true true true +true true true +NULL NULL NULL + + +# Test STARTS_WITH with utf8view against literals +# In this case, the literals should be cast to utf8view. The columns +# should not be cast to utf8. +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8view, 'äöüß') as c1, + STARTS_WITH(column1_utf8view, '') as c2, + STARTS_WITH(column1_utf8view, NULL) as c3, + STARTS_WITH(NULL, column1_utf8view) as c4 +FROM test; +---- +logical_plan +01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 +02)--TableScan: test projection=[column1_utf8view] + +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: initcap(test.column1_utf8view) AS c +02)--TableScan: test projection=[column1_utf8view] + + +# Create a table with lowercase strings +statement ok +CREATE TABLE test_lowercase AS SELECT + lower(column1_utf8) as column1_utf8_lower, + lower(column1_large_utf8) as column1_large_utf8_lower, + lower(column1_utf8view) as column1_utf8view_lower +FROM test; + +# Test INITCAP with utf8view, utf8, and largeutf8 +# Should not cast anything +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view_lower) as c1, + INITCAP(column1_utf8_lower) as c2, + INITCAP(column1_large_utf8_lower) as c3 +FROM test_lowercase; +---- +logical_plan +01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3 +02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower] + +statement ok +drop table test_lowercase + +statement ok +drop table test From f522980259baa1cd51dbc2136a85fce083762a46 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 18 Sep 2024 22:49:54 +0800 Subject: [PATCH 2/4] move the test to new framework --- .../substr/substr_table.slt.part | 45 -- .../sqllogictest/test_files/string_view.slt | 547 ------------------ 2 files changed, 592 deletions(-) delete mode 100644 datafusion/sqllogictest/test_files/string_functions/substr/substr_table.slt.part diff --git a/datafusion/sqllogictest/test_files/string_functions/substr/substr_table.slt.part b/datafusion/sqllogictest/test_files/string_functions/substr/substr_table.slt.part deleted file mode 100644 index 0d8e62af84e0..000000000000 --- a/datafusion/sqllogictest/test_files/string_functions/substr/substr_table.slt.part +++ /dev/null @@ -1,45 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# This file is intended to be run with tables already defined -# with standard values, but different types in string columns -# (String, StringView, etc.) - -query TTTTTTTTTTTTTT -select - substr(c1, 1), - substr(c1, 3), - substr(c1, 100), - substr(c1, -1), - substr(c1, 0, 0), - substr(c1, -1, 2), - substr(c1, -2, 10), - substr(c1, -100, 200), - substr(c1, -10, 10), - substr(c1, -100, 10), - substr(c1, 1, 100), - substr(c1, 5, 3), - substr(c1, 100, 200), - substr(c1, 8, 0) -from test_substr; ----- -foo o (empty) foo (empty) (empty) foo foo (empty) (empty) foo (empty) (empty) (empty) -hello🌏世界 llo🌏世界 (empty) hello🌏世界 (empty) (empty) hello🌏世 hello🌏世界 (empty) (empty) hello🌏世界 o🌏世 (empty) (empty) -💩 (empty) (empty) 💩 (empty) (empty) 💩 💩 (empty) (empty) 💩 (empty) (empty) (empty) -ThisIsAVeryLongASCIIString isIsAVeryLongASCIIString (empty) ThisIsAVeryLongASCIIString (empty) (empty) ThisIsA ThisIsAVeryLongASCIIString (empty) (empty) ThisIsAVeryLongASCIIString IsA (empty) (empty) -(empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) (empty) -NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 7df43bb7eddb..0dd1eec5abca 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -43,556 +43,9 @@ FROM test_source; statement ok drop table test_source -######## -## StringView to StringView -######## - -# StringView scalar to StringView scalar - -query BBBB -select - arrow_cast('NULL', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), - arrow_cast('NULL', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') = arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Xiangpeng', 'Utf8View') <> arrow_cast('Andrew', 'Utf8View'); ----- -false true true true - - -# StringView column to StringView column comparison as filters - -query TT -select column1_utf8, column2_utf8 from test where column1_utf8view = column2_utf8view; ----- -Xiangpeng Xiangpeng - -query TT -select column1_utf8, column2_utf8 from test where column1_utf8view <> column2_utf8view; ----- -Andrew X -Raphael R - -# StringView column to StringView column -query TTBB -select - column1_utf8, column2_utf8, - column1_utf8view = column2_utf8view, - column1_utf8view <> column2_utf8view -from test; ----- -Andrew X false true -Xiangpeng Xiangpeng true false -Raphael R false true -NULL R NULL NULL - -# StringView column to StringView scalar comparison -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8view = arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') = column1_utf8view, - column1_utf8view <> arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') <> column1_utf8view -from test; ----- -Andrew X true true false false -Xiangpeng Xiangpeng false false true true -Raphael R false false true true -NULL R NULL NULL NULL NULL - -######## -## StringView to String -######## - -# test StringViewArray with Utf8 columns -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8view = column2_utf8, - column2_utf8 = column1_utf8view, - column1_utf8view <> column2_utf8, - column2_utf8 <> column1_utf8view -from test; ----- -Andrew X false false true true -Xiangpeng Xiangpeng true true false false -Raphael R false false true true -NULL R NULL NULL NULL NULL - -# test StringViewArray with LargeUtf8 columns -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8view = column2_large_utf8, - column2_large_utf8 = column1_utf8view, - column1_utf8view <> column2_large_utf8, - column2_large_utf8 <> column1_utf8view -from test; ----- -Andrew X false false true true -Xiangpeng Xiangpeng true true false false -Raphael R false false true true -NULL R NULL NULL NULL NULL - - -# StringView column to String scalar -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8view = arrow_cast('Andrew', 'Utf8'), - arrow_cast('Andrew', 'Utf8') = column1_utf8view, - column1_utf8view <> arrow_cast('Andrew', 'Utf8'), - arrow_cast('Andrew', 'Utf8') <> column1_utf8view -from test; ----- -Andrew X true true false false -Xiangpeng Xiangpeng false false true true -Raphael R false false true true -NULL R NULL NULL NULL NULL - -# StringView column to LargeString scalar -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8view = arrow_cast('Andrew', 'LargeUtf8'), - arrow_cast('Andrew', 'LargeUtf8') = column1_utf8view, - column1_utf8view <> arrow_cast('Andrew', 'LargeUtf8'), - arrow_cast('Andrew', 'LargeUtf8') <> column1_utf8view -from test; ----- -Andrew X true true false false -Xiangpeng Xiangpeng false false true true -Raphael R false false true true -NULL R NULL NULL NULL NULL - -# String column to StringView scalar -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8 = arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') = column1_utf8, - column1_utf8 <> arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') <> column1_utf8 -from test; ----- -Andrew X true true false false -Xiangpeng Xiangpeng false false true true -Raphael R false false true true -NULL R NULL NULL NULL NULL - -# LargeString column to StringView scalar -query TTBBBB -select - column1_utf8, column2_utf8, - column1_large_utf8 = arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') = column1_large_utf8, - column1_large_utf8 <> arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') <> column1_large_utf8 -from test; ----- -Andrew X true true false false -Xiangpeng Xiangpeng false false true true -Raphael R false false true true -NULL R NULL NULL NULL NULL - -######## -## StringView to Dictionary -######## - -# test StringViewArray with Dictionary columns -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8view = column2_dict, - column2_dict = column1_utf8view, - column1_utf8view <> column2_dict, - column2_dict <> column1_utf8view -from test; ----- -Andrew X false false true true -Xiangpeng Xiangpeng true true false false -Raphael R false false true true -NULL R NULL NULL NULL NULL - -# StringView column to Dict scalar -query TTBBBB -select - column1_utf8, column2_utf8, - column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), - arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view, - column1_utf8view <> arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'), - arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') <> column1_utf8view -from test; ----- -Andrew X true true false false -Xiangpeng Xiangpeng false false true true -Raphael R false false true true -NULL R NULL NULL NULL NULL - -# Dict column to StringView scalar -query TTBBBB -select - column1_utf8, column2_utf8, - column1_dict = arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') = column1_dict, - column1_dict <> arrow_cast('Andrew', 'Utf8View'), - arrow_cast('Andrew', 'Utf8View') <> column1_dict -from test; ----- -Andrew X true true false false -Xiangpeng Xiangpeng false false true true -Raphael R false false true true -NULL R NULL NULL NULL NULL - - -######## -## Coercion Rules -######## - - statement ok set datafusion.explain.logical_plan_only = true; - -# Filter should have a StringView literal and no column cast -query TT -explain SELECT column1_utf8 from test where column1_utf8view = 'Andrew'; ----- -logical_plan -01)Projection: test.column1_utf8 -02)--Filter: test.column1_utf8view = Utf8View("Andrew") -03)----TableScan: test projection=[column1_utf8, column1_utf8view] - -# reverse order should be the same -query TT -explain SELECT column1_utf8 from test where 'Andrew' = column1_utf8view; ----- -logical_plan -01)Projection: test.column1_utf8 -02)--Filter: test.column1_utf8view = Utf8View("Andrew") -03)----TableScan: test projection=[column1_utf8, column1_utf8view] - -query TT -explain SELECT column1_utf8 from test where column1_utf8 = arrow_cast('Andrew', 'Utf8View'); ----- -logical_plan -01)Filter: test.column1_utf8 = Utf8("Andrew") -02)--TableScan: test projection=[column1_utf8] - -query TT -explain SELECT column1_utf8 from test where arrow_cast('Andrew', 'Utf8View') = column1_utf8; ----- -logical_plan -01)Filter: test.column1_utf8 = Utf8("Andrew") -02)--TableScan: test projection=[column1_utf8] - -query TT -explain SELECT column1_utf8 from test where column1_utf8view = arrow_cast('Andrew', 'Dictionary(Int32, Utf8)'); ----- -logical_plan -01)Projection: test.column1_utf8 -02)--Filter: test.column1_utf8view = Utf8View("Andrew") -03)----TableScan: test projection=[column1_utf8, column1_utf8view] - -query TT -explain SELECT column1_utf8 from test where arrow_cast('Andrew', 'Dictionary(Int32, Utf8)') = column1_utf8view; ----- -logical_plan -01)Projection: test.column1_utf8 -02)--Filter: test.column1_utf8view = Utf8View("Andrew") -03)----TableScan: test projection=[column1_utf8, column1_utf8view] - -# compare string / stringview -# Should cast string -> stringview (which is cheap), not stringview -> string (which is not) -query TT -explain SELECT column1_utf8 from test where column1_utf8view = column2_utf8; ----- -logical_plan -01)Projection: test.column1_utf8 -02)--Filter: test.column1_utf8view = CAST(test.column2_utf8 AS Utf8View) -03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] - -query TT -explain SELECT column1_utf8 from test where column2_utf8 = column1_utf8view; ----- -logical_plan -01)Projection: test.column1_utf8 -02)--Filter: CAST(test.column2_utf8 AS Utf8View) = test.column1_utf8view -03)----TableScan: test projection=[column1_utf8, column2_utf8, column1_utf8view] - -## Test distinct aggregates -query III -SELECT - COUNT(DISTINCT column1_utf8), - COUNT(DISTINCT column1_utf8view), - COUNT(DISTINCT column1_dict) -FROM test; ----- -3 3 3 - -query III -SELECT - COUNT(DISTINCT column1_utf8), - COUNT(DISTINCT column1_utf8view), - COUNT(DISTINCT column1_dict) -FROM test -GROUP BY column2_utf8view; ----- -1 1 1 -1 1 1 -1 1 1 - - -query TT -EXPLAIN SELECT - COUNT(DISTINCT column1_utf8), - COUNT(DISTINCT column1_utf8view), - COUNT(DISTINCT column1_dict) -FROM test; ----- -logical_plan -01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]] -02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict] - -### `STARTS_WITH` - -# Test STARTS_WITH with utf8view against utf8view, utf8, and largeutf8 -# (should be no casts) -query TT -EXPLAIN SELECT - STARTS_WITH(column1_utf8view, column2_utf8view) as c1, - STARTS_WITH(column1_utf8view, column2_utf8) as c2, - STARTS_WITH(column1_utf8view, column2_large_utf8) as c3 -FROM test; ----- -logical_plan -01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c2, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3 -02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] - -query BBB -SELECT - STARTS_WITH(column1_utf8view, column2_utf8view) as c1, - STARTS_WITH(column1_utf8view, column2_utf8) as c2, - STARTS_WITH(column1_utf8view, column2_large_utf8) as c3 -FROM test; ----- -false false false -true true true -true true true -NULL NULL NULL - -# Test STARTS_WITH with utf8 against utf8view, utf8, and largeutf8 -# Should work, but will have to cast to common types -# should cast utf8 -> utf8view and largeutf8 -> utf8view -query TT -EXPLAIN SELECT - STARTS_WITH(column1_utf8, column2_utf8view) as c1, - STARTS_WITH(column1_utf8, column2_utf8) as c3, - STARTS_WITH(column1_utf8, column2_large_utf8) as c4 -FROM test; ----- -logical_plan -01)Projection: starts_with(__common_expr_1, test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(__common_expr_1, CAST(test.column2_large_utf8 AS Utf8View)) AS c4 -02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] - -query BBB - SELECT - STARTS_WITH(column1_utf8, column2_utf8view) as c1, - STARTS_WITH(column1_utf8, column2_utf8) as c3, - STARTS_WITH(column1_utf8, column2_large_utf8) as c4 -FROM test; ----- -false false false -true true true -true true true -NULL NULL NULL - - -# Test STARTS_WITH with utf8view against literals -# In this case, the literals should be cast to utf8view. The columns -# should not be cast to utf8. -query TT -EXPLAIN SELECT - STARTS_WITH(column1_utf8view, 'äöüß') as c1, - STARTS_WITH(column1_utf8view, '') as c2, - STARTS_WITH(column1_utf8view, NULL) as c3, - STARTS_WITH(NULL, column1_utf8view) as c4 -FROM test; ----- -logical_plan -01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 -02)--TableScan: test projection=[column1_utf8view] - -### Test TRANSLATE - -# Should run TRANSLATE using utf8view column successfully -query T -SELECT - TRANSLATE(column1_utf8view, 'foo', 'bar') as c -FROM test; ----- -Andrew -Xiangpeng -Raphael -NULL - -# Should run TRANSLATE using utf8 column successfully -query T -SELECT - TRANSLATE(column1_utf8, 'foo', 'bar') as c -FROM test; ----- -Andrew -Xiangpeng -Raphael -NULL - -# Should run TRANSLATE using large_utf8 column successfully -query T -SELECT - TRANSLATE(column1_large_utf8, 'foo', 'bar') as c -FROM test; ----- -Andrew -Xiangpeng -Raphael -NULL - -### Test REGEXP_REPLACE - -# Should run REGEXP_REPLACE with Scalar value for utf8view -query T -SELECT - REGEXP_REPLACE(column1_utf8view, 'e', 'f') AS k -FROM test; ----- -Andrfw -Xiangpfng -Raphafl -NULL - -# Should run REGEXP_REPLACE with Scalar value for utf8view with flag -query T -SELECT - REGEXP_REPLACE(column1_utf8view, 'e', 'f', 'i') AS k -FROM test; ----- -Andrfw -Xiangpfng -Raphafl -NULL - -# Should run REGEXP_REPLACE with Scalar value for utf8 -query T -SELECT - REGEXP_REPLACE(column1_utf8, 'e', 'f') AS k -FROM test; ----- -Andrfw -Xiangpfng -Raphafl -NULL - -# Should run REGEXP_REPLACE with Scalar value for utf8 with flag -query T -SELECT - REGEXP_REPLACE(column1_utf8, 'e', 'f', 'i') AS k -FROM test; ----- -Andrfw -Xiangpfng -Raphafl -NULL - -# Should run REGEXP_REPLACE with ScalarArray value for utf8view -query T -SELECT - REGEXP_REPLACE(column1_utf8view, lower(column1_utf8view), 'bar') AS k -FROM test; ----- -Andrew -Xiangpeng -Raphael -NULL - -# Should run REGEXP_REPLACE with ScalarArray value for utf8view with flag -query T -SELECT - REGEXP_REPLACE(column1_utf8view, lower(column1_utf8view), 'bar', 'g') AS k -FROM test; ----- -Andrew -Xiangpeng -Raphael -NULL - -# Should run REGEXP_REPLACE with ScalarArray value for utf8 -query T -SELECT - REGEXP_REPLACE(column1_utf8, lower(column1_utf8), 'bar') AS k -FROM test; ----- -Andrew -Xiangpeng -Raphael -NULL - -# Should run REGEXP_REPLACE with ScalarArray value for utf8 with flag -query T -SELECT - REGEXP_REPLACE(column1_utf8, lower(column1_utf8), 'bar', 'g') AS k -FROM test; ----- -Andrew -Xiangpeng -Raphael -NULL - -### Initcap - -query TT -EXPLAIN SELECT - INITCAP(column1_utf8view) as c -FROM test; ----- -logical_plan -01)Projection: initcap(test.column1_utf8view) AS c -02)--TableScan: test projection=[column1_utf8view] - -# Create a table with lowercase strings -statement ok -CREATE TABLE test_lowercase AS SELECT - lower(column1_utf8) as column1_utf8_lower, - lower(column1_large_utf8) as column1_large_utf8_lower, - lower(column1_utf8view) as column1_utf8view_lower -FROM test; - -# Test INITCAP with utf8view, utf8, and largeutf8 -# Should not cast anything -query TT -EXPLAIN SELECT - INITCAP(column1_utf8view_lower) as c1, - INITCAP(column1_utf8_lower) as c2, - INITCAP(column1_large_utf8_lower) as c3 -FROM test_lowercase; ----- -logical_plan -01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3 -02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower] - -query TTT -SELECT - INITCAP(column1_utf8view_lower) as c1, - INITCAP(column1_utf8_lower) as c2, - INITCAP(column1_large_utf8_lower) as c3 -FROM test_lowercase; ----- -Andrew Andrew Andrew -Xiangpeng Xiangpeng Xiangpeng -Raphael Raphael Raphael -NULL NULL NULL - # Ensure string functions use native StringView implementation # and do not fall back to Utf8 or LargeUtf8 # Should see no casts to Utf8 in the plans below From b0723076f96cd0bc71c4a4b8cf641da48d477085 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 21 Sep 2024 02:13:19 +0800 Subject: [PATCH 3/4] add readme for string test --- .../sqllogictest/test_files/string/README.md | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/string/README.md diff --git a/datafusion/sqllogictest/test_files/string/README.md b/datafusion/sqllogictest/test_files/string/README.md new file mode 100644 index 000000000000..8c7a435b13d0 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string/README.md @@ -0,0 +1,24 @@ +# String Test Files + +This directory contains test files for the `string` test suite. +To ensure consistent behavior across different string types, we should run the same tests with the same inputs on all string types. +There is a framework in place to execute the same tests across different string types. + +See [#12415](https://github.com/apache/datafusion/issues/12415) for more background. + +## Directory Structure +``` +string/ + - init_data.slt.part // generate the testing data + - string_query.slt.part // the sharing tests for all string type + - string.slt // the entrypoint for string type + - large_string.slt // the entrypoint for large_string type + - string_view.slt // the entrypoint for string_view type and the string_view specific tests + - string_literal.slt // the tests for string literal +``` + +## Pattern for Test Entry Point Files + +Any entry point file should include `init_data.slt.part` and `string_query.slt.part`. + +Planning-related tests (e.g., EXPLAIN ...) should be placed in their own entry point file (e.g., `string_view.slt`) as they are only used to assert planning behavior specific to that type. From 4fefd064f0729718e65f9bd82cc78ab390fdad10 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 21 Sep 2024 02:16:56 +0800 Subject: [PATCH 4/4] add license and invoke prettier --- .../sqllogictest/test_files/string/README.md | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string/README.md b/datafusion/sqllogictest/test_files/string/README.md index 8c7a435b13d0..8693ef16f9d7 100644 --- a/datafusion/sqllogictest/test_files/string/README.md +++ b/datafusion/sqllogictest/test_files/string/README.md @@ -1,3 +1,22 @@ + + # String Test Files This directory contains test files for the `string` test suite. @@ -7,6 +26,7 @@ There is a framework in place to execute the same tests across different string See [#12415](https://github.com/apache/datafusion/issues/12415) for more background. ## Directory Structure + ``` string/ - init_data.slt.part // generate the testing data