-
Notifications
You must be signed in to change notification settings - Fork 132
Fix to enable index scan with LIKE operator for non-const nodes #4589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: BABEL_6_X_DEV
Are you sure you want to change the base?
Changes from all commits
088e87f
21d6b84
868da18
f8cdd51
86997fe
3357524
ffdb176
ac796d5
75140af
cd87af9
69b694c
71e35f2
7917299
7f34556
926c335
2b17d45
2f55c79
fdf6fca
afeb5b3
9822783
5197005
293001e
aec3662
710ceec
4a8fd1d
c6d91e0
2021734
ba425e9
fea7f92
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,16 +8,20 @@ | |
| #include "utils/syscache.h" | ||
| #include "utils/memutils.h" | ||
| #include "utils/builtins.h" | ||
| #include "catalog/pg_proc.h" | ||
| #include "catalog/pg_type.h" | ||
| #include "catalog/pg_collation.h" | ||
| #include "catalog/namespace.h" | ||
| #include "tsearch/ts_locale.h" | ||
| #include "optimizer/clauses.h" | ||
| #include "optimizer/optimizer.h" | ||
| #include "parser/parser.h" | ||
| #include "parser/parse_coerce.h" | ||
| #include "parser/parse_type.h" | ||
| #include "parser/parse_oper.h" | ||
| #include "nodes/makefuncs.h" | ||
| #include "nodes/nodes.h" | ||
| #include "nodes/pathnodes.h" | ||
| #ifdef USE_ICU | ||
| #include <unicode/utrans.h> | ||
| #include "utils/removeaccent.map" | ||
|
|
@@ -76,6 +80,87 @@ extern int pattern_fixed_prefix_wrapper(Const *patt, | |
| static Node *transform_likenode_for_AI(OpExpr *op); | ||
| static Node *convert_node_to_funcexpr_for_like(Node *node, Oid inputcollid); | ||
|
|
||
| /* | ||
| * Check whether a like_escape call appears anywhere in the expression. | ||
| * Uses expression_tree_walker for safe traversal of all node types. | ||
| */ | ||
| static bool | ||
| contains_like_escape(Node *node, void *context) | ||
| { | ||
| if (node == NULL) | ||
| return false; | ||
| if (IsA(node, FuncExpr) && | ||
| strcmp(get_func_name(((FuncExpr *) node)->funcid), | ||
| "like_escape") == 0) | ||
| return true; | ||
| return expression_tree_walker(node, contains_like_escape, context); | ||
| } | ||
|
|
||
| /* | ||
| * Check whether the expression contains any non-immutable function | ||
| * outside our allowlist of safe-to-fold stable functions. | ||
| * Returns true if an unsafe function is found. | ||
| * | ||
| * Allowlisted functions are STABLE only because plpgsql/C cannot be | ||
| * declared IMMUTABLE, but are deterministic for constant inputs: | ||
| * babelfish_conv_helper_* - CONVERT / TRY_CONVERT | ||
| * babelfish_try_cast_to_* - TRY_CAST | ||
| * babelfish_concat_wrapper* - string + operator | ||
| * concat - CONCAT() | ||
| */ | ||
| static bool | ||
| has_unsafe_stable_func(Node *node, void *context) | ||
| { | ||
| if (node == NULL) | ||
| return false; | ||
| if (IsA(node, FuncExpr)) | ||
| { | ||
| FuncExpr *func = (FuncExpr *) node; | ||
|
|
||
| if (func_volatile(func->funcid) != PROVOLATILE_IMMUTABLE) | ||
| { | ||
| char *name = get_func_name(func->funcid); | ||
| bool safe = (name != NULL && | ||
| (strncmp(name, "babelfish_conv_helper_", 22) == 0 || | ||
| strncmp(name, "babelfish_try_cast_to_", 22) == 0 || | ||
| strncmp(name, "babelfish_concat_wrapper", 24) == 0 || | ||
| strcmp(name, "concat") == 0)); | ||
|
|
||
| if (name) | ||
| pfree(name); | ||
| if (!safe) | ||
| return true; | ||
| } | ||
| } | ||
| return expression_tree_walker(node, has_unsafe_stable_func, context); | ||
| } | ||
|
|
||
| /* | ||
| * Fold a LIKE pattern to Const, restricted to expressions whose | ||
| * non-immutable functions are all known-safe stable functions. | ||
| * The original LIKE is always kept as a recheck, so correctness | ||
| * is guaranteed. | ||
| */ | ||
| static Node * | ||
| fold_like_pattern_to_const(Node *expr) | ||
| { | ||
| PlannerGlobal glob; | ||
| PlannerInfo root; | ||
|
|
||
| if (has_unsafe_stable_func(expr, NULL)) | ||
| return expr; | ||
|
|
||
| memset(&glob, 0, sizeof(PlannerGlobal)); | ||
| glob.type = T_PlannerGlobal; | ||
| glob.boundParams = NULL; | ||
|
|
||
| memset(&root, 0, sizeof(PlannerInfo)); | ||
| root.type = T_PlannerInfo; | ||
| root.glob = &glob; | ||
|
|
||
| return estimate_expression_value(&root, expr); | ||
| } | ||
|
|
||
| /* pattern prefix status for pattern_fixed_prefix_wrapper | ||
| * Pattern_Prefix_None: no prefix found, this means the first character is a wildcard character | ||
| * Pattern_Prefix_Exact: the pattern doesn't include any wildcard character | ||
|
|
@@ -327,8 +412,9 @@ create_collate_expr(Node *arg, Oid collid) | |
| * function to remove the accents and optimize. | ||
| * If the node is OpExpr and the collation is cs_as, then simply use optimization: | ||
| * | ||
| * Case 1: if the pattern is a constant stirng | ||
| * col LIKE PATTERN -> col = PATTERN | ||
| * Case 1: if the pattern is a constant string | ||
| * col LIKE PATTERN -> col = PATTERN AND col LIKE PATTERN | ||
| * col NOT LIKE PATTERN -> col <> PATTERN OR col NOT LIKE PATTERN | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With this OR clause, planner may loose an ability to choose index scan. Hence in most of the cases where we have correctness issue will be fixed with this but perf might get degraded. But I think tht should be okay |
||
| * Case 2: if the pattern have a constant prefix | ||
| * col LIKE PATTERN -> | ||
| * col LIKE PATTERN BETWEEN prefix AND prefix||E'\uFFFF' | ||
|
|
@@ -386,9 +472,8 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf | |
| op->opno = like_entry.ilike_oid; | ||
| op->opfuncid = like_entry.ilike_opfuncid; | ||
| } | ||
|
|
||
| op->inputcollid = tsql_get_oid_from_collidx(collidx_of_cs_as); | ||
|
|
||
| op->inputcollid = tsql_get_oid_from_collidx(collidx_of_cs_as); | ||
|
|
||
| /* Remove CollateExpr as the op->inputcollid has already been set */ | ||
| if (IsA(rightop, CollateExpr)) | ||
|
|
@@ -401,6 +486,27 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf | |
| linitial(op->args) = leftop = (Node*)((CollateExpr*) leftop)->arg; | ||
| } | ||
|
|
||
| /* | ||
| * Try to reduce rightop to a Const for prefix extraction. | ||
| * | ||
| * First use eval_const_expressions (folds immutable functions like | ||
| * string concatenation and type casts). If still not a Const, use | ||
| * fold_like_pattern_to_const which selectively folds only | ||
| * allowlisted stable functions (CONVERT, TRY_CONVERT, TRY_CAST, | ||
| * string +) via estimate_expression_value. | ||
| * | ||
| * Skip for like_escape: PG's like_escape converts escape sequences | ||
| * to backslash format, which doesn't match T-SQL bracket patterns. | ||
| * Folding it produces wrong prefix bounds. | ||
| */ | ||
| if (!contains_like_escape(rightop, NULL)) | ||
| { | ||
| rightop = eval_const_expressions(NULL, rightop); | ||
| if (!IsA(rightop, Const)) | ||
| rightop = fold_like_pattern_to_const(rightop); | ||
| lsecond(op->args) = rightop; | ||
| } | ||
|
|
||
| /* | ||
| * This is needed to process CI_AI for Const nodes | ||
| * Because after we call coerce_to_target_type for type conversion in transform_likenode_for_AI, | ||
|
|
@@ -466,8 +572,10 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf | |
| { | ||
| RelabelType *relabel = (RelabelType *) leftop; | ||
| leftop = copyObject((Node*) relabel->arg); | ||
| prefix->consttype = rtypeId = ltypeId = exprType(leftop); | ||
| ltypeId = exprType(leftop); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the necessity to make this change? Is there any test case which fails?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. Without this, BABEL_SCHEMATA-vu-verify crashes: rtypeId is set once at the top from original rightop. After evaluate_expr changes rightop to a Const, rtypeId becomes stale. Previously this code was never reached because rightop was never a Const for such queries. Now it is, and types mismatch. |
||
| } | ||
| /* Reconcile types — LIKE coerces to TEXT but we need original column type */ | ||
| prefix->consttype = rtypeId = ltypeId; | ||
|
|
||
| /* | ||
| * We need to do this because the dump considers rightop as Const with COLLATE being added | ||
|
|
@@ -506,7 +614,7 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf | |
| InvalidOid, | ||
| coll_info_of_inputcollid.oid, | ||
| oprfuncid(optup))); | ||
| ret = make_and_qual(ret, node); | ||
| ret = like_entry.is_not_match ? make_or_qual(ret, node) : make_and_qual(ret, node); | ||
| ReleaseSysCache(optup); | ||
| } | ||
| else | ||
|
|
@@ -1053,6 +1161,42 @@ transform_likenode(Node *node, bool is_constraint) | |
|
|
||
| get_remove_accents_internal_oid(); | ||
|
|
||
| /* | ||
| * View definitions (after dump/restore) lose original column collation | ||
| * due to ::text casts, causing wrong LIKE/ILIKE operator selection. | ||
| * Skip if an explicit COLLATE clause is present. | ||
| */ | ||
|
|
||
| if (OidIsValid(like_entry.like_oid) && list_length(op->args) >= 2) | ||
| { | ||
| Node *left_arg = linitial(op->args); | ||
| Node *right_arg = lsecond(op->args); | ||
| if (!IsA(left_arg, CollateExpr) && !IsA(right_arg, CollateExpr) && !contain_var_clause(right_arg)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please explain this if condition
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These conditions check if it's safe to consider the column's collation for optimization:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like we are putting some hack in our code, maybe in some cases that is unavoidable but in most of the conditions I see that if not done then this crashes. I am not comfortable with such explaination. Is there anyway to have a generic solution?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, agreed. I've implemented a better approach now which removes the need for all the manual guards that solely arose from use of evaluate_expr. Replaced it with PG's built-in eval_const_expressions + estimate_expression_value - these handle safety checks internally. Any unhandled expression simply skips optimization with correct results. The conditions in this block are unrelated to index optimization. They are for collation recovery needed because views lose column collation during storage due to ::text casts. They ensure we only recover when collation wasn't explicitly set and don't unnecessarily override collation. I think the test coverage for views with LIKE pattern was not much earlier and hence we couldn't catch this issue. |
||
| { | ||
| Node *unwrapped = left_arg; | ||
| while (IsA(unwrapped, RelabelType)) | ||
| unwrapped = (Node *) ((RelabelType *) unwrapped)->arg; | ||
|
|
||
| if (IsA(unwrapped, Var)) | ||
| { | ||
| Oid var_collation = ((Var *) unwrapped)->varcollid; | ||
|
|
||
| if (OidIsValid(var_collation) && | ||
| var_collation != op->inputcollid) | ||
| { | ||
| coll_info_t var_coll_info = | ||
| tsql_lookup_collation_table_internal(var_collation); | ||
|
|
||
| if (OidIsValid(var_coll_info.oid)) | ||
| { | ||
| op->inputcollid = var_collation; | ||
| coll_info_of_inputcollid = var_coll_info; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /* | ||
| * We do not allow CREATE TABLE statements with CHECK constraint where | ||
| * the constraint has an ILIKE operator and the collation is ci_as. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
comment over
fold_like_pattern_to_constclearly says following,In essence -- runtime value might be different from planned value. So I think we should avoid using it blindly. Instead we should only try to const fold CONVERT/TRY_CONVERT selectively.
btw, what reason CONVERT/TRY_CONVERT function are marked STABLE in Babelfish?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. Replaced blind estimate_expression_value with a guarded approach. We now selectively fold only CONVERT/TRY_CONVERT, TRY_CAST, and string concatenation stable functions. Any other stable/volatile function skips folding.
These functions even though marked STABLE, have deterministic output for constant literals. I think they are marked STABLE likely because they perform internal catalog lookups and as per PG doc functions accessing tables should not be IMMUTABLE. Reference: https://www.postgresql.org/docs/current/xfunc-volatility.html#:~:text=It%20is%20generally%20unwise%20to%20select%20from%20database%20tables%20within%20an%20IMMUTABLE%20function%20at%20all