Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
088e87f
Fix to enable index scan with LIKE operator for non-const nodes
Feb 20, 2026
21d6b84
Fix type reconciliation to always run regardless of RelabelType
Feb 21, 2026
868da18
Skipping Escapes from this optimization
Feb 21, 2026
f8cdd51
Correcting the optimization logic and adding test files
Feb 22, 2026
86997fe
Fixing collation and removing test files from unnecessary schedules
Feb 22, 2026
3357524
Fixing test failures
Feb 23, 2026
ffdb176
Removing like node check
Feb 23, 2026
ac796d5
Ficxing Collation
Feb 23, 2026
75140af
Testing to see if failures in views get resolved
Feb 23, 2026
cd87af9
Merge remote-tracking branch 'upstream/BABEL_6_X_DEV' into jira-babel…
Feb 23, 2026
69b694c
Removing changes made for handling dump restore and upgrade failures
Feb 23, 2026
71e35f2
Fixing views after restore and modifying test files accordingly
Feb 24, 2026
7917299
Fixing crash
Feb 24, 2026
7f34556
Fixing test failures
Feb 25, 2026
926c335
Merge remote-tracking branch 'upstream/BABEL_6_X_DEV' into jira-babel…
Feb 25, 2026
2b17d45
Empty commit to rerun workflow
Feb 25, 2026
2f55c79
Increasing sla
Feb 26, 2026
fdf6fca
Merge remote-tracking branch 'upstream/BABEL_6_X_DEV' into jira-babel…
Feb 26, 2026
afeb5b3
Removing default collation and AI indexes from test files
Mar 9, 2026
9822783
Merge remote-tracking branch 'upstream/BABEL_6_X_DEV' into jira-babel…
Mar 9, 2026
5197005
15_2 Before file
Mar 10, 2026
293001e
Correcting before file
Mar 10, 2026
aec3662
New Approach
Mar 11, 2026
710ceec
Merge remote-tracking branch 'upstream/BABEL_6_X_DEV' into jira-babel…
Mar 11, 2026
4a8fd1d
Adding function to check like_escape presence
Mar 11, 2026
c6d91e0
Resolving test failures
Mar 12, 2026
2021734
Resolving test failures
Mar 12, 2026
ba425e9
Allowing only known stable functions having deterministic output to fold
Mar 24, 2026
fea7f92
Merge remote-tracking branch 'upstream/BABEL_6_X_DEV' into jira-babel…
Mar 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 150 additions & 6 deletions contrib/babelfishpg_tsql/src/collation.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,20 @@
#include "utils/syscache.h"
#include "utils/memutils.h"
#include "utils/builtins.h"
#include "catalog/pg_proc.h"
#include "catalog/pg_type.h"
#include "catalog/pg_collation.h"
#include "catalog/namespace.h"
#include "tsearch/ts_locale.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
#include "parser/parser.h"
#include "parser/parse_coerce.h"
#include "parser/parse_type.h"
#include "parser/parse_oper.h"
#include "nodes/makefuncs.h"
#include "nodes/nodes.h"
#include "nodes/pathnodes.h"
#ifdef USE_ICU
#include <unicode/utrans.h>
#include "utils/removeaccent.map"
Expand Down Expand Up @@ -76,6 +80,87 @@ extern int pattern_fixed_prefix_wrapper(Const *patt,
static Node *transform_likenode_for_AI(OpExpr *op);
static Node *convert_node_to_funcexpr_for_like(Node *node, Oid inputcollid);

/*
* Check whether a like_escape call appears anywhere in the expression.
* Uses expression_tree_walker for safe traversal of all node types.
*/
static bool
contains_like_escape(Node *node, void *context)
{
if (node == NULL)
return false;
if (IsA(node, FuncExpr) &&
strcmp(get_func_name(((FuncExpr *) node)->funcid),
"like_escape") == 0)
return true;
return expression_tree_walker(node, contains_like_escape, context);
}

/*
* Check whether the expression contains any non-immutable function
* outside our allowlist of safe-to-fold stable functions.
* Returns true if an unsafe function is found.
*
* Allowlisted functions are STABLE only because plpgsql/C cannot be
* declared IMMUTABLE, but are deterministic for constant inputs:
* babelfish_conv_helper_* - CONVERT / TRY_CONVERT
* babelfish_try_cast_to_* - TRY_CAST
* babelfish_concat_wrapper* - string + operator
* concat - CONCAT()
*/
static bool
has_unsafe_stable_func(Node *node, void *context)
{
if (node == NULL)
return false;
if (IsA(node, FuncExpr))
{
FuncExpr *func = (FuncExpr *) node;

if (func_volatile(func->funcid) != PROVOLATILE_IMMUTABLE)
{
char *name = get_func_name(func->funcid);
bool safe = (name != NULL &&
(strncmp(name, "babelfish_conv_helper_", 22) == 0 ||
strncmp(name, "babelfish_try_cast_to_", 22) == 0 ||
strncmp(name, "babelfish_concat_wrapper", 24) == 0 ||
strcmp(name, "concat") == 0));

if (name)
pfree(name);
if (!safe)
return true;
}
}
return expression_tree_walker(node, has_unsafe_stable_func, context);
}

/*
* Fold a LIKE pattern to Const, restricted to expressions whose
* non-immutable functions are all known-safe stable functions.
* The original LIKE is always kept as a recheck, so correctness
* is guaranteed.
*/
static Node *
fold_like_pattern_to_const(Node *expr)
{
PlannerGlobal glob;
PlannerInfo root;

if (has_unsafe_stable_func(expr, NULL))
return expr;

memset(&glob, 0, sizeof(PlannerGlobal));
glob.type = T_PlannerGlobal;
glob.boundParams = NULL;

memset(&root, 0, sizeof(PlannerInfo));
root.type = T_PlannerInfo;
root.glob = &glob;

return estimate_expression_value(&root, expr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment over fold_like_pattern_to_const clearly says following,

 * This function attempts to estimate the value of an expression for
 * planning purposes.  It is in essence a more aggressive version of
 * eval_const_expressions(): we will perform constant reductions that are
 * not necessarily 100% safe, but are reasonable for estimation purposes.

In essence -- runtime value might be different from planned value. So I think we should avoid using it blindly. Instead we should only try to const fold CONVERT/TRY_CONVERT selectively.

btw, what reason CONVERT/TRY_CONVERT function are marked STABLE in Babelfish?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. Replaced blind estimate_expression_value with a guarded approach. We now selectively fold only CONVERT/TRY_CONVERT, TRY_CAST, and string concatenation stable functions. Any other stable/volatile function skips folding.

These functions even though marked STABLE, have deterministic output for constant literals. I think they are marked STABLE likely because they perform internal catalog lookups and as per PG doc functions accessing tables should not be IMMUTABLE. Reference: https://www.postgresql.org/docs/current/xfunc-volatility.html#:~:text=It%20is%20generally%20unwise%20to%20select%20from%20database%20tables%20within%20an%20IMMUTABLE%20function%20at%20all

}

/* pattern prefix status for pattern_fixed_prefix_wrapper
* Pattern_Prefix_None: no prefix found, this means the first character is a wildcard character
* Pattern_Prefix_Exact: the pattern doesn't include any wildcard character
Expand Down Expand Up @@ -327,8 +412,9 @@ create_collate_expr(Node *arg, Oid collid)
* function to remove the accents and optimize.
* If the node is OpExpr and the collation is cs_as, then simply use optimization:
*
* Case 1: if the pattern is a constant stirng
* col LIKE PATTERN -> col = PATTERN
* Case 1: if the pattern is a constant string
* col LIKE PATTERN -> col = PATTERN AND col LIKE PATTERN
* col NOT LIKE PATTERN -> col <> PATTERN OR col NOT LIKE PATTERN
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this OR clause, planner may loose an ability to choose index scan. Hence in most of the cases where we have correctness issue will be fixed with this but perf might get degraded. But I think tht should be okay

* Case 2: if the pattern have a constant prefix
* col LIKE PATTERN ->
* col LIKE PATTERN BETWEEN prefix AND prefix||E'\uFFFF'
Expand Down Expand Up @@ -386,9 +472,8 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf
op->opno = like_entry.ilike_oid;
op->opfuncid = like_entry.ilike_opfuncid;
}

op->inputcollid = tsql_get_oid_from_collidx(collidx_of_cs_as);

op->inputcollid = tsql_get_oid_from_collidx(collidx_of_cs_as);

/* Remove CollateExpr as the op->inputcollid has already been set */
if (IsA(rightop, CollateExpr))
Expand All @@ -401,6 +486,27 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf
linitial(op->args) = leftop = (Node*)((CollateExpr*) leftop)->arg;
}

/*
* Try to reduce rightop to a Const for prefix extraction.
*
* First use eval_const_expressions (folds immutable functions like
* string concatenation and type casts). If still not a Const, use
* fold_like_pattern_to_const which selectively folds only
* allowlisted stable functions (CONVERT, TRY_CONVERT, TRY_CAST,
* string +) via estimate_expression_value.
*
* Skip for like_escape: PG's like_escape converts escape sequences
* to backslash format, which doesn't match T-SQL bracket patterns.
* Folding it produces wrong prefix bounds.
*/
if (!contains_like_escape(rightop, NULL))
{
rightop = eval_const_expressions(NULL, rightop);
if (!IsA(rightop, Const))
rightop = fold_like_pattern_to_const(rightop);
lsecond(op->args) = rightop;
}

/*
* This is needed to process CI_AI for Const nodes
* Because after we call coerce_to_target_type for type conversion in transform_likenode_for_AI,
Expand Down Expand Up @@ -466,8 +572,10 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf
{
RelabelType *relabel = (RelabelType *) leftop;
leftop = copyObject((Node*) relabel->arg);
prefix->consttype = rtypeId = ltypeId = exprType(leftop);
ltypeId = exprType(leftop);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the necessity to make this change? Is there any test case which fails?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Without this, BABEL_SCHEMATA-vu-verify crashes:

  use db_schemata_10;
  SELECT * FROM information_schema.schemata ORDER BY SCHEMA_NAME;
  TRAP: failed Assert("ltypeId == rtypeId")

rtypeId is set once at the top from original rightop. After evaluate_expr changes rightop to a Const, rtypeId becomes stale. Previously this code was never reached because rightop was never a Const for such queries. Now it is, and types mismatch.

}
/* Reconcile types — LIKE coerces to TEXT but we need original column type */
prefix->consttype = rtypeId = ltypeId;

/*
* We need to do this because the dump considers rightop as Const with COLLATE being added
Expand Down Expand Up @@ -506,7 +614,7 @@ optimise_likenode(Node *node, OpExpr *op, like_ilike_info_t like_entry, coll_inf
InvalidOid,
coll_info_of_inputcollid.oid,
oprfuncid(optup)));
ret = make_and_qual(ret, node);
ret = like_entry.is_not_match ? make_or_qual(ret, node) : make_and_qual(ret, node);
ReleaseSysCache(optup);
}
else
Expand Down Expand Up @@ -1053,6 +1161,42 @@ transform_likenode(Node *node, bool is_constraint)

get_remove_accents_internal_oid();

/*
* View definitions (after dump/restore) lose original column collation
* due to ::text casts, causing wrong LIKE/ILIKE operator selection.
* Skip if an explicit COLLATE clause is present.
*/

if (OidIsValid(like_entry.like_oid) && list_length(op->args) >= 2)
{
Node *left_arg = linitial(op->args);
Node *right_arg = lsecond(op->args);
if (!IsA(left_arg, CollateExpr) && !IsA(right_arg, CollateExpr) && !contain_var_clause(right_arg))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please explain this if condition

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These conditions check if it's safe to consider the column's collation for optimization:

  1. !IsA(left_arg, CollateExpr): If left has explicit COLLATE (eg Col COLLATE Latin1_General_CI_AI LIKE 'abc'), op->inputcollid already reflects it. Without this check, we would overwrite it with column's default collation, ignoring the explicit COLLATE.
  2. !IsA(right_arg, CollateExpr): If right has explicit COLLATE, evaluate_expr crashes on CollateExpr
  3. !contain_var_clause(right_arg): If pattern has column references (eg Col LIKE OtherCol), it cannot be reduced to a Const, so optimization won't apply.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like we are putting some hack in our code, maybe in some cases that is unavoidable but in most of the conditions I see that if not done then this crashes. I am not comfortable with such explaination. Is there anyway to have a generic solution?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, agreed. I've implemented a better approach now which removes the need for all the manual guards that solely arose from use of evaluate_expr. Replaced it with PG's built-in eval_const_expressions + estimate_expression_value - these handle safety checks internally. Any unhandled expression simply skips optimization with correct results.

The conditions in this block are unrelated to index optimization. They are for collation recovery needed because views lose column collation during storage due to ::text casts. They ensure we only recover when collation wasn't explicitly set and don't unnecessarily override collation. I think the test coverage for views with LIKE pattern was not much earlier and hence we couldn't catch this issue.

{
Node *unwrapped = left_arg;
while (IsA(unwrapped, RelabelType))
unwrapped = (Node *) ((RelabelType *) unwrapped)->arg;

if (IsA(unwrapped, Var))
{
Oid var_collation = ((Var *) unwrapped)->varcollid;

if (OidIsValid(var_collation) &&
var_collation != op->inputcollid)
{
coll_info_t var_coll_info =
tsql_lookup_collation_table_internal(var_collation);

if (OidIsValid(var_coll_info.oid))
{
op->inputcollid = var_collation;
coll_info_of_inputcollid = var_coll_info;
}
}
}
}
}

/*
* We do not allow CREATE TABLE statements with CHECK constraint where
* the constraint has an ILIKE operator and the collation is ci_as.
Expand Down
4 changes: 2 additions & 2 deletions test/JDBC/expected/babel_collection.out
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,8 @@ GO
~~START~~
text
Query Text: select c1 from testing4 where c1 NOT LIKE 'jones'
Bitmap Heap Scan on testing4 (cost=11.56..31.31 rows=647 width=32)
Filter: ((c1 <> 'jones'::"varchar") AND ((c1)::text !~~* 'jones'::text COLLATE bbf_unicode_cp1_cs_as))
Bitmap Heap Scan on testing4 (cost=11.56..31.31 rows=650 width=32)
Filter: ((c1 <> 'jones'::"varchar") OR ((c1)::text !~~* 'jones'::text COLLATE bbf_unicode_cp1_cs_as))
-> Bitmap Index Scan on c1_idxtesting49a168d73f3ba5aacdfd495b931b8d187 (cost=0.00..11.40 rows=650 width=0)
~~END~~

Expand Down
4 changes: 2 additions & 2 deletions test/JDBC/expected/db_collation/babel_collection.out
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,8 @@ GO
~~START~~
text
Query Text: select c1 from testing4 where c1 NOT LIKE 'jones'
Bitmap Heap Scan on testing4 (cost=11.56..34.56 rows=647 width=32)
Filter: ((remove_accents_internal((c1)::text) <> 'jones'::nvarchar) AND ((remove_accents_internal((c1)::text))::text !~~* 'jones'::text COLLATE bbf_unicode_cp1_cs_as))
Bitmap Heap Scan on testing4 (cost=11.56..34.56 rows=650 width=32)
Filter: ((remove_accents_internal((c1)::text) <> 'jones'::nvarchar) OR ((remove_accents_internal((c1)::text))::text !~~* 'jones'::text COLLATE bbf_unicode_cp1_cs_as))
-> Bitmap Index Scan on c1_idxtesting49a168d73f3ba5aacdfd495b931b8d187 (cost=0.00..11.40 rows=650 width=0)
~~END~~

Expand Down
Loading
Loading