Remove unsafe encoding facilities from query parser

michaelpq · michaelpq · commit 44c6c60c0452 · 2025-10-07T14:04:51.000+09:00
These have been inherited from the psql query parser, but are not required as the encoding is always known to be safe on the backend side. Hence, there is no need to perform a transformation of the query string. The code was not relying on that, actually, PG_SQL_ASCII being given of query_scan_setup() (actually, it was 0). Let's remove this code, to ease future maintenance. Like 37a06b4, this is backpatched down to v17 where the query parser has been introduced. Backpatch-through: 17
diff --git a/pg_hint_plan.c b/pg_hint_plan.c
@@ -2044,7 +2044,7 @@ get_hints_from_comment(const char *p)
 	sstate = query_scan_create();
 	query_buf = makeStringInfo();
 
-	query_scan_setup(sstate, p, strlen(p), 0,
+	query_scan_setup(sstate, p, strlen(p),
 					 standard_conforming_strings,
 					 pg_hint_plan_parse_message_level);
 	for (;;)
diff --git a/query_scan.h b/query_scan.h
@@ -31,7 +31,7 @@ typedef enum
 extern QueryScanState query_scan_create(void);
 extern void query_scan_setup(QueryScanState state,
 							 const char *line, int line_len,
-							 int encoding, bool std_strings,
+							 bool std_strings,
 							 int elevel);
 extern void query_scan_finish(QueryScanState state);
 extern QueryScanResult query_scan(QueryScanState state,
diff --git a/query_scan.l b/query_scan.l
@@ -826,35 +826,26 @@ query_scan_create(void)
  * a pointer to the storage at *line --- this string must not be altered
  * or freed until after query_scan_finish is called.
  *
- * encoding is the libpq identifier for the character encoding in use,
- * and std_strings says whether standard_conforming_strings is on.
+ * std_strings says whether standard_conforming_strings is on.
  */
 void
 query_scan_setup(QueryScanState state,
 				const char *line, int line_len,
-				int encoding, bool std_strings, int elevel)
+				bool std_strings, int elevel)
 {
 	/* Mustn't be scanning already */
 	Assert(state->scanbufhandle == NULL);
 
 	/* elevel for reports */
 	state->elevel = elevel;
 
-	/* Do we need to hack the character set encoding? */
-	state->encoding = encoding;
-	state->safe_encoding = pg_valid_server_encoding_id(encoding);
-
 	/* Save standard-strings flag as well */
 	state->std_strings = std_strings;
 
 	/* Set up flex input buffer with appropriate translation and padding */
 	state->scanbufhandle = query_scan_prepare_buffer(state, line, line_len,
 												   &state->scanbuf);
 	state->scanline = line;
-
-	/* Set lookaside data in case we have to map unsafe encoding */
-	state->curline = state->scanbuf;
-	state->refline = state->scanline;
 }
 
 /*
@@ -958,8 +949,7 @@ query_scan_finish(QueryScanState state)
 
 /*
  * Set up a flex input buffer to scan the given data.  We always make a
- * copy of the data.  If working in an unsafe encoding, the copy has
- * multibyte sequences replaced by FFs to avoid fooling the lexer rules.
+ * copy of the data.
  *
  * NOTE SIDE EFFECT: the new buffer is made the active flex input buffer.
  */
@@ -974,25 +964,8 @@ query_scan_prepare_buffer(QueryScanState state, const char *txt, int len,
 	*txtcopy = newtxt;
 	newtxt[len] = newtxt[len + 1] = YY_END_OF_BUFFER_CHAR;
 
-	if (state->safe_encoding)
-		memcpy(newtxt, txt, len);
-	else
-	{
-		/* Gotta do it the hard way */
-		int			i = 0;
-
-		while (i < len)
-		{
-			int			thislen = pg_encoding_mblen(state->encoding,
-									    txt + i);
-
-			/* first byte should always be okay... */
-			newtxt[i] = txt[i];
-			i++;
-			while (--thislen > 0 && i < len)
-				newtxt[i++] = (char) 0xFF;
-		}
-	}
+	/* Copy the text */
+	memcpy(newtxt, txt, len);
 
 	return yy_scan_buffer(newtxt, len + 2, state->scanner);
 }
@@ -1018,23 +991,6 @@ query_scan_emit(QueryScanState state, const char *txt, int len)
 {
 	StringInfo output_buf = state->output_buf;
 
-	if (state->safe_encoding)
-		appendBinaryStringInfo(output_buf, txt, len);
-	else
-	{
-		/* Gotta do it the hard way */
-		const char *reference = state->refline;
-		int			i;
-
-		reference += (txt - state->curline);
-
-		for (i = 0; i < len; i++)
-		{
-			char		ch = txt[i];
-
-			if (ch == (char) 0xFF)
-				ch = reference[i];
-			appendStringInfoChar(output_buf, ch);
-		}
-	}
+	/* Simply copy the data to the output */
+	appendBinaryStringInfo(output_buf, txt, len);
 }
diff --git a/query_scan_int.h b/query_scan_int.h
@@ -5,20 +5,6 @@
  *
  * This file declares the QueryScanStateData structure used by query_scan.l.
  *
- * One difficult aspect of this code is that we need to work in multibyte
- * encodings that are not ASCII-safe.  A "safe" encoding is one in which each
- * byte of a multibyte character has the high bit set (it's >= 0x80).  Since
- * all our lexing rules treat all high-bit-set characters alike, we don't
- * really need to care whether such a byte is part of a sequence or not.
- * In an "unsafe" encoding, we still expect the first byte of a multibyte
- * sequence to be >= 0x80, but later bytes might not be.  If we scan such
- * a sequence as-is, the lexing rules could easily be fooled into matching
- * such bytes to ordinary ASCII characters.  Our solution for this is to
- * substitute 0xFF for each non-first byte within the data presented to flex.
- * The flex rules will then pass the FF's through unmolested.  The
- * query_scan_emit() subroutine is responsible for looking back to the
- * original string and replacing FF's with the corresponding original bytes.
- *
  * Another interesting thing we do here is scan different parts of the same
  * input with physically separate flex lexers (ie, lexers written in separate
  * .l files).  We can get away with this because the only part of the
@@ -81,12 +67,7 @@ typedef struct QueryScanStateData
 	char	   *scanbuf;		/* start of outer-level input buffer */
 	const char *scanline;		/* current input line at outer level */
 
-	/* safe_encoding, curline, refline are used by emit() to replace FFs */
-	int			encoding;		/* encoding being used now */
-	bool		safe_encoding;	/* is current encoding "safe"? */
 	bool		std_strings;	/* are string literals standard? */
-	const char *curline;		/* actual flex input string for cur buf */
-	const char *refline;		/* original data for cur buffer */
 
 	/*
 	 * All this state lives across successive input lines.  start_state is