Skip to content

Commit 44c6c60

Browse files
committed
Remove unsafe encoding facilities from query parser
These have been inherited from the psql query parser, but are not required as the encoding is always known to be safe on the backend side. Hence, there is no need to perform a transformation of the query string. The code was not relying on that, actually, PG_SQL_ASCII being given of query_scan_setup() (actually, it was 0). Let's remove this code, to ease future maintenance. Like 37a06b4, this is backpatched down to v17 where the query parser has been introduced. Backpatch-through: 17
1 parent 11458d1 commit 44c6c60

4 files changed

Lines changed: 9 additions & 72 deletions

File tree

pg_hint_plan.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2044,7 +2044,7 @@ get_hints_from_comment(const char *p)
20442044
sstate = query_scan_create();
20452045
query_buf = makeStringInfo();
20462046

2047-
query_scan_setup(sstate, p, strlen(p), 0,
2047+
query_scan_setup(sstate, p, strlen(p),
20482048
standard_conforming_strings,
20492049
pg_hint_plan_parse_message_level);
20502050
for (;;)

query_scan.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ typedef enum
3131
extern QueryScanState query_scan_create(void);
3232
extern void query_scan_setup(QueryScanState state,
3333
const char *line, int line_len,
34-
int encoding, bool std_strings,
34+
bool std_strings,
3535
int elevel);
3636
extern void query_scan_finish(QueryScanState state);
3737
extern QueryScanResult query_scan(QueryScanState state,

query_scan.l

Lines changed: 7 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -826,35 +826,26 @@ query_scan_create(void)
826826
* a pointer to the storage at *line --- this string must not be altered
827827
* or freed until after query_scan_finish is called.
828828
*
829-
* encoding is the libpq identifier for the character encoding in use,
830-
* and std_strings says whether standard_conforming_strings is on.
829+
* std_strings says whether standard_conforming_strings is on.
831830
*/
832831
void
833832
query_scan_setup(QueryScanState state,
834833
const char *line, int line_len,
835-
int encoding, bool std_strings, int elevel)
834+
bool std_strings, int elevel)
836835
{
837836
/* Mustn't be scanning already */
838837
Assert(state->scanbufhandle == NULL);
839838

840839
/* elevel for reports */
841840
state->elevel = elevel;
842841

843-
/* Do we need to hack the character set encoding? */
844-
state->encoding = encoding;
845-
state->safe_encoding = pg_valid_server_encoding_id(encoding);
846-
847842
/* Save standard-strings flag as well */
848843
state->std_strings = std_strings;
849844

850845
/* Set up flex input buffer with appropriate translation and padding */
851846
state->scanbufhandle = query_scan_prepare_buffer(state, line, line_len,
852847
&state->scanbuf);
853848
state->scanline = line;
854-
855-
/* Set lookaside data in case we have to map unsafe encoding */
856-
state->curline = state->scanbuf;
857-
state->refline = state->scanline;
858849
}
859850

860851
/*
@@ -958,8 +949,7 @@ query_scan_finish(QueryScanState state)
958949

959950
/*
960951
* Set up a flex input buffer to scan the given data. We always make a
961-
* copy of the data. If working in an unsafe encoding, the copy has
962-
* multibyte sequences replaced by FFs to avoid fooling the lexer rules.
952+
* copy of the data.
963953
*
964954
* NOTE SIDE EFFECT: the new buffer is made the active flex input buffer.
965955
*/
@@ -974,25 +964,8 @@ query_scan_prepare_buffer(QueryScanState state, const char *txt, int len,
974964
*txtcopy = newtxt;
975965
newtxt[len] = newtxt[len + 1] = YY_END_OF_BUFFER_CHAR;
976966

977-
if (state->safe_encoding)
978-
memcpy(newtxt, txt, len);
979-
else
980-
{
981-
/* Gotta do it the hard way */
982-
int i = 0;
983-
984-
while (i < len)
985-
{
986-
int thislen = pg_encoding_mblen(state->encoding,
987-
txt + i);
988-
989-
/* first byte should always be okay... */
990-
newtxt[i] = txt[i];
991-
i++;
992-
while (--thislen > 0 && i < len)
993-
newtxt[i++] = (char) 0xFF;
994-
}
995-
}
967+
/* Copy the text */
968+
memcpy(newtxt, txt, len);
996969

997970
return yy_scan_buffer(newtxt, len + 2, state->scanner);
998971
}
@@ -1018,23 +991,6 @@ query_scan_emit(QueryScanState state, const char *txt, int len)
1018991
{
1019992
StringInfo output_buf = state->output_buf;
1020993

1021-
if (state->safe_encoding)
1022-
appendBinaryStringInfo(output_buf, txt, len);
1023-
else
1024-
{
1025-
/* Gotta do it the hard way */
1026-
const char *reference = state->refline;
1027-
int i;
1028-
1029-
reference += (txt - state->curline);
1030-
1031-
for (i = 0; i < len; i++)
1032-
{
1033-
char ch = txt[i];
1034-
1035-
if (ch == (char) 0xFF)
1036-
ch = reference[i];
1037-
appendStringInfoChar(output_buf, ch);
1038-
}
1039-
}
994+
/* Simply copy the data to the output */
995+
appendBinaryStringInfo(output_buf, txt, len);
1040996
}

query_scan_int.h

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,6 @@
55
*
66
* This file declares the QueryScanStateData structure used by query_scan.l.
77
*
8-
* One difficult aspect of this code is that we need to work in multibyte
9-
* encodings that are not ASCII-safe. A "safe" encoding is one in which each
10-
* byte of a multibyte character has the high bit set (it's >= 0x80). Since
11-
* all our lexing rules treat all high-bit-set characters alike, we don't
12-
* really need to care whether such a byte is part of a sequence or not.
13-
* In an "unsafe" encoding, we still expect the first byte of a multibyte
14-
* sequence to be >= 0x80, but later bytes might not be. If we scan such
15-
* a sequence as-is, the lexing rules could easily be fooled into matching
16-
* such bytes to ordinary ASCII characters. Our solution for this is to
17-
* substitute 0xFF for each non-first byte within the data presented to flex.
18-
* The flex rules will then pass the FF's through unmolested. The
19-
* query_scan_emit() subroutine is responsible for looking back to the
20-
* original string and replacing FF's with the corresponding original bytes.
21-
*
228
* Another interesting thing we do here is scan different parts of the same
239
* input with physically separate flex lexers (ie, lexers written in separate
2410
* .l files). We can get away with this because the only part of the
@@ -81,12 +67,7 @@ typedef struct QueryScanStateData
8167
char *scanbuf; /* start of outer-level input buffer */
8268
const char *scanline; /* current input line at outer level */
8369

84-
/* safe_encoding, curline, refline are used by emit() to replace FFs */
85-
int encoding; /* encoding being used now */
86-
bool safe_encoding; /* is current encoding "safe"? */
8770
bool std_strings; /* are string literals standard? */
88-
const char *curline; /* actual flex input string for cur buf */
89-
const char *refline; /* original data for cur buffer */
9071

9172
/*
9273
* All this state lives across successive input lines. start_state is

0 commit comments

Comments
 (0)