Skip to content

Commit e6596d8

Browse files
Further optimization to TextFormatEscaper
In the case of quotes and backslashes (which are the most common things that need escaping) we can avoid going into Utf8 bytes for most cases this way, while also only doing 1 pass over the string when it doesn't need escaping at all. PiperOrigin-RevId: 862311595
1 parent 2390c69 commit e6596d8

File tree

1 file changed

+39
-7
lines changed

1 file changed

+39
-7
lines changed

java/core/src/main/java/com/google/protobuf/TextFormatEscaper.java

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,51 @@ static String escapeBytes(final ByteString input) {
8888
return escapeBytes(input.toByteArray());
8989
}
9090

91-
static boolean needsEscape(char c) {
92-
return c < 0x20 || c > 0x7e || c == '\'' || c == '"' || c == '\\';
93-
}
94-
9591
/** Like {@link #escapeBytes(ByteString)}, but escapes a text string. */
9692
static String escapeText(String input) {
97-
// Loop on the string to see if any character even needs escaping. If yes, then convert into
98-
// UTF-8 and then escape on those bytes. If not, we can just return the original input.
93+
boolean hasSingleQuote = false;
94+
boolean hasDoubleQuote = false;
95+
boolean hasBackslash = false;
96+
9997
for (int i = 0; i < input.length(); ++i) {
100-
if (needsEscape(input.charAt(i))) {
98+
char c = input.charAt(i);
99+
100+
// If there are any characters outside of ASCII range we eagerly convert to UTF and escape on
101+
// those bytes. Note that escaping to UTF8 bytes instead of \\u sequences is itself somewhat
102+
// nonsensical, but JavaProto has behaved this way for a long time, and changing the behavior
103+
// would be disruptive.
104+
if (c < 0x20 || c > 0x7e) {
101105
return escapeBytes(input.getBytes(Internal.UTF_8));
102106
}
107+
108+
// While in this loop, keep track if there are any single quotes, double quotes, or
109+
// backslashes. This can help avoid multiple passes over the string looking for each of the
110+
// bad characters.
111+
switch (c) {
112+
case '\'':
113+
hasSingleQuote = true;
114+
break;
115+
case '"':
116+
hasDoubleQuote = true;
117+
break;
118+
case '\\':
119+
hasBackslash = true;
120+
break;
121+
default:
122+
break;
123+
}
103124
}
125+
126+
if (hasSingleQuote) {
127+
input = input.replace("\'", "\\\'");
128+
}
129+
if (hasDoubleQuote) {
130+
input = input.replace("\"", "\\\"");
131+
}
132+
if (hasBackslash) {
133+
input = input.replace("\\", "\\\\");
134+
}
135+
104136
return input;
105137
}
106138

0 commit comments

Comments
 (0)