Look for the utf8 sentinel in a separate pass

papandreou · ljharb · commit 16c43b3688be · 2018-07-26T19:00:59.000+02:00
Means that it doesn't have to be the first parameter to affect the parsing of the entire query string. ljharb#268 (comment)
diff --git a/lib/parse.js b/lib/parse.js
@@ -8,12 +8,15 @@ var defaults = {
     allowDots: false,
     allowPrototypes: false,
     arrayLimit: 20,
+    charset: 'utf-8',
     decoder: utils.decode,
     delimiter: '&',
     depth: 5,
+    interpretNumericEntities: false,
     parameterLimit: 1000,
     plainObjects: false,
-    strictNullHandling: false
+    strictNullHandling: false,
+    utf8Sentinel: false
 };
 
 var interpretNumericEntities = function (str) {
@@ -25,24 +28,41 @@ var interpretNumericEntities = function (str) {
 // This is what browsers will submit when the ✓ character occurs in an
 // application/x-www-form-urlencoded body and the encoding of the page containing
 // the form is iso-8859-1, or when the submitted form has an accept-charset
-// attribute of iso-8859-1. Presumably also with other charsets that does no contain
+// attribute of iso-8859-1. Presumably also with other charsets that do not contain
 // the ✓ character, such as us-ascii.
-var numericCheckmark = '&#10003;';
+var isoSentinel = 'utf8=%26%2310003%3B'; // encodeURIComponent('&#10003;')
 
-// These are the raw utf-8 bytes of the checkmark as code points in a string.
-// It's what we end up with when the utf-8 sentinel parameter is interpreted
-// as iso-8859-1. When utf8Sentinel is enabled, we will use it to course-correct
-// and interpret the rest of the query string as utf-8.
-var misinterpretedCheckmark = '\xe2\x9c\x93';
+// These are the percent-encoded utf-8 octets representing a checkmark, indicating
+// that the request actually is utf-8 encoded.
+var utf8Sentinel = 'utf8=%E2%9C%93'; // encodeURIComponent('✓')
 
 var parseValues = function parseQueryStringValues(str, options) {
     var obj = {};
     var cleanStr = options.ignoreQueryPrefix ? str.replace(/^\?/, '') : str;
     var limit = options.parameterLimit === Infinity ? undefined : options.parameterLimit;
     var parts = cleanStr.split(options.delimiter, limit);
     var charset = options.charset;
+    var skipIndex = -1; // Keep track of where the utf8 sentinel was found
+    var i;
+
+    if (options.utf8Sentinel) {
+        for (i = 0; i < parts.length; ++i) {
+            if (parts[i].indexOf('utf8=') === 0) {
+                if (parts[i] === utf8Sentinel) {
+                    charset = 'utf-8';
+                } else if (parts[i] === isoSentinel) {
+                    charset = 'iso-8859-1';
+                }
+                skipIndex = i;
+                i = parts.length; // The eslint settings do not allow break;
+            }
+        }
+    }
 
-    for (var i = 0; i < parts.length; ++i) {
+    for (i = 0; i < parts.length; ++i) {
+        if (i === skipIndex) {
+            continue;
+        }
         var part = parts[i];
 
         var bracketEqualsPos = part.indexOf(']=');
@@ -57,21 +77,13 @@ var parseValues = function parseQueryStringValues(str, options) {
             val = options.decoder(part.slice(pos + 1), defaults.decoder, charset);
         }
 
-        if (key === 'utf8' && options.utf8Sentinel) {
-            if (val === '✓' || val === misinterpretedCheckmark) {
-                charset = 'utf-8';
-            } else if (val === numericCheckmark) {
-                charset = 'iso-8859-1';
-            }
+        if (options.interpretNumericEntities && charset === 'iso-8859-1') {
+            val = interpretNumericEntities(val);
+        }
+        if (has.call(obj, key)) {
+            obj[key] = [].concat(obj[key]).concat(val);
         } else {
-            if (options.interpretNumericEntities && charset === 'iso-8859-1') {
-                val = interpretNumericEntities(val);
-            }
-            if (has.call(obj, key)) {
-                obj[key] = [].concat(obj[key]).concat(val);
-            } else {
-                obj[key] = val;
-            }
+            obj[key] = val;
         }
     }
 
diff --git a/test/parse.js b/test/parse.js
@@ -597,6 +597,11 @@ test('parse()', function (t) {
         st.end();
     });
 
+    t.test('does not require the utf8 sentinel to be defined before the parameters whose decoding it affects', function (st) {
+        st.deepEqual(qs.parse('a=' + urlEncodedOSlashInUtf8 + '&utf8=' + urlEncodedNumCheckmark, { utf8Sentinel: true, charset: 'utf-8' }), { a: 'Ã¸' });
+        st.end();
+    });
+
     t.test('should ignore an utf8 sentinel with an unknown value', function (st) {
         st.deepEqual(qs.parse('utf8=foo&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { utf8Sentinel: true, charset: 'utf-8' }), { ø: 'ø' });
         st.end();