Skip to content

Commit 286c4bd

Browse files
authored
Merge pull request #268 from papandreou/feature/iso8859-1
[New] Add support for iso-8859-1, utf8 "sentinel" and numeric entities
2 parents 2b94ea7 + 7bcf2dd commit 286c4bd

File tree

7 files changed

+282
-19
lines changed

7 files changed

+282
-19
lines changed

.eslintrc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
"func-name-matching": 0,
1010
"id-length": [2, { "min": 1, "max": 25, "properties": "never" }],
1111
"indent": [2, 4],
12-
"max-params": [2, 12],
13-
"max-statements": [2, 45],
12+
"max-params": [2, 14],
13+
"max-statements": [2, 52],
1414
"no-continue": 1,
1515
"no-magic-numbers": 0,
1616
"no-restricted-syntax": [2, "BreakStatement", "DebuggerStatement", "ForInStatement", "LabeledStatement", "WithStatement"],

README.md

Lines changed: 88 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,62 @@ var withDots = qs.parse('a.b=c', { allowDots: true });
146146
assert.deepEqual(withDots, { a: { b: 'c' } });
147147
```
148148

149+
If you have to deal with legacy browsers or services, there's
150+
also support for decoding percent-encoded octets as iso-8859-1:
151+
152+
```javascript
153+
var oldCharset = qs.parse('a=%A7', { charset: 'iso-8859-1' });
154+
assert.deepEqual(oldCharset, { a: '§' });
155+
```
156+
157+
Some services add an initial `utf8=✓` value to forms so that old
158+
Internet Explorer versions are more likely to submit the form as
159+
utf-8. Additionally, the server can check the value against wrong
160+
encodings of the checkmark character and detect that a query string
161+
or `application/x-www-form-urlencoded` body was *not* sent as
162+
utf-8, eg. if the form had an `accept-charset` parameter or the
163+
containing page had a different character set.
164+
165+
**qs** supports this mechanism via the `charsetSentinel` option.
166+
If specified, the `utf8` parameter will be omitted from the
167+
returned object. It will be used to switch to `iso-8859-1`/`utf-8`
168+
mode depending on how the checkmark is encoded.
169+
170+
**Important**: When you specify both the `charset` option and the
171+
`charsetSentinel` option, the `charset` will be overridden when
172+
the request contains a `utf8` parameter from which the actual
173+
charset can be deduced. In that sense the `charset` will behave
174+
as the default charset rather than the authoritative charset.
175+
176+
```javascript
177+
var detectedAsUtf8 = qs.parse('utf8=%E2%9C%93&a=%C3%B8', {
178+
charset: 'iso-8859-1',
179+
charsetSentinel: true
180+
});
181+
assert.deepEqual(detectedAsUtf8, { a: 'ø' });
182+
183+
// Browsers encode the checkmark as ✓ when submitting as iso-8859-1:
184+
var detectedAsIso8859_1 = qs.parse('utf8=%26%2310003%3B&a=%F8', {
185+
charset: 'utf-8',
186+
charsetSentinel: true
187+
});
188+
assert.deepEqual(detectedAsIso8859_1, { a: 'ø' });
189+
```
190+
191+
If you want to decode the `&#...;` syntax to the actual character,
192+
you can specify the `interpretNumericEntities` option as well:
193+
194+
```javascript
195+
var detectedAsIso8859_1 = qs.parse('a=%26%239786%3B', {
196+
charset: 'iso-8859-1',
197+
interpretNumericEntities: true
198+
});
199+
assert.deepEqual(detectedAsIso8859_1, { a: '' });
200+
```
201+
202+
It also works when the charset has been detected in `charsetSentinel`
203+
mode.
204+
149205
### Parsing Arrays
150206

151207
**qs** can also parse arrays using a similar `[]` notation:
@@ -426,10 +482,40 @@ var nullsSkipped = qs.stringify({ a: 'b', c: null}, { skipNulls: true });
426482
assert.equal(nullsSkipped, 'a=b');
427483
```
428484

485+
If you're communicating with legacy systems, you can switch to `iso-8859-1`
486+
using the `charset` option:
487+
488+
```javascript
489+
var iso = qs.stringify({ æ: 'æ' }, { charset: 'iso-8859-1' });
490+
assert.equal(iso, '%E6=%E6');
491+
```
492+
493+
Characters that don't exist in `iso-8859-1` will be converted to numeric
494+
entities, similar to what browsers do:
495+
496+
```javascript
497+
var numeric = qs.stringify({ a: '' }, { charset: 'iso-8859-1' });
498+
assert.equal(numeric, 'a=%26%239786%3B');
499+
```
500+
501+
You can use the `charsetSentinel` option to announce the character by
502+
including an `utf8=✓` parameter with the proper encoding if the checkmark,
503+
similar to what Ruby on Rails and others do when submitting forms.
504+
505+
```javascript
506+
var sentinel = qs.stringify({ a: '' }, { charsetSentinel: true });
507+
assert.equal(sentinel, 'utf8=%E2%9C%93&a=%E2%98%BA');
508+
509+
var isoSentinel = qs.stringify({ a: 'æ' }, { charsetSentinel: true, charset: 'iso-8859-1' });
510+
assert.equal(isoSentinel, 'utf8=%26%2310003%3B&a=%E6');
511+
```
512+
429513
### Dealing with special character sets
430514

431-
By default the encoding and decoding of characters is done in `utf-8`. If you
432-
wish to encode querystrings to a different character set (i.e.
515+
By default the encoding and decoding of characters is done in `utf-8`,
516+
and `iso-8859-1` support is also built in via the `charset` parameter.
517+
518+
If you wish to encode querystrings to a different character set (i.e.
433519
[Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS)) you can use the
434520
[`qs-iconv`](https://github.com/martinheidegger/qs-iconv) library:
435521

lib/parse.js

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,33 +8,80 @@ var defaults = {
88
allowDots: false,
99
allowPrototypes: false,
1010
arrayLimit: 20,
11+
charset: 'utf-8',
12+
charsetSentinel: false,
1113
decoder: utils.decode,
1214
delimiter: '&',
1315
depth: 5,
16+
interpretNumericEntities: false,
1417
parameterLimit: 1000,
1518
plainObjects: false,
1619
strictNullHandling: false
1720
};
1821

22+
var interpretNumericEntities = function (str) {
23+
return str.replace(/&#(\d+);/g, function ($0, numberStr) {
24+
return String.fromCharCode(parseInt(numberStr, 10));
25+
});
26+
};
27+
28+
// This is what browsers will submit when the ✓ character occurs in an
29+
// application/x-www-form-urlencoded body and the encoding of the page containing
30+
// the form is iso-8859-1, or when the submitted form has an accept-charset
31+
// attribute of iso-8859-1. Presumably also with other charsets that do not contain
32+
// the ✓ character, such as us-ascii.
33+
var isoSentinel = 'utf8=%26%2310003%3B'; // encodeURIComponent('✓')
34+
35+
// These are the percent-encoded utf-8 octets representing a checkmark, indicating
36+
// that the request actually is utf-8 encoded.
37+
var charsetSentinel = 'utf8=%E2%9C%93'; // encodeURIComponent('✓')
38+
1939
var parseValues = function parseQueryStringValues(str, options) {
2040
var obj = {};
2141
var cleanStr = options.ignoreQueryPrefix ? str.replace(/^\?/, '') : str;
2242
var limit = options.parameterLimit === Infinity ? undefined : options.parameterLimit;
2343
var parts = cleanStr.split(options.delimiter, limit);
44+
var charset = options.charset;
45+
var skipIndex = -1; // Keep track of where the utf8 sentinel was found
46+
var i;
47+
48+
if (charset !== undefined && charset !== 'utf-8' && charset !== 'iso-8859-1') {
49+
throw new Error('The charset option must be either utf-8, iso-8859-1, or undefined');
50+
}
51+
if (options.charsetSentinel) {
52+
for (i = 0; i < parts.length; ++i) {
53+
if (parts[i].indexOf('utf8=') === 0) {
54+
if (parts[i] === charsetSentinel) {
55+
charset = 'utf-8';
56+
} else if (parts[i] === isoSentinel) {
57+
charset = 'iso-8859-1';
58+
}
59+
skipIndex = i;
60+
i = parts.length; // The eslint settings do not allow break;
61+
}
62+
}
63+
}
2464

25-
for (var i = 0; i < parts.length; ++i) {
65+
for (i = 0; i < parts.length; ++i) {
66+
if (i === skipIndex) {
67+
continue;
68+
}
2669
var part = parts[i];
2770

2871
var bracketEqualsPos = part.indexOf(']=');
2972
var pos = bracketEqualsPos === -1 ? part.indexOf('=') : bracketEqualsPos + 1;
3073

3174
var key, val;
3275
if (pos === -1) {
33-
key = options.decoder(part, defaults.decoder);
76+
key = options.decoder(part, defaults.decoder, charset);
3477
val = options.strictNullHandling ? null : '';
3578
} else {
36-
key = options.decoder(part.slice(0, pos), defaults.decoder);
37-
val = options.decoder(part.slice(pos + 1), defaults.decoder);
79+
key = options.decoder(part.slice(0, pos), defaults.decoder, charset);
80+
val = options.decoder(part.slice(pos + 1), defaults.decoder, charset);
81+
}
82+
83+
if (options.interpretNumericEntities && charset === 'iso-8859-1') {
84+
val = interpretNumericEntities(val);
3885
}
3986
if (has.call(obj, key)) {
4087
obj[key] = [].concat(obj[key]).concat(val);

lib/stringify.js

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
4141
allowDots,
4242
serializeDate,
4343
formatter,
44-
encodeValuesOnly
44+
encodeValuesOnly,
45+
charset
4546
) {
4647
var obj = object;
4748
if (typeof filter === 'function') {
@@ -50,16 +51,16 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
5051
obj = serializeDate(obj);
5152
} else if (obj === null) {
5253
if (strictNullHandling) {
53-
return encoder && !encodeValuesOnly ? encoder(prefix, defaults.encoder) : prefix;
54+
return encoder && !encodeValuesOnly ? encoder(prefix, defaults.encoder, charset) : prefix;
5455
}
5556

5657
obj = '';
5758
}
5859

5960
if (typeof obj === 'string' || typeof obj === 'number' || typeof obj === 'boolean' || utils.isBuffer(obj)) {
6061
if (encoder) {
61-
var keyValue = encodeValuesOnly ? prefix : encoder(prefix, defaults.encoder);
62-
return [formatter(keyValue) + '=' + formatter(encoder(obj, defaults.encoder))];
62+
var keyValue = encodeValuesOnly ? prefix : encoder(prefix, defaults.encoder, charset);
63+
return [formatter(keyValue) + '=' + formatter(encoder(obj, defaults.encoder, charset))];
6364
}
6465
return [formatter(prefix) + '=' + formatter(String(obj))];
6566
}
@@ -98,7 +99,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
9899
allowDots,
99100
serializeDate,
100101
formatter,
101-
encodeValuesOnly
102+
encodeValuesOnly,
103+
charset
102104
));
103105
} else {
104106
values = values.concat(stringify(
@@ -113,7 +115,8 @@ var stringify = function stringify( // eslint-disable-line func-name-matching
113115
allowDots,
114116
serializeDate,
115117
formatter,
116-
encodeValuesOnly
118+
encodeValuesOnly,
119+
charset
117120
));
118121
}
119122
}
@@ -138,6 +141,11 @@ module.exports = function (object, opts) {
138141
var allowDots = typeof options.allowDots === 'undefined' ? false : options.allowDots;
139142
var serializeDate = typeof options.serializeDate === 'function' ? options.serializeDate : defaults.serializeDate;
140143
var encodeValuesOnly = typeof options.encodeValuesOnly === 'boolean' ? options.encodeValuesOnly : defaults.encodeValuesOnly;
144+
var charset = options.charset || 'utf-8';
145+
if (charset !== undefined && charset !== 'utf-8' && charset !== 'iso-8859-1') {
146+
throw new Error('The charset option must be either utf-8, iso-8859-1, or undefined');
147+
}
148+
141149
if (typeof options.format === 'undefined') {
142150
options.format = formats['default'];
143151
} else if (!Object.prototype.hasOwnProperty.call(formats.formatters, options.format)) {
@@ -199,12 +207,23 @@ module.exports = function (object, opts) {
199207
allowDots,
200208
serializeDate,
201209
formatter,
202-
encodeValuesOnly
210+
encodeValuesOnly,
211+
charset
203212
));
204213
}
205214

206215
var joined = keys.join(delimiter);
207216
var prefix = options.addQueryPrefix === true ? '?' : '';
208217

218+
if (options.charsetSentinel) {
219+
if (charset === 'iso-8859-1') {
220+
// encodeURIComponent('&#10003;'), the "numeric entity" representation of a checkmark
221+
prefix += 'utf8=%26%2310003%3B&';
222+
} else {
223+
// encodeURIComponent('✓')
224+
prefix += 'utf8=%E2%9C%93&';
225+
}
226+
}
227+
209228
return joined.length > 0 ? prefix + joined : '';
210229
};

lib/utils.js

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -107,15 +107,21 @@ var assign = function assignSingleSource(target, source) {
107107
}, target);
108108
};
109109

110-
var decode = function (str) {
110+
var decode = function (str, decoder, charset) {
111+
var strWithoutPlus = str.replace(/\+/g, ' ');
112+
if (charset === 'iso-8859-1') {
113+
// unescape never throws, no try...catch needed:
114+
return strWithoutPlus.replace(/%[0-9a-f]{2}/gi, unescape);
115+
}
116+
// utf-8
111117
try {
112-
return decodeURIComponent(str.replace(/\+/g, ' '));
118+
return decodeURIComponent(strWithoutPlus);
113119
} catch (e) {
114-
return str;
120+
return strWithoutPlus;
115121
}
116122
};
117123

118-
var encode = function encode(str) {
124+
var encode = function encode(str, defaultEncoder, charset) {
119125
// This code was originally written by Brian White (mscdex) for the io.js core querystring library.
120126
// It has been adapted here for stricter adherence to RFC 3986
121127
if (str.length === 0) {
@@ -124,6 +130,12 @@ var encode = function encode(str) {
124130

125131
var string = typeof str === 'string' ? str : String(str);
126132

133+
if (charset === 'iso-8859-1') {
134+
return escape(string).replace(/%u[0-9a-f]{4}/gi, function ($0) {
135+
return '%26%23' + parseInt($0.slice(2), 16) + '%3B';
136+
});
137+
}
138+
127139
var out = '';
128140
for (var i = 0; i < string.length; ++i) {
129141
var c = string.charCodeAt(i);

0 commit comments

Comments
 (0)