Skip to content

Commit afbef9b

Browse files
authored
Fix Encoding on Body (#143)
* fix: check encoding on body
1 parent 9d4c883 commit afbef9b

15 files changed

Lines changed: 7155 additions & 1034 deletions

dist/generate-custom-parser.js

Lines changed: 7010 additions & 988 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/generate-custom-parser.js.map

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/mercury.js

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/mercury.js.map

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

fixtures/nock/fetch-resource-test.js

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

fixtures/nock/resource-test.js

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/check-build.test.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ if (process.env.CI) {
4343
assert.equal(article.title, result.title);
4444
done();
4545
}).catch((e) => {
46-
console.log('THIS WENT WRONG', e); // eslint-disable-line no-console
46+
console.log('There was an error', e.message); // eslint-disable-line no-console
47+
console.log('e.fileName', e.fileName);
48+
console.log('e.lineNumber', e.lineNumber);
4749
assert.equal(true, false);
4850
done();
4951
});

src/resource/index.js

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import cheerio from 'cheerio';
2+
import iconv from 'iconv-lite';
23

4+
import { getEncoding } from 'utils/text';
35
import { fetchResource } from './utils';
46
import {
57
normalizeMetaTags,
@@ -51,7 +53,7 @@ const Resource = {
5153
throw new Error('Content does not appear to be text.');
5254
}
5355

54-
let $ = cheerio.load(content);
56+
let $ = this.encodeDoc({ content, contentType });
5557

5658
if ($.root().children().length === 0) {
5759
throw new Error('No children, likely a bad parse.');
@@ -63,6 +65,24 @@ const Resource = {
6365

6466
return $;
6567
},
68+
69+
encodeDoc({ content, contentType }) {
70+
const encoding = getEncoding(contentType);
71+
let decodedContent = iconv.decode(content, encoding);
72+
let $ = cheerio.load(decodedContent);
73+
74+
// after first cheerio.load, check to see if encoding matches
75+
const metaContentType = $('meta[http-equiv=content-type]').attr('content');
76+
const properEncoding = getEncoding(metaContentType);
77+
78+
// if encodings in the header/body dont match, use the one in the body
79+
if (properEncoding !== encoding) {
80+
decodedContent = iconv.decode(content, properEncoding);
81+
$ = cheerio.load(decodedContent);
82+
}
83+
84+
return $;
85+
},
6686
};
6787

6888
export default Resource;

src/resource/index.test.js

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import assert from 'assert';
22
import cheerio from 'cheerio';
33
import { Errors } from 'utils';
4+
import { getEncoding } from 'utils/text';
45

56
import { record } from 'test-helpers';
67
import Resource from './index';
@@ -24,18 +25,31 @@ describe('Resource', () => {
2425

2526
assert.equal(error, Errors.badUrl);
2627
});
27-
});
2828

29-
describe('generateDoc({ body, response })', () => {
30-
it('returns a cheerio object if valid', () => {
31-
const response = { headers: { 'content-type': 'text/html' } };
29+
it('fetches with different encoding on body', async () => {
30+
const url = 'http://www.playnation.de/spiele-news/kojima-productions/hideo-kojima-reflektiert-ueber-seinen-werdegang-bei-konami-id68950.html';
31+
const $ = await Resource.create(url);
32+
const metaContentType = $('meta[http-equiv=content-type]').attr('value');
33+
34+
assert.equal(getEncoding(metaContentType), 'iso-8859-1');
35+
const encodedU = /ü/g;
36+
37+
assert.equal(encodedU.test($.html()), true);
38+
assert.equal(typeof $, 'function');
39+
});
40+
41+
it('handles special encoding', async () => {
42+
const url = 'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
43+
const $ = await Resource.create(url);
3244

33-
const body = '<div><p>Hi</p></div>';
34-
const $ = Resource.generateDoc({ body, response });
45+
const badEncodingRe = //g;
3546

36-
assert.equal($.html(), body);
47+
assert.equal(badEncodingRe.test($.html()), false);
48+
assert.equal(typeof $, 'function');
3749
});
50+
});
3851

52+
describe('generateDoc({ body, response })', () => {
3953
it('throws an error if the content is not text', () => {
4054
const response = {
4155
headers: {

src/resource/utils/fetch-resource.js

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import URL from 'url';
22
import request from 'request';
3-
import iconv from 'iconv-lite';
43
import { Errors } from 'utils';
5-
import { getEncoding } from 'utils/text';
64

75
import {
86
REQUEST_HEADERS,
@@ -17,12 +15,6 @@ function get(options) {
1715
if (err) {
1816
reject(err);
1917
} else {
20-
const encoding = getEncoding(response.headers['content-type']);
21-
22-
if (iconv.encodingExists(encoding)) {
23-
body = iconv.decode(body, encoding);
24-
}
25-
2618
resolve({ body, response });
2719
}
2820
});
@@ -97,9 +89,6 @@ export default async function fetchResource(url, parsedUrl) {
9789
url: parsedUrl.href,
9890
headers: { ...REQUEST_HEADERS },
9991
timeout: FETCH_TIMEOUT,
100-
// Don't set encoding; fixes issues
101-
// w/gzipped responses
102-
encoding: null,
10392
// Accept cookies
10493
jar: true,
10594
// Accept and decode gzip

0 commit comments

Comments
 (0)