Skip to content

Commit 512cd7a

Browse files
authored
parse: fix support for literal URLs
Related-to GH-479. Closes GH-478. Closes GH-481.
1 parent 7be53be commit 512cd7a

File tree

12 files changed

+5097
-420
lines changed

12 files changed

+5097
-420
lines changed

packages/remark-parse/lib/locate/url.js

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,25 @@
22

33
module.exports = locate
44

5-
var protocols = ['https://', 'http://', 'mailto:']
5+
var values = ['www.', 'http://', 'https://']
66

77
function locate(value, fromIndex) {
8-
var length = protocols.length
9-
var index = -1
108
var min = -1
9+
var index
10+
var length
1111
var position
1212

1313
if (!this.options.gfm) {
14-
return -1
14+
return min
1515
}
1616

17+
length = values.length
18+
index = -1
19+
1720
while (++index < length) {
18-
position = value.indexOf(protocols[index], fromIndex)
21+
position = value.indexOf(values[index], fromIndex)
1922

20-
if (position !== -1 && (position < min || min === -1)) {
23+
if (position !== -1 && (min === -1 || position < min)) {
2124
min = position
2225
}
2326
}
Lines changed: 152 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1,153 +1,210 @@
11
'use strict'
22

3+
var ccount = require('ccount')
34
var decode = require('parse-entities')
5+
var decimal = require('is-decimal')
6+
var alphabetical = require('is-alphabetical')
47
var whitespace = require('is-whitespace-character')
58
var locate = require('../locate/url')
69

710
module.exports = url
811
url.locator = locate
912
url.notInLink = true
1013

11-
var quotationMark = '"'
12-
var apostrophe = "'"
13-
var leftParenthesis = '('
14-
var rightParenthesis = ')'
15-
var comma = ','
16-
var dot = '.'
17-
var colon = ':'
18-
var semicolon = ';'
19-
var lessThan = '<'
20-
var atSign = '@'
21-
var leftSquareBracket = '['
22-
var rightSquareBracket = ']'
23-
24-
var http = 'http://'
25-
var https = 'https://'
26-
var mailto = 'mailto:'
27-
28-
var protocols = [http, https, mailto]
29-
30-
var protocolsLength = protocols.length
14+
var exclamationMark = 33 // '!'
15+
var ampersand = 38 // '&'
16+
var rightParenthesis = 41 // ')'
17+
var asterisk = 42 // '*'
18+
var comma = 44 // ','
19+
var dash = 45 // '-'
20+
var dot = 46 // '.'
21+
var colon = 58 // ':'
22+
var semicolon = 59 // ';'
23+
var questionMark = 63 // '?'
24+
var lessThan = 60 // '<'
25+
var underscore = 95 // '_'
26+
var tilde = 126 // '~'
27+
28+
var leftParenthesisCharacter = '('
29+
var rightParenthesisCharacter = ')'
3130

3231
function url(eat, value, silent) {
3332
var self = this
34-
var subvalue
35-
var content
36-
var character
33+
var gfm = self.options.gfm
34+
var tokenizers = self.inlineTokenizers
35+
var length = value.length
36+
var previousDot = -1
37+
var protocolless = false
38+
var dots
39+
var lastTwoPartsStart
40+
var start
3741
var index
38-
var position
39-
var protocol
40-
var match
41-
var length
42-
var queue
43-
var parenCount
44-
var nextCharacter
45-
var tokenizers
42+
var pathStart
43+
var path
44+
var code
45+
var end
46+
var leftCount
47+
var rightCount
48+
var content
49+
var children
50+
var url
4651
var exit
4752

48-
if (!self.options.gfm) {
53+
if (!gfm) {
54+
return
55+
}
56+
57+
// `WWW.` doesn’t work.
58+
if (value.slice(0, 4) === 'www.') {
59+
protocolless = true
60+
index = 4
61+
} else if (value.slice(0, 7).toLowerCase() === 'http://') {
62+
index = 7
63+
} else if (value.slice(0, 8).toLowerCase() === 'https://') {
64+
index = 8
65+
} else {
4966
return
5067
}
5168

52-
subvalue = ''
53-
index = -1
69+
// Act as if the starting boundary is a dot.
70+
previousDot = index - 1
5471

55-
while (++index < protocolsLength) {
56-
protocol = protocols[index]
57-
match = value.slice(0, protocol.length)
72+
// Parse a valid domain.
73+
start = index
74+
dots = []
5875

59-
if (match.toLowerCase() === protocol) {
60-
subvalue = match
61-
break
76+
while (index < length) {
77+
code = value.charCodeAt(index)
78+
79+
if (code === dot) {
80+
// Dots may not appear after each other.
81+
if (previousDot === index - 1) {
82+
break
83+
}
84+
85+
dots.push(index)
86+
previousDot = index
87+
index++
88+
continue
6289
}
90+
91+
if (
92+
decimal(code) ||
93+
alphabetical(code) ||
94+
code === dash ||
95+
code === underscore
96+
) {
97+
index++
98+
continue
99+
}
100+
101+
break
102+
}
103+
104+
// Ignore a final dot:
105+
if (code === dot) {
106+
dots.pop()
107+
index--
63108
}
64109

65-
if (!subvalue) {
110+
// If there are not dots, exit.
111+
if (dots[0] === undefined) {
66112
return
67113
}
68114

69-
index = subvalue.length
70-
length = value.length
71-
queue = ''
72-
parenCount = 0
115+
// If there is an underscore in the last two domain parts, exit:
116+
// `www.example.c_m` and `www.ex_ample.com` are not OK, but
117+
// `www.sub_domain.example.com` is.
118+
lastTwoPartsStart = dots.length < 2 ? start : dots[dots.length - 2] + 1
73119

120+
if (value.slice(lastTwoPartsStart, index).indexOf('_') !== -1) {
121+
return
122+
}
123+
124+
/* istanbul ignore if - never used (yet) */
125+
if (silent) {
126+
return true
127+
}
128+
129+
end = index
130+
pathStart = index
131+
132+
// Parse a path.
74133
while (index < length) {
75-
character = value.charAt(index)
134+
code = value.charCodeAt(index)
76135

77-
if (whitespace(character) || character === lessThan) {
136+
if (whitespace(code) || code === lessThan) {
78137
break
79138
}
80139

140+
index++
141+
81142
if (
82-
character === dot ||
83-
character === comma ||
84-
character === colon ||
85-
character === semicolon ||
86-
character === quotationMark ||
87-
character === apostrophe ||
88-
character === rightParenthesis ||
89-
character === rightSquareBracket
143+
code === exclamationMark ||
144+
code === asterisk ||
145+
code === comma ||
146+
code === dot ||
147+
code === colon ||
148+
code === questionMark ||
149+
code === underscore ||
150+
code === tilde
90151
) {
91-
nextCharacter = value.charAt(index + 1)
92-
93-
if (!nextCharacter || whitespace(nextCharacter)) {
94-
break
95-
}
152+
// Empty
153+
} else {
154+
end = index
96155
}
156+
}
97157

98-
if (character === leftParenthesis || character === leftSquareBracket) {
99-
parenCount++
100-
}
158+
index = end
101159

102-
if (character === rightParenthesis || character === rightSquareBracket) {
103-
parenCount--
160+
// If the path ends in a closing paren, and the count of closing parens is
161+
// higher than the opening count, then remove the supefluous closing parens.
162+
if (value.charCodeAt(index - 1) === rightParenthesis) {
163+
path = value.slice(pathStart, index)
164+
leftCount = ccount(path, leftParenthesisCharacter)
165+
rightCount = ccount(path, rightParenthesisCharacter)
104166

105-
if (parenCount < 0) {
106-
break
107-
}
167+
while (rightCount > leftCount) {
168+
index = pathStart + path.lastIndexOf(rightParenthesisCharacter)
169+
path = value.slice(pathStart, index)
170+
rightCount--
108171
}
109-
110-
queue += character
111-
index++
112172
}
113173

114-
if (!queue) {
115-
return
116-
}
174+
if (value.charCodeAt(index - 1) === semicolon) {
175+
// GitHub doesn’t document this, but final semicolons aren’t paret of the
176+
// URL either.
177+
index--
117178

118-
subvalue += queue
119-
content = subvalue
179+
// // If the path ends in what looks like an entity, it’s not part of the path.
180+
if (alphabetical(value.charCodeAt(index - 1))) {
181+
end = index - 2
120182

121-
if (protocol === mailto) {
122-
position = queue.indexOf(atSign)
183+
while (alphabetical(value.charCodeAt(end))) {
184+
end--
185+
}
123186

124-
if (position === -1 || position === length - 1) {
125-
return
187+
if (value.charCodeAt(end) === ampersand) {
188+
index = end
189+
}
126190
}
127-
128-
content = content.slice(mailto.length)
129191
}
130192

131-
/* istanbul ignore if - never used (yet) */
132-
if (silent) {
133-
return true
193+
content = value.slice(0, index)
194+
url = decode(content, {nonTerminated: false})
195+
196+
if (protocolless) {
197+
url = 'http://' + url
134198
}
135199

136200
exit = self.enterLink()
137201

138202
// Temporarily remove all tokenizers except text in url.
139-
tokenizers = self.inlineTokenizers
140203
self.inlineTokenizers = {text: tokenizers.text}
141-
142-
content = self.tokenizeInline(content, eat.now())
143-
204+
children = self.tokenizeInline(content, eat.now())
144205
self.inlineTokenizers = tokenizers
206+
145207
exit()
146208

147-
return eat(subvalue)({
148-
type: 'link',
149-
title: null,
150-
url: decode(subvalue, {nonTerminated: false}),
151-
children: content
152-
})
209+
return eat(content)({type: 'link', title: null, url: url, children: children})
153210
}

packages/remark-parse/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"types/index.d.ts"
3838
],
3939
"dependencies": {
40+
"ccount": "^1.0.0",
4041
"collapse-white-space": "^1.0.2",
4142
"is-alphabetical": "^1.0.0",
4243
"is-decimal": "^1.0.0",

0 commit comments

Comments
 (0)