|
1 | 1 | 'use strict' |
2 | 2 |
|
| 3 | +var ccount = require('ccount') |
3 | 4 | var decode = require('parse-entities') |
| 5 | +var decimal = require('is-decimal') |
| 6 | +var alphabetical = require('is-alphabetical') |
4 | 7 | var whitespace = require('is-whitespace-character') |
5 | 8 | var locate = require('../locate/url') |
6 | 9 |
|
7 | 10 | module.exports = url |
8 | 11 | url.locator = locate |
9 | 12 | url.notInLink = true |
10 | 13 |
|
11 | | -var quotationMark = '"' |
12 | | -var apostrophe = "'" |
13 | | -var leftParenthesis = '(' |
14 | | -var rightParenthesis = ')' |
15 | | -var comma = ',' |
16 | | -var dot = '.' |
17 | | -var colon = ':' |
18 | | -var semicolon = ';' |
19 | | -var lessThan = '<' |
20 | | -var atSign = '@' |
21 | | -var leftSquareBracket = '[' |
22 | | -var rightSquareBracket = ']' |
23 | | - |
24 | | -var http = 'http://' |
25 | | -var https = 'https://' |
26 | | -var mailto = 'mailto:' |
27 | | - |
28 | | -var protocols = [http, https, mailto] |
29 | | - |
30 | | -var protocolsLength = protocols.length |
| 14 | +var exclamationMark = 33 // '!' |
| 15 | +var ampersand = 38 // '&' |
| 16 | +var rightParenthesis = 41 // ')' |
| 17 | +var asterisk = 42 // '*' |
| 18 | +var comma = 44 // ',' |
| 19 | +var dash = 45 // '-' |
| 20 | +var dot = 46 // '.' |
| 21 | +var colon = 58 // ':' |
| 22 | +var semicolon = 59 // ';' |
| 23 | +var questionMark = 63 // '?' |
| 24 | +var lessThan = 60 // '<' |
| 25 | +var underscore = 95 // '_' |
| 26 | +var tilde = 126 // '~' |
| 27 | + |
| 28 | +var leftParenthesisCharacter = '(' |
| 29 | +var rightParenthesisCharacter = ')' |
31 | 30 |
|
32 | 31 | function url(eat, value, silent) { |
33 | 32 | var self = this |
34 | | - var subvalue |
35 | | - var content |
36 | | - var character |
| 33 | + var gfm = self.options.gfm |
| 34 | + var tokenizers = self.inlineTokenizers |
| 35 | + var length = value.length |
| 36 | + var previousDot = -1 |
| 37 | + var protocolless = false |
| 38 | + var dots |
| 39 | + var lastTwoPartsStart |
| 40 | + var start |
37 | 41 | var index |
38 | | - var position |
39 | | - var protocol |
40 | | - var match |
41 | | - var length |
42 | | - var queue |
43 | | - var parenCount |
44 | | - var nextCharacter |
45 | | - var tokenizers |
| 42 | + var pathStart |
| 43 | + var path |
| 44 | + var code |
| 45 | + var end |
| 46 | + var leftCount |
| 47 | + var rightCount |
| 48 | + var content |
| 49 | + var children |
| 50 | + var url |
46 | 51 | var exit |
47 | 52 |
|
48 | | - if (!self.options.gfm) { |
| 53 | + if (!gfm) { |
| 54 | + return |
| 55 | + } |
| 56 | + |
| 57 | + // `WWW.` doesn’t work. |
| 58 | + if (value.slice(0, 4) === 'www.') { |
| 59 | + protocolless = true |
| 60 | + index = 4 |
| 61 | + } else if (value.slice(0, 7).toLowerCase() === 'http://') { |
| 62 | + index = 7 |
| 63 | + } else if (value.slice(0, 8).toLowerCase() === 'https://') { |
| 64 | + index = 8 |
| 65 | + } else { |
49 | 66 | return |
50 | 67 | } |
51 | 68 |
|
52 | | - subvalue = '' |
53 | | - index = -1 |
| 69 | + // Act as if the starting boundary is a dot. |
| 70 | + previousDot = index - 1 |
54 | 71 |
|
55 | | - while (++index < protocolsLength) { |
56 | | - protocol = protocols[index] |
57 | | - match = value.slice(0, protocol.length) |
| 72 | + // Parse a valid domain. |
| 73 | + start = index |
| 74 | + dots = [] |
58 | 75 |
|
59 | | - if (match.toLowerCase() === protocol) { |
60 | | - subvalue = match |
61 | | - break |
| 76 | + while (index < length) { |
| 77 | + code = value.charCodeAt(index) |
| 78 | + |
| 79 | + if (code === dot) { |
| 80 | + // Dots may not appear after each other. |
| 81 | + if (previousDot === index - 1) { |
| 82 | + break |
| 83 | + } |
| 84 | + |
| 85 | + dots.push(index) |
| 86 | + previousDot = index |
| 87 | + index++ |
| 88 | + continue |
62 | 89 | } |
| 90 | + |
| 91 | + if ( |
| 92 | + decimal(code) || |
| 93 | + alphabetical(code) || |
| 94 | + code === dash || |
| 95 | + code === underscore |
| 96 | + ) { |
| 97 | + index++ |
| 98 | + continue |
| 99 | + } |
| 100 | + |
| 101 | + break |
| 102 | + } |
| 103 | + |
| 104 | + // Ignore a final dot: |
| 105 | + if (code === dot) { |
| 106 | + dots.pop() |
| 107 | + index-- |
63 | 108 | } |
64 | 109 |
|
65 | | - if (!subvalue) { |
| 110 | + // If there are not dots, exit. |
| 111 | + if (dots[0] === undefined) { |
66 | 112 | return |
67 | 113 | } |
68 | 114 |
|
69 | | - index = subvalue.length |
70 | | - length = value.length |
71 | | - queue = '' |
72 | | - parenCount = 0 |
| 115 | + // If there is an underscore in the last two domain parts, exit: |
| 116 | + // `www.example.c_m` and `www.ex_ample.com` are not OK, but |
| 117 | + // `www.sub_domain.example.com` is. |
| 118 | + lastTwoPartsStart = dots.length < 2 ? start : dots[dots.length - 2] + 1 |
73 | 119 |
|
| 120 | + if (value.slice(lastTwoPartsStart, index).indexOf('_') !== -1) { |
| 121 | + return |
| 122 | + } |
| 123 | + |
| 124 | + /* istanbul ignore if - never used (yet) */ |
| 125 | + if (silent) { |
| 126 | + return true |
| 127 | + } |
| 128 | + |
| 129 | + end = index |
| 130 | + pathStart = index |
| 131 | + |
| 132 | + // Parse a path. |
74 | 133 | while (index < length) { |
75 | | - character = value.charAt(index) |
| 134 | + code = value.charCodeAt(index) |
76 | 135 |
|
77 | | - if (whitespace(character) || character === lessThan) { |
| 136 | + if (whitespace(code) || code === lessThan) { |
78 | 137 | break |
79 | 138 | } |
80 | 139 |
|
| 140 | + index++ |
| 141 | + |
81 | 142 | if ( |
82 | | - character === dot || |
83 | | - character === comma || |
84 | | - character === colon || |
85 | | - character === semicolon || |
86 | | - character === quotationMark || |
87 | | - character === apostrophe || |
88 | | - character === rightParenthesis || |
89 | | - character === rightSquareBracket |
| 143 | + code === exclamationMark || |
| 144 | + code === asterisk || |
| 145 | + code === comma || |
| 146 | + code === dot || |
| 147 | + code === colon || |
| 148 | + code === questionMark || |
| 149 | + code === underscore || |
| 150 | + code === tilde |
90 | 151 | ) { |
91 | | - nextCharacter = value.charAt(index + 1) |
92 | | - |
93 | | - if (!nextCharacter || whitespace(nextCharacter)) { |
94 | | - break |
95 | | - } |
| 152 | + // Empty |
| 153 | + } else { |
| 154 | + end = index |
96 | 155 | } |
| 156 | + } |
97 | 157 |
|
98 | | - if (character === leftParenthesis || character === leftSquareBracket) { |
99 | | - parenCount++ |
100 | | - } |
| 158 | + index = end |
101 | 159 |
|
102 | | - if (character === rightParenthesis || character === rightSquareBracket) { |
103 | | - parenCount-- |
| 160 | + // If the path ends in a closing paren, and the count of closing parens is |
| 161 | + // higher than the opening count, then remove the supefluous closing parens. |
| 162 | + if (value.charCodeAt(index - 1) === rightParenthesis) { |
| 163 | + path = value.slice(pathStart, index) |
| 164 | + leftCount = ccount(path, leftParenthesisCharacter) |
| 165 | + rightCount = ccount(path, rightParenthesisCharacter) |
104 | 166 |
|
105 | | - if (parenCount < 0) { |
106 | | - break |
107 | | - } |
| 167 | + while (rightCount > leftCount) { |
| 168 | + index = pathStart + path.lastIndexOf(rightParenthesisCharacter) |
| 169 | + path = value.slice(pathStart, index) |
| 170 | + rightCount-- |
108 | 171 | } |
109 | | - |
110 | | - queue += character |
111 | | - index++ |
112 | 172 | } |
113 | 173 |
|
114 | | - if (!queue) { |
115 | | - return |
116 | | - } |
| 174 | + if (value.charCodeAt(index - 1) === semicolon) { |
| 175 | + // GitHub doesn’t document this, but final semicolons aren’t paret of the |
| 176 | + // URL either. |
| 177 | + index-- |
117 | 178 |
|
118 | | - subvalue += queue |
119 | | - content = subvalue |
| 179 | + // // If the path ends in what looks like an entity, it’s not part of the path. |
| 180 | + if (alphabetical(value.charCodeAt(index - 1))) { |
| 181 | + end = index - 2 |
120 | 182 |
|
121 | | - if (protocol === mailto) { |
122 | | - position = queue.indexOf(atSign) |
| 183 | + while (alphabetical(value.charCodeAt(end))) { |
| 184 | + end-- |
| 185 | + } |
123 | 186 |
|
124 | | - if (position === -1 || position === length - 1) { |
125 | | - return |
| 187 | + if (value.charCodeAt(end) === ampersand) { |
| 188 | + index = end |
| 189 | + } |
126 | 190 | } |
127 | | - |
128 | | - content = content.slice(mailto.length) |
129 | 191 | } |
130 | 192 |
|
131 | | - /* istanbul ignore if - never used (yet) */ |
132 | | - if (silent) { |
133 | | - return true |
| 193 | + content = value.slice(0, index) |
| 194 | + url = decode(content, {nonTerminated: false}) |
| 195 | + |
| 196 | + if (protocolless) { |
| 197 | + url = 'http://' + url |
134 | 198 | } |
135 | 199 |
|
136 | 200 | exit = self.enterLink() |
137 | 201 |
|
138 | 202 | // Temporarily remove all tokenizers except text in url. |
139 | | - tokenizers = self.inlineTokenizers |
140 | 203 | self.inlineTokenizers = {text: tokenizers.text} |
141 | | - |
142 | | - content = self.tokenizeInline(content, eat.now()) |
143 | | - |
| 204 | + children = self.tokenizeInline(content, eat.now()) |
144 | 205 | self.inlineTokenizers = tokenizers |
| 206 | + |
145 | 207 | exit() |
146 | 208 |
|
147 | | - return eat(subvalue)({ |
148 | | - type: 'link', |
149 | | - title: null, |
150 | | - url: decode(subvalue, {nonTerminated: false}), |
151 | | - children: content |
152 | | - }) |
| 209 | + return eat(content)({type: 'link', title: null, url: url, children: children}) |
153 | 210 | } |
0 commit comments