WIP: safen the text regex via linear-time scans

davisjam · davisjam · commit 24d4a5efb365 · 2018-04-27T22:29:21.000-04:00
Sketch implementing text regex as a linear-time RegExp imitator.
- A few nits here and there
- I haven't tested all of the offsetOfX routines, so 'npm run test' hangs on some bug
diff --git a/lib/marked.js b/lib/marked.js
@@ -4,7 +4,7 @@
  * https://github.com/markedjs/marked
  */
 
-var NEW_TEXT = false;
+var NEW_TEXT = true;
 
 var doLog = false;
 function log(msg) {
@@ -526,15 +526,146 @@ var inline = {
   code: /^(`+)\s*([\s\S]*?[^`]?)\s*\1(?!`)/,
   br: /^ {2,}\n(?!\s*$)/,
   del: noop,
-  text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/
+  text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/ // TODO Vulnerable
 };
 
+// Helper for the offsetOf routines.
+function offsetOfRegex(str, regex) {
+  var cap = regex.exec(str);
+  if (cap) {
+    log(`offsetOfRegex: str ${str} matches regex ${regex.source}`);
+    return cap.index;
+  }
+  return -1;
+}
+
+// Returns earliest offset of "special characters"
+function offsetOfSpecialChars(str) {
+  return offsetOfRegex(str, /[\\<!\[`*]/);
+}
+
+// Returns earliest offset of a command to italicize
+function offsetOfItalics (str) {
+  return offsetOfRegex(str, /\b_/);
+}
+
+// Returns earliest offset of a run of 2+ spaces then a newline
+function offsetOfSpacesThenNewline(str) {
+  // linear-time implementation of / {2,}\n/
+  var spaceRunBegins = -1;
+  var nSpaces = 0;
+  for (var i = 0; i < str.length; i++) {
+    if (str.charAt(i) === ' ') {
+      if (nSpaces === 0) {
+        spaceRunBegins = i;
+      }
+      nSpaces++;
+    } else if (2 <= nSpaces && str.charAt(i) === '\n') {
+      return spaceRunBegins;
+    } else {
+      nSpaces = 0;
+    }
+  }
+  return -1;
+}
+
+// Returns earliest offset of an http protocol
+function offsetOfHTTP(str) {
+  return offsetOfRegex(str, /https?:\/\//);
+}
+
+// Returns earliest offset of an ftp protocol
+function offsetOfFTP(str) {
+  return offsetOfRegex(str, /ftp:\/\//);
+}
+
+// Returns earliest offset of a www URL
+function offsetOfWWW(str) {
+  return offsetOfRegex(str, /www\./);
+}
+
+// Returns earliest offset of an email (username + @)
+function offsetOfEmail(str) {
+  var atSymbolIx = 0;
+  var emailUsernameChar = /^[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]$/;
+  // Look for email-like things at every '@'.
+  while (0 < (atSymbolIx = str.indexOf('@', atSymbolIx))) {
+    // Found an @, work backwards through valid username chars until we run out of string.
+    var i = atSymbolIx;
+    while (0 < i && emailUsernameChar.exec(str.charAt(i - 1))) {
+      i--;
+    }
+    // If we found any, this looks like an email.
+    if (i < atSymbolIx) {
+      return i;
+    }
+  }
+
+  return -1;
+}
+
+// Returns earliest offset of a text break in str, based on an array of textBreakFinders functions
+// textBreakFinders should be a subset of the offsetOfX functions
+// Imitates RegExp.exec
+function offsetOfTextBreak(str, textBreakFinders) {
+  // Clean code means doing several O(n) operations.
+  // A more complex state machine (like a linear-time regex) might test all options
+  // in parallel more efficiently, but I don't know how to write one.
+  log(`Looking for tb in \'${str}\'`);
+  if (str.length === 0) {
+    return null;
+  }
+  var strToSearch = str.substr(1); // Must be at least one character of text before the break.
+
+  // Find the earliest instance of each kind of text break.
+  var textBreaks = textBreakFinders.map(function (f) {
+    return f(strToSearch);
+  });
+  log(`textBreaks: ${textBreaks}`);
+
+  // Pick earliest among them.
+  var validTextBreaks = textBreaks.filter(function (brk) {
+    return 0 <= brk;
+  });
+
+  var earliestBreakOffset;
+  if (validTextBreaks.length) {
+    // Why doesn't Math.min work here?
+    var min = validTextBreaks.reduce(function (accum, b) {
+      if (b < accum) {
+        return b;
+      }
+      return accum;
+    });
+    earliestBreakOffset = min + 1; // +1 because strToSearch is missing 1st char of str
+  } else {
+    // No text breaks? Then the whole string is text.
+    earliestBreakOffset = str.length;
+  }
+
+  // Mimic RegExp 'exec' for compatibility.
+  var result = {};
+  result[0] = str.substr(0, earliestBreakOffset);
+  result.index = earliestBreakOffset;
+  log(`Returning: earliestBreakOffset ${earliestBreakOffset} result ${JSON.stringify(result)}`);
+  return result;
+}
+
+// Find earliest text break according to the rules of the Inline Lexer.
+// Imitates RegExp.exec
+function offsetOfTextBreakInline(str) {
+  return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline]);
+}
+
+// Find earliest text break according to the rules of the Inline GFM Lexer.
+// Imitates RegExp.exec
+function offsetOfTextBreakInlineGFM(str) {
+  return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline, offsetOfHTTP, offsetOfFTP, offsetOfWWW, offsetOfEmail]);
+}
+
+// Override vulnerable but readable regex
 if (NEW_TEXT) {
-  // TODO: If we replace ' {2,}\n' with '  \n' and address trailing whitespace,
-  //       we break the definition of GFM inline.breaks further down (affects the gfm_break test).
-  // Furthermore, we still have trouble with the email pattern substituted in: /|[...]+@/, which
-  // is vulnerable to REDOS just like /| {2,}\n/ was
-  inline.text = /[\s\S](?:[\\<!\[`*]|\b_| {2}\n|$)/;
+  inline.text = { exec: offsetOfTextBreakInline };
 }
 
 inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
@@ -599,10 +730,7 @@ inline.gfm = merge({}, inline.normal, {
     .getRegex(),
   _backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
   del: /^~~(?=\S)([\s\S]*?\S)~~/,
-  text: edit(inline.text)
-    .replace(']|', '~]|')
-    .replace('|', '|https?://|ftp://|www\\.|[a-zA-Z0-9.!#$%&\'*+/=?^_`{\\|}~-]+@|')
-    .getRegex()
+  text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: .replace(']|', '~]|')
 });
 
 /**
@@ -611,7 +739,7 @@ inline.gfm = merge({}, inline.normal, {
 
 inline.breaks = merge({}, inline.gfm, {
   br: edit(inline.br).replace('{2,}', '*').getRegex(),
-  text: edit(inline.gfm.text).replace('{2,}', '*').getRegex()
+  text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: inline.gfm.text.replace('{2,}', '*')
 });
 
 /**
@@ -803,16 +931,22 @@ InlineLexer.prototype.output = function(src) {
     }
 
     // text
-    log(`lexer: Matching text: ${this.rules.text.source}\n  <${src}>`);
+    //log(`lexer: Matching text: ${this.rules.text.source}\n  <${src}>`);
     if (cap = this.rules.text.exec(src)) {
       if (NEW_TEXT) {
-        log(`lexer: Match: ${cap} ${cap.index}`);
+        log(`lexer: Match: ${JSON.stringify(cap)} ${cap.index}`);
         var textLen = cap.index + 1;
         // text is not in cap[0], so extract text before advancing src.
         out += this.renderer.text(escape(this.smartypants(src.substr(0, textLen))));
         src = src.substring(textLen);
         continue;
       } else {
+        var offInline = offsetOfTextBreakInline(src);
+        var offInlineGFM = offsetOfTextBreakInlineGFM(src);
+        console.log(`cap ${JSON.stringify(cap)}`);
+        console.log(`offInline ${JSON.stringify(offInline)}`);
+        console.log(`offInlineGFM ${JSON.stringify(offInlineGFM)}`);
+        console.log(`regex ${cap[0].length} offInline ${offInline[0].length} offInlineGFM ${offInlineGFM[0].length}`);
         src = src.substring(cap[0].length);
         out += this.renderer.text(escape(this.smartypants(cap[0])));
         continue;
@@ -1530,6 +1664,8 @@ marked.defaults = marked.getDefaults();
  * Expose
  */
 
+marked(' # # ####A');
+
 marked.Parser = Parser;
 marked.parser = Parser.parse;