44 * https://github.com/markedjs/marked
55 */
66
7- var NEW_TEXT = false ;
7+ var NEW_TEXT = true ;
88
99var doLog = false ;
1010function log ( msg ) {
@@ -526,15 +526,146 @@ var inline = {
526526 code : / ^ ( ` + ) \s * ( [ \s \S ] * ?[ ^ ` ] ? ) \s * \1(? ! ` ) / ,
527527 br : / ^ { 2 , } \n (? ! \s * $ ) / ,
528528 del : noop ,
529- text : / ^ [ \s \S ] + ?(? = [ \\ < ! \[ ` * ] | \b _ | { 2 , } \n | $ ) /
529+ text : / ^ [ \s \S ] + ?(? = [ \\ < ! \[ ` * ] | \b _ | { 2 , } \n | $ ) / // TODO Vulnerable
530530} ;
531531
532+ // Helper for the offsetOf routines.
533+ function offsetOfRegex ( str , regex ) {
534+ var cap = regex . exec ( str ) ;
535+ if ( cap ) {
536+ log ( `offsetOfRegex: str ${ str } matches regex ${ regex . source } ` ) ;
537+ return cap . index ;
538+ }
539+ return - 1 ;
540+ }
541+
542+ // Returns earliest offset of "special characters"
543+ function offsetOfSpecialChars ( str ) {
544+ return offsetOfRegex ( str , / [ \\ < ! \[ ` * ] / ) ;
545+ }
546+
547+ // Returns earliest offset of a command to italicize
548+ function offsetOfItalics ( str ) {
549+ return offsetOfRegex ( str , / \b _ / ) ;
550+ }
551+
552+ // Returns earliest offset of a run of 2+ spaces then a newline
553+ function offsetOfSpacesThenNewline ( str ) {
554+ // linear-time implementation of / {2,}\n/
555+ var spaceRunBegins = - 1 ;
556+ var nSpaces = 0 ;
557+ for ( var i = 0 ; i < str . length ; i ++ ) {
558+ if ( str . charAt ( i ) === ' ' ) {
559+ if ( nSpaces === 0 ) {
560+ spaceRunBegins = i ;
561+ }
562+ nSpaces ++ ;
563+ } else if ( 2 <= nSpaces && str . charAt ( i ) === '\n' ) {
564+ return spaceRunBegins ;
565+ } else {
566+ nSpaces = 0 ;
567+ }
568+ }
569+ return - 1 ;
570+ }
571+
572+ // Returns earliest offset of an http protocol
573+ function offsetOfHTTP ( str ) {
574+ return offsetOfRegex ( str , / h t t p s ? : \/ \/ / ) ;
575+ }
576+
577+ // Returns earliest offset of an ftp protocol
578+ function offsetOfFTP ( str ) {
579+ return offsetOfRegex ( str , / f t p : \/ \/ / ) ;
580+ }
581+
582+ // Returns earliest offset of a www URL
583+ function offsetOfWWW ( str ) {
584+ return offsetOfRegex ( str , / w w w \. / ) ;
585+ }
586+
587+ // Returns earliest offset of an email (username + @)
588+ function offsetOfEmail ( str ) {
589+ var atSymbolIx = 0 ;
590+ var emailUsernameChar = / ^ [ a - z A - Z 0 - 9 . ! # $ % & ' * + \/ = ? _ ` { \| } ~ - ] $ / ;
591+ // Look for email-like things at every '@'.
592+ while ( 0 < ( atSymbolIx = str . indexOf ( '@' , atSymbolIx ) ) ) {
593+ // Found an @, work backwards through valid username chars until we run out of string.
594+ var i = atSymbolIx ;
595+ while ( 0 < i && emailUsernameChar . exec ( str . charAt ( i - 1 ) ) ) {
596+ i -- ;
597+ }
598+ // If we found any, this looks like an email.
599+ if ( i < atSymbolIx ) {
600+ return i ;
601+ }
602+ }
603+
604+ return - 1 ;
605+ }
606+
607+ // Returns earliest offset of a text break in str, based on an array of textBreakFinders functions
608+ // textBreakFinders should be a subset of the offsetOfX functions
609+ // Imitates RegExp.exec
610+ function offsetOfTextBreak ( str , textBreakFinders ) {
611+ // Clean code means doing several O(n) operations.
612+ // A more complex state machine (like a linear-time regex) might test all options
613+ // in parallel more efficiently, but I don't know how to write one.
614+ log ( `Looking for tb in \'${ str } \'` ) ;
615+ if ( str . length === 0 ) {
616+ return null ;
617+ }
618+ var strToSearch = str . substr ( 1 ) ; // Must be at least one character of text before the break.
619+
620+ // Find the earliest instance of each kind of text break.
621+ var textBreaks = textBreakFinders . map ( function ( f ) {
622+ return f ( strToSearch ) ;
623+ } ) ;
624+ log ( `textBreaks: ${ textBreaks } ` ) ;
625+
626+ // Pick earliest among them.
627+ var validTextBreaks = textBreaks . filter ( function ( brk ) {
628+ return 0 <= brk ;
629+ } ) ;
630+
631+ var earliestBreakOffset ;
632+ if ( validTextBreaks . length ) {
633+ // Why doesn't Math.min work here?
634+ var min = validTextBreaks . reduce ( function ( accum , b ) {
635+ if ( b < accum ) {
636+ return b ;
637+ }
638+ return accum ;
639+ } ) ;
640+ earliestBreakOffset = min + 1 ; // +1 because strToSearch is missing 1st char of str
641+ } else {
642+ // No text breaks? Then the whole string is text.
643+ earliestBreakOffset = str . length ;
644+ }
645+
646+ // Mimic RegExp 'exec' for compatibility.
647+ var result = { } ;
648+ result [ 0 ] = str . substr ( 0 , earliestBreakOffset ) ;
649+ result . index = earliestBreakOffset ;
650+ log ( `Returning: earliestBreakOffset ${ earliestBreakOffset } result ${ JSON . stringify ( result ) } ` ) ;
651+ return result ;
652+ }
653+
654+ // Find earliest text break according to the rules of the Inline Lexer.
655+ // Imitates RegExp.exec
656+ function offsetOfTextBreakInline ( str ) {
657+ return offsetOfTextBreak ( str , [ offsetOfSpecialChars , offsetOfItalics , offsetOfSpacesThenNewline ] ) ;
658+ }
659+
660+ // Find earliest text break according to the rules of the Inline GFM Lexer.
661+ // Imitates RegExp.exec
662+ function offsetOfTextBreakInlineGFM ( str ) {
663+ return offsetOfTextBreak ( str , [ offsetOfSpecialChars , offsetOfItalics , offsetOfSpacesThenNewline , offsetOfHTTP , offsetOfFTP , offsetOfWWW , offsetOfEmail ] ) ;
664+ }
665+
666+ // Override vulnerable but readable regex
532667if ( NEW_TEXT ) {
533- // TODO: If we replace ' {2,}\n' with ' \n' and address trailing whitespace,
534- // we break the definition of GFM inline.breaks further down (affects the gfm_break test).
535- // Furthermore, we still have trouble with the email pattern substituted in: /|[...]+@/, which
536- // is vulnerable to REDOS just like /| {2,}\n/ was
537- inline . text = / [ \s \S ] (?: [ \\ < ! \[ ` * ] | \b _ | { 2 } \n | $ ) / ;
668+ inline . text = { exec : offsetOfTextBreakInline } ;
538669}
539670
540671inline . _escapes = / \\ ( [ ! " # $ % & ' ( ) * + , \- . / : ; < = > ? @ \[ \] \\ ^ _ ` { | } ~ ] ) / g;
@@ -599,10 +730,7 @@ inline.gfm = merge({}, inline.normal, {
599730 . getRegex ( ) ,
600731 _backpedal : / (?: [ ^ ? ! . , : ; * _ ~ ( ) & ] + | \( [ ^ ) ] * \) | & (? ! [ a - z A - Z 0 - 9 ] + ; $ ) | [ ? ! . , : ; * _ ~ ) ] + (? ! $ ) ) + / ,
601732 del : / ^ ~ ~ (? = \S ) ( [ \s \S ] * ?\S ) ~ ~ / ,
602- text : edit ( inline . text )
603- . replace ( ']|' , '~]|' )
604- . replace ( '|' , '|https?://|ftp://|www\\.|[a-zA-Z0-9.!#$%&\'*+/=?^_`{\\|}~-]+@|' )
605- . getRegex ( )
733+ text : { exec : offsetOfTextBreakInlineGFM } // TODO Missing: .replace(']|', '~]|')
606734} ) ;
607735
608736/**
@@ -611,7 +739,7 @@ inline.gfm = merge({}, inline.normal, {
611739
612740inline . breaks = merge ( { } , inline . gfm , {
613741 br : edit ( inline . br ) . replace ( '{2,}' , '*' ) . getRegex ( ) ,
614- text : edit ( inline . gfm . text ) . replace ( '{2,}' , '*' ) . getRegex ( )
742+ text : { exec : offsetOfTextBreakInlineGFM } // TODO Missing: inline.gfm.text.replace('{2,}', '*')
615743} ) ;
616744
617745/**
@@ -803,16 +931,22 @@ InlineLexer.prototype.output = function(src) {
803931 }
804932
805933 // text
806- log ( `lexer: Matching text: ${ this . rules . text . source } \n <${ src } >` ) ;
934+ // log(`lexer: Matching text: ${this.rules.text.source}\n <${src}>`);
807935 if ( cap = this . rules . text . exec ( src ) ) {
808936 if ( NEW_TEXT ) {
809- log ( `lexer: Match: ${ cap } ${ cap . index } ` ) ;
937+ log ( `lexer: Match: ${ JSON . stringify ( cap ) } ${ cap . index } ` ) ;
810938 var textLen = cap . index + 1 ;
811939 // text is not in cap[0], so extract text before advancing src.
812940 out += this . renderer . text ( escape ( this . smartypants ( src . substr ( 0 , textLen ) ) ) ) ;
813941 src = src . substring ( textLen ) ;
814942 continue ;
815943 } else {
944+ var offInline = offsetOfTextBreakInline ( src ) ;
945+ var offInlineGFM = offsetOfTextBreakInlineGFM ( src ) ;
946+ console . log ( `cap ${ JSON . stringify ( cap ) } ` ) ;
947+ console . log ( `offInline ${ JSON . stringify ( offInline ) } ` ) ;
948+ console . log ( `offInlineGFM ${ JSON . stringify ( offInlineGFM ) } ` ) ;
949+ console . log ( `regex ${ cap [ 0 ] . length } offInline ${ offInline [ 0 ] . length } offInlineGFM ${ offInlineGFM [ 0 ] . length } ` ) ;
816950 src = src . substring ( cap [ 0 ] . length ) ;
817951 out += this . renderer . text ( escape ( this . smartypants ( cap [ 0 ] ) ) ) ;
818952 continue ;
@@ -1530,6 +1664,8 @@ marked.defaults = marked.getDefaults();
15301664 * Expose
15311665 */
15321666
1667+ marked ( ' # # ####A' ) ;
1668+
15331669marked . Parser = Parser ;
15341670marked . parser = Parser . parse ;
15351671
0 commit comments