Unicode correctness

jamesdbrock · jamesdbrock · commit b8291d8dff53 · 2021-09-23T00:27:48.000+09:00
Correctly handle UTF-16 surrogate pairs in `String`s. We are not quite making the default `CodePoint`, as was discussed in #76 (comment) . Rather we are keeping most of the current API and making it work properly with astral Unicode. We keep the `Char` parsers for ergonomic reasons. For example the parser `char :: forall s m. Monad m => Char -> ParserT s m Char`. This parser is usually called with a literal like `char 'a'`. It would be annoying to call this parser with `char (codePointFromChar 'a')`. Add primitive parsers `anyCodePoint` and `satisfyCodePoint` for parsing `CodePoint`s. To make this library handle Unicode correctly, it is necessary to delete the `StringLike` class. `StringLike` has no laws, and during the five years of its life, no-one on Github has ever written another instance of `StringLike`. https://github.com/search?l=&q=StringLike+language%3APureScript&type=code Move `updatePosString` to the `Text.Parsing.Parser.String` module and don't export it. Add the `match` combinator. Change the definition of `whiteSpace` and `skipSpaces` to `Data.CodePoint.Unicode.isSpace`. Move the character class parsers from `Text.Parsing.Parser.Token` module into the `Text.Parsing.Parser.String` module. All prior tests pass with no modifications. Add a few new tests.
diff --git a/spago.dhall b/spago.dhall
@@ -20,6 +20,7 @@
   , "transformers"
   , "tuples"
   , "unicode"
+  , "unsafe-coerce"
   ]
 , packages = ./packages.dhall
 , sources = [ "src/**/*.purs", "test/**/*.purs" ]
diff --git a/src/Text/Parsing/Parser/Language.purs b/src/Text/Parsing/Parser/Language.purs
@@ -12,8 +12,8 @@ import Prelude
 
 import Control.Alt ((<|>))
 import Text.Parsing.Parser (ParserT)
-import Text.Parsing.Parser.String (char, oneOf)
-import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser, alphaNum, letter)
+import Text.Parsing.Parser.String (char, oneOf, alphaNum, letter)
+import Text.Parsing.Parser.Token (LanguageDef, TokenParser, GenLanguageDef(..), unGenLanguageDef, makeTokenParser)
 
 -----------------------------------------------------------
 -- Styles: haskellStyle, javaStyle
diff --git a/src/Text/Parsing/Parser/Pos.purs b/src/Text/Parsing/Parser/Pos.purs
@@ -1,10 +1,8 @@
 module Text.Parsing.Parser.Pos where
 
 import Prelude
+
 import Data.Generic.Rep (class Generic)
-import Data.Foldable (foldl)
-import Data.Newtype (wrap)
-import Data.String (split)
 
 -- | `Position` represents the position of the parser in the input.
 -- |
@@ -27,13 +25,3 @@ derive instance ordPosition :: Ord Position
 -- | The `Position` before any input has been parsed.
 initialPos :: Position
 initialPos = Position { line: 1, column: 1 }
-
--- | Updates a `Position` by adding the columns and lines in `String`.
-updatePosString :: Position -> String -> Position
-updatePosString pos' str = foldl updatePosChar pos' (split (wrap "") str)
-  where
-  updatePosChar (Position pos) c = case c of
-    "\n" -> Position { line: pos.line + 1, column: 1 }
-    "\r" -> Position { line: pos.line + 1, column: 1 }
-    "\t" -> Position { line: pos.line,     column: pos.column + 8 - ((pos.column - 1) `mod` 8) }
-    _    -> Position { line: pos.line,     column: pos.column + 1 }
diff --git a/src/Text/Parsing/Parser/String.purs b/src/Text/Parsing/Parser/String.purs
@@ -1,43 +1,67 @@
 -- | Primitive parsers for working with an input stream of type `String`.
-
-module Text.Parsing.Parser.String where
+-- |
+-- | These primitive parsers all operate on primitive `String` inputs.
+-- | In most JavaScript runtime environments, the `String` is encoded
+-- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but
+-- | these primitive parsers should work with any runtime encoding.
+-- |
+-- | The primitive parsers which return `Char` will only succeed when the character
+-- | being parsed is a code point in the
+-- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
+-- | (the “BMP”). These parsers can be convenient because of the good support
+-- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`.
+-- |
+-- | The other primitive parsers, which return `CodePoint` and `String` types,
+-- | can parse the full Unicode character set. All of the primitive parsers
+-- | in this module can be used together.
+module Text.Parsing.Parser.String
+( string
+, eof
+, anyChar
+, anyCodePoint
+, satisfy
+, satisfyCodePoint
+, char
+, whiteSpace
+, skipSpaces
+, oneOf
+, noneOf
+, match
+, digit
+, hexDigit
+, octDigit
+, upper
+, space
+, letter
+, alphaNum
+)
+where
 
 import Prelude hiding (between)
 
-import Control.Monad.State (gets, modify_)
-import Data.Array (many)
-import Data.Foldable (elem, notElem)
+import Control.Monad.State (get, gets, modify_)
+import Data.Array (notElem)
+import Data.Char (fromCharCode)
+import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper)
+import Data.Foldable (elem)
 import Data.Maybe (Maybe(..))
 import Data.Newtype (wrap)
-import Data.String (Pattern)
-import Data.String as S
+import Data.String (CodePoint, codePointFromChar, null, stripPrefix, uncons)
 import Data.String.CodeUnits as SCU
+import Data.Tuple (Tuple(..), fst)
 import Text.Parsing.Parser (ParseState(..), ParserT, fail)
-import Text.Parsing.Parser.Combinators (tryRethrow, (<?>))
-import Text.Parsing.Parser.Pos (updatePosString)
-
--- | This class exists to abstract over streams which support the string-like
--- | operations which this modules needs.
-class StringLike s where
-  drop :: Int -> s -> s
-  stripPrefix :: Pattern -> s -> Maybe s
-  null :: s -> Boolean
-  uncons :: s -> Maybe { head :: Char, tail :: s }
-
-instance stringLikeString :: StringLike String where
-  uncons = SCU.uncons
-  drop = S.drop
-  stripPrefix = S.stripPrefix
-  null = S.null
+import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>))
+import Text.Parsing.Parser.Pos (Position(..))
+import Unsafe.Coerce (unsafeCoerce)
 
 -- | Match end-of-file.
-eof :: forall s m. StringLike s => Monad m => ParserT s m Unit
+eof :: forall m. Monad m => ParserT String m Unit
 eof = do
   input <- gets \(ParseState input _ _) -> input
   unless (null input) (fail "Expected EOF")
 
 -- | Match the specified string.
-string :: forall s m. StringLike s => Monad m => String -> ParserT s m String
+string :: forall m. Monad m => String -> ParserT String m String
 string str = do
   input <- gets \(ParseState input _ _) -> input
   case stripPrefix (wrap str) input of
@@ -49,44 +73,127 @@ string str = do
       pure str
     _ -> fail ("Expected " <> show str)
 
--- | Match any character.
-anyChar :: forall s m. StringLike s => Monad m => ParserT s m Char
-anyChar = do
+-- | Match any BMP `Char`.
+-- | Parser will fail if the character is not in the Basic Multilingual Plane.
+anyChar :: forall m. Monad m => ParserT String m Char
+anyChar = tryRethrow do
+  cp :: Int <- unsafeCoerce <$> anyCodePoint
+  -- the `fromCharCode` function doesn't check if this is beyond the
+  -- BMP, so we check that ourselves.
+  -- https://github.com/purescript/purescript-strings/issues/153
+  if cp > 65535 -- BMP
+    then fail "Not a Char"
+  else case fromCharCode cp of
+    Nothing -> fail "Not a Char"
+    Just c -> pure c
+
+-- | Match any Unicode character.
+-- | Always succeeds.
+anyCodePoint :: forall m. Monad m => ParserT String m CodePoint
+anyCodePoint = do
   input <- gets \(ParseState input _ _) -> input
   case uncons input of
     Nothing -> fail "Unexpected EOF"
     Just { head, tail } -> do
       modify_ \(ParseState _ position _) ->
-        ParseState tail
-                   (updatePosString position (SCU.singleton head))
-                   true
+        ParseState tail (updatePosSingle position head) true
       pure head
 
--- | Match a character satisfying the specified predicate.
-satisfy :: forall s m. StringLike s => Monad m => (Char -> Boolean) -> ParserT s m Char
+-- | Match a BMP `Char` satisfying the predicate.
+satisfy :: forall m. Monad m => (Char -> Boolean) -> ParserT String m Char
 satisfy f = tryRethrow do
   c <- anyChar
-  if f c then pure c
-         else fail $ "Character '" <> SCU.singleton c <> "' did not satisfy predicate"
+  if f c
+    then pure c
+    else fail "Predicate unsatisfied"
+
+-- | Match a Unicode character satisfying the predicate.
+satisfyCodePoint :: forall m. Monad m => (CodePoint -> Boolean) -> ParserT String m CodePoint
+satisfyCodePoint f = tryRethrow do
+  c <- anyCodePoint
+  if f c
+    then pure c
+    else fail "Predicate unsatisfied"
 
--- | Match the specified character
-char :: forall s m. StringLike s => Monad m => Char -> ParserT s m Char
+-- | Match the specified BMP `Char`.
+char :: forall m. Monad m => Char -> ParserT String m Char
 char c = satisfy (_ == c) <?> show c
 
--- | Match zero or more whitespace characters.
-whiteSpace :: forall s m. StringLike s => Monad m => ParserT s m String
-whiteSpace = do
-  cs <- many $ satisfy \c -> c == '\n' || c == '\r' || c == ' ' || c == '\t'
-  pure $ SCU.fromCharArray cs
+-- | Match zero or more whitespace characters satisfying
+-- | `Data.CodePoint.Unicode.isSpace`.
+whiteSpace :: forall m. Monad m => ParserT String m String
+whiteSpace = fst <$> match skipSpaces
 
 -- | Skip whitespace characters.
-skipSpaces :: forall s m. StringLike s => Monad m => ParserT s m Unit
-skipSpaces = void whiteSpace
+skipSpaces :: forall m. Monad m => ParserT String m Unit
+skipSpaces = skipMany (satisfyCodePoint isSpace)
 
--- | Match one of the characters in the array.
-oneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
+-- | Match one of the BMP `Char`s in the array.
+oneOf :: forall m. Monad m => Array Char -> ParserT String m Char
 oneOf ss = satisfy (flip elem ss) <?> ("one of " <> show ss)
 
--- | Match any character not in the array.
-noneOf :: forall s m. StringLike s => Monad m => Array Char -> ParserT s m Char
+-- | Match any BMP `Char` not in the array.
+noneOf :: forall m. Monad m => Array Char -> ParserT String m Char
 noneOf ss = satisfy (flip notElem ss) <?> ("none of " <> show ss)
+
+-- | Updates a `Position` by adding the columns and lines in `String`.
+updatePosString :: Position -> String -> Position
+updatePosString pos str = case uncons str of
+  Nothing -> pos
+  Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
+
+-- | Updates a `Position` by adding the columns and lines in a
+-- | single `CodePoint`.
+updatePosSingle :: Position -> CodePoint -> Position
+updatePosSingle (Position {line,column}) cp = case unsafeCoerce cp of
+  10 -> Position { line: line + 1, column: 1 } -- "\n"
+  13 -> Position { line: line + 1, column: 1 } -- "\r"
+  9  -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns?
+  _  -> Position { line, column: column + 1 }
+
+-- | Combinator which returns both the result of a parse and the portion of
+-- | the input that was consumed while it was being parsed.
+match :: forall m a. Monad m => ParserT String m a -> ParserT String m (Tuple String a)
+match p = do
+  ParseState input1 _ _ <- get
+  x <- p
+  ParseState input2 _ _ <- get
+  -- We use the `SCU.length`, which is in units of “code units”
+  -- instead of `Data.String.length`. which is in units of “code points”.
+  -- This is more efficient, and it will be correct as long as we can assume
+  -- the invariant that the `ParseState input` always begins on a code point
+  -- boundary.
+  pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
+
+-- Helper function.
+satisfyCP :: forall m . Monad m => (CodePoint -> Boolean) -> ParserT String m Char
+satisfyCP p = satisfy (p <<< codePointFromChar)
+
+-- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`.
+digit :: forall m . Monad m => ParserT String m Char
+digit = satisfyCP isDecDigit <?> "digit"
+
+-- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`.
+hexDigit :: forall m . Monad m => ParserT String m Char
+hexDigit = satisfyCP isHexDigit <?> "hex digit"
+
+-- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`.
+octDigit :: forall m . Monad m => ParserT String m Char
+octDigit = satisfyCP isOctDigit <?> "oct digit"
+
+-- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`.
+upper :: forall m . Monad m => ParserT String m Char
+upper = satisfyCP isUpper <?> "uppercase letter"
+
+-- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`.
+space :: forall m . Monad m => ParserT String m Char
+space = satisfyCP isSpace <?> "space"
+
+-- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`.
+letter :: forall m . Monad m => ParserT String m Char
+letter = satisfyCP isAlpha <?> "letter"
+
+-- | Parse an alphabetical or numerical character.
+-- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
+alphaNum :: forall m . Monad m => ParserT String m Char
+alphaNum = satisfyCP isAlphaNum <?> "letter or digit"
diff --git a/src/Text/Parsing/Parser/Token.purs b/src/Text/Parsing/Parser/Token.purs
@@ -10,28 +10,17 @@ module Text.Parsing.Parser.Token
   , TokenParser
   , GenTokenParser
   , makeTokenParser
-  -- should these be exported?  Maybe they should go in a different module?
-  , digit
-  , hexDigit
-  , octDigit
-  , upper
-  , space
-  , letter
-  , alphaNum
   )
     where
 
-import Prelude hiding (when,between)
+import Prelude hiding (when, between)
 
 import Control.Lazy (fix)
 import Control.Monad.State (gets, modify_)
 import Control.MonadPlus (guard, (<|>))
 import Data.Array as Array
-import Data.String.CodeUnits (toChar, singleton) as CodeUnits
-import Data.String.CodePoints (CodePoint, codePointFromChar)
 import Data.Char (fromCharCode, toCharCode)
-import Data.CodePoint.Unicode (isAlpha, isAlphaNum, isDecDigit, isHexDigit, isOctDigit, isSpace, isUpper, hexDigitToInt)
-import Data.String.Unicode as Unicode
+import Data.CodePoint.Unicode (hexDigitToInt, isAlpha, isSpace)
 import Data.Either (Either(..))
 import Data.Foldable (foldl, foldr)
 import Data.Identity (Identity)
@@ -41,13 +30,16 @@ import Data.List as List
 import Data.List.NonEmpty (NonEmptyList)
 import Data.Maybe (Maybe(..), maybe)
 import Data.String (null, toLower)
+import Data.String.CodePoints (codePointFromChar)
+import Data.String.CodeUnits (toChar, singleton) as CodeUnits
 import Data.String.CodeUnits as SCU
+import Data.String.Unicode as Unicode
 import Data.Tuple (Tuple(..))
 import Math (pow)
 import Text.Parsing.Parser (ParseState(..), ParserT, fail)
 import Text.Parsing.Parser.Combinators (skipMany1, try, tryRethrow, skipMany, notFollowedBy, option, choice, between, sepBy1, sepBy, (<?>), (<??>))
 import Text.Parsing.Parser.Pos (Position)
-import Text.Parsing.Parser.String (satisfy, oneOf, noneOf, string, char)
+import Text.Parsing.Parser.String (char, digit, hexDigit, noneOf, octDigit, oneOf, satisfy, satisfyCodePoint, space, string, upper)
 
 -- | Create a parser which Returns the first token in the stream.
 token :: forall m a. Monad m => (a -> Position) -> ParserT (List a) m a
@@ -746,7 +738,7 @@ whiteSpace' langDef@(LanguageDef languageDef)
         skipMany (simpleSpace <|> oneLineComment langDef <|> multiLineComment langDef <?> "")
 
 simpleSpace :: forall m . Monad m => ParserT String m Unit
-simpleSpace = skipMany1 (satisfyCP isSpace)
+simpleSpace = skipMany1 (satisfyCodePoint isSpace)
 
 oneLineComment :: forall m . Monad m => GenLanguageDef String m -> ParserT String m Unit
 oneLineComment (LanguageDef languageDef) =
@@ -780,39 +772,3 @@ inCommentSingle (LanguageDef languageDef) =
   where
     startEnd :: Array Char
     startEnd = SCU.toCharArray languageDef.commentEnd <> SCU.toCharArray languageDef.commentStart
-
--------------------------------------------------------------------------
--- Helper functions that should maybe go in Text.Parsing.Parser.String --
--------------------------------------------------------------------------
-
-satisfyCP :: forall m . Monad m => (CodePoint -> Boolean) -> ParserT String m Char
-satisfyCP p = satisfy (p <<< codePointFromChar)
-
--- | Parse a digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isDecDigit`.
-digit :: forall m . Monad m => ParserT String m Char
-digit = satisfyCP isDecDigit <?> "digit"
-
--- | Parse a hex digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isHexDigit`.
-hexDigit :: forall m . Monad m => ParserT String m Char
-hexDigit = satisfyCP isHexDigit <?> "hex digit"
-
--- | Parse an octal digit.  Matches any char that satisfies `Data.CodePoint.Unicode.isOctDigit`.
-octDigit :: forall m . Monad m => ParserT String m Char
-octDigit = satisfyCP isOctDigit <?> "oct digit"
-
--- | Parse an uppercase letter.  Matches any char that satisfies `Data.CodePoint.Unicode.isUpper`.
-upper :: forall m . Monad m => ParserT String m Char
-upper = satisfyCP isUpper <?> "uppercase letter"
-
--- | Parse a space character.  Matches any char that satisfies `Data.CodePoint.Unicode.isSpace`.
-space :: forall m . Monad m => ParserT String m Char
-space = satisfyCP isSpace <?> "space"
-
--- | Parse an alphabetical character.  Matches any char that satisfies `Data.CodePoint.Unicode.isAlpha`.
-letter :: forall m . Monad m => ParserT String m Char
-letter = satisfyCP isAlpha <?> "letter"
-
--- | Parse an alphabetical or numerical character.
--- | Matches any char that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
-alphaNum :: forall m . Monad m => ParserT String m Char
-alphaNum = satisfyCP isAlphaNum <?> "letter or digit"
diff --git a/test/Main.purs b/test/Main.purs

Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,7 @@`
`20`	`20`	`, "transformers"`
`21`	`21`	`, "tuples"`
`22`	`22`	`, "unicode"`
	`23`	`+ , "unsafe-coerce"`
`23`	`24`	`]`
`24`	`25`	`, packages = ./packages.dhall`
`25`	`26`	`, sources = [ "src/*/.purs", "test/*/.purs" ]`