11-- | Primitive parsers for working with an input stream of type `String`.
2-
3- module Text.Parsing.Parser.String where
2+ -- |
3+ -- | These primitive parsers all operate on primitive `String` inputs.
4+ -- | In most JavaScript runtime environments, the `String` is encoded
5+ -- | as little-endian [UTF-16](https://en.wikipedia.org/wiki/UTF-16), but
6+ -- | these primitive parsers should work with any runtime encoding.
7+ -- |
8+ -- | The primitive parsers which return `Char` will only succeed when the character
9+ -- | being parsed is a code point in the
10+ -- | [Basic Multilingual Plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane)
11+ -- | (the “BMP”). These parsers can be convenient because of the good support
12+ -- | that PureScript has for writing `Char` literals like `'あ', 'β', 'C'`.
13+ -- |
14+ -- | The other primitive parsers, which return `CodePoint` and `String` types,
15+ -- | can parse the full Unicode character set. All of the primitive parsers
16+ -- | in this module can be used together.
17+ module Text.Parsing.Parser.String
18+ ( string
19+ , eof
20+ , anyChar
21+ , anyCodePoint
22+ , satisfy
23+ , satisfyCodePoint
24+ , char
25+ , whiteSpace
26+ , skipSpaces
27+ , oneOf
28+ , noneOf
29+ , match
30+ , digit
31+ , hexDigit
32+ , octDigit
33+ , upper
34+ , space
35+ , letter
36+ , alphaNum
37+ )
38+ where
439
540import Prelude hiding (between )
641
7- import Control.Monad.State (gets , modify_ )
8- import Data.Array (many )
9- import Data.Foldable (elem , notElem )
42+ import Control.Monad.State (get , gets , modify_ )
43+ import Data.Array (notElem )
44+ import Data.Char (fromCharCode )
45+ import Data.CodePoint.Unicode (isAlpha , isAlphaNum , isDecDigit , isHexDigit , isOctDigit , isSpace , isUpper )
46+ import Data.Foldable (elem )
1047import Data.Maybe (Maybe (..))
1148import Data.Newtype (wrap )
12- import Data.String (Pattern )
13- import Data.String as S
49+ import Data.String (CodePoint , codePointFromChar , null , stripPrefix , uncons )
1450import Data.String.CodeUnits as SCU
51+ import Data.Tuple (Tuple (..), fst )
1552import Text.Parsing.Parser (ParseState (..), ParserT , fail )
16- import Text.Parsing.Parser.Combinators (tryRethrow , (<?>))
17- import Text.Parsing.Parser.Pos (updatePosString )
18-
19- -- | This class exists to abstract over streams which support the string-like
20- -- | operations which this modules needs.
21- class StringLike s where
22- drop :: Int -> s -> s
23- stripPrefix :: Pattern -> s -> Maybe s
24- null :: s -> Boolean
25- uncons :: s -> Maybe { head :: Char , tail :: s }
26-
27- instance stringLikeString :: StringLike String where
28- uncons = SCU .uncons
29- drop = S .drop
30- stripPrefix = S .stripPrefix
31- null = S .null
53+ import Text.Parsing.Parser.Combinators (skipMany , tryRethrow , (<?>))
54+ import Text.Parsing.Parser.Pos (Position (..))
55+ import Unsafe.Coerce (unsafeCoerce )
3256
3357-- | Match end-of-file.
34- eof :: forall s m . StringLike s => Monad m => ParserT s m Unit
58+ eof :: forall m . Monad m => ParserT String m Unit
3559eof = do
3660 input <- gets \(ParseState input _ _) -> input
3761 unless (null input) (fail " Expected EOF" )
3862
3963-- | Match the specified string.
40- string :: forall s m . StringLike s => Monad m => String -> ParserT s m String
64+ string :: forall m . Monad m => String -> ParserT String m String
4165string str = do
4266 input <- gets \(ParseState input _ _) -> input
4367 case stripPrefix (wrap str) input of
@@ -49,44 +73,127 @@ string str = do
4973 pure str
5074 _ -> fail (" Expected " <> show str)
5175
52- -- | Match any character.
53- anyChar :: forall s m . StringLike s => Monad m => ParserT s m Char
54- anyChar = do
76+ -- | Match any BMP `Char`.
77+ -- | Parser will fail if the character is not in the Basic Multilingual Plane.
78+ anyChar :: forall m . Monad m => ParserT String m Char
79+ anyChar = tryRethrow do
80+ cp :: Int <- unsafeCoerce <$> anyCodePoint
81+ -- the `fromCharCode` function doesn't check if this is beyond the
82+ -- BMP, so we check that ourselves.
83+ -- https://github.com/purescript/purescript-strings/issues/153
84+ if cp > 65535 -- BMP
85+ then fail " Not a Char"
86+ else case fromCharCode cp of
87+ Nothing -> fail " Not a Char"
88+ Just c -> pure c
89+
90+ -- | Match any Unicode character.
91+ -- | Always succeeds.
92+ anyCodePoint :: forall m . Monad m => ParserT String m CodePoint
93+ anyCodePoint = do
5594 input <- gets \(ParseState input _ _) -> input
5695 case uncons input of
5796 Nothing -> fail " Unexpected EOF"
5897 Just { head, tail } -> do
5998 modify_ \(ParseState _ position _) ->
60- ParseState tail
61- (updatePosString position (SCU .singleton head))
62- true
99+ ParseState tail (updatePosSingle position head) true
63100 pure head
64101
65- -- | Match a character satisfying the specified predicate.
66- satisfy :: forall s m . StringLike s => Monad m => (Char -> Boolean ) -> ParserT s m Char
102+ -- | Match a BMP `Char` satisfying the predicate.
103+ satisfy :: forall m . Monad m => (Char -> Boolean ) -> ParserT String m Char
67104satisfy f = tryRethrow do
68105 c <- anyChar
69- if f c then pure c
70- else fail $ " Character '" <> SCU .singleton c <> " ' did not satisfy predicate"
106+ if f c
107+ then pure c
108+ else fail " Predicate unsatisfied"
109+
110+ -- | Match a Unicode character satisfying the predicate.
111+ satisfyCodePoint :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m CodePoint
112+ satisfyCodePoint f = tryRethrow do
113+ c <- anyCodePoint
114+ if f c
115+ then pure c
116+ else fail " Predicate unsatisfied"
71117
72- -- | Match the specified character
73- char :: forall s m . StringLike s => Monad m => Char -> ParserT s m Char
118+ -- | Match the specified BMP `Char`.
119+ char :: forall m . Monad m => Char -> ParserT String m Char
74120char c = satisfy (_ == c) <?> show c
75121
76- -- | Match zero or more whitespace characters.
77- whiteSpace :: forall s m . StringLike s => Monad m => ParserT s m String
78- whiteSpace = do
79- cs <- many $ satisfy \c -> c == ' \n ' || c == ' \r ' || c == ' ' || c == ' \t '
80- pure $ SCU .fromCharArray cs
122+ -- | Match zero or more whitespace characters satisfying
123+ -- | `Data.CodePoint.Unicode.isSpace`.
124+ whiteSpace :: forall m . Monad m => ParserT String m String
125+ whiteSpace = fst <$> match skipSpaces
81126
82127-- | Skip whitespace characters.
83- skipSpaces :: forall s m . StringLike s => Monad m => ParserT s m Unit
84- skipSpaces = void whiteSpace
128+ skipSpaces :: forall m . Monad m => ParserT String m Unit
129+ skipSpaces = skipMany (satisfyCodePoint isSpace)
85130
86- -- | Match one of the characters in the array.
87- oneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
131+ -- | Match one of the BMP `Char`s in the array.
132+ oneOf :: forall m . Monad m => Array Char -> ParserT String m Char
88133oneOf ss = satisfy (flip elem ss) <?> (" one of " <> show ss)
89134
90- -- | Match any character not in the array.
91- noneOf :: forall s m . StringLike s => Monad m => Array Char -> ParserT s m Char
135+ -- | Match any BMP `Char` not in the array.
136+ noneOf :: forall m . Monad m => Array Char -> ParserT String m Char
92137noneOf ss = satisfy (flip notElem ss) <?> (" none of " <> show ss)
138+
139+ -- | Updates a `Position` by adding the columns and lines in `String`.
140+ updatePosString :: Position -> String -> Position
141+ updatePosString pos str = case uncons str of
142+ Nothing -> pos
143+ Just {head,tail} -> updatePosString (updatePosSingle pos head) tail -- tail recursive
144+
145+ -- | Updates a `Position` by adding the columns and lines in a
146+ -- | single `CodePoint`.
147+ updatePosSingle :: Position -> CodePoint -> Position
148+ updatePosSingle (Position {line,column}) cp = case unsafeCoerce cp of
149+ 10 -> Position { line: line + 1 , column: 1 } -- "\n"
150+ 13 -> Position { line: line + 1 , column: 1 } -- "\r"
151+ 9 -> Position { line, column: column + 8 - ((column - 1 ) `mod` 8 ) } -- "\t" Who says that one tab is 8 columns?
152+ _ -> Position { line, column: column + 1 }
153+
154+ -- | Combinator which returns both the result of a parse and the portion of
155+ -- | the input that was consumed while it was being parsed.
156+ match :: forall m a . Monad m => ParserT String m a -> ParserT String m (Tuple String a )
157+ match p = do
158+ ParseState input1 _ _ <- get
159+ x <- p
160+ ParseState input2 _ _ <- get
161+ -- We use the `SCU.length`, which is in units of “code units”
162+ -- instead of `Data.String.length`. which is in units of “code points”.
163+ -- This is more efficient, and it will be correct as long as we can assume
164+ -- the invariant that the `ParseState input` always begins on a code point
165+ -- boundary.
166+ pure $ Tuple (SCU .take (SCU .length input1 - SCU .length input2) input1) x
167+
168+ -- Helper function.
169+ satisfyCP :: forall m . Monad m => (CodePoint -> Boolean ) -> ParserT String m Char
170+ satisfyCP p = satisfy (p <<< codePointFromChar)
171+
172+ -- | Parse a digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isDecDigit`.
173+ digit :: forall m . Monad m => ParserT String m Char
174+ digit = satisfyCP isDecDigit <?> " digit"
175+
176+ -- | Parse a hex digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isHexDigit`.
177+ hexDigit :: forall m . Monad m => ParserT String m Char
178+ hexDigit = satisfyCP isHexDigit <?> " hex digit"
179+
180+ -- | Parse an octal digit. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isOctDigit`.
181+ octDigit :: forall m . Monad m => ParserT String m Char
182+ octDigit = satisfyCP isOctDigit <?> " oct digit"
183+
184+ -- | Parse an uppercase letter. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isUpper`.
185+ upper :: forall m . Monad m => ParserT String m Char
186+ upper = satisfyCP isUpper <?> " uppercase letter"
187+
188+ -- | Parse a space character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isSpace`.
189+ space :: forall m . Monad m => ParserT String m Char
190+ space = satisfyCP isSpace <?> " space"
191+
192+ -- | Parse an alphabetical character. Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlpha`.
193+ letter :: forall m . Monad m => ParserT String m Char
194+ letter = satisfyCP isAlpha <?> " letter"
195+
196+ -- | Parse an alphabetical or numerical character.
197+ -- | Matches any BMP `Char` that satisfies `Data.CodePoint.Unicode.isAlphaNum`.
198+ alphaNum :: forall m . Monad m => ParserT String m Char
199+ alphaNum = satisfyCP isAlphaNum <?> " letter or digit"
0 commit comments