Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 62 additions & 19 deletions src/Parsers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ end
* `sentinel=nothing`: valid values include: `nothing` meaning don't check for sentinel values; `missing` meaning an "empty field" should be considered a sentinel value; or a `Vector{String}` of the various string values that should each be checked as a sentinel value. Note that sentinels will always be checked longest to shortest, with the longest valid match taking precedence.
* `wh1=' '`: the first ascii character to be considered when ignoring leading/trailing whitespace in value parsing
* `wh2='\t'`: the second ascii character to be considered when ignoring leading/trailing whitespace in value parsing
* `openquotechar='"'`: the ascii character that signals a "quoted" field while parsing; subsequent characters will be treated as non-significant until a valid `closequotechar` is detected
* `closequotechar='"'`: the ascii character that signals the end of a quoted field
* `openquotechar='"'`: the ascii character or string that signals a "quoted" field while parsing; subsequent characters will be treated as non-significant until a valid `closequotechar` is detected
* `closequotechar='"'`: the ascii character or string that signals the end of a quoted field
* `escapechar='"'`: an ascii character used to "escape" a `closequotechar` within a quoted field
* `delim=nothing`: if `nothing`, no delimiter will be checked for; if a `Char` or `String`, a delimiter will be checked for directly after parsing a value or `closequotechar`; a newline (`\n`), return (`\r`), or CRLF (`"\r\n"`) are always considered "delimiters", in addition to EOF
* `decimal='.'`: an ascii character to be used when parsing float values that separates a decimal value
Expand All @@ -65,8 +65,8 @@ struct Options
wh1::UInt8
wh2::UInt8
quoted::Bool
oq::UInt8
cq::UInt8
oq::Union{UInt8, PtrLen}
cq::Union{UInt8, PtrLen}
e::UInt8
delim::Union{Nothing, UInt8, PtrLen}
decimal::UInt8
Expand All @@ -83,11 +83,11 @@ asciival(c::Char) = isascii(c)
asciival(b::UInt8) = b < 0x80

function Options(
sentinel::Union{Nothing, Missing, Vector{String}},
sentinel::Union{Nothing, Missing, Vector{String}},
wh1::Union{UInt8, Char},
wh2::Union{UInt8, Char},
oq::Union{UInt8, Char},
cq::Union{UInt8, Char},
oq::Union{UInt8, Char, String},
cq::Union{UInt8, Char, String},
e::Union{UInt8, Char},
delim::Union{Nothing, UInt8, Char, String},
decimal::Union{UInt8, Char},
Expand All @@ -96,15 +96,20 @@ function Options(
dateformat::Union{Nothing, String, Dates.DateFormat, Format},
ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace=false, stripquoted=false)
asciival(wh1) && asciival(wh2) || throw(ArgumentError("whitespace characters must be ASCII"))
asciival(oq) && asciival(cq) && asciival(e) || throw(ArgumentError("openquotechar, closequotechar, and escapechar must be ASCII characters"))
(oq isa String || asciival(oq)) && (cq isa String || asciival(cq)) && asciival(e) || throw(ArgumentError("openquotechar, closequotechar, and escapechar must be ASCII characters"))
(oq == delim) || (cq == delim) || (e == delim) && throw(ArgumentError("delim argument must be different than openquotechar, closequotechar, and escapechar arguments"))
Copy link
Collaborator Author

@nickrobinson251 nickrobinson251 Jun 10, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to think about how delim and oq/cq interact e.g. when delim::Char in oq::String

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should probably just check the delim and oq/cq (i) are not equal, (ii) none of the chars in one are in the other

if sentinel isa Vector{String}
for sent in sentinel
if startswith(sent, string(Char(wh1))) || startswith(sent, string(Char(wh2)))
throw(ArgumentError("sentinel value isn't allowed to start with wh1 or wh2 characters"))
end
if startswith(sent, string(Char(oq))) || startswith(sent, string(Char(cq)))
if (
((oq isa UInt8 || oq isa Char) && startswith(sent, string(Char(oq)))) ||
((cq isa UInt8 || cq isa Char) && startswith(sent, string(Char(cq))))
)
throw(ArgumentError("sentinel value isn't allowed to start with openquotechar, closequotechar, or escapechar characters"))
elseif (oq isa String && startswith(sent, oq)) || (cq isa String && startswith(sent, cq))
throw(ArgumentError("sentinel value isn't allowed to start with openquote or closequote string"))
end
if (delim isa UInt8 || delim isa Char) && startswith(sent, string(Char(delim)))
throw(ArgumentError("sentinel value isn't allowed to start with a delimiter character"))
Expand All @@ -117,6 +122,8 @@ function Options(
refs = [""]
end
sent = sentinel === nothing || sentinel === missing ? sentinel : prepare(sentinel)
openq = oq isa String ? ptrlen(oq) : oq % UInt8
closeq = cq isa String ? ptrlen(cq) : cq % UInt8
del = delim === nothing ? nothing : delim isa String ? ptrlen(delim) : delim % UInt8
if del isa UInt8
((wh1 % UInt8) == del || (wh2 % UInt8) == del) && throw(ArgumentError("whitespace characters (`wh1=' '` and `wh2='\\t'` default keyword arguments) must be different than delim argument"))
Expand All @@ -136,15 +143,15 @@ function Options(
cmt = ptrlen(comment)
end
df = dateformat === nothing ? nothing : dateformat isa String ? Format(dateformat) : dateformat isa Dates.DateFormat ? Format(dateformat) : dateformat
return Options(refs, sent, ignorerepeated, ignoreemptylines, wh1 % UInt8, wh2 % UInt8, quoted, oq % UInt8, cq % UInt8, e % UInt8, del, decimal % UInt8, trues, falses, df, cmt, stripwhitespace || stripquoted, stripquoted)
return Options(refs, sent, ignorerepeated, ignoreemptylines, wh1 % UInt8, wh2 % UInt8, quoted, openq, closeq, e % UInt8, del, decimal % UInt8, trues, falses, df, cmt, stripwhitespace || stripquoted, stripquoted)
end

Options(;
sentinel::Union{Nothing, Missing, Vector{String}}=nothing,
wh1::Union{UInt8, Char}=UInt8(' '),
wh2::Union{UInt8, Char}=UInt8('\t'),
openquotechar::Union{UInt8, Char}=UInt8('"'),
closequotechar::Union{UInt8, Char}=UInt8('"'),
openquotechar::Union{UInt8, Char, String}=UInt8('"'),
closequotechar::Union{UInt8, Char, String}=UInt8('"'),
escapechar::Union{UInt8, Char}=UInt8('"'),
delim::Union{Nothing, UInt8, Char, String}=nothing,
decimal::Union{UInt8, Char}=UInt8('.'),
Expand Down Expand Up @@ -209,7 +216,7 @@ A [`Parsers.Result`](@ref) struct is returned, with the following fields:
function xparse end

# for testing purposes only, it's much too slow to dynamically create Options for every xparse call
function xparse(::Type{T}, buf::Union{AbstractVector{UInt8}, AbstractString, IO}; pos::Integer=1, len::Integer=buf isa IO ? 0 : sizeof(buf), sentinel=nothing, wh1::Union{UInt8, Char}=UInt8(' '), wh2::Union{UInt8, Char}=UInt8('\t'), quoted::Bool=true, openquotechar::Union{UInt8, Char}=UInt8('"'), closequotechar::Union{UInt8, Char}=UInt8('"'), escapechar::Union{UInt8, Char}=UInt8('"'), ignorerepeated::Bool=false, ignoreemptylines::Bool=false, delim::Union{UInt8, Char, PtrLen, AbstractString, Nothing}=UInt8(','), decimal::Union{UInt8, Char}=UInt8('.'), comment=nothing, trues=nothing, falses=nothing, dateformat::Union{Nothing, String, Dates.DateFormat}=nothing, debug::Bool=false, stripwhitespace::Bool=false, stripquoted::Bool=false) where {T}
function xparse(::Type{T}, buf::Union{AbstractVector{UInt8}, AbstractString, IO}; pos::Integer=1, len::Integer=buf isa IO ? 0 : sizeof(buf), sentinel=nothing, wh1::Union{UInt8, Char}=UInt8(' '), wh2::Union{UInt8, Char}=UInt8('\t'), quoted::Bool=true, openquotechar::Union{UInt8, Char, String}=UInt8('"'), closequotechar::Union{UInt8, Char, String}=UInt8('"'), escapechar::Union{UInt8, Char}=UInt8('"'), ignorerepeated::Bool=false, ignoreemptylines::Bool=false, delim::Union{UInt8, Char, PtrLen, AbstractString, Nothing}=UInt8(','), decimal::Union{UInt8, Char}=UInt8('.'), comment=nothing, trues=nothing, falses=nothing, dateformat::Union{Nothing, String, Dates.DateFormat}=nothing, debug::Bool=false, stripwhitespace::Bool=false, stripquoted::Bool=false) where {T}
options = Options(sentinel, wh1, wh2, openquotechar, closequotechar, escapechar, delim, decimal, trues, falses, dateformat, ignorerepeated, ignoreemptylines, comment, quoted, debug, stripwhitespace, stripquoted)
return xparse(T, buf, pos, len, options)
end
Expand Down Expand Up @@ -286,12 +293,12 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
end
# check for start of quoted field
if options.quoted
quoted = b == options.oq
preqpos = pos
pos = checkquote(source, pos, len, options.oq)
quoted = pos > preqpos
if quoted
code = QUOTED
pos += 1
vstartpos = pos
incr!(source)
if eof(source, pos, len)
code |= INVALID_QUOTED_FIELD
@goto donedone
Expand Down Expand Up @@ -359,9 +366,9 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
same = options.cq == options.e
first = true
while true
pos += 1
incr!(source)
if same && b == options.e
pos += 1
incr!(source)
if eof(source, pos, len)
code |= EOF
if !first
Expand All @@ -378,6 +385,8 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
pos += 1
incr!(source)
elseif b == options.e
pos += 1
incr!(source)
if eof(source, pos, len)
code |= INVALID_QUOTED_FIELD | EOF
@goto donedone
Expand All @@ -386,6 +395,8 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
pos += 1
incr!(source)
elseif b == options.cq
pos += 1
incr!(source)
if !first
code |= INVALID
end
Expand All @@ -394,6 +405,22 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
@goto donedone
end
break
else
preqpos = pos
pos = checkquote(source, pos, len, options.cq)
if pos > preqpos
if !first
code |= INVALID
end
if eof(source, pos, len)
code |= EOF
@goto donedone
end
break
else
pos += 1
incr!(source)
end
end
if eof(source, pos, len)
code |= INVALID_QUOTED_FIELD | EOF
Expand Down Expand Up @@ -765,7 +792,23 @@ end
end
end

function checkdelim!(buf, pos, len, options::Options)
function checkquote(source, pos, len, q::PtrLen)
eof(source, pos, len) && return pos
return checkdelim(source, pos, len, q) # calls `incr!` if `q` matches.
end

function checkquote(source, pos, len, q::UInt8)
eof(source, pos, len) && return pos
b = peekbyte(source, pos)
if b == q
pos += 1
incr!(source)
end
return pos
end

# Used by CSV.jl
function checkdelim!(buf::AbstractVector{UInt8}, pos, len, options::Options)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added comment and type annotation here because i ended up being led astray by this function... and then wondered why we had it at all 😂

pos > len && return pos
delim = options.delim
@inbounds b = buf[pos]
Expand Down
27 changes: 22 additions & 5 deletions src/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
end
# check for start of quoted field
if options.quoted
quoted = b == options.oq
preqpos = pos
pos = checkquote(source, pos, len, options.oq)
quoted = pos > preqpos
if quoted
code = QUOTED
pos += 1
incr!(source)
# since we're in quoted mode, reset vstartpos & vpos
vstartpos = vpos = pos
if eof(source, pos, len)
Expand Down Expand Up @@ -66,9 +66,9 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
same = options.cq == options.e
while true
vpos = pos
pos += 1
incr!(source)
if same && b == options.e
pos += 1
incr!(source)
if eof(source, pos, len)
code |= EOF
@goto donedone
Expand All @@ -79,6 +79,8 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
pos += 1
incr!(source)
elseif b == options.e
pos += 1
incr!(source)
if eof(source, pos, len)
code |= INVALID_QUOTED_FIELD | EOF
@goto donedone
Expand All @@ -87,11 +89,26 @@ function xparse(::Type{T}, source::Union{AbstractVector{UInt8}, IO}, pos, len, o
pos += 1
incr!(source)
elseif b == options.cq
pos += 1
incr!(source)
if eof(source, pos, len)
code |= EOF
@goto donedone
end
break
else
preqpos = pos
pos = checkquote(source, pos, len, options.oq)
if pos > preqpos
if eof(source, pos, len)
code |= EOF
@goto donedone
end
break
else
pos += 1
incr!(source)
end
end
if eof(source, pos, len)
code |= INVALID_QUOTED_FIELD | EOF
Expand Down
1 change: 1 addition & 0 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ function checkdelim(source::IO, pos, len, (ptr, ptrlen))
return delimpos
end

# if `true`, `source` incremented past the match, else not incremented
@inline function match!(source::IO, ptr, ptrlen)
b = peekbyte(source)
c = unsafe_load(ptr)
Expand Down
32 changes: 29 additions & 3 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,17 @@ testcases = [
(str="{1 } ,", kwargs=(stripquoted=true,delim=UInt8(',')), x=1, code=(OK | DELIMITED | QUOTED), vpos=2, vlen=1, tlen=6),
];

chars(s::AbstractString) = s
chars(s::UInt8) = Char(s)
for useio in (false, true)
for (oq, cq, e) in ((UInt8('"'), UInt8('"'), UInt8('"')), (UInt8('"'), UInt8('"'), UInt8('\\')), (UInt8('{'), UInt8('}'), UInt8('\\')))
for (oq, cq, e) in (
(UInt8('"'), UInt8('"'), UInt8('"')),
(UInt8('"'), UInt8('"'), UInt8('\\')),
(UInt8('{'), UInt8('}'), UInt8('\\')),
("{{", "}}", UInt8('\\')),
)
for (i, case) in enumerate(testcases)
str = replace(replace(replace(case.str, '{'=>Char(oq)), '}'=>Char(cq)), '\\'=>Char(e))
str = replace(replace(replace(case.str, '{'=>chars(oq)), '}'=>chars(cq)), '\\'=>chars(e))
source = useio ? IOBuffer(str) : str
res = Parsers.xparse(Int64, source; openquotechar=oq, closequotechar=cq, escapechar=e, case.kwargs...)
x, code, tlen = res.val, res.code, res.tlen
Expand All @@ -216,7 +223,11 @@ for useio in (false, true)
@test x == case.x
end
@test code == case.code
@test tlen == case.tlen
if Parsers.invalidquotedfield(code) || Parsers.quoted(code)
@test tlen == length(str)
else
@test tlen == case.tlen
end
end
end
end
Expand All @@ -238,6 +249,8 @@ for (i, case) in enumerate(testcases)
end

# stripwhitespace
res = Parsers.xparse(String, "{{hey there }}"; openquotechar="{{", closequotechar="}}", stripwhitespace=true)
@test res.val.pos == 3 && res.val.len == 11
res = Parsers.xparse(String, "{hey there}"; openquotechar='{', closequotechar='}', stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, "{hey there }"; openquotechar='{', closequotechar='}', stripwhitespace=true)
Expand All @@ -257,6 +270,8 @@ res = Parsers.xparse(String, " hey there "; stripwhitespace=true)
res = Parsers.xparse(String, " hey there "; delim=nothing, stripwhitespace=true)
@test res.val.pos == 2 && res.val.len == 9

res = Parsers.xparse(String, "{{hey there }}"; openquotechar="{{", closequotechar="}}", stripquoted=true)
@test res.val.pos == 3 && res.val.len == 11
res = Parsers.xparse(String, "{hey there}"; openquotechar='{', closequotechar='}', stripquoted=true)
@test res.val.pos == 2 && res.val.len == 9
res = Parsers.xparse(String, "{hey there }"; openquotechar='{', closequotechar='}', stripquoted=true)
Expand Down Expand Up @@ -442,6 +457,17 @@ pos += tlen
@test Parsers.checkdelim!(codeunits(",,"), 1, 2, Parsers.Options(ignorerepeated=true, delim=',')) == 3
@test Parsers.checkdelim!(codeunits("::::"), 1, 4, Parsers.Options(delim="::", ignorerepeated=true)) == 5

# matches
@test Parsers.checkquote(codeunits("{1}"), 1, 3, UInt8('{')) == 2
@test Parsers.checkquote( IOBuffer("{1}"), 1, 3, UInt8('{')) == 2
@test Parsers.checkquote(codeunits("{{1}}"), 1, 5, Parsers.ptrlen("{{")) == 3
@test Parsers.checkquote( IOBuffer("{{1}}"), 1, 5, Parsers.ptrlen("{{")) == 3
# non-matches
@test Parsers.checkquote(codeunits("{1}"), 1, 3, UInt8('[')) == 1
@test Parsers.checkquote( IOBuffer("{1}"), 1, 3, UInt8('[')) == 1
@test Parsers.checkquote(codeunits("{{1}}"), 1, 5, Parsers.ptrlen("[[")) == 1
@test Parsers.checkquote( IOBuffer("{{1}}"), 1, 5, Parsers.ptrlen("[[")) == 1

e = Parsers.Error(Vector{UInt8}("hey"), Int64, INVALID | EOF, 1, 3)
io = IOBuffer()
showerror(io, e)
Expand Down