Skip to content

Commit fe554b7

Browse files
authored
Rework annotation ordering/optimisations (#54289)
Improvements to the consistency and semantics of AnnotatedStrings, mainly around the ordering of annotations.
2 parents 831ebe0 + 4c18472 commit fe554b7

7 files changed

Lines changed: 122 additions & 83 deletions

File tree

base/strings/annotated.jl

Lines changed: 90 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,17 @@ and a value (`Any`), paired together as a `Pair{Symbol, <:Any}`.
2525
Labels do not need to be unique, the same region can hold multiple annotations
2626
with the same label.
2727
28+
Code written for `AnnotatedString`s in general should conserve the following
29+
properties:
30+
- Which characters an annotation is applied to
31+
- The order in which annotations are applied to each character
32+
33+
Additional semantics may be introduced by specific uses of `AnnotatedString`s.
34+
35+
A corollary of these rules is that adjacent, consecutively placed, annotations
36+
with identical labels and values are equivalent to a single annotation spanning
37+
the combined range.
38+
2839
See also [`AnnotatedChar`](@ref), [`annotatedstring`](@ref),
2940
[`annotations`](@ref), and [`annotate!`](@ref).
3041
@@ -255,56 +266,26 @@ annotatedstring(c::AnnotatedChar) =
255266

256267
AnnotatedString(s::SubString{<:AnnotatedString}) = annotatedstring(s)
257268

258-
"""
259-
annotatedstring_optimize!(str::AnnotatedString)
260-
261-
Merge contiguous identical annotations in `str`.
262-
"""
263-
function annotatedstring_optimize!(s::AnnotatedString)
264-
last_seen = Dict{Pair{Symbol, Any}, Int}()
265-
i = 1
266-
while i <= length(s.annotations)
267-
region, keyval = s.annotations[i]
268-
prev = get(last_seen, keyval, 0)
269-
if prev > 0
270-
lregion, _ = s.annotations[prev]
271-
if last(lregion) + 1 == first(region)
272-
s.annotations[prev] =
273-
setindex(s.annotations[prev],
274-
first(lregion):last(region),
275-
1)
276-
deleteat!(s.annotations, i)
277-
else
278-
delete!(last_seen, keyval)
279-
end
280-
else
281-
last_seen[keyval] = i
282-
i += 1
283-
end
284-
end
285-
s
286-
end
287-
288269
function repeat(str::AnnotatedString, r::Integer)
289270
r == 0 && return one(AnnotatedString)
290271
r == 1 && return str
291272
unannot = repeat(str.string, r)
292273
annotations = Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}()
293274
len = ncodeunits(str)
294275
fullregion = firstindex(str):lastindex(str)
295-
for (region, annot) in str.annotations
296-
if region == fullregion
297-
push!(annotations, (firstindex(unannot):lastindex(unannot), annot))
276+
if allequal(first, str.annotations) && first(first(str.annotations)) == fullregion
277+
newfullregion = firstindex(unannot):lastindex(unannot)
278+
for (_, annot) in str.annotations
279+
push!(annotations, (newfullregion, annot))
298280
end
299-
end
300-
for offset in 0:len:(r-1)*len
301-
for (region, annot) in str.annotations
302-
if region != fullregion
281+
else
282+
for offset in 0:len:(r-1)*len
283+
for (region, annot) in str.annotations
303284
push!(annotations, (region .+ offset, annot))
304285
end
305286
end
306287
end
307-
AnnotatedString(unannot, annotations) |> annotatedstring_optimize!
288+
AnnotatedString(unannot, annotations)
308289
end
309290

310291
repeat(str::SubString{<:AnnotatedString}, r::Integer) =
@@ -335,14 +316,9 @@ reverse(s::SubString{<:AnnotatedString}) = reverse(AnnotatedString(s))
335316
function _annotate!(annlist::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, range::UnitRange{Int}, @nospecialize(labelval::Pair{Symbol, <:Any}))
336317
label, val = labelval
337318
if val === nothing
338-
indices = searchsorted(annlist, (range,), by=first)
339-
labelindex = filter(i -> first(annlist[i][2]) === label, indices)
340-
for index in Iterators.reverse(labelindex)
341-
deleteat!(annlist, index)
342-
end
319+
deleteat!(annlist, findall(ann -> ann[1] == range && first(ann[2]) === label, annlist))
343320
else
344-
sortedindex = searchsortedlast(annlist, (range,), by=first) + 1
345-
insert!(annlist, sortedindex, (range, Pair{Symbol, Any}(label, val)))
321+
push!(annlist, (range, Pair{Symbol, Any}(label, val)))
346322
end
347323
end
348324

@@ -352,6 +328,9 @@ end
352328
353329
Annotate a `range` of `str` (or the entire string) with a labeled value (`label` => `value`).
354330
To remove existing `label` annotations, use a value of `nothing`.
331+
332+
The order in which annotations are applied to `str` is semantically meaningful,
333+
as described in [`AnnotatedString`](@ref).
355334
"""
356335
annotate!(s::AnnotatedString, range::UnitRange{Int}, @nospecialize(labelval::Pair{Symbol, <:Any})) =
357336
(_annotate!(s.annotations, range, labelval); s)
@@ -384,6 +363,9 @@ annotations that overlap with `position` will be returned.
384363
Annotations are provided together with the regions they apply to, in the form of
385364
a vector of region–annotation tuples.
386365
366+
In accordance with the semantics documented in [`AnnotatedString`](@ref), the
367+
order of annotations returned matches the order in which they were applied.
368+
387369
See also: `annotate!`.
388370
"""
389371
annotations(s::AnnotatedString) = s.annotations
@@ -518,10 +500,19 @@ function write(dest::AnnotatedIOBuffer, src::AnnotatedIOBuffer)
518500
nb
519501
end
520502

503+
"""
504+
_clear_annotations_in_region!(annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, span::UnitRange{Int})
505+
506+
Erase the presence of `annotations` within a certain `span`.
507+
508+
This operates by removing all elements of `annotations` that are entirely
509+
contained in `span`, truncating ranges that partially overlap, and splitting
510+
annotations that subsume `span` to just exist either side of `span`.
511+
"""
521512
function _clear_annotations_in_region!(annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, span::UnitRange{Int})
522513
# Clear out any overlapping pre-existing annotations.
523514
filter!(((region, _),) -> first(region) < first(span) || last(region) > last(span), annotations)
524-
extras = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[]
515+
extras = Tuple{Int, Tuple{UnitRange{Int}, Pair{Symbol, Any}}}[]
525516
for i in eachindex(annotations)
526517
region, annot = annotations[i]
527518
# Test for partial overlap
@@ -532,31 +523,68 @@ function _clear_annotations_in_region!(annotations::Vector{Tuple{UnitRange{Int},
532523
# If `span` fits exactly within `region`, then we've only copied over
533524
# the beginning overhang, but also need to conserve the end overhang.
534525
if first(region) < first(span) && last(span) < last(region)
535-
push!(extras, (last(span)+1:last(region), annot))
526+
push!(extras, (i, (last(span)+1:last(region), annot)))
536527
end
537528
end
538-
# Insert any extra entries in the appropriate position
539-
for entry in extras
540-
sortedindex = searchsortedlast(annotations, (first(entry),), by=first) + 1
541-
insert!(annotations, sortedindex, entry)
542-
end
529+
end
530+
# Insert any extra entries in the appropriate position
531+
for (offset, (i, entry)) in enumerate(extras)
532+
insert!(annotations, i + offset, entry)
543533
end
544534
annotations
545535
end
546536

537+
"""
538+
_insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, offset::Int = position(io))
539+
540+
Register new `annotations` in `io`, applying an `offset` to their regions.
541+
542+
The largely consists of simply shifting the regions of `annotations` by `offset`
543+
and pushing them onto `io`'s annotations. However, when it is possible to merge
544+
the new annotations with recent annotations in accordance with the semantics
545+
outlined in [`AnnotatedString`](@ref), we do so. More specifically, when there
546+
is a run of the most recent annotations that are also present as the first
547+
`annotations`, with the same value and adjacent regions, the new annotations are
548+
merged into the existing recent annotations by simply extending their range.
549+
550+
This is implemented so that one can say write an `AnnotatedString` to an
551+
`AnnotatedIOBuffer` one character at a time without needlessly producing a
552+
new annotation for each character.
553+
"""
547554
function _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{Tuple{UnitRange{Int}, Pair{Symbol, Any}}}, offset::Int = position(io))
548-
if !eof(io)
549-
for (region, annot) in annotations
550-
region = first(region)+offset:last(region)+offset
551-
sortedindex = searchsortedlast(io.annotations, (region,), by=first) + 1
552-
insert!(io.annotations, sortedindex, (region, annot))
553-
end
554-
else
555-
for (region, annot) in annotations
556-
region = first(region)+offset:last(region)+offset
557-
push!(io.annotations, (region, annot))
555+
run = 0
556+
if !isempty(io.annotations) && last(first(last(io.annotations))) == offset
557+
for i in reverse(axes(annotations, 1))
558+
annot = annotations[i]
559+
first(first(annot)) == 1 || continue
560+
if last(annot) == last(last(io.annotations))
561+
valid_run = true
562+
for runlen in 1:i
563+
new_range, new_annot = annotations[begin+runlen-1]
564+
old_range, old_annot = io.annotations[end-i+runlen]
565+
if last(old_range) != offset || first(new_range) != 1 || old_annot != new_annot
566+
valid_run = false
567+
break
568+
end
569+
end
570+
if valid_run
571+
run = i
572+
break
573+
end
574+
end
558575
end
559576
end
577+
for runindex in 0:run-1
578+
old_index = lastindex(io.annotations) - run + 1 + runindex
579+
old_region, annot = io.annotations[old_index]
580+
new_region, _ = annotations[begin+runindex]
581+
io.annotations[old_index] = (first(old_region):last(new_region)+offset, annot)
582+
end
583+
for index in run+1:lastindex(annotations)
584+
region, annot = annotations[index]
585+
start, stop = first(region), last(region)
586+
push!(io.annotations, (start+offset:stop+offset, annot))
587+
end
560588
end
561589

562590
function read(io::AnnotatedIOBuffer, ::Type{AnnotatedString{T}}) where {T <: AbstractString}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
6969fb6d2e8585d26beef865910ec8ef
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
281292e8478d72ab66b84cbd4f42e5dc2dd5054e8c54a79de8f0c0537d28962b460e67fe71230ead6b02386b87d0423879d51ce53a2b2427ce55866d62d6ebde

deps/checksums/StyledStrings-bfdb4c3f73a93a956ad48b0f06f89eb1cd40ff6b.tar.gz/md5

Lines changed: 0 additions & 1 deletion
This file was deleted.

deps/checksums/StyledStrings-bfdb4c3f73a93a956ad48b0f06f89eb1cd40ff6b.tar.gz/sha512

Lines changed: 0 additions & 1 deletion
This file was deleted.

stdlib/StyledStrings.version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
STYLEDSTRINGS_BRANCH = main
2-
STYLEDSTRINGS_SHA1 = bfdb4c3f73a93a956ad48b0f06f89eb1cd40ff6b
2+
STYLEDSTRINGS_SHA1 = ac472083359dde956aed8c61d43b8158ac84d9ce
33
STYLEDSTRINGS_GIT_URL := https://github.com/JuliaLang/StyledStrings.jl.git
44
STYLEDSTRINGS_TAR_URL = https://api.github.com/repos/JuliaLang/StyledStrings.jl/tarball/$1

test/strings/annotated.jl

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@
2424
@test Base.AnnotatedString(str[3:4]) ==
2525
Base.AnnotatedString("me", [(1:2, :thing => 0x01), (1:2, :all => 0x03)])
2626
@test Base.AnnotatedString(str[3:6]) ==
27-
Base.AnnotatedString("me s", [(1:2, :thing => 0x01), (1:4, :all => 0x03), (4:4, :other => 0x02)])
28-
@test str == Base.AnnotatedString("some string", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)])
27+
Base.AnnotatedString("me s", [(1:2, :thing => 0x01), (4:4, :other => 0x02), (1:4, :all => 0x03)])
28+
@test str == Base.AnnotatedString("some string", [(1:4, :thing => 0x01), (6:11, :other => 0x02), (1:11, :all => 0x03)])
2929
@test str != Base.AnnotatedString("some string")
30-
@test str != Base.AnnotatedString("some string", [(1:1, :thing => 0x01), (6:6, :other => 0x02), (11:11, :all => 0x03)])
30+
@test str != Base.AnnotatedString("some string", [(1:1, :thing => 0x01), (1:11, :all => 0x03), (6:6, :other => 0x02)])
3131
@test str != Base.AnnotatedString("some string", [(1:4, :thing => 0x11), (1:11, :all => 0x13), (6:11, :other => 0x12)])
3232
@test str != Base.AnnotatedString("some thingg", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)])
3333
@test Base.AnnotatedString([Base.AnnotatedChar('a', [:a => 1]), Base.AnnotatedChar('b', [:b => 2])]) ==
@@ -51,15 +51,8 @@
5151
# @test collect(Base.eachstyle(str)) ==
5252
# [("some", [:thing => 0x01, :all => 0x03]),
5353
# (" string", [:all => 0x03, :other => 0x02])]
54-
@test ==(Base.annotatedstring_optimize!(
55-
Base.AnnotatedString("abc", [(1:1, :val => 1),
56-
(2:2, :val => 2),
57-
(2:2, :val => 1),
58-
(3:3, :val => 2)])),
59-
Base.AnnotatedString("abc", [(1:2, :val => 1),
60-
(2:3, :val => 2)]))
6154
@test chopprefix(sprint(show, str), "Base.") ==
62-
"AnnotatedString{String}(\"some string\", [(1:4, :thing => 0x01), (1:11, :all => 0x03), (6:11, :other => 0x02)])"
55+
"AnnotatedString{String}(\"some string\", [(1:4, :thing => 0x01), (6:11, :other => 0x02), (1:11, :all => 0x03)])"
6356
@test eval(Meta.parse(repr(str))) == str
6457
@test sprint(show, MIME("text/plain"), str) == "\"some string\""
6558
end
@@ -149,8 +142,8 @@ end
149142
# Check `annotate!`, including region sorting
150143
@test truncate(aio, 0).io.size == 0
151144
@test write(aio, "hello world") == ncodeunits("hello world")
152-
@test Base.annotate!(aio, 7:11, :tag => 2) === aio
153145
@test Base.annotate!(aio, 1:5, :tag => 1) === aio
146+
@test Base.annotate!(aio, 7:11, :tag => 2) === aio
154147
@test Base.annotations(aio) == [(1:5, :tag => 1), (7:11, :tag => 2)]
155148
# Reading
156149
@test read(seekstart(deepcopy(aio.io)), String) == "hello world"
@@ -178,24 +171,42 @@ end
178171
@test Base.annotations(aio) == [(1:5, :tag => 1), (7:11, :tag => 2)] # Should be unchanged
179172
@test write(seek(aio, 0), Base.AnnotatedString("hey-o", [(1:5, :hey => 'o')])) == 5
180173
@test read(seekstart(aio), String) == "hey-o alice"
181-
@test Base.annotations(aio) == [(1:5, :hey => 'o'), (7:11, :tag => 2)] # First annotation should have been entirely replaced
174+
@test Base.annotations(aio) == [(7:11, :tag => 2), (1:5, :hey => 'o')] # First annotation should have been entirely replaced
182175
@test write(seek(aio, 7), Base.AnnotatedString("bbi", [(1:3, :hey => 'a')])) == 3 # a[lic => bbi]e ('alice' => 'abbie')
183176
@test read(seekstart(aio), String) == "hey-o abbie"
184-
@test Base.annotations(aio) == [(1:5, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)]
177+
@test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (1:5, :hey => 'o'), (8:10, :hey => 'a')]
185178
@test write(seek(aio, 0), Base.AnnotatedString("ab")) == 2 # Check first annotation's region is adjusted correctly
186179
@test read(seekstart(aio), String) == "aby-o abbie"
187-
@test Base.annotations(aio) == [(3:5, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)]
180+
@test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (3:5, :hey => 'o'), (8:10, :hey => 'a')]
188181
@test write(seek(aio, 3), Base.AnnotatedString("ss")) == 2
189182
@test read(seekstart(aio), String) == "abyss abbie"
190-
@test Base.annotations(aio) == [(3:3, :hey => 'o'), (7:7, :tag => 2), (8:10, :hey => 'a'), (11:11, :tag => 2)]
183+
@test Base.annotations(aio) == [(7:7, :tag => 2), (11:11, :tag => 2), (3:3, :hey => 'o'), (8:10, :hey => 'a')]
191184
# Writing one buffer to another
192185
newaio = Base.AnnotatedIOBuffer()
193186
@test write(newaio, seekstart(aio)) == 11
194187
@test read(seekstart(newaio), String) == "abyss abbie"
195188
@test Base.annotations(newaio) == Base.annotations(aio)
196189
@test write(seek(newaio, 5), seek(aio, 5)) == 6
197-
@test Base.annotations(newaio) == Base.annotations(aio)
190+
@test sort(Base.annotations(newaio)) == sort(Base.annotations(aio))
198191
@test write(newaio, seek(aio, 5)) == 6
199192
@test read(seekstart(newaio), String) == "abyss abbie abbie"
200-
@test Base.annotations(newaio) == vcat(Base.annotations(aio), [(13:13, :tag => 2), (14:16, :hey => 'a'), (17:17, :tag => 2)])
193+
@test sort(Base.annotations(newaio)) == sort(vcat(Base.annotations(aio), [(13:13, :tag => 2), (14:16, :hey => 'a'), (17:17, :tag => 2)]))
194+
# The `_insert_annotations!` cautious-merging optimisation
195+
aio = Base.AnnotatedIOBuffer()
196+
@test write(aio, Base.AnnotatedChar('a', [:a => 1, :b => 2])) == 1
197+
@test Base.annotations(aio) == [(1:1, :a => 1), (1:1, :b => 2)]
198+
@test write(aio, Base.AnnotatedChar('b', [:a => 1, :b => 2])) == 1
199+
@test Base.annotations(aio) == [(1:2, :a => 1), (1:2, :b => 2)]
200+
let aio2 = copy(aio) # A different start makes merging too risky to do.
201+
@test write(aio2, Base.AnnotatedChar('c', [:a => 0, :b => 2])) == 1
202+
@test Base.annotations(aio2) == [(1:2, :a => 1), (1:2, :b => 2), (3:3, :a => 0), (3:3, :b => 2)]
203+
end
204+
let aio2 = copy(aio) # Merging some run of the most recent annotations is fine though.
205+
@test write(aio2, Base.AnnotatedChar('c', [:b => 2])) == 1
206+
@test Base.annotations(aio2) == [(1:2, :a => 1), (1:3, :b => 2)]
207+
end
208+
let aio2 = copy(aio) # ...and any subsequent annotations after a matching run can just be copied over.
209+
@test write(aio2, Base.AnnotatedChar('c', [:b => 2, :c => 3, :d => 4])) == 1
210+
@test Base.annotations(aio2) == [(1:2, :a => 1), (1:3, :b => 2), (3:3, :c => 3), (3:3, :d => 4)]
211+
end
201212
end

0 commit comments

Comments
 (0)