Skip to content
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "Loess"
uuid = "4345ca2d-374a-55d4-8d30-97f9976e7612"
version = "0.6.1"
version = "0.6.2"

[deps]
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Expand Down
74 changes: 43 additions & 31 deletions src/kd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ function KDTree(
) where T <: AbstractFloat

n, m = size(xs)
perm = collect(1:n)

bounds = Array{T}(undef, 2, m)
for j in 1:m
Expand All @@ -63,14 +62,13 @@ function KDTree(
push!(verts, T[vert...])
end

perm = collect(1:n)
root = build_kdtree(xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)

KDTree(convert(Matrix{T}, xs), collect(1:n), root, verts, bounds)
KDTree(convert(Matrix{T}, xs), perm, root, verts, bounds)
end




"""
diameter(bounds)

Expand All @@ -88,6 +86,32 @@ function diameter(bounds::Matrix)
euclidean(vec(bounds[1,:]), vec(bounds[2,:]))
end

"""
_select_j(xs::AbstractMatrix{T})

Select the column for sorting the rows xs based on the column with the largest spread.
"""
function _select_j(xs::AbstractMatrix{T}) where {T <: AbstractFloat}
size(xs, 2) == 1 && return 1

# split on the dimension with the largest spread
# maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
j = 1
maxspread = 0
@inbounds for k in axes(xs, 2)
xmin = Inf
xmax = -Inf
@inbounds for i in axes(xs, 1)
xmin = min(xmin, xs[i, k])
xmax = max(xmax, xs[i, k])
end
if xmax - xmin > maxspread
maxspread = xmax - xmin
j = k
end
end
return j
end

"""
build_kdtree(xs, perm, bounds, leaf_size_cutoff, leaf_diameter_cutoff, verts)
Expand Down Expand Up @@ -121,30 +145,19 @@ function build_kdtree(xs::AbstractMatrix{T},
Base.require_one_based_indexing(xs)
Base.require_one_based_indexing(perm)

j = _select_j(xs)
n, m = size(xs)
if !issorted(view(xs, perm, j))
@debug "received unsorted data, sorting"
sortperm!(perm, view(xs, :, j))
end
xjs = view(xs, perm, j)

if length(perm) <= leaf_size_cutoff || diameter(bounds) <= leaf_diameter_cutoff
@debug "Creating leaf node" length(perm) leaf_size_cutoff diameter(bounds) leaf_diameter_cutoff
return nothing
end

# split on the dimension with the largest spread
# maxspread, j = findmax(maximum(xs[perm, k]) - minimum(xs[perm, k]) for k in 1:m)
j = 1
maxspread = 0
for k in 1:m
xmin = Inf
xmax = -Inf
for i in perm
xmin = min(xmin, xs[i, k])
xmax = max(xmax, xs[i, k])
end
if xmax - xmin > maxspread
maxspread = xmax - xmin
j = k
end
end

# Find the "median" and partition
#
# The aim of the algorithm is to split the data recursively in two roughly equally sized
Expand All @@ -165,37 +178,36 @@ function build_kdtree(xs::AbstractMatrix{T},
#
# The details here are reversed engineered from the C/Fortran implementation wrapped
# by R and also distribtued on NETLIB.
mid = (length(perm) + 1) ÷ 2
@debug "Candidate median index and median value" mid xs[perm[mid], j]
mid = (length(xjs) + 1) ÷ 2
@debug "Candidate median index and median value" mid xs[mid, j]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be xjs[mid] instead of xs[mid, j] and likewise below?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, yes


offset = 0
local mid1, mid2
while true
mid1 = mid + offset
mid2 = mid1 + 1
if mid1 < 1
@debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(perm) xs[perm[mid], j]
@debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(xjs) xs[mid, j]
offset = mid1 = 0
mid2 = length(perm) + 1
mid2 = length(xjs) + 1
break
end
if mid2 > length(perm)
@debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(perm) offset
if mid2 > length(xjs)
@debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(xjs) offset
# This makes the offset 0, 1, -1, 2, -2, ...
offset = -offset + (offset <= 0)
continue
end
p12 = partialsort!(perm, mid1:mid2, by = i -> xs[i, j])
if xs[p12[1], j] == xs[p12[2], j]
@debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
if xjs[mid1] == xjs[mid2]
# @debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
# This makes the offset 0, 1, -1, 2, -2, ...
offset = -offset + (offset <= 0)
else
break
end
end
mid += offset
med = xs[perm[mid], j]
med = xjs[mid]
@debug "Accepted median index and median value" mid med

leftbounds = copy(bounds)
Expand Down
14 changes: 14 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,20 @@ end
@test_broken predict(model, x)[end] ≈ pred[end] atol=1e-5
end

@testset "lots of ties" begin
# adapted from https://github.com/JuliaStats/Loess.jl/pull/74#discussion_r1294303522
x = repeat([π/4*i for i in -20:20], inner=101)
y = sin.(x)

model = loess(x,y; span=0.2)
for i in -3:3
@test predict(model, i * π) ≈ 0 atol=1e-12
# not great tolerance but loess also struggles to capture the sine peaks
@test abs(predict(model, i * π + π / 2 )) ≈ 0.9 atol=0.1
end

end

@test_throws DimensionMismatch loess([1.0 2.0; 3.0 4.0], [1.0])

@testset "Issue 28" begin
Expand Down