JuliaStats · andreasnoack · Apr 11, 2023 · Mar 29, 2023 · Apr 11, 2023
diff --git a/Project.toml b/Project.toml
@@ -12,8 +12,9 @@ Distances = "0.7, 0.8, 0.9, 0.10"
 julia = "0.7, 1"
 
 [extras]
+RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Random", "Test"]
+test = ["Random", "RDatasets", "Test"]
diff --git a/src/Loess.jl b/src/Loess.jl
@@ -29,21 +29,27 @@ Args:
   - `span`: The degree of smoothing, typically in [0,1]. Smaller values result in smaller
       local context in fitting.
   - `degree`: Polynomial degree.
+  - `cell`: Control parameter for bucket size. Internal interpolation nodes will be
+    added to the K-D tree until the number of bucket element is below `n * cell * span`.
 
 Returns:
   A fit `LoessModel`.
 
 """
-function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
-               normalize::Bool=true,
-               span::AbstractFloat=0.75,
-               degree::Integer=2) where T<:AbstractFloat
+function loess(
+    xs::AbstractMatrix{T},
+    ys::AbstractVector{T};
+    normalize::Bool = true,
+    span::AbstractFloat = 0.75,
+    degree::Integer = 2,
+    cell::AbstractFloat = 0.2
+) where T<:AbstractFloat
     if size(xs, 1) != size(ys, 1)
         throw(DimensionMismatch("Predictor and response arrays must of the same length"))
     end
 
     n, m = size(xs)
-    q = ceil(Int, (span * n))
+    q = trunc(Int, (span * n))
     if q < degree + 1
         throw(ArgumentError("neighborhood size must be larger than degree+1=$(degree + 1) but was $q. Try increasing the value of span."))
     end
@@ -55,7 +61,7 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
         xs = tnormalize!(copy(xs))
     end
 
-    kdtree = KDTree(xs, 0.05 * span)
+    kdtree = KDTree(xs, cell * span, 0.0)
 
     # map verticies to their index in the bs coefficient matrix
     verts = Dict{Vector{T}, Int}()
@@ -89,8 +95,8 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
 
         for i in 1:q
             pᵢ = perm[i]
-            w = tricubic(ds[pᵢ] / dmax)
-            us[i,1] = w
+            w = sqrt(tricubic(ds[pᵢ] / dmax))
+            us[i, 1] = w
             for j in 1:m
                 x = xs[pᵢ, j]
                 wxl = w
@@ -135,12 +141,11 @@ end
 # Returns:
 #   A length n' vector of predicted response values.
 #
-function predict(model::LoessModel{T}, z::T) where T <: AbstractFloat
-    predict(model, T[z])
+function predict(model::LoessModel, z::Number)
+    predict(model, [z])
 end
 
-
-function predict(model::LoessModel{T}, zs::AbstractVector{T}) where T <: AbstractFloat
+function predict(model::LoessModel, zs::AbstractVector)
     m = size(model.xs, 2)
 
     # in the univariate case, interpret a non-singleton zs as vector of
@@ -178,14 +183,7 @@ function predict(model::LoessModel{T}, zs::AbstractVector{T}) where T <: Abstrac
 end
 
 
-function predict(model::LoessModel{T}, zs::AbstractMatrix{T}) where T <: AbstractFloat
-    ys = Array{T}(undef, size(zs, 1))
-    for i in 1:size(zs, 1)
-        # the vec() here is not necessary on 0.5 anymore
-        ys[i] = predict(model, vec(zs[i,:]))
-    end
-    ys
-end
+predict(model::LoessModel, zs::AbstractMatrix) = [predict(model, zs[i,:]) for i in 1:size(zs, 1)]
 
 """
     tricubic(u)

diff --git a/src/kd.jl b/src/kd.jl
@@ -40,8 +40,9 @@ Returns:
 
 """
 function KDTree(xs::AbstractMatrix{T},
-                leaf_size_factor=0.05,
-                leaf_diameter_factor=0.0) where T <: AbstractFloat
+    leaf_size_factor,
+    leaf_diameter_factor
+) where T <: AbstractFloat
 
     n, m = size(xs)
     perm = collect(1:n)
@@ -55,11 +56,14 @@ function KDTree(xs::AbstractMatrix{T},
 
     diam = diameter(bounds)
 
-    leaf_size_cutoff = ceil(Int, leaf_size_factor * n)
+    # leaf_size_cutoff = ceil(Int, leaf_size_factor * n)
+    leaf_size_cutoff = leaf_size_factor * n
     leaf_diameter_cutoff = leaf_diameter_factor * diam
     verts = Set{Vector{T}}()
 
     # Add a vertex for each corner of the hypercube
+    # This deviates from the original implementation from the paper where the outer verted were
+    # made a bit wider than the data limits.
     for vert in Iterators.product([bounds[:,j] for j in 1:m]...)
         push!(verts, T[vert...])
     end
@@ -113,14 +117,15 @@ Returns:
   Either a `KDLeafNode` or a `KDInternalNode`
 """
 function build_kdtree(xs::AbstractMatrix{T},
-                      perm::AbstractArray,
+                      perm::AbstractVector,
                       bounds::Matrix{T},
-                      leaf_size_cutoff::Int,
-                      leaf_diameter_cutoff::T,
+                      leaf_size_cutoff::Real,
+                      leaf_diameter_cutoff::Real,
                       verts::Set{Vector{T}}) where T
     n, m = size(xs)
 
     if length(perm) <= leaf_size_cutoff || diameter(bounds) <= leaf_diameter_cutoff
+        @debug "Creating leaf node" length(perm) leaf_size_cutoff diameter(bounds) leaf_diameter_cutoff
         return KDLeafNode()
     end
 
@@ -141,19 +146,56 @@ function build_kdtree(xs::AbstractMatrix{T},
         end
     end
 
-    # find the median and partition
-    if isodd(length(perm))
-        mid = (length(perm) + 1) ÷ 2
-        partialsort!(perm, mid, by=i -> xs[i, j])
-        med = xs[perm[mid], j]
-        mid1 = mid
-        mid2 = mid + 1
-    else
-        mid1 = length(perm) ÷ 2
+    # Find the "median" and partition
+    #
+    # The aim of the alogorihtm is to split the data recursively in two roughly equally sized
+    # subsets. To do so, we'll use the median element of x[:, j] but there are a
+    # few corner cases that require some care: sets with an even number of elements
+    # doesn't have a unique median and ties. Below, we'll list the possibilities.
+    #
+    # - Odd number of elements and no ties, e.g. [1, 2, 3]: the median is
+    #   unambiguously 2 and split the data into [1, 2] and [3]
+    #
+    # - Even number of element and no ties, e.g. [1, 2, 3, 4]: we choose the left middle
+    #   value 2 as median and split the dato into [1, 2] and [3, 4]
+    #
+    # - Ties will cause a search to the change in value that divides the set most evenly.
+    #   E.g. [1, 2, 2, 3] uses 2 as median value to split the data into [1, 2, 2] and [3]
+    #   but [1, 1, 2, 2, 2] uses 1 to split into [1, 1] and [2, 2, 2] even though 1 is
+    #   not a proper median value. This avoids that the same value in two buckets.
+    #
+    # The details here are reversed engineered from the C/Fortran implementation wrapped
+    # by R and also distribtued on NETLIB.
+    mid = (length(perm) + 1) ÷ 2
+    @debug "Candidate median index and median value" mid xs[perm[mid], j]
+
+    offset = 0
+    local mid1, mid2
+    while true
+        mid1 = mid + offset
         mid2 = mid1 + 1
-        partialsort!(perm, mid1:mid2, by=i -> xs[i, j])
-        med = (xs[perm[mid1], j] + xs[perm[mid2], j]) / 2
+        if mid1 < 1
+            @debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(perm) xs[perm[mid], j]
+            offset = mid1 = 0
+            mid2 = length(perm) + 1
+            break
+        end
+        if mid2 > length(perm)
+            @debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(perm) offset
+            offset = -offset + (offset <= 0)
+            continue
+        end
+        p12 = partialsort!(perm, mid1:mid2, by = i -> xs[i, j])
+        if xs[p12[1], j] == xs[p12[2], j]
+            @debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
+            offset = -offset + (offset <= 0)
+        else
+            break
+        end
     end
+    mid += offset
+    med = xs[perm[mid], j]
+    @debug "Accepted median index and median value" mid med
 
     leftbounds = copy(bounds)
     leftbounds[2, j] = med
@@ -198,7 +240,7 @@ end
 Traverse the tree `kdtree` to the bottom and return the verticies of
 the bounding hypercube of the leaf node containing the point `x`.
 """
-function traverse(kdtree::KDTree{T}, x::AbstractVector{T}) where T
+function traverse(kdtree::KDTree, x::AbstractVector)
     m = size(kdtree.bounds, 2)
 
     if length(x) != m

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -4,31 +4,38 @@ using Loess
 using Test
 using Random
 using Statistics
+using RDatasets
 
-Random.seed!(100)
-xs = 10 .* rand(100)
-ys = sin.(xs) .+ 0.5 * rand(100)
+@testset "basic" begin
+    Random.seed!(100)
+    xs = 10 .* rand(100)
+    ys = sin.(xs) .+ 0.5 * rand(100)
 
-model = loess(xs, ys)
+    model = loess(xs, ys)
 
-us = collect(minimum(xs):0.1:maximum(xs))
-vs = predict(model, us)
+    us = collect(minimum(xs):0.1:maximum(xs))
+    vs = predict(model, us)
 
-@test minimum(vs) >= -1.1
-@test maximum(vs) <= +1.1
+    @test minimum(vs) >= -1.1
+    @test maximum(vs) <= +1.1
 
+    x = [13.0,14.0,14.35,15.0,16.0]
+    y = [0.369486,  0.355579, 0.3545, 0.356952, 0.36883]
+    model = loess(x,y)
+    @test Loess.predict(model,x) ≈ y
+end
 
-x = [13.0,14.0,14.35,15.0,16.0]
-y = [0.369486,  0.355579, 0.3545, 0.356952, 0.36883]
-model = loess(x,y)
-@test Loess.predict(model,x) ≈ y
-
-let x = 1:10, y = sin.(1:10)
+@testset "sine" begin
+    x = 1:10
+    y = sin.(1:10)
     model = loess(x, y)
     @test model.xs == reshape(collect(Float64, 1:10), 10, 1)
     @test model.ys == y
-    pred = [1.02866, 0.798561, 0.533528, 0.253913, -0.0325918, -0.319578, -0.648763]
-    @test predict(model, 1.0:0.5:4.0) ≈ pred atol=1e-5
+    # Test values from R's loess
+    pred = [1.0291697629100347, 0.5449767438422979, -0.0006429381818782, -0.7073242734363788, -0.8962317382167664, -0.2611478760650811, 0.6140341389957057, 0.8482042073056935, 0.4572835622716546, -0.5459427790447537]
+    # The last element differs by a bit because the implementation that R uses widens the outer vertices a bit
+    @test predict(model, x)[1:end - 1] ≈ pred[1:end - 1] atol=1e-5
+    @test_broken predict(model, x)[end] ≈ pred[end] atol=1e-5
 end
 
 @test_throws DimensionMismatch loess([1.0 2.0; 3.0 4.0], [1.0])
@@ -38,18 +45,34 @@ end
         x = [1.0, 2.0, 3.0, 4.0]
         y = [1.0, 2.0, 3.0, 4.0]
         @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 1. Try increasing the value of span.") loess(x, y, span = 0.25)
-        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") loess(x, y, span = 0.33)
+        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 1. Try increasing the value of span.") loess(x, y, span = 0.33)
         @test predict(loess(x, y), x) ≈ x
     end
 
     @testset "Example 2" begin
         x = [1.0, 1.0, 2.0, 3.0, 4.0, 4.0]
         y = [1.0, 1.0, 2.0, 3.0, 4.0, 4.0]
-        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") loess(x, y, span = 0.33)
-        # For 0.4 and 0.5 these current don't hit the middle values. I suspect
-        # the issue is related to the ties in x.
-        @test_broken predict(loess(x, y, span = 0.4), x) ≈ x
-        @test_broken predict(loess(x, y, span = 0.5), x) ≈ x
+        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 1. Try increasing the value of span.") loess(x, y, span = 0.33)
+        @test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") predict(loess(x, y, span = 0.4), x) ≈ x
+        @test predict(loess(x, y, span = 0.5), x) ≈ x
         @test predict(loess(x, y, span = 0.6), x) ≈ x
     end
 end
+
+@testset "cars" begin
+    df = dataset("datasets", "cars")
+    ft = loess(df.Speed, df.Dist)
+
+    # Test values from R's loess expect outer vertices as they are made wider in the R/C/Fortran implementation
+    @testset "vertices" begin
+        @test sort(getindex.(keys(ft.verts))) == [4.0, 8.0, 10.0, 12.0, 13.0, 14.0, 15.0, 17.0, 19.0, 22.0, 25.0]
+    end
+
+    @testset "predict" begin
+        # In R this is `predict(cars.lo, data.frame(speed = seq(5, 25, 1)))`.
+        Rvals = [7.797353, 10.002308, 12.499786, 15.281082, 18.446568, 21.865315, 25.517015, 29.350386, 33.230660, 37.167935, 41.205226, 45.055736, 48.355889, 49.824812, 51.986702, 56.461318, 61.959729, 68.569313, 76.316068, 85.212121, 95.324047]
+        @test predict(ft, [10, 15, 22]) ≈ Rvals[[6, 11, 18]] rtol=1e-5
+        # The interpolated values are broken until https://github.com/JuliaStats/Loess.jl/pull/63 is merged
+        @test_broken predict(ft) ≈ Rvals rtol=1e-5
+    end
+end