Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ Distances = "0.7, 0.8, 0.9, 0.10"
julia = "0.7, 1"

[extras]
RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Random", "Test"]
test = ["Random", "RDatasets", "Test"]
38 changes: 18 additions & 20 deletions src/Loess.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,27 @@ Args:
- `span`: The degree of smoothing, typically in [0,1]. Smaller values result in smaller
local context in fitting.
- `degree`: Polynomial degree.
- `cell`: Control parameter for bucket size. Internal interpolation nodes will be
added to the K-D tree until the number of bucket element is below `n * cell * span`.

Returns:
A fit `LoessModel`.

"""
function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
normalize::Bool=true,
span::AbstractFloat=0.75,
degree::Integer=2) where T<:AbstractFloat
function loess(
xs::AbstractMatrix{T},
ys::AbstractVector{T};
normalize::Bool = true,
span::AbstractFloat = 0.75,
degree::Integer = 2,
cell::AbstractFloat = 0.2
) where T<:AbstractFloat
if size(xs, 1) != size(ys, 1)
throw(DimensionMismatch("Predictor and response arrays must of the same length"))
end

n, m = size(xs)
q = ceil(Int, (span * n))
q = trunc(Int, (span * n))
Comment thread
andreasnoack marked this conversation as resolved.
Outdated
if q < degree + 1
throw(ArgumentError("neighborhood size must be larger than degree+1=$(degree + 1) but was $q. Try increasing the value of span."))
end
Expand All @@ -55,7 +61,7 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};
xs = tnormalize!(copy(xs))
end

kdtree = KDTree(xs, 0.05 * span)
kdtree = KDTree(xs, cell * span, 0.0)
Comment thread
andreasnoack marked this conversation as resolved.
Outdated

# map verticies to their index in the bs coefficient matrix
verts = Dict{Vector{T}, Int}()
Expand Down Expand Up @@ -89,8 +95,8 @@ function loess(xs::AbstractMatrix{T}, ys::AbstractVector{T};

Comment thread
andreasnoack marked this conversation as resolved.
for i in 1:q
pᵢ = perm[i]
w = tricubic(ds[pᵢ] / dmax)
us[i,1] = w
w = sqrt(tricubic(ds[pᵢ] / dmax))
us[i, 1] = w
Comment thread
andreasnoack marked this conversation as resolved.
for j in 1:m
x = xs[pᵢ, j]
wxl = w
Expand Down Expand Up @@ -135,12 +141,11 @@ end
# Returns:
# A length n' vector of predicted response values.
#
function predict(model::LoessModel{T}, z::T) where T <: AbstractFloat
predict(model, T[z])
function predict(model::LoessModel, z::Number)
Comment thread
andreasnoack marked this conversation as resolved.
Outdated
predict(model, [z])
Comment thread
andreasnoack marked this conversation as resolved.
end


function predict(model::LoessModel{T}, zs::AbstractVector{T}) where T <: AbstractFloat
function predict(model::LoessModel, zs::AbstractVector)
Comment thread
andreasnoack marked this conversation as resolved.
m = size(model.xs, 2)

# in the univariate case, interpret a non-singleton zs as vector of
Expand Down Expand Up @@ -178,14 +183,7 @@ function predict(model::LoessModel{T}, zs::AbstractVector{T}) where T <: Abstrac
end


function predict(model::LoessModel{T}, zs::AbstractMatrix{T}) where T <: AbstractFloat
ys = Array{T}(undef, size(zs, 1))
for i in 1:size(zs, 1)
# the vec() here is not necessary on 0.5 anymore
ys[i] = predict(model, vec(zs[i,:]))
end
ys
end
predict(model::LoessModel, zs::AbstractMatrix) = [predict(model, zs[i,:]) for i in 1:size(zs, 1)]
Comment thread
andreasnoack marked this conversation as resolved.
Outdated

"""
tricubic(u)
Expand Down
78 changes: 60 additions & 18 deletions src/kd.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ Returns:

"""
function KDTree(xs::AbstractMatrix{T},
leaf_size_factor=0.05,
leaf_diameter_factor=0.0) where T <: AbstractFloat
leaf_size_factor,
leaf_diameter_factor
) where T <: AbstractFloat

n, m = size(xs)
perm = collect(1:n)
Expand All @@ -55,11 +56,14 @@ function KDTree(xs::AbstractMatrix{T},

diam = diameter(bounds)

leaf_size_cutoff = ceil(Int, leaf_size_factor * n)
# leaf_size_cutoff = ceil(Int, leaf_size_factor * n)
leaf_size_cutoff = leaf_size_factor * n
Comment thread
andreasnoack marked this conversation as resolved.
Outdated
leaf_diameter_cutoff = leaf_diameter_factor * diam
verts = Set{Vector{T}}()

# Add a vertex for each corner of the hypercube
# This deviates from the original implementation from the paper where the outer verted were
# made a bit wider than the data limits.
for vert in Iterators.product([bounds[:,j] for j in 1:m]...)
push!(verts, T[vert...])
end
Expand Down Expand Up @@ -113,14 +117,15 @@ Returns:
Either a `KDLeafNode` or a `KDInternalNode`
"""
function build_kdtree(xs::AbstractMatrix{T},
perm::AbstractArray,
perm::AbstractVector,
bounds::Matrix{T},
leaf_size_cutoff::Int,
leaf_diameter_cutoff::T,
leaf_size_cutoff::Real,
leaf_diameter_cutoff::Real,
verts::Set{Vector{T}}) where T
n, m = size(xs)

if length(perm) <= leaf_size_cutoff || diameter(bounds) <= leaf_diameter_cutoff
@debug "Creating leaf node" length(perm) leaf_size_cutoff diameter(bounds) leaf_diameter_cutoff
return KDLeafNode()
end

Expand All @@ -141,19 +146,56 @@ function build_kdtree(xs::AbstractMatrix{T},
end
end

# find the median and partition
if isodd(length(perm))
mid = (length(perm) + 1) ÷ 2
partialsort!(perm, mid, by=i -> xs[i, j])
med = xs[perm[mid], j]
mid1 = mid
mid2 = mid + 1
else
mid1 = length(perm) ÷ 2
# Find the "median" and partition
#
# The aim of the alogorihtm is to split the data recursively in two roughly equally sized
Comment thread
andreasnoack marked this conversation as resolved.
Outdated
# subsets. To do so, we'll use the median element of x[:, j] but there are a
# few corner cases that require some care: sets with an even number of elements
# doesn't have a unique median and ties. Below, we'll list the possibilities.
#
# - Odd number of elements and no ties, e.g. [1, 2, 3]: the median is
# unambiguously 2 and split the data into [1, 2] and [3]
#
# - Even number of element and no ties, e.g. [1, 2, 3, 4]: we choose the left middle
# value 2 as median and split the dato into [1, 2] and [3, 4]
Comment thread
andreasnoack marked this conversation as resolved.
Outdated
#
# - Ties will cause a search to the change in value that divides the set most evenly.
# E.g. [1, 2, 2, 3] uses 2 as median value to split the data into [1, 2, 2] and [3]
# but [1, 1, 2, 2, 2] uses 1 to split into [1, 1] and [2, 2, 2] even though 1 is
# not a proper median value. This avoids that the same value in two buckets.
Comment thread
andreasnoack marked this conversation as resolved.
Outdated
#
# The details here are reversed engineered from the C/Fortran implementation wrapped
# by R and also distribtued on NETLIB.
mid = (length(perm) + 1) ÷ 2
@debug "Candidate median index and median value" mid xs[perm[mid], j]
Comment thread
andreasnoack marked this conversation as resolved.

offset = 0
local mid1, mid2
while true
mid1 = mid + offset
mid2 = mid1 + 1
partialsort!(perm, mid1:mid2, by=i -> xs[i, j])
med = (xs[perm[mid1], j] + xs[perm[mid2], j]) / 2
if mid1 < 1
@debug "mid1 is zero. All elements are identical. Creating vertex and then two leaves" mid1 length(perm) xs[perm[mid], j]
offset = mid1 = 0
mid2 = length(perm) + 1
break
end
if mid2 > length(perm)
@debug "mid2 is out of bounds. Continuing with negative offset" mid2 length(perm) offset
offset = -offset + (offset <= 0)
Comment thread
andreasnoack marked this conversation as resolved.
continue
end
p12 = partialsort!(perm, mid1:mid2, by = i -> xs[i, j])
if xs[p12[1], j] == xs[p12[2], j]
@debug "tie! Adjusting offset" xs[p12[1], j] xs[p12[2], j] offset
offset = -offset + (offset <= 0)
Comment thread
andreasnoack marked this conversation as resolved.
else
break
end
end
mid += offset
med = xs[perm[mid], j]
@debug "Accepted median index and median value" mid med

leftbounds = copy(bounds)
leftbounds[2, j] = med
Expand Down Expand Up @@ -198,7 +240,7 @@ end
Traverse the tree `kdtree` to the bottom and return the verticies of
the bounding hypercube of the leaf node containing the point `x`.
"""
function traverse(kdtree::KDTree{T}, x::AbstractVector{T}) where T
function traverse(kdtree::KDTree, x::AbstractVector)
m = size(kdtree.bounds, 2)

if length(x) != m
Expand Down
67 changes: 45 additions & 22 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,38 @@ using Loess
using Test
using Random
using Statistics
using RDatasets

Random.seed!(100)
xs = 10 .* rand(100)
ys = sin.(xs) .+ 0.5 * rand(100)
@testset "basic" begin
Random.seed!(100)
xs = 10 .* rand(100)
ys = sin.(xs) .+ 0.5 * rand(100)

model = loess(xs, ys)
model = loess(xs, ys)

us = collect(minimum(xs):0.1:maximum(xs))
vs = predict(model, us)
us = collect(minimum(xs):0.1:maximum(xs))
vs = predict(model, us)

@test minimum(vs) >= -1.1
@test maximum(vs) <= +1.1
@test minimum(vs) >= -1.1
@test maximum(vs) <= +1.1

x = [13.0,14.0,14.35,15.0,16.0]
y = [0.369486, 0.355579, 0.3545, 0.356952, 0.36883]
model = loess(x,y)
@test Loess.predict(model,x) ≈ y
end

x = [13.0,14.0,14.35,15.0,16.0]
y = [0.369486, 0.355579, 0.3545, 0.356952, 0.36883]
model = loess(x,y)
@test Loess.predict(model,x) ≈ y

let x = 1:10, y = sin.(1:10)
@testset "sine" begin
x = 1:10
y = sin.(1:10)
model = loess(x, y)
@test model.xs == reshape(collect(Float64, 1:10), 10, 1)
@test model.ys == y
pred = [1.02866, 0.798561, 0.533528, 0.253913, -0.0325918, -0.319578, -0.648763]
@test predict(model, 1.0:0.5:4.0) ≈ pred atol=1e-5
# Test values from R's loess
pred = [1.0291697629100347, 0.5449767438422979, -0.0006429381818782, -0.7073242734363788, -0.8962317382167664, -0.2611478760650811, 0.6140341389957057, 0.8482042073056935, 0.4572835622716546, -0.5459427790447537]
# The last element differs by a bit because the implementation that R uses widens the outer vertices a bit
@test predict(model, x)[1:end - 1] ≈ pred[1:end - 1] atol=1e-5
@test_broken predict(model, x)[end] ≈ pred[end] atol=1e-5
end

@test_throws DimensionMismatch loess([1.0 2.0; 3.0 4.0], [1.0])
Expand All @@ -38,18 +45,34 @@ end
x = [1.0, 2.0, 3.0, 4.0]
y = [1.0, 2.0, 3.0, 4.0]
@test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 1. Try increasing the value of span.") loess(x, y, span = 0.25)
@test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") loess(x, y, span = 0.33)
@test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 1. Try increasing the value of span.") loess(x, y, span = 0.33)
@test predict(loess(x, y), x) ≈ x
end

@testset "Example 2" begin
x = [1.0, 1.0, 2.0, 3.0, 4.0, 4.0]
y = [1.0, 1.0, 2.0, 3.0, 4.0, 4.0]
@test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") loess(x, y, span = 0.33)
# For 0.4 and 0.5 these current don't hit the middle values. I suspect
# the issue is related to the ties in x.
@test_broken predict(loess(x, y, span = 0.4), x) ≈ x
@test_broken predict(loess(x, y, span = 0.5), x) ≈ x
@test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 1. Try increasing the value of span.") loess(x, y, span = 0.33)
@test_throws ArgumentError("neighborhood size must be larger than degree+1=3 but was 2. Try increasing the value of span.") predict(loess(x, y, span = 0.4), x) ≈ x
@test predict(loess(x, y, span = 0.5), x) ≈ x
@test predict(loess(x, y, span = 0.6), x) ≈ x
end
end

@testset "cars" begin
df = dataset("datasets", "cars")
ft = loess(df.Speed, df.Dist)

# Test values from R's loess expect outer vertices as they are made wider in the R/C/Fortran implementation
@testset "vertices" begin
@test sort(getindex.(keys(ft.verts))) == [4.0, 8.0, 10.0, 12.0, 13.0, 14.0, 15.0, 17.0, 19.0, 22.0, 25.0]
end

@testset "predict" begin
# In R this is `predict(cars.lo, data.frame(speed = seq(5, 25, 1)))`.
Rvals = [7.797353, 10.002308, 12.499786, 15.281082, 18.446568, 21.865315, 25.517015, 29.350386, 33.230660, 37.167935, 41.205226, 45.055736, 48.355889, 49.824812, 51.986702, 56.461318, 61.959729, 68.569313, 76.316068, 85.212121, 95.324047]
@test predict(ft, [10, 15, 22]) ≈ Rvals[[6, 11, 18]] rtol=1e-5
# The interpolated values are broken until https://github.com/JuliaStats/Loess.jl/pull/63 is merged
@test_broken predict(ft) ≈ Rvals rtol=1e-5
end
end