diff --git a/NEWS.md b/NEWS.md index fe15fce7c9..e6f706aa3f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -76,6 +76,11 @@ * Make `transform!` on `SubDataFrame` faster ([#3070](https://github.com/JuliaData/DataFrames.jl/pull/3070)) +## Integration changes + +* Support `Tables.subset` and move `ByRow` definition to Tables.jl + ([#3158](https://github.com/JuliaData/DataFrames.jl/pull/3158)) + # DataFrames.jl v1.3.4 Patch Release Notes ## Bug fixes diff --git a/Project.toml b/Project.toml index 8106f2cc4a..033bb55b91 100644 --- a/Project.toml +++ b/Project.toml @@ -36,7 +36,7 @@ Reexport = "0.1, 0.2, 1" ShiftedArrays = "1" SortingAlgorithms = "0.1, 0.2, 0.3, 1" TableTraits = "0.4, 1" -Tables = "1.2" +Tables = "1.8.1" Unitful = "1" julia = "1" diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md index f48bbb44d1..4860a15ffd 100644 --- a/docs/src/lib/types.md +++ b/docs/src/lib/types.md @@ -126,7 +126,6 @@ without caution because: ```@docs AbstractDataFrame AsTable -ByRow DataFrame DataFrameRow GroupedDataFrame diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 60be63d8ca..90e22ff3a8 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -9,6 +9,7 @@ import LinearAlgebra: norm using Markdown using PrettyTables using Random +using Tables: ByRow import DataAPI, DataAPI.allcombinations, diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index da35a28a34..75e09991d2 100755 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -309,33 +309,6 @@ function broadcast_pair(df::AbstractDataFrame, @nospecialize(p::AbstractVecOrMat end end -""" - ByRow - -A type used for selection operations to signal that the wrapped function should -be applied to each element (row) of the selection. - -The wrapped function is called exactly once for each element. -This differs from `map` and `broadcast`, which assume for some types of -source vectors (e.g. `SparseVector`) that the wrapped function is pure, -allowing them to call the function only once for multiple equal values. -When using such types, for maximal performance with pure functions -which are relatively costly, use `x -> map(f, x)` instead of `ByRow(f)`. - -Note that `ByRow` always collects values returned by `fun` in a vector. -""" -struct ByRow{T} <: Function - fun::T -end - -# invoke the generic AbstractVector function to ensure function is called -# exactly once for each element -(f::ByRow)(cols::AbstractVector...) = - invoke(map, - Tuple{typeof(f.fun), ntuple(i -> AbstractVector, length(cols))...}, - f.fun, cols...) -(f::ByRow)(table::NamedTuple) = [f.fun(nt) for nt in Tables.namedtupleiterator(table)] - # add a method to funname defined in other/utils.jl funname(row::ByRow) = funname(row.fun) diff --git a/src/other/tables.jl b/src/other/tables.jl index 09cf1f634d..7aa31465f2 100644 --- a/src/other/tables.jl +++ b/src/other/tables.jl @@ -103,3 +103,12 @@ IteratorInterfaceExtensions.getiterator(df::AbstractDataFrame) = Tables.datavaluerows(Tables.columntable(df)) IteratorInterfaceExtensions.isiterable(x::AbstractDataFrame) = true TableTraits.isiterabletable(x::AbstractDataFrame) = true + +@inline function Tables.subset(df::AbstractDataFrame, inds; view::Union{Bool, Nothing}=nothing) + res = view === true ? DataFrames.view(df, inds, :) : df[inds, :] + if res isa DataFrameRow && view === false + return copy(res) + else + return res + end +end diff --git a/test/grouping.jl b/test/grouping.jl index b7017d6ad7..6fcc8cc70c 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -3461,7 +3461,9 @@ end df = DataFrame(id=[1, 1, 2, 3, 3, 1], x=1:6) gdf = groupby_checked(df, :id) @test_throws ArgumentError combine(gdf, :x, :x) - @test_throws ErrorException combine(gdf, :x => (x -> Dict("a" => [1])) => AsTable) + @test_throws ErrorException combine(gdf, :x => (x -> Dict("a" => 1)) => AsTable) + # changed in Tables.jl 1.8 + @test combine(gdf, :x => (x -> Dict("a" => [1])) => AsTable) == DataFrame(id=1:3, a=1) @test_throws ErrorException combine(gdf, :x => (x -> Dict(:a => 1)) => AsTable) @test_throws ArgumentError combine(gdf, sdf -> sdf.id[1] == 1 ? Ref(1) : [1]) @test_throws ArgumentError combine(gdf, sdf -> sdf.id[1] == 2 ? Ref(1) : [1]) diff --git a/test/tables.jl b/test/tables.jl index 021bcfe880..5bad91216b 100644 --- a/test/tables.jl +++ b/test/tables.jl @@ -323,4 +323,48 @@ end @test DataFrame === @inferred Tables.materializer(DataFrames.DataFrameColumns) end +@testset "Tables.subset" begin + df = DataFrame(a=1:3, b=4:6) + + res = @inferred Tables.subset(df, :) + @test res isa DataFrame + @test res == DataFrame(a=1:3, b=4:6) + res = Tables.subset(df, :, view=false) + @test res isa DataFrame + @test res == DataFrame(a=1:3, b=4:6) + res = Tables.subset(df, :, view=true) + @test res isa SubDataFrame + @test res == DataFrame(a=1:3, b=4:6) + + res = @inferred Tables.subset(df, [3, 1]) + @test res isa DataFrame + @test res == DataFrame(a=[3, 1], b=[6, 4]) + res = Tables.subset(df, [3, 1], view=false) + @test res isa DataFrame + @test res == DataFrame(a=[3, 1], b=[6, 4]) + res = Tables.subset(df, [3, 1], view=true) + @test res isa SubDataFrame + @test res == DataFrame(a=[3, 1], b=[6, 4]) + + res = @inferred Tables.subset(df, [true, false, true]) + @test res isa DataFrame + @test res == DataFrame(a=[1, 3], b=[4, 6]) + res = Tables.subset(df, [1, 3], view=false) + @test res isa DataFrame + @test res == DataFrame(a=[1, 3], b=[4, 6]) + res = Tables.subset(df, [1, 3], view=true) + @test res isa SubDataFrame + @test res == DataFrame(a=[1, 3], b=[4, 6]) + + res = @inferred Tables.subset(df, 2) + @test res isa DataFrameRow + @test res == DataFrame(a=2, b=5)[1, :] + res = Tables.subset(df, 2, view=false) + @test res isa NamedTuple{(:a, :b), Tuple{Int, Int}} + @test res == (a=2, b=5) + res = Tables.subset(df, 2, view=true) + @test res isa DataFrameRow + @test res == DataFrame(a=2, b=5)[1, :] +end + end # module