From 85813e49fa7ce012e433c35149f4d329c99c69ed Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 13 Feb 2026 10:47:37 +1300 Subject: [PATCH 1/3] allow option in upack to use strings for names instead of symbols --- src/data/data.jl | 36 ++++++++++++++++++++++++++---------- test/data/data.jl | 9 +++++++++ 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/data/data.jl b/src/data/data.jl index 92fec35a..81edc7bb 100644 --- a/src/data/data.jl +++ b/src/data/data.jl @@ -201,18 +201,22 @@ _partition(X::Tuple, row_partition) = wrap_singles=false, shuffle=false, rng::Union{AbstractRNG,Int,Nothing}=nothing, + string_names=false, coerce_options...) -Horizontally split any Tables.jl compatible `table` into smaller -tables or vectors by making column selections determined by the -predicates `f1`, `f2`, ..., `fk`. Selection from the column names is -without replacement. A *predicate* is any object `f` such that -`f(name)` is `true` or `false` for each column `name::Symbol` of -`table`. +Horizontally split any Tables.jl compatible `table` into smaller tables or vectors by +making column selections determined by the predicates `f1`, `f2`, ..., `fk`, which are +applied to the column names. Selection from the column names is without replacement. A +*predicate* is any object `f` such that `f(name)` is `true` or `false` for each column +`name::Symbol` of `table`. For example, `=!(:id)` is a predicate for selecting every +column except the one with name `:id`. -Returns a tuple of tables/vectors with length one greater than the -number of supplied predicates, with the last component including all -previously unselected columns. +Predicates may also be formulated with the understanding that column names are strings +instead of symbols, by specifying `string_names=true`, or by + + +Returns a tuple of tables/vectors with length one greater than the number of supplied +predicates, with the last component including all previously unselected columns. ```julia-repl julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"]) @@ -241,8 +245,14 @@ julia> W # the column(s) left over 2-element Vector{String}: "A" "B" + +julia> YW, _ = unpack(table, in(["y", "w"]); string_names=true) +julia> YW + + ``` + Whenever a returned table contains a single column, it is converted to a vector unless `wrap_singles=true`. @@ -259,8 +269,14 @@ specifies the seed of an automatically generated Mersenne twister. If function unpack(X, predicates...; wrap_singles=false, shuffle=nothing, - rng=nothing, pairs...) + rng=nothing, + string_names=false, + pairs...) + if string_names + predicates = predicates .∘ string + end + # add a final predicate to unpack all remaining columns into to # the last return value: predicates = (predicates..., _ -> true) diff --git a/test/data/data.jl b/test/data/data.jl index 2cd854f6..52850692 100644 --- a/test/data/data.jl +++ b/test/data/data.jl @@ -227,6 +227,15 @@ end @test isempty(w) @test unpack(small, ==(:x), ==(:y); shuffle=true, rng=StableRNG(66)) == unpack(small, ==(:x), ==(:y); rng=StableRNG(66)) + + # string names: + x1 = categorical(["Female", "Male", "Female"]) + x2 = [185, 160, 175] + x3 = [10.1, 27.1, 25.7] + table = NamedTuple{Symbol.(("Gender", "age (years)", "T / °C"))}((x1, x2, x3)) + X1X2, X3 = unpack(table, !=("T / °C"), string_names=true) + @test X1X2 == selectcols(table, [1, 2]) + @test X3 == x3 end @testset "restrict and corestrict" begin From ea64bb04c660b1fc05a55497fb59393d3c14ab14 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Fri, 13 Feb 2026 10:53:03 +1300 Subject: [PATCH 2/3] fix the unpack docstring --- src/data/data.jl | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/data/data.jl b/src/data/data.jl index 81edc7bb..0cc4503f 100644 --- a/src/data/data.jl +++ b/src/data/data.jl @@ -212,13 +212,14 @@ applied to the column names. Selection from the column names is without replacem column except the one with name `:id`. Predicates may also be formulated with the understanding that column names are strings -instead of symbols, by specifying `string_names=true`, or by +instead of symbols, by specifying `string_names=true`. Returns a tuple of tables/vectors with length one greater than the number of supplied predicates, with the last component including all previously unselected columns. ```julia-repl +using DataFrames julia> table = DataFrame(x=[1,2], y=['a', 'b'], z=[10.0, 20.0], w=["A", "B"]) 2×4 DataFrame Row │ x y z w @@ -248,22 +249,26 @@ julia> W # the column(s) left over julia> YW, _ = unpack(table, in(["y", "w"]); string_names=true) julia> YW - +2×2 DataFrame + Row │ y w + │ Char String +─────┼────────────── + 1 │ a A + 2 │ b B ``` -Whenever a returned table contains a single column, it is converted to -a vector unless `wrap_singles=true`. +Whenever a returned table contains a single column, it is converted to a vector unless +`wrap_singles=true`. -If `coerce_options` are specified then `table` is first replaced -with `coerce(table, coerce_options)`. See -[`ScientificTypes.coerce`](@ref) for details. +If `coerce_options` are specified then `table` is first replaced with `coerce(table, +coerce_options)`. See [`ScientificTypes.coerce`](@ref) for details. -If `shuffle=true` then the rows of `table` are first shuffled, using -the global RNG, unless `rng` is specified; if `rng` is an integer, it -specifies the seed of an automatically generated Mersenne twister. If -`rng` is specified then `shuffle=true` is implicit. +If `shuffle=true` then the rows of `table` are first shuffled, using the global RNG, +unless `rng` is specified; if `rng` is an integer, it specifies the seed of an +automatically generated Mersenne twister. If `rng` is specified then `shuffle=true` is +implicit. """ function unpack(X, predicates...; @@ -276,7 +281,7 @@ function unpack(X, predicates...; if string_names predicates = predicates .∘ string end - + # add a final predicate to unpack all remaining columns into to # the last return value: predicates = (predicates..., _ -> true) From a89f795cad8043e7f0d3c3af5609c3e5d41af1f0 Mon Sep 17 00:00:00 2001 From: "Anthony D. Blaom" Date: Tue, 31 Mar 2026 12:03:00 +1300 Subject: [PATCH 3/3] docstring updates --- src/data/data.jl | 51 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/src/data/data.jl b/src/data/data.jl index 0cc4503f..efe9316c 100644 --- a/src/data/data.jl +++ b/src/data/data.jl @@ -211,10 +211,6 @@ applied to the column names. Selection from the column names is without replacem `name::Symbol` of `table`. For example, `=!(:id)` is a predicate for selecting every column except the one with name `:id`. -Predicates may also be formulated with the understanding that column names are strings -instead of symbols, by specifying `string_names=true`. - - Returns a tuple of tables/vectors with length one greater than the number of supplied predicates, with the last component including all previously unselected columns. @@ -246,7 +242,12 @@ julia> W # the column(s) left over 2-element Vector{String}: "A" "B" +``` +Predicates may also be formulated with the understanding that column names are strings +instead of symbols, by specifying `string_names=true`. + +```julia-repl julia> YW, _ = unpack(table, in(["y", "w"]); string_names=true) julia> YW 2×2 DataFrame @@ -255,15 +256,34 @@ julia> YW ─────┼────────────── 1 │ a A 2 │ b B - ``` - Whenever a returned table contains a single column, it is converted to a vector unless `wrap_singles=true`. -If `coerce_options` are specified then `table` is first replaced with `coerce(table, -coerce_options)`. See [`ScientificTypes.coerce`](@ref) for details. +If `coerce_options` are specified then the scitype of the referenced columns of `table` +are first coerced, as shown in the following example: + +```julia-repl +julia> YW, _ = unpack(table, in([:y, :w]); y=OrderedFactor) # or `:y => OrderedFactor` +julia> YW +2×2 DataFrame + Row │ y w + │ Cat… String +─────┼────────────── + 1 │ a A + 2 │ b B + +julia> schema(YW) +┌───────┬──────────────────┬────────────────────────────────┐ +│ names │ scitypes │ types │ +├───────┼──────────────────┼────────────────────────────────┤ +│ y │ OrderedFactor{2} │ CategoricalValue{Char, UInt32} │ +│ w │ Textual │ String │ +└───────┴──────────────────┴────────────────────────────────┘ +``` + +For more flexible type coercion options see [`ScientificTypes.coerce`](@ref). If `shuffle=true` then the rows of `table` are first shuffled, using the global RNG, unless `rng` is specified; if `rng` is an integer, it specifies the seed of an @@ -271,12 +291,15 @@ automatically generated Mersenne twister. If `rng` is specified then `shuffle=tr implicit. """ -function unpack(X, predicates...; - wrap_singles=false, - shuffle=nothing, - rng=nothing, - string_names=false, - pairs...) +function unpack( + X, + predicates...; + wrap_singles=false, + shuffle=nothing, + rng=nothing, + string_names=false, + pairs..., + ) if string_names predicates = predicates .∘ string