Skip to content

Commit feaa418

Browse files
authored
Fix list slicing of np.ndarrays on CPU (#1817)
* Move `ListSlice` tests to a separate file * Add a failing test for slicing `np.ndarray` and make it pass * Adjust the `ListSlice` test for better CPU/GPU compatibility
1 parent 5c66dab commit feaa418

File tree

3 files changed

+135
-81
lines changed

3 files changed

+135
-81
lines changed

nvtabular/ops/list_slice.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,11 @@ def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFram
7979
on_cpu = is_cpu_object(df)
8080
ret = type(df)()
8181

82+
if on_cpu:
83+
xp = np
84+
else:
85+
xp = cp
86+
8287
for col in col_selector.names:
8388
# handle CPU via normal python slicing (not very efficient)
8489
if on_cpu:
@@ -88,7 +93,11 @@ def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFram
8893
if self.pad:
8994
for v in values:
9095
if len(v) < self.max_elements:
91-
v.extend([self.pad_value] * (self.max_elements - len(v)))
96+
padding = [self.pad_value] * (self.max_elements - len(v))
97+
if isinstance(v, xp.ndarray):
98+
xp.append(v, padding)
99+
else:
100+
v.extend(padding)
92101

93102
ret[col] = values
94103
else:
@@ -115,11 +124,18 @@ def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFram
115124

116125
# create a new array for the sliced elements
117126
new_elements = cp.full(
118-
new_offsets[-1].item(), fill_value=self.pad_value, dtype=elements.dtype
127+
new_offsets[-1].item(),
128+
fill_value=self.pad_value,
129+
dtype=elements.dtype,
119130
)
120131
if new_elements.size:
121132
_slice_rows[blocks, threads](
122-
self.start, self.end, offsets, elements, new_offsets, new_elements
133+
self.start,
134+
self.end,
135+
offsets,
136+
elements,
137+
new_offsets,
138+
new_elements,
123139
)
124140

125141
# build up a list column with the sliced values
@@ -133,7 +149,10 @@ def _compute_dtype(self, col_schema, input_schema):
133149

134150
def _compute_properties(self, col_schema, input_schema):
135151
col_schema = super()._compute_properties(col_schema, input_schema)
136-
properties = {**col_schema.properties, **{"value_count": {"min": 0, "max": None}}}
152+
properties = {
153+
**col_schema.properties,
154+
**{"value_count": {"min": 0, "max": None}},
155+
}
137156
if self.max_elements != np.iinfo(np.int64).max:
138157
properties["value_count"]["max"] = self.max_elements
139158
if self.pad:

tests/unit/ops/test_list_slice.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#
2+
# Copyright (c) 2023, NVIDIA CORPORATION.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
import pytest
17+
18+
import nvtabular as nvt
19+
from merlin.core.compat import cudf, numpy, pandas
20+
from merlin.core.dispatch import make_df
21+
from nvtabular import ColumnSelector, ops
22+
from tests.conftest import assert_eq
23+
24+
if cudf:
25+
_CPU = [True, False]
26+
else:
27+
_CPU = [True]
28+
29+
30+
@pytest.mark.parametrize("cpu", _CPU)
31+
def test_list_slice(cpu):
32+
DataFrame = pandas.DataFrame if cpu else cudf.DataFrame
33+
34+
df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})
35+
36+
op = ops.ListSlice(0, 2)
37+
selector = ColumnSelector(["y"])
38+
transformed = op.transform(selector, df)
39+
expected = DataFrame({"y": [[0, 1], [1, 2], [1, 223]]})
40+
assert_eq(transformed, expected)
41+
42+
op = ops.ListSlice(3, 5)
43+
transformed = op.transform(selector, df)
44+
expected = DataFrame({"y": [[2, 767], [3], []]})
45+
assert_eq(transformed, expected)
46+
47+
op = ops.ListSlice(4, 10)
48+
transformed = op.transform(selector, df)
49+
expected = DataFrame({"y": [[767], [], []]})
50+
assert_eq(transformed, expected)
51+
52+
op = ops.ListSlice(100, 20000)
53+
transformed = op.transform(selector, df)
54+
expected = DataFrame({"y": [[], [], []]})
55+
assert_eq(transformed, expected)
56+
57+
op = ops.ListSlice(-4)
58+
transformed = op.transform(selector, df)
59+
expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})
60+
assert_eq(transformed, expected)
61+
62+
op = ops.ListSlice(-3, -1)
63+
transformed = op.transform(selector, df)
64+
expected = DataFrame({"y": [[2, 2], [2, 2], [1, 223]]})
65+
assert_eq(transformed, expected)
66+
67+
68+
@pytest.mark.parametrize("cpu", _CPU)
69+
def test_list_slice_pad(cpu):
70+
DataFrame = pandas.DataFrame if cpu else cudf.DataFrame
71+
df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})
72+
73+
# 0 pad to 5 elements
74+
op = ops.ListSlice(5, pad=True)
75+
selector = ColumnSelector(["y"])
76+
transformed = op.transform(selector, df)
77+
expected = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3, 0], [1, 223, 4, 0, 0]]})
78+
assert_eq(transformed, expected)
79+
80+
# make sure we can also pad when start != 0, and when pad_value is set
81+
op = ops.ListSlice(1, 6, pad=True, pad_value=123)
82+
selector = ColumnSelector(["y"])
83+
transformed = op.transform(selector, df)
84+
expected = DataFrame({"y": [[1, 2, 2, 767, 123], [2, 2, 3, 123, 123], [223, 4, 123, 123, 123]]})
85+
assert_eq(transformed, expected)
86+
87+
# we should be able to do pad out negative offsets as well
88+
op = ops.ListSlice(-4, pad=True, pad_value=-1)
89+
selector = ColumnSelector(["y"])
90+
transformed = op.transform(selector, df)
91+
expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4, -1]]})
92+
assert_eq(transformed, expected)
93+
94+
op = ops.ListSlice(-4, -1, pad=True, pad_value=-1)
95+
selector = ColumnSelector(["y"])
96+
transformed = op.transform(selector, df)
97+
expected = DataFrame({"y": [[1, 2, 2], [1, 2, 2], [1, 223, -1]]})
98+
assert_eq(transformed, expected)
99+
100+
op = ops.ListSlice(-4, pad=True, pad_value=-1)
101+
selector = ColumnSelector(["y"])
102+
transformed = op.transform(selector, df)
103+
expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4, -1]]})
104+
assert_eq(transformed, expected)
105+
106+
107+
def test_slice_ndarrays():
108+
out = ["test"] >> nvt.ops.ListSlice(10, pad=True)
109+
workflow = nvt.Workflow(out)
110+
df = make_df({"test": [[x for x in numpy.asarray(range(1, 4)).astype(numpy.int32)]]})
111+
workflow.fit(nvt.Dataset(df, cpu=True))
112+
workflow.transform(nvt.Dataset(df, cpu=True)).compute()

tests/unit/ops/test_ops.py

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -299,83 +299,6 @@ def test_data_stats(tmpdir, df, datasets, engine, cpu):
299299
)
300300

301301

302-
@pytest.mark.parametrize("cpu", _CPU)
303-
def test_list_slice(cpu):
304-
DataFrame = pd.DataFrame if cpu else cudf.DataFrame
305-
306-
df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})
307-
308-
op = ops.ListSlice(0, 2)
309-
selector = ColumnSelector(["y"])
310-
transformed = op.transform(selector, df)
311-
expected = DataFrame({"y": [[0, 1], [1, 2], [1, 223]]})
312-
assert_eq(transformed, expected)
313-
314-
op = ops.ListSlice(3, 5)
315-
transformed = op.transform(selector, df)
316-
expected = DataFrame({"y": [[2, 767], [3], []]})
317-
assert_eq(transformed, expected)
318-
319-
op = ops.ListSlice(4, 10)
320-
transformed = op.transform(selector, df)
321-
expected = DataFrame({"y": [[767], [], []]})
322-
assert_eq(transformed, expected)
323-
324-
op = ops.ListSlice(100, 20000)
325-
transformed = op.transform(selector, df)
326-
expected = DataFrame({"y": [[], [], []]})
327-
assert_eq(transformed, expected)
328-
329-
op = ops.ListSlice(-4)
330-
transformed = op.transform(selector, df)
331-
expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})
332-
assert_eq(transformed, expected)
333-
334-
op = ops.ListSlice(-3, -1)
335-
transformed = op.transform(selector, df)
336-
expected = DataFrame({"y": [[2, 2], [2, 2], [1, 223]]})
337-
assert_eq(transformed, expected)
338-
339-
340-
@pytest.mark.parametrize("cpu", _CPU)
341-
def test_list_slice_pad(cpu):
342-
DataFrame = pd.DataFrame if cpu else cudf.DataFrame
343-
df = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4]]})
344-
345-
# 0 pad to 5 elements
346-
op = ops.ListSlice(5, pad=True)
347-
selector = ColumnSelector(["y"])
348-
transformed = op.transform(selector, df)
349-
expected = DataFrame({"y": [[0, 1, 2, 2, 767], [1, 2, 2, 3, 0], [1, 223, 4, 0, 0]]})
350-
assert_eq(transformed, expected)
351-
352-
# make sure we can also pad when start != 0, and when pad_value is set
353-
op = ops.ListSlice(1, 6, pad=True, pad_value=123)
354-
selector = ColumnSelector(["y"])
355-
transformed = op.transform(selector, df)
356-
expected = DataFrame({"y": [[1, 2, 2, 767, 123], [2, 2, 3, 123, 123], [223, 4, 123, 123, 123]]})
357-
assert_eq(transformed, expected)
358-
359-
# we should be able to do pad out negative offsets as well
360-
op = ops.ListSlice(-4, pad=True, pad_value=-1)
361-
selector = ColumnSelector(["y"])
362-
transformed = op.transform(selector, df)
363-
expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4, -1]]})
364-
assert_eq(transformed, expected)
365-
366-
op = ops.ListSlice(-4, -1, pad=True, pad_value=-1)
367-
selector = ColumnSelector(["y"])
368-
transformed = op.transform(selector, df)
369-
expected = DataFrame({"y": [[1, 2, 2], [1, 2, 2], [1, 223, -1]]})
370-
assert_eq(transformed, expected)
371-
372-
op = ops.ListSlice(-4, pad=True, pad_value=-1)
373-
selector = ColumnSelector(["y"])
374-
transformed = op.transform(selector, df)
375-
expected = DataFrame({"y": [[1, 2, 2, 767], [1, 2, 2, 3], [1, 223, 4, -1]]})
376-
assert_eq(transformed, expected)
377-
378-
379302
@pytest.mark.parametrize("cpu", _CPU)
380303
def test_rename(cpu):
381304
DataFrame = pd.DataFrame if cpu else cudf.DataFrame

0 commit comments

Comments
 (0)