@@ -89,11 +89,7 @@ class ARROW_EXPORT DictionaryMemoTable {
8989// / dense array
9090// /
9191// / Unlike other builders, dictionary builder does not completely
92- // / reset the state on Finish calls. The arrays built after the
93- // / initial Finish call will reuse the previously created encoding and
94- // / build a delta dictionary when new terms occur.
95- // /
96- // / data
92+ // / reset the state on Finish calls.
9793template <typename BuilderType, typename T>
9894class DictionaryBuilderBase : public ArrayBuilder {
9995 public:
@@ -230,44 +226,34 @@ class DictionaryBuilderBase : public ArrayBuilder {
230226 }
231227
232228 void Reset () override {
229+ // Perform a partial reset. Call ResetFull to also reset the accumulated
230+ // dictionary values
233231 ArrayBuilder::Reset ();
234232 indices_builder_.Reset ();
233+ }
234+
235+ // / \brief Reset and also clear accumulated dictionary values in memo table
236+ void ResetFull () {
237+ Reset ();
235238 memo_table_.reset (new internal::DictionaryMemoTable (value_type_));
236- delta_offset_ = 0 ;
237239 }
238240
239241 Status Resize (int64_t capacity) override {
240242 ARROW_RETURN_NOT_OK (CheckCapacity (capacity, capacity_));
241243 capacity = std::max (capacity, kMinBuilderCapacity );
242-
243- if (capacity_ == 0 ) {
244- // Initialize hash table
245- // XXX should we let the user pass additional size heuristics?
246- delta_offset_ = 0 ;
247- }
248244 ARROW_RETURN_NOT_OK (indices_builder_.Resize (capacity));
249245 capacity_ = indices_builder_.capacity ();
250246 return Status::OK ();
251247 }
252248
253- Status FinishInternal (std::shared_ptr<ArrayData>* out) override {
254- // Finalize indices array
255- ARROW_RETURN_NOT_OK (indices_builder_.FinishInternal (out));
256-
257- // Generate dictionary array from hash table contents
258- std::shared_ptr<ArrayData> dictionary_data;
259-
260- ARROW_RETURN_NOT_OK (
261- memo_table_->GetArrayData (pool_, delta_offset_, &dictionary_data));
262-
263- // Set type of array data to the right dictionary type
264- (*out)->type = type ();
265- (*out)->dictionary = MakeArray (dictionary_data);
266-
267- // Update internals for further uses of this DictionaryBuilder
268- delta_offset_ = memo_table_->size ();
269- indices_builder_.Reset ();
270-
249+ // / \brief Return dictionary indices and a delta dictionary since the last
250+ // / time that Finish or FinishDelta were called, and reset state of builder
251+ // / (except the memo table)
252+ Status FinishDelta (std::shared_ptr<Array>* out_indices,
253+ std::shared_ptr<Array>* out_delta) {
254+ std::shared_ptr<ArrayData> indices_data;
255+ ARROW_RETURN_NOT_OK (FinishWithDictOffset (delta_offset_, &indices_data, out_delta));
256+ *out_indices = MakeArray (indices_data);
271257 return Status::OK ();
272258 }
273259
@@ -277,17 +263,45 @@ class DictionaryBuilderBase : public ArrayBuilder {
277263
278264 Status Finish (std::shared_ptr<DictionaryArray>* out) { return FinishTyped (out); }
279265
280- // / is the dictionary builder in the delta building mode
281- bool is_building_delta () { return delta_offset_ > 0 ; }
282-
283266 std::shared_ptr<DataType> type () const override {
284267 return ::arrow::dictionary (indices_builder_.type (), value_type_);
285268 }
286269
287270 protected:
271+ Status FinishInternal (std::shared_ptr<ArrayData>* out) override {
272+ std::shared_ptr<Array> dictionary;
273+ ARROW_RETURN_NOT_OK (FinishWithDictOffset (/* offset=*/ 0 , out, &dictionary));
274+
275+ // Set type of array data to the right dictionary type
276+ (*out)->type = type ();
277+ (*out)->dictionary = dictionary;
278+ return Status::OK ();
279+ }
280+
281+ Status FinishWithDictOffset (int64_t dict_offset,
282+ std::shared_ptr<ArrayData>* out_indices,
283+ std::shared_ptr<Array>* out_dictionary) {
284+ // Finalize indices array
285+ ARROW_RETURN_NOT_OK (indices_builder_.FinishInternal (out_indices));
286+
287+ // Generate dictionary array from hash table contents
288+ std::shared_ptr<ArrayData> dictionary_data;
289+ ARROW_RETURN_NOT_OK (memo_table_->GetArrayData (pool_, dict_offset, &dictionary_data));
290+
291+ *out_dictionary = MakeArray (dictionary_data);
292+ delta_offset_ = memo_table_->size ();
293+
294+ // Update internals for further uses of this DictionaryBuilder
295+ ArrayBuilder::Reset ();
296+ return Status::OK ();
297+ }
298+
288299 std::unique_ptr<DictionaryMemoTable> memo_table_;
289300
301+ // The size of the dictionary memo at last invocation of Finish, to use in
302+ // FinishDelta for computing dictionary deltas
290303 int32_t delta_offset_;
304+
291305 // Only used for FixedSizeBinaryType
292306 int32_t byte_width_;
293307
@@ -380,6 +394,7 @@ class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuil
380394 const uint8_t * valid_bytes = NULLPTR) {
381395 int64_t null_count_before = this ->indices_builder_ .null_count ();
382396 ARROW_RETURN_NOT_OK (this ->indices_builder_ .AppendValues (values, length, valid_bytes));
397+ this ->capacity_ = this ->indices_builder_ .capacity ();
383398 this ->length_ += length;
384399 this ->null_count_ += this ->indices_builder_ .null_count () - null_count_before;
385400 return Status::OK ();
@@ -402,6 +417,7 @@ class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder,
402417 const uint8_t * valid_bytes = NULLPTR) {
403418 int64_t null_count_before = this ->indices_builder_ .null_count ();
404419 ARROW_RETURN_NOT_OK (this ->indices_builder_ .AppendValues (values, length, valid_bytes));
420+ this ->capacity_ = this ->indices_builder_ .capacity ();
405421 this ->length_ += length;
406422 this ->null_count_ += this ->indices_builder_ .null_count () - null_count_before;
407423 return Status::OK ();
0 commit comments