Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cpp/src/gandiva/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,11 @@ set(SRC_FILES
expression_registry.cc
exported_funcs_registry.cc
filter.cc
array_ops.cc
function_ir_builder.cc
function_registry.cc
function_registry_arithmetic.cc
function_registry_array.cc
function_registry_datetime.cc
function_registry_hash.cc
function_registry_math_ops.cc
Expand Down Expand Up @@ -237,6 +239,7 @@ add_gandiva_test(internals-test
random_generator_holder_test.cc
hash_utils_test.cc
gdv_function_stubs_test.cc
array_ops_test.cc
EXTRA_DEPENDENCIES
LLVM::LLVM_INTERFACE
${GANDIVA_OPENSSL_LIBS}
Expand Down
58 changes: 51 additions & 7 deletions cpp/src/gandiva/annotator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,23 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) {
int data_idx = buffer_count_++;
int validity_idx = buffer_count_++;
int offsets_idx = FieldDescriptor::kInvalidIdx;
int child_offsets_idx = FieldDescriptor::kInvalidIdx;
if (arrow::is_binary_like(field->type()->id())) {
offsets_idx = buffer_count_++;
}

if (field->type()->id() == arrow::Type::LIST) {
offsets_idx = buffer_count_++;
if (arrow::is_binary_like(field->type()->field(0)->type()->id())) {
child_offsets_idx = buffer_count_++;
}
}
int data_buffer_ptr_idx = FieldDescriptor::kInvalidIdx;
if (is_output) {
data_buffer_ptr_idx = buffer_count_++;
}
return std::make_shared<FieldDescriptor>(field, data_idx, validity_idx, offsets_idx,
data_buffer_ptr_idx);
data_buffer_ptr_idx, child_offsets_idx);
}

void Annotator::PrepareBuffersForField(const FieldDescriptor& desc,
Expand All @@ -74,16 +82,52 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc,
if (desc.HasOffsetsIdx()) {
uint8_t* offsets_buf = const_cast<uint8_t*>(array_data.buffers[buffer_idx]->data());
eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset);
++buffer_idx;

if (desc.HasChildOffsetsIdx()) {
if (is_output) {
// if list field is output field, we should put buffer pointer into eval batch
// for resizing
uint8_t* child_offsets_buf = reinterpret_cast<uint8_t*>(
array_data.child_data.at(0)->buffers[buffer_idx].get());
eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf,
array_data.child_data.at(0)->offset);
} else {
// if list field is input field, just put buffer data into eval batch
uint8_t* child_offsets_buf = const_cast<uint8_t*>(
array_data.child_data.at(0)->buffers[buffer_idx]->data());
eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf,
array_data.child_data.at(0)->offset);
}
}
if (array_data.type->id() != arrow::Type::LIST ||
arrow::is_binary_like(array_data.type->field(0)->type()->id()))
// primitive type list data buffer index is 1
// binary like type list data buffer index is 2
++buffer_idx;
}

if (array_data.type->id() != arrow::Type::LIST) {
uint8_t* data_buf = const_cast<uint8_t*>(array_data.buffers[buffer_idx]->data());
eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset);
} else {
uint8_t* data_buf =
const_cast<uint8_t*>(array_data.child_data.at(0)->buffers[buffer_idx]->data());
eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset);
}

uint8_t* data_buf = const_cast<uint8_t*>(array_data.buffers[buffer_idx]->data());
eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset);
if (is_output) {
// pass in the Buffer object for output data buffers. Can be used for resizing.
uint8_t* data_buf_ptr =
reinterpret_cast<uint8_t*>(array_data.buffers[buffer_idx].get());
eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset);
if (array_data.type->id() != arrow::Type::LIST) {
uint8_t* data_buf_ptr =
reinterpret_cast<uint8_t*>(array_data.buffers[buffer_idx].get());
eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset);
} else {
// list data buffer is in child data buffer
uint8_t* data_buf_ptr = reinterpret_cast<uint8_t*>(
array_data.child_data.at(0)->buffers[buffer_idx].get());
eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr,
array_data.child_data.at(0)->offset);
}
}
}

Expand Down
76 changes: 76 additions & 0 deletions cpp/src/gandiva/array_ops.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "gandiva/array_ops.h"

#include "arrow/util/value_parsing.h"
#include "gandiva/engine.h"
#include "gandiva/exported_funcs.h"

/// Stub functions that can be accessed from LLVM or the pre-compiled library.

extern "C" {

bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf,
int32_t* entry_child_offsets, int32_t entry_offsets_len,
const char* contains_data, int32_t contains_data_length) {
for (int i = 0; i < entry_offsets_len; i++) {
int32_t entry_len = *(entry_child_offsets + i + 1) - *(entry_child_offsets + i);
if (entry_len != contains_data_length) {
entry_buf = entry_buf + entry_len;
continue;
}
if (strncmp(entry_buf, contains_data, contains_data_length) == 0) {
return true;
}
entry_buf = entry_buf + entry_len;
}
return false;
}

int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf,
int32_t* entry_child_offsets, int32_t entry_offsets_len) {
int64_t res = entry_offsets_len;
return res;
}
}

namespace gandiva {
void ExportedArrayFunctions::AddMappings(Engine* engine) const {
std::vector<llvm::Type*> args;
auto types = engine->types();

args = {types->i64_type(), // int64_t execution_context
types->i8_ptr_type(), // int8_t* data ptr
types->i32_ptr_type(), // int32_t* child offsets ptr
types->i32_type()}; // int32_t child offsets length

engine->AddGlobalMappingForFunc("array_utf8_length", types->i64_type() /*return_type*/,
args, reinterpret_cast<void*>(array_utf8_length));

args = {types->i64_type(), // int64_t execution_context
types->i8_ptr_type(), // int8_t* data ptr
types->i32_ptr_type(), // int32_t* child offsets ptr
types->i32_type(), // int32_t child offsets length
types->i8_ptr_type(), // const char* contains data buf
types->i32_type()}; // int32_t contains data length

engine->AddGlobalMappingForFunc("array_utf8_contains_utf8",
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(array_utf8_contains_utf8));
}
} // namespace gandiva
33 changes: 33 additions & 0 deletions cpp/src/gandiva/array_ops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>

#include "gandiva/visibility.h"

/// Array functions that can be accessed from LLVM.
extern "C" {
GANDIVA_EXPORT
bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf,
int32_t* entry_child_offsets, int32_t entry_offsets_len,
const char* contains_data, int32_t contains_data_length);
GANDIVA_EXPORT
int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf,
int32_t* entry_child_offsets, int32_t entry_offsets_len);
}
52 changes: 52 additions & 0 deletions cpp/src/gandiva/array_ops_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "gandiva/execution_context.h"
#include "gandiva/precompiled/types.h"

namespace gandiva {

TEST(TestArrayOps, TestUtf8ContainsUtf8) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
const char* entry_buf = "trianglecirclerectangle";
int32_t entry_child_offsets[] = {0, 8, 14, 24};
int32_t entry_offsets_len = 3;
const char* contains_data = "triangle";
int32_t contains_data_length = 8;

EXPECT_EQ(
array_utf8_contains_utf8(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len,
contains_data, contains_data_length),
true);
}

TEST(TestArrayOps, TestUtf8Length) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
const char* entry_buf = "trianglecirclerectangle";
int32_t entry_child_offsets[] = {0, 8, 14, 24};
int32_t entry_offsets_len = 3;

EXPECT_EQ(array_utf8_length(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len),
3);
}

} // namespace gandiva
28 changes: 28 additions & 0 deletions cpp/src/gandiva/dex.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,19 @@ class GANDIVA_EXPORT VectorReadFixedLenValueDex : public VectorReadBaseDex {
void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
};

/// value component of a fixed-len list ValueVector
class GANDIVA_EXPORT VectorReadFixedLenValueListDex : public VectorReadBaseDex {
public:
explicit VectorReadFixedLenValueListDex(FieldDescriptorPtr field_desc)
: VectorReadBaseDex(field_desc) {}

int DataIdx() const { return field_desc_->data_idx(); }

int OffsetsIdx() const { return field_desc_->offsets_idx(); }

void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
};

/// value component of a variable-len ValueVector
class GANDIVA_EXPORT VectorReadVarLenValueDex : public VectorReadBaseDex {
public:
Expand All @@ -93,6 +106,21 @@ class GANDIVA_EXPORT VectorReadVarLenValueDex : public VectorReadBaseDex {
void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
};

/// value component of a variable-len list ValueVector
class GANDIVA_EXPORT VectorReadVarLenValueListDex : public VectorReadBaseDex {
public:
explicit VectorReadVarLenValueListDex(FieldDescriptorPtr field_desc)
: VectorReadBaseDex(field_desc) {}

int DataIdx() const { return field_desc_->data_idx(); }

int OffsetsIdx() const { return field_desc_->offsets_idx(); }

int ChildOffsetsIdx() const { return field_desc_->child_data_offsets_idx(); }

void Accept(DexVisitor& visitor) override { visitor.Visit(*this); }
};

/// validity based on a local bitmap.
class GANDIVA_EXPORT LocalBitMapValidityDex : public Dex {
public:
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/gandiva/dex_visitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ namespace gandiva {

class VectorReadValidityDex;
class VectorReadFixedLenValueDex;
class VectorReadFixedLenValueListDex;
class VectorReadVarLenValueDex;
class VectorReadVarLenValueListDex;
class LocalBitMapValidityDex;
class LiteralDex;
class TrueDex;
Expand All @@ -48,7 +50,9 @@ class GANDIVA_EXPORT DexVisitor {

virtual void Visit(const VectorReadValidityDex& dex) = 0;
virtual void Visit(const VectorReadFixedLenValueDex& dex) = 0;
virtual void Visit(const VectorReadFixedLenValueListDex& dex) = 0;
virtual void Visit(const VectorReadVarLenValueDex& dex) = 0;
virtual void Visit(const VectorReadVarLenValueListDex& dex) = 0;
virtual void Visit(const LocalBitMapValidityDex& dex) = 0;
virtual void Visit(const TrueDex& dex) = 0;
virtual void Visit(const FalseDex& dex) = 0;
Expand All @@ -72,7 +76,9 @@ class GANDIVA_EXPORT DexVisitor {
class GANDIVA_EXPORT DexDefaultVisitor : public DexVisitor {
VISIT_DCHECK(VectorReadValidityDex)
VISIT_DCHECK(VectorReadFixedLenValueDex)
VISIT_DCHECK(VectorReadFixedLenValueListDex)
VISIT_DCHECK(VectorReadVarLenValueDex)
VISIT_DCHECK(VectorReadVarLenValueListDex)
VISIT_DCHECK(LocalBitMapValidityDex)
VISIT_DCHECK(TrueDex)
VISIT_DCHECK(FalseDex)
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/gandiva/exported_funcs.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ class ExportedFuncsBase {
virtual void AddMappings(Engine* engine) const = 0;
};

// Class for exporting Array functions
class ExportedArrayFunctions : public ExportedFuncsBase {
void AddMappings(Engine* engine) const override;
};
REGISTER_EXPORTED_FUNCS(ExportedArrayFunctions);

// Class for exporting Stub functions
class ExportedStubFunctions : public ExportedFuncsBase {
void AddMappings(Engine* engine) const override;
Expand Down
12 changes: 10 additions & 2 deletions cpp/src/gandiva/expr_decomposer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,16 @@ Status ExprDecomposer::Visit(const FieldNode& node) {

DexPtr validity_dex = std::make_shared<VectorReadValidityDex>(desc);
DexPtr value_dex;
if (desc->HasOffsetsIdx()) {
value_dex = std::make_shared<VectorReadVarLenValueDex>(desc);
if (desc->HasChildOffsetsIdx()) {
// handle list<binary> type
value_dex = std::make_shared<VectorReadVarLenValueListDex>(desc);
} else if (desc->HasOffsetsIdx()) {
if (desc->field()->type()->id() == arrow::Type::LIST) {
// handle list<primitive> type
value_dex = std::make_shared<VectorReadFixedLenValueListDex>(desc);
} else {
value_dex = std::make_shared<VectorReadVarLenValueDex>(desc);
}
} else {
value_dex = std::make_shared<VectorReadFixedLenValueDex>(desc);
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/gandiva/expr_validator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Status ExprValidator::Validate(const ExpressionPtr& expr) {
}

Status ExprValidator::Visit(const FieldNode& node) {
auto llvm_type = types_->IRType(node.return_type()->id());
auto llvm_type = types_->DataVecType(node.return_type());
ARROW_RETURN_IF(llvm_type == nullptr,
Status::ExpressionValidationError("Field ", node.field()->name(),
" has unsupported data type ",
Expand Down Expand Up @@ -111,7 +111,7 @@ Status ExprValidator::Visit(const IfNode& node) {
}

Status ExprValidator::Visit(const LiteralNode& node) {
auto llvm_type = types_->IRType(node.return_type()->id());
auto llvm_type = types_->DataVecType(node.return_type());
ARROW_RETURN_IF(llvm_type == nullptr,
Status::ExpressionValidationError("Value ", ToString(node.holder()),
" has unsupported data type ",
Expand Down
Loading