Skip to content
This repository was archived by the owner on Jan 28, 2023. It is now read-only.

Commit da4c610

Browse files
committed
Merge branch 'master' of https://github.com/holgerbrandl/krangl
2 parents e3af6b0 + 1c6038f commit da4c610

File tree

8 files changed

+462
-87
lines changed

8 files changed

+462
-87
lines changed

build.gradle

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ dependencies {
2323
compileOnly 'org.jetbrains.kotlin:kotlin-script-runtime:1.6.20'
2424

2525
api "org.apache.commons:commons-csv:1.6" // cant upgrade to 1.8 because of https://issues.apache.org/jira/browse/CSV-257
26+
api 'org.apache.arrow:arrow-vector:8.0.0'
27+
implementation 'org.apache.arrow:arrow-memory-netty:8.0.0'
2628
api 'org.apache.poi:poi-ooxml:5.2.2'
2729

2830
api 'com.beust:klaxon:5.6'// compile 'me.tongfei:progressbar:0.5.5'
@@ -98,7 +100,7 @@ test {
98100
//http://stackoverflow.com/questions/34377367/why-is-gradle-install-replacing-my-version-with-unspecified
99101
group 'com.github.holgerbrandl'
100102
//version '0.16.95'
101-
version '0.17.4-SNAPSHOT'
103+
version '0.17.4'
102104

103105

104106

src/main/kotlin/krangl/ArrowIO.kt

Lines changed: 318 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,318 @@
1+
package krangl
2+
3+
import org.apache.arrow.memory.BufferAllocator
4+
import org.apache.arrow.memory.RootAllocator
5+
import org.apache.arrow.vector.BaseFixedWidthVector
6+
import org.apache.arrow.vector.BigIntVector
7+
import org.apache.arrow.vector.BitVector
8+
import org.apache.arrow.vector.Float4Vector
9+
import org.apache.arrow.vector.Float8Vector
10+
import org.apache.arrow.vector.IntVector
11+
import org.apache.arrow.vector.SmallIntVector
12+
import org.apache.arrow.vector.TinyIntVector
13+
import org.apache.arrow.vector.VarCharVector
14+
import org.apache.arrow.vector.VectorSchemaRoot
15+
import org.apache.arrow.vector.ipc.ArrowFileReader
16+
import org.apache.arrow.vector.ipc.ArrowFileWriter
17+
import org.apache.arrow.vector.types.FloatingPointPrecision
18+
import org.apache.arrow.vector.types.pojo.ArrowType
19+
import org.apache.arrow.vector.types.pojo.Schema
20+
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
21+
import org.apache.arrow.vector.util.Text
22+
import java.io.ByteArrayOutputStream
23+
import java.io.File
24+
import java.nio.channels.*
25+
import java.nio.file.StandardOpenOption
26+
import java.util.*
27+
28+
internal fun unwrapStringArrayFromArrow(vector: VarCharVector): ArrayList<String?> {
29+
val result = ArrayList<String?>()
30+
for (i in 0 until vector.valueCount) {
31+
result.add(vector.getObject(i)?.toString())
32+
}
33+
return result
34+
}
35+
36+
internal inline fun <reified ELEMENT_TYPE>unwrapNumericVectorFromArrow(vector: BaseFixedWidthVector, elementClass: Class<ELEMENT_TYPE>): List<ELEMENT_TYPE?> {
37+
val elements = vector.valueCount
38+
val outVector = ArrayList<ELEMENT_TYPE?>(elements)
39+
for (i in 0 until elements) {
40+
outVector.add(vector.getObject(i) as ELEMENT_TYPE?)
41+
}
42+
return outVector
43+
}
44+
45+
internal fun unwrapBooleanArrayFromArrow(vector: BitVector): ArrayList<Boolean?> {
46+
val result = ArrayList<Boolean?>()
47+
for (i in 0 until vector.valueCount) {
48+
result.add(vector.getObject(i))
49+
}
50+
return result
51+
}
52+
53+
fun DataFrame.Companion.arrowReader() = ArrowReader()
54+
55+
class ArrowReader() {
56+
/**
57+
* Internal low-level function.
58+
* Use this function if you are working with [VectorSchemaRoot]s directly in your project.
59+
*/
60+
fun fromVectorSchemaRoot(vectorSchemaRoot: VectorSchemaRoot): DataFrame {
61+
val kranglVectors = vectorSchemaRoot.fieldVectors.map { fieldVector ->
62+
when (fieldVector.field.type) {
63+
is ArrowType.FixedSizeList, is ArrowType.List -> {
64+
throw Exception("Matrices are not supported yet")
65+
}
66+
is ArrowType.Utf8 -> {
67+
StringCol(fieldVector.name, unwrapStringArrayFromArrow(fieldVector as VarCharVector))
68+
}
69+
is ArrowType.Int -> {
70+
val bitWidth = (fieldVector.field.type as ArrowType.Int).bitWidth
71+
when (bitWidth) {
72+
8 -> IntCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as TinyIntVector, Int::class.java))
73+
16 -> IntCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as SmallIntVector, Int::class.java))
74+
32 -> IntCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as IntVector, Int::class.java))
75+
64 -> LongCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as BigIntVector, Long::class.java))
76+
else -> throw java.lang.Exception("Incorrect Int.bitWidth ($bitWidth, should never happen)")
77+
}
78+
}
79+
is ArrowType.FloatingPoint -> {
80+
val precision = (fieldVector.field.type as ArrowType.FloatingPoint).precision
81+
when (precision) {
82+
FloatingPointPrecision.HALF -> java.lang.Exception("HALF float not supported")
83+
FloatingPointPrecision.SINGLE -> DoubleCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as Float4Vector, Double::class.java))
84+
FloatingPointPrecision.DOUBLE -> DoubleCol(fieldVector.name, unwrapNumericVectorFromArrow(fieldVector as Float8Vector, Double::class.java))
85+
else -> throw java.lang.Exception("Incorrect FloatingPoint.precision ($precision, should never happen)")
86+
}
87+
}
88+
is ArrowType.Bool -> {
89+
BooleanCol(fieldVector.name, unwrapBooleanArrayFromArrow(fieldVector as BitVector))
90+
}
91+
else -> {
92+
throw Exception("${fieldVector.field.type.typeID.name} is not supported yet")
93+
}
94+
}
95+
}
96+
97+
return dataFrameOf(*(kranglVectors as List<DataCol>).toTypedArray())
98+
}
99+
100+
/**
101+
* Read [VectorSchemaRoot] from existing [channel] and convert it to [DataFrame].
102+
* Use this function if you want to manage channels yourself, make in-memory IPC sharing and so on.
103+
* If [allocator] is null, it will be created and closed inside.
104+
*/
105+
fun readFromChannel(channel: SeekableByteChannel, allocator: BufferAllocator?): DataFrame {
106+
fun readFromChannelAllocating(channel: SeekableByteChannel, allocator: BufferAllocator?): DataFrame {
107+
ArrowFileReader(channel, allocator).use { reader ->
108+
reader.loadNextBatch()
109+
return fromVectorSchemaRoot(reader.vectorSchemaRoot)
110+
}
111+
}
112+
if (allocator == null ) {
113+
RootAllocator().use { newAllocator ->
114+
return readFromChannelAllocating(channel, newAllocator)
115+
}
116+
} else {
117+
return readFromChannelAllocating(channel, allocator)
118+
}
119+
}
120+
121+
/**
122+
* Read [VectorSchemaRoot] from ByteArray and convert it to [DataFrame].
123+
*/
124+
fun fromByteArray(byteArray: ByteArray): DataFrame {
125+
return readFromChannel(ByteArrayReadableSeekableByteChannel(byteArray), null)
126+
}
127+
128+
/**
129+
* Read [VectorSchemaRoot] from [file] by and convert it to [DataFrame].
130+
*/
131+
fun fromFile(file: File): DataFrame {
132+
if (!file.exists()) {
133+
throw Exception("${file.path} does not exist")
134+
}
135+
if (file.isDirectory) {
136+
throw Exception("${file.path} is directory")
137+
}
138+
FileChannel.open(
139+
file.toPath(),
140+
StandardOpenOption.READ
141+
).use { channel ->
142+
return readFromChannel(channel, null)
143+
}
144+
}
145+
146+
/**
147+
* Read [VectorSchemaRoot] from file by [path] and convert it to [DataFrame].
148+
*/
149+
fun fromFile(path: String): DataFrame {
150+
return fromFile(File(path))
151+
}
152+
}
153+
154+
fun DataFrame.arrowWriter() = ArrowWriter(this)
155+
156+
class ArrowWriter(val dataFrame: DataFrame) {
157+
internal fun fromStringCol(column: StringCol, allocator: BufferAllocator): VarCharVector {
158+
val fieldVector = VarCharVector(column.name, allocator)
159+
fieldVector.allocateNew(column.length)
160+
column.values.forEachIndexed { index, value ->
161+
if (value == null) {
162+
fieldVector.setNull(index)
163+
} else {
164+
fieldVector.setSafe(index, Text(value))
165+
}
166+
}
167+
fieldVector.valueCount = column.length
168+
return fieldVector
169+
}
170+
171+
internal fun fromBooleanCol(column: BooleanCol, allocator: BufferAllocator): BitVector {
172+
val fieldVector = BitVector(column.name, allocator)
173+
fieldVector.allocateNew(column.length)
174+
column.values.forEachIndexed { index, value ->
175+
if (value == null) {
176+
fieldVector.setNull(index)
177+
} else {
178+
fieldVector.setSafe(index, if (value) 1 else 0)
179+
}
180+
}
181+
fieldVector.valueCount = column.length
182+
return fieldVector
183+
}
184+
185+
internal fun fromIntCol(column: IntCol, allocator: BufferAllocator): IntVector {
186+
val fieldVector = IntVector(column.name, allocator)
187+
fieldVector.allocateNew(column.length)
188+
column.values.forEachIndexed { index, value ->
189+
if (value == null) {
190+
fieldVector.setNull(index)
191+
} else {
192+
fieldVector.setSafe(index, value)
193+
}
194+
}
195+
fieldVector.valueCount = column.length
196+
return fieldVector
197+
}
198+
199+
internal fun fromLongCol(column: LongCol, allocator: BufferAllocator): BigIntVector {
200+
val fieldVector = BigIntVector(column.name, allocator)
201+
fieldVector.allocateNew(column.length)
202+
column.values.forEachIndexed { index, value ->
203+
if (value == null) {
204+
fieldVector.setNull(index)
205+
} else {
206+
fieldVector.setSafe(index, value)
207+
}
208+
}
209+
fieldVector.valueCount = column.length
210+
return fieldVector
211+
}
212+
213+
internal fun fromDoubleCol(column: DoubleCol, allocator: BufferAllocator): Float8Vector {
214+
val fieldVector = Float8Vector(column.name, allocator)
215+
fieldVector.allocateNew(column.length)
216+
column.values.forEachIndexed { index, value ->
217+
if (value == null) {
218+
fieldVector.setNull(index)
219+
} else {
220+
fieldVector.setSafe(index, value)
221+
}
222+
}
223+
fieldVector.valueCount = column.length
224+
return fieldVector
225+
}
226+
227+
internal fun fromAnyCol(column: AnyCol, allocator: BufferAllocator): VarCharVector {
228+
val fieldVector = VarCharVector(column.name, allocator)
229+
fieldVector.allocateNew(column.length)
230+
column.values.forEachIndexed { index, value ->
231+
if (value == null) {
232+
fieldVector.setNull(index)
233+
} else {
234+
fieldVector.setSafe(index, Text(value.toString()))
235+
}
236+
}
237+
fieldVector.valueCount = column.length
238+
return fieldVector
239+
}
240+
241+
/**
242+
* Internal low-level function.
243+
* Use this function if you are working with [VectorSchemaRoot]s and [BufferAllocator]s directly in your project.
244+
*/
245+
fun allocateVectorSchemaRoot(allocator: BufferAllocator): VectorSchemaRoot {
246+
val arrowVectors = dataFrame.cols.map { column ->
247+
when (column) {
248+
is StringCol -> fromStringCol(column, allocator)
249+
is BooleanCol -> fromBooleanCol(column, allocator)
250+
is IntCol -> fromIntCol(column, allocator)
251+
is LongCol -> fromLongCol(column, allocator)
252+
is DoubleCol -> fromDoubleCol(column, allocator)
253+
is AnyCol -> fromAnyCol(column, allocator)
254+
else -> {
255+
throw Exception("Unknown column type ${column.javaClass.canonicalName}")
256+
}
257+
}
258+
}
259+
return VectorSchemaRoot(arrowVectors)
260+
}
261+
262+
/**
263+
* Export [dataFrame] to [VectorSchemaRoot] and write it to any existing [channel].
264+
* Use this function if you want to manage channels yourself, make in-memory IPC sharing and so on
265+
*/
266+
fun writeToChannel(channel: WritableByteChannel) {
267+
RootAllocator().use { allocator ->
268+
this.allocateVectorSchemaRoot(allocator).use { vectorSchemaRoot ->
269+
ArrowFileWriter(vectorSchemaRoot, null, channel).use { writer ->
270+
writer.writeBatch();
271+
}
272+
}
273+
}
274+
}
275+
276+
/**
277+
* Export [dataFrame] to [VectorSchemaRoot] and write it to new ByteArray.
278+
*/
279+
fun toByteArray(): ByteArray {
280+
ByteArrayOutputStream().use { byteArrayStream ->
281+
Channels.newChannel(byteArrayStream).use { channel ->
282+
writeToChannel(channel)
283+
return byteArrayStream.toByteArray()
284+
}
285+
}
286+
}
287+
288+
/**
289+
* Export [dataFrame] to [VectorSchemaRoot] and write it to new or existing [file].
290+
* Temporary file is created if [file] argument is null.
291+
*/
292+
fun toFile(file: File?): File {
293+
val saveToFile = file ?: File.createTempFile("DataFrame", ".arrow")
294+
295+
FileChannel.open(
296+
saveToFile.toPath(),
297+
StandardOpenOption.WRITE,
298+
StandardOpenOption.CREATE
299+
).use { channel ->
300+
channel.truncate(0)
301+
writeToChannel(channel)
302+
}
303+
return saveToFile
304+
}
305+
306+
/**
307+
* Export [dataFrame] to [VectorSchemaRoot] and write it to new or existing file by [path].
308+
* Temporary file is created if [path] argument is null.
309+
*/
310+
fun toFile(path: String?): File {
311+
val saveToFile = if (path != null) {
312+
File(path)
313+
} else {
314+
File.createTempFile("DataFrame", ".arrow")
315+
}
316+
return toFile(saveToFile)
317+
}
318+
}

0 commit comments

Comments
 (0)