wangyum
diff --git a/‎.github/workflows/build_ebay_gluten.yml‎
Lines changed: 137 additions & 0 deletions b/‎.github/workflows/build_ebay_gluten.yml‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎.idea/vcs.xml‎
Lines changed: 3 additions & 21 deletions b/‎.idea/vcs.xml‎
Lines changed: 3 additions & 21 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala‎
Lines changed: 2 additions & 2 deletions b/‎backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dev/build_arrow.sh‎
Lines changed: 1 addition & 2 deletions b/‎dev/build_arrow.sh‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎dev/ci-velox-buildstatic-centos-7.sh‎
Lines changed: 1 addition & 1 deletion b/‎dev/ci-velox-buildstatic-centos-7.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala‎
Lines changed: 1 addition & 1 deletion b/‎gluten-core/src/main/scala/org/apache/gluten/config/GlutenCoreConfig.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎gluten-core/src/main/scala/org/apache/gluten/execution/GlutenPlan.scala‎
Lines changed: 2 additions & 0 deletions b/‎gluten-core/src/main/scala/org/apache/gluten/execution/GlutenPlan.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala‎
Lines changed: 3 additions & 3 deletions b/‎gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎gluten-substrait/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala‎
Lines changed: 13 additions & 2 deletions b/‎gluten-substrait/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎gluten-substrait/src/main/scala/org/apache/gluten/utils/InputPartitionsUtil.scala‎
Lines changed: 32 additions & 12 deletions b/‎gluten-substrait/src/main/scala/org/apache/gluten/utils/InputPartitionsUtil.scala‎
Lines changed: 32 additions & 12 deletions
@@ -0,0 +1,137 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Build eBay bundle package
+
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+  M2_REPOSITORY: '/root/.m2/repository/'
+
+on:
+  push:
+    branches:
+      - 'ebay-build**'
+    tags:
+      - 'ebay-build**'
+
+jobs:
+  build-native-lib:
+    runs-on: ubuntu-22.04
+    env:
+      OAUTH_KEY: ${{ secrets.OAUTH_KEY }}
+    steps:
+      - uses: actions/checkout@v4
+      - name: Build Gluten velox third party
+        run: |
+          docker run -v $GITHUB_WORKSPACE:/work -w /work apache/gluten:vcpkg-centos-7 bash -c "
+            df -a
+            cd /work
+            bash dev/ci-velox-buildstatic-centos-7.sh
+            mkdir -p /work/.m2/repository/org/apache/arrow/
+            cp -r $M2_REPOSITORY/org/apache/arrow  arrow-package
+          "
+      - name: Upload native libs
+        uses: actions/upload-artifact@v4
+        with:
+          path: ./cpp/build/releases/
+          name: velox-native-lib-${{github.sha}}
+          retention-days: 1
+          if-no-files-found: error
+      - name: Upload Artifact Arrow Jar
+        uses: actions/upload-artifact@v4
+        with:
+          path: ./arrow-package
+          name: velox-arrow-jar-centos-7-${{github.sha}}
+          retention-days: 1
+          if-no-files-found: error
+
+  build-bundle-package-centos8:
+    needs: build-native-lib
+    runs-on: ubuntu-22.04
+    env:
+      OAUTH_KEY: ${{ secrets.OAUTH_KEY }}
+    container: centos:8
+    steps:
+      - uses: actions/checkout@v1
+        with:
+          repository: wangyum/packages
+          ref: master
+          token: ${{ secrets.EBAY_TOKEN }}
+      - name: Install packages to $M2_REPOSITORY
+        run: |
+          mkdir -p $M2_REPOSITORY
+          rm -rf $M2_REPOSITORY/io && rm -rf $M2_REPOSITORY/org
+          mv ../packages/* $M2_REPOSITORY
+          cd $M2_REPOSITORY && find . -name "_*.repositories" | xargs rm -rf # Fix Could not find artifact io.ebay.rheos ...
+      - uses: actions/checkout@v2
+      - name: Download All Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: velox-native-lib-${{github.sha}}
+          path: ./cpp/build/releases
+      - name: Download All Arrow Jar Artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: velox-arrow-jar-centos-7-${{github.sha}}
+          path: /root/.m2/repository/org/apache/arrow/
+      - name: Setup java and maven
+        run: |
+          sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
+          sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && \
+          yum update -y && yum install -y java-1.8.0-openjdk-devel wget && \
+          wget https://dlcdn.apache.org/maven/maven-3/3.8.9/binaries/apache-maven-3.8.9-bin.tar.gz && \
+          tar -xvf apache-maven-3.8.9-bin.tar.gz && \
+          mv apache-maven-3.8.9 /usr/lib/maven
+      - uses: actions/checkout@v1
+        with:
+          repository: wangyum/ebay-spark
+          ref: gluten-build
+          token: ${{ secrets.EBAY_TOKEN }}
+      - name: Install ebay-spark to $M2_REPOSITORY
+        run: |
+          cd ../ebay-spark && build/mvn clean install -Dhadoop.version=3.3.3.1.0.39 -DskipTests=true -Dmaven.javadoc.skip=true
+      - name: Build for eBay Spark 3.5
+        run: |
+          cd $GITHUB_WORKSPACE/ && \
+          export MAVEN_HOME=/usr/lib/maven && \
+          export PATH=${PATH}:${MAVEN_HOME}/bin && \
+          mvn clean install -Pspark-3.5 -Dhadoop.version=3.3.3.1.0.39 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DskipTests=true -Dmaven.javadoc.skip=true -Dmaven.scaladoc.skip=true -Dmaven.source.skip -Dcyclonedx.skip=true
+          ls -lh package/target/
+      - name: Upload bundle package
+        uses: actions/upload-artifact@v4
+        with:
+          name: gluten-velox-bundle-package
+          path: package/target/gluten-velox-bundle-*.jar
+          retention-days: 3
+          if-no-files-found: error
+      - name: Install ebay-spark to $M2_REPOSITORY with scala 2.13
+        run: |
+          cd ../ebay-spark && ./dev/change-scala-version.sh 2.13 && build/mvn clean install -Pscala-2.13 -Dhadoop.version=3.3.3.1.0.39 -DskipTests=true -Dmaven.javadoc.skip=true
+      - name: Build for ebay Spark 3.5 with scala 2.13
+        run: |
+          cd $GITHUB_WORKSPACE/ && \
+          export MAVEN_HOME=/usr/lib/maven && \
+          export PATH=${PATH}:${MAVEN_HOME}/bin && \
+          export SPARK_SCALA_VERSION=2.13 && \
+          mvn clean install -Pspark-3.5 -Pscala-2.13 -Dhadoop.version=3.3.3.1.0.39 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DskipTests=true -Dmaven.javadoc.skip=true -Dmaven.scaladoc.skip=true -Dmaven.source.skip -Dcyclonedx.skip=true
+          ls -lh package/target/
+          md5sum package/target/gluten-velox-bundle-spark3.5*
+      - name: Upload bundle package with scala 2.13
+        uses: actions/upload-artifact@v4
+        with:
+          name: gluten-velox-bundle-package-scala2.13
+          path: package/target/gluten-velox-bundle-*.jar
+          retention-days: 3
+          if-no-files-found: error
@@ -154,7 +154,7 @@ class VeloxColumnarWriteFilesRDD(
     } else {
       Some(
         WriteTaskResult(
-          new TaskCommitMessage(addedAbsPathFiles.toMap -> updatedPartitions),
+          new TaskCommitMessage(addedAbsPathFiles.toMap, updatedPartitions, numFiles),
           summary))
     }
   }
@@ -277,7 +277,7 @@ case class VeloxColumnarWriteFilesExec private (
     if (rdd.partitions.length == 0) {
       // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single
       // partition rdd to make sure we at least set up one write task to write the metadata.
-      writeFilesForEmptyRDD(description, committer, jobTrackerID)
+      writeFilesForEmptyRDD(description, committer, jobTrackerID, writeFilesSpec)
     } else {
       new VeloxColumnarWriteFilesRDD(rdd, description, committer, jobTrackerID)
     }
 
@@ -26,8 +26,7 @@ BUILD_TYPE=Release
 
 function prepare_arrow_build() {
   mkdir -p ${ARROW_PREFIX}/../ && pushd ${ARROW_PREFIX}/../ && sudo rm -rf arrow_ep/
-  wget_and_untar https://github.com/apache/arrow/archive/refs/tags/apache-arrow-${VELOX_ARROW_BUILD_VERSION}/.tar.gz arrow_ep
-  #wget_and_untar https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz arrow_ep
+  git clone -b apache-arrow-15.0.0 https://github.com/apache/arrow.git arrow_ep
   cd arrow_ep
   patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow.patch
   patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow_dataset_scan_option.patch
 
@@ -19,5 +19,5 @@ set -e
 
 source /opt/rh/devtoolset-11/enable
 export NUM_THREADS=4
-./dev/builddeps-veloxbe.sh --enable_vcpkg=ON --build_arrow=OFF --build_tests=OFF --build_benchmarks=OFF \
+./dev/builddeps-veloxbe.sh --enable_vcpkg=ON --build_arrow=ON --build_tests=OFF --build_benchmarks=OFF \
                            --build_examples=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=ON
@@ -80,7 +80,7 @@ object GlutenCoreConfig {
       .doc("Whether to enable gluten. Default value is true. Just an experimental property." +
         " Recommend to enable/disable Gluten through the setting for spark.plugins.")
       .booleanConf
-      .createWithDefault(true)
+      .createWithDefault(false)
 
   // Options used by RAS.
   val RAS_ENABLED =
 
@@ -57,6 +57,8 @@ trait GlutenPlan
     rowType() != Convention.RowType.None
   }
 
+  final override val supportsVectorExecution: Boolean = true
+
   override def batchType(): Convention.BatchType
 
   override def rowType0(): Convention.RowType
 
@@ -963,15 +963,15 @@ object GlutenConfig {
       .doc("The threshold to determine whether to use sort-based columnar shuffle. Sort-based " +
         "shuffle will be used if the number of partitions is greater than this threshold.")
       .intConf
-      .createWithDefault(4000)
+      .createWithDefault(0)
 
   val COLUMNAR_SHUFFLE_SORT_COLUMNS_THRESHOLD =
     buildConf("spark.gluten.sql.columnar.shuffle.sort.columns.threshold")
       .internal()
       .doc("The threshold to determine whether to use sort-based columnar shuffle. Sort-based " +
         "shuffle will be used if the number of columns is greater than this threshold.")
       .intConf
-      .createWithDefault(100000)
+      .createWithDefault(0)
 
   val COLUMNAR_TABLE_CACHE_ENABLED =
     buildConf("spark.gluten.sql.columnar.tableCache")
@@ -1559,7 +1559,7 @@ object GlutenConfig {
       .internal()
       .doc("If enabled, gluten will convert the viewfs path to hdfs path in scala side")
       .booleanConf
-      .createWithDefault(false)
+      .createWithDefault(true)
 
   val ENCRYPTED_PARQUET_FALLBACK_ENABLED =
     buildConf("spark.gluten.sql.fallbackEncryptedParquet")
 
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.connector.catalog.Table
+import org.apache.spark.sql.connector.catalog.functions.Reducer
 import org.apache.spark.sql.connector.read.{InputPartition, Scan}
 import org.apache.spark.sql.execution.datasources.v2.{BatchScanExecShim, FileScan}
 import org.apache.spark.sql.execution.metric.SQLMetric
@@ -42,7 +43,9 @@ case class BatchScanExecTransformer(
     override val keyGroupedPartitioning: Option[Seq[Expression]] = None,
     override val ordering: Option[Seq[SortOrder]] = None,
     @transient override val table: Table,
+    override val joinKeyPositions: Option[Seq[Int]] = None,
     override val commonPartitionValues: Option[Seq[(InternalRow, Int)]] = None,
+    override val reducers: Option[Seq[Option[Reducer[_, _]]]] = None,
     override val applyPartialClustering: Boolean = false,
     override val replicatePartitions: Boolean = false)
   extends BatchScanExecTransformerBase(
@@ -52,9 +55,12 @@ case class BatchScanExecTransformer(
     keyGroupedPartitioning,
     ordering,
     table,
+    joinKeyPositions,
     commonPartitionValues,
+    reducers,
     applyPartialClustering,
-    replicatePartitions) {
+    replicatePartitions
+  ) {
 
   protected[this] def supportsBatchScan(scan: Scan): Boolean = {
     scan.isInstanceOf[FileScan]
@@ -77,7 +83,9 @@ abstract class BatchScanExecTransformerBase(
     override val keyGroupedPartitioning: Option[Seq[Expression]] = None,
     override val ordering: Option[Seq[SortOrder]] = None,
     @transient override val table: Table,
+    override val joinKeyPositions: Option[Seq[Int]] = None,
     override val commonPartitionValues: Option[Seq[(InternalRow, Int)]] = None,
+    override val reducers: Option[Seq[Option[Reducer[_, _]]]] = None,
     override val applyPartialClustering: Boolean = false,
     override val replicatePartitions: Boolean = false)
   extends BatchScanExecShim(
@@ -87,9 +95,12 @@ abstract class BatchScanExecTransformerBase(
     keyGroupedPartitioning,
     ordering,
     table,
+    joinKeyPositions,
     commonPartitionValues,
+    reducers,
     applyPartialClustering,
-    replicatePartitions)
+    replicatePartitions
+  )
   with BasicScanExecTransformer {
 
   // Note: "metrics" is made transient to avoid sending driver-side metrics to tasks.
 
@@ -21,7 +21,9 @@ import org.apache.gluten.sql.shims.SparkShimLoader
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.connector.read.InputPartition
+import org.apache.spark.sql.execution.PartitionedFileUtil
 import org.apache.spark.sql.execution.datasources.{BucketingUtils, FilePartition, HadoopFsRelation, PartitionDirectory}
+import org.apache.spark.sql.execution.datasources.FilePartition.{maxSplitBytesBySpecifiedNum, minPartitionNumBySpecifiedSize}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.collection.BitSet
 
@@ -47,9 +49,30 @@ case class InputPartitionsUtil(
   }
 
   private def genNonBuckedInputPartitionSeq(): Seq[InputPartition] = {
+    val originSize = FilePartition.maxSplitBytes(relation.sparkSession, selectedPartitions)
     val openCostInBytes = relation.sparkSession.sessionState.conf.filesOpenCostInBytes
     val maxSplitBytes =
-      FilePartition.maxSplitBytes(relation.sparkSession, selectedPartitions)
+      if (
+        relation.sparkSession.sessionState.conf.bucketingEnabled &&
+        relation.bucketSpec.isDefined
+      ) {
+        val partitionNum =
+          minPartitionNumBySpecifiedSize(relation.sparkSession, selectedPartitions, originSize)
+        val bucketNum = math.max(
+          relation.bucketSpec.get.numBuckets,
+          relation.sparkSession.sessionState.conf.numShufflePartitions)
+        val maxBucketScanParts = relation.sparkSession.sessionState.conf.filesMaxPartitionNum
+          .map(_.min(bucketNum))
+          .getOrElse(bucketNum)
+        if (partitionNum > maxBucketScanParts) {
+          maxSplitBytesBySpecifiedNum(relation.sparkSession, selectedPartitions, maxBucketScanParts)
+        } else {
+          originSize
+        }
+      } else {
+        originSize
+      }
+
     logInfo(
       s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " +
         s"open cost is considered as scanning $openCostInBytes bytes.")
@@ -73,22 +96,19 @@ case class InputPartitionsUtil(
     val splitFiles = selectedPartitions
       .flatMap {
         partition =>
-          SparkShimLoader.getSparkShims.getFileStatus(partition).flatMap {
+          partition.files.flatMap {
             file =>
               // getPath() is very expensive so we only want to call it once in this block:
-              val filePath = file._1.getPath
+              val filePath = file.path
               if (shouldProcess(filePath)) {
                 val isSplitable =
                   SparkShimLoader.getSparkShims.isFileSplittable(relation, filePath, requiredSchema)
-                SparkShimLoader.getSparkShims.splitFiles(
-                  sparkSession = relation.sparkSession,
-                  file = file._1,
-                  filePath = filePath,
-                  isSplitable = isSplitable,
-                  maxSplitBytes = maxSplitBytes,
-                  partitionValues = partition.values,
-                  metadata = file._2
-                )
+                PartitionedFileUtil.splitFiles(
+                  relation.sparkSession,
+                  file,
+                  isSplitable,
+                  maxSplitBytes,
+                  partition.values)
               } else {
                 Seq.empty
               }
Original file line number	Diff line number	Diff line change
`@@ -154,7 +154,7 @@ class VeloxColumnarWriteFilesRDD(`
`154`	`154`	`} else {`
`155`	`155`	`Some(`
`156`	`156`	`WriteTaskResult(`
`157`		`- new TaskCommitMessage(addedAbsPathFiles.toMap -> updatedPartitions),`
	`157`	`+ new TaskCommitMessage(addedAbsPathFiles.toMap, updatedPartitions, numFiles),`
`158`	`158`	`summary))`
`159`	`159`	`}`
`160`	`160`	`}`
`@@ -277,7 +277,7 @@ case class VeloxColumnarWriteFilesExec private (`
`277`	`277`	`if (rdd.partitions.length == 0) {`
`278`	`278`	`// SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single`
`279`	`279`	`// partition rdd to make sure we at least set up one write task to write the metadata.`
`280`		`- writeFilesForEmptyRDD(description, committer, jobTrackerID)`
	`280`	`+ writeFilesForEmptyRDD(description, committer, jobTrackerID, writeFilesSpec)`
`281`	`281`	`} else {`
`282`	`282`	`new VeloxColumnarWriteFilesRDD(rdd, description, committer, jobTrackerID)`
`283`	`283`	`}`
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,8 @@ trait GlutenPlan`
`57`	`57`	`rowType() != Convention.RowType.None`
`58`	`58`	`}`
`59`	`59`
	`60`	`+ final override val supportsVectorExecution: Boolean = true`
	`61`	`+`
`60`	`62`	`override def batchType(): Convention.BatchType`
`61`	`63`
`62`	`64`	`override def rowType0(): Convention.RowType`