NVIDIA · pxLi · Sep 25, 2023 · Sep 25, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -183,6 +183,16 @@ flag if cross-compilation is required.
 mvn clean verify -Dbuildver=330 -P<jdk11|jdk17>
 ```
 
+### Building and Testing with ARM
+
+To build our project on ARM platform, please add `-Parm64` to your Maven commands.
+NOTE: Build process does not require an ARM machine, so if you want to build the artifacts only
+on X86 machine, please also add `-DskipTests` in commands.
+
+```bash
+mvn clean verify -Dbuildver=311 -Parm64
+```
+
 ### Iterative development during local testing
 
 When iterating on changes impacting the `dist` module artifact directly or via

diff --git a/README.md b/README.md
@@ -73,7 +73,7 @@ as a `provided` dependency.
 <dependency>
     <groupId>com.nvidia</groupId>
     <artifactId>rapids-4-spark_2.12</artifactId>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
     <scope>provided</scope>
 </dependency>
 ```
@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
     </parent>
     <artifactId>rapids-4-spark-aggregator_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Aggregator</name>
     <description>Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <!--

@@ -22,10 +22,10 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
     </parent>
     <artifactId>rapids-4-spark-api-validation</artifactId>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <profiles>
        <profile>

diff --git a/datagen/README.md b/datagen/README.md
@@ -27,12 +27,12 @@ corresponding profile flag `-P<jdk11|jdk17>`
 
 After this the jar should be at 
 `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar`
-for example a Spark 3.3.0 jar for the 23.10.0 release would be
-`target/datagen_2.12-23.10.0-spark330.jar`
+for example a Spark 3.3.0 jar for the 23.12.0 release would be
+`target/datagen_2.12-23.12.0-spark330.jar`
 
 To get a spark shell with this you can run 
 ```shell
-spark-shell --jars target/datagen_2.12-23.10.0-spark330.jar
+spark-shell --jars target/datagen_2.12-23.12.0-spark330.jar
 ```
 
 After that you should be good to go.

diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md
@@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \
 --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \
 --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class
 --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME
-./target/datagen_2.12-23.10.0-SNAPSHOT-spark332.jar \
+./target/datagen_2.12-23.12.0-SNAPSHOT-spark332.jar \
 1 \
 10 \
 parquet \

@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
     </parent>
     <artifactId>datagen_2.12</artifactId>
     <name>Data Generator</name>
     <description>Tools for generating large amounts of data</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
     <properties>
         <target.classifier/>
         <rapids.default.jar.excludePattern>**/*</rapids.default.jar.excludePattern>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-20x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support</name>
     <description>Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-21x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support</name>
     <description>Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-22x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support</name>
     <description>Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-24x_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support</name>
     <description>Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark321db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support</name>
     <description>Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark330db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support</name>
     <description>Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-spark332db_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support</name>
     <description>Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,14 +22,14 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
         <relativePath>../../pom.xml</relativePath>
     </parent>
 
     <artifactId>rapids-4-spark-delta-stub_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Delta Lake Stub</name>
     <description>Delta Lake stub for the RAPIDS Accelerator for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
 
     <properties>
         <rapids.compressed.artifact>false</rapids.compressed.artifact>

@@ -22,12 +22,12 @@
     <parent>
         <groupId>com.nvidia</groupId>
         <artifactId>rapids-4-spark-parent</artifactId>
-        <version>23.10.0-SNAPSHOT</version>
+        <version>23.12.0-SNAPSHOT</version>
     </parent>
     <artifactId>rapids-4-spark_2.12</artifactId>
     <name>RAPIDS Accelerator for Apache Spark Distribution</name>
     <description>Creates the distribution package of the RAPIDS plugin for Apache Spark</description>
-    <version>23.10.0-SNAPSHOT</version>
+    <version>23.12.0-SNAPSHOT</version>
     <dependencies>
         <dependency>
             <groupId>com.nvidia</groupId>

diff --git a/docs/configs.md b/docs/configs.md
@@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports.
 On startup use: `--conf [conf key]=[conf value]`. For example:
 
 ```
-${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.10.0-SNAPSHOT-cuda11.jar \
+${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar \
 --conf spark.plugins=com.nvidia.spark.SQLPlugin \
 --conf spark.rapids.sql.concurrentGpuTasks=2
 ```

diff --git a/docs/dev/README.md b/docs/dev/README.md
@@ -13,6 +13,7 @@ following topics:
   * [How Spark Executes the Physical Plan](#how-spark-executes-the-physical-plan)
 * [How the Plugin Works](#how-the-rapids-plugin-works)
   * [Plugin Replacement Rules](#plugin-replacement-rules)
+  * [Working with Data Sources](#working-with-data-sources)
 * [Guidelines for Replacing Catalyst Executors and Expressions](#guidelines-for-replacing-catalyst-executors-and-expressions)
   * [Setting Up the Class](#setting-up-the-class)
   * [Expressions](#expressions)
@@ -131,6 +132,11 @@ executor, expression, etc.), and applying the rule that matches.  See the
 There is a separate guide for working with
 [Adaptive Query Execution](adaptive-query.md).
 
+### Working with Data Sources
+
+The plugin supports v1 and v2 data sources for file formats such as CSV,
+Orc, JSON, and Parquet. See the [data source guide](data-sources.md) for more information.
+
 ## Guidelines for Replacing Catalyst Executors and Expressions
 Most development work in the plugin involves translating various Catalyst
 executor and expression nodes into new nodes that execute on the GPU.  This

diff --git a/docs/dev/data-sources.md b/docs/dev/data-sources.md
@@ -0,0 +1,68 @@
+---
+layout: page
+title: Working with Spark Data Sources
+nav_order: 2
+parent: Developer Overview
+---
+
+# Working with Spark Data Sources
+
+## Data Source API Versions
+
+Spark has two major versions of its data source APIs, simply known as "v1" and "v2". There is a configuration
+property `spark.sql.sources.useV1SourceList` which determines which API version is used when reading from data
+sources such as CSV, Orc, and Parquet. The default value for this configuration option (as of Spark 3.4.0)
+is `"avro,csv,json,kafka,orc,parquet,text"`, meaning that all of these data sources fall back to v1 by default.
+
+When using Spark SQL (including the DataFrame API), the representation of a read in the physical plan will be
+different depending on the API version being used, and in the plugin we therefore have different code paths
+for tagging and replacing these operations.
+
+## V1 API
+
+In the v1 API, a read from a file-based data source is represented by a `FileSourceScanExec`, which wraps
+a `HadoopFsRelation`.
+
+`HadoopFsRelation` is an important component in Apache Spark. It represents a relation based on data stored in the
+Hadoop FileSystem. When we talk about the Hadoop FileSystem in this context, it encompasses various distributed
+storage systems that are Hadoop-compatible, such as HDFS (Hadoop Distributed FileSystem), Amazon S3, and others.
+
+`HadoopFsRelation` is not tied to a specific file format. Instead, it relies on implementations of the `FileFormat`
+interface to read and write data.
+
+This means that various file formats like CSV, Parquet, and ORC can have their implementations of the `FileFormat`
+interface, and `HadoopFsRelation` will be able to work with any of them.
+
+When overriding `FileSourceScanExec` in the plugin, there are a number of different places where tagging code can be
+placed, depending on the file format. We start in GpuOverrides with a map entry `GpuOverrides.exec[FileSourceScanExec]`,
+and then the hierarchical flow is typically as follows, although it may vary between shim versions:
+
+```
+FileSourceScanExecMeta.tagPlanForGpu
+  ScanExecShims.tagGpuFileSourceScanExecSupport
+    GpuFileSourceScanExec.tagSupport
+```
+
+`GpuFileSourceScanExec.tagSupport` will inspect the `FileFormat` and then call into one of the following:
+
+- `GpuReadCSVFileFormat.tagSupport`, which calls `GpuCSVScan.tagSupport`
+- `GpuReadOrcFileFormat.tagSupport`, which calls `GpuOrcScan.tagSupport`
+- `GpuReadParquetFileFormat.tagSupport`, which calls `GpuParquetScan.tagSupport`
+
+The classes `GpuCSVScan`, `GpuParquetScan`, `GpuOrcScan`, and `GpuJsonScan` are also called
+from the v2 API, so this is a good place to put code that is not specific to either API
+version. These scan classes also call into `FileFormatChecks.tag`.
+
+## V2 API
+
+When using the v2 API, the physical plan will contain a `BatchScanExec`, which wraps a scan that implements
+the `org.apache.spark.sql.connector.read.Scan` trait. The scan implementations include `CsvScan`, `ParquetScan`,
+and `OrcScan`. These are the same scan implementations used in the v1 API, and the plugin tagging code can be
+placed in one of the following methods:
+
+- `GpuCSVScan.tagSupport`
+- `GpuOrcScan.tagSupport`
+- `GpuParquetScan.tagSupport`
+
+When overriding v2 operators in the plugin, we can override both `BatchScanExec` and the individual scans, such
+as `CsvScanExec`.
diff --git a/docs/dev/shims.md b/docs/dev/shims.md
@@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi
 Spark 3.0.2's URLs:
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.10.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.10.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.10.0.jar!/spark302/
+jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark302/
 ```
 
 Spark 3.2.0's URLs :
 
 ```text
-jar:file:/home/spark/rapids-4-spark_2.12-23.10.0.jar!/
-jar:file:/home/spark/rapids-4-spark_2.12-23.10.0.jar!/spark3xx-common/
-jar:file:/home/spark/rapids-4-spark_2.12-23.10.0.jar!/spark320/
+jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/
+jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/
+jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark320/
 ```
 
 ### Late Inheritance in Public Classes

diff --git a/integration_tests/README.md b/integration_tests/README.md
@@ -250,7 +250,7 @@ individually, so you don't risk running unit tests along with the integration te
 http://www.scalatest.org/user_guide/using_the_scalatest_shell
 
 ```shell
-spark-shell --jars rapids-4-spark-tests_2.12-23.10.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-23.10.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
+spark-shell --jars rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-23.12.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar
 ```
 
 First you import the `scalatest_shell` and tell the tests where they can find the test files you
@@ -273,7 +273,7 @@ If you just want to verify the SQL replacement is working you will need to add t
 assumes CUDA 11.0 is being used.
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.10.0-SNAPSHOT-cuda11.jar" ./runtests.py
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py
 ```
 
 You don't have to enable the plugin for this to work, the test framework will do that for you.
@@ -372,7 +372,7 @@ To run cudf_udf tests, need following configuration changes:
 As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0:
 
 ```
-$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.10.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-23.10.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.10.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.10.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
+$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf
 ```
 
 ### Enabling fuzz tests