From 33032cba10c206eebdcb7fb053c168da87bf1a09 Mon Sep 17 00:00:00 2001 From: Simeon Simeonov Date: Thu, 25 Jun 2020 01:17:20 -0400 Subject: [PATCH] Upgrades to Spark 3.0.0 and Scala 2.12.11 --- .gitignore | 1 + NOTICE | 8 ++++++++ README.md | 20 ++++++++++++------- VERSION | 2 +- .../spark/sql/catalyst/plans/PlanTest.scala | 4 ++-- build.sbt | 6 +++--- project/build.properties | 2 +- project/plugins.sbt | 6 ++---- 8 files changed, 31 insertions(+), 18 deletions(-) create mode 100644 NOTICE diff --git a/.gitignore b/.gitignore index 5bdf0fa..581581d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,5 +24,6 @@ project/plugins/project/ metastore_db/ tmp/ +spark-warehouse/ .DS_Store diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..8cf6bfe --- /dev/null +++ b/NOTICE @@ -0,0 +1,8 @@ +------------------------------------------------------------------------------------ +This product bundles various third-party components under other open source licenses. +This section summarizes those components and their licenses. + + +Apache Software Foundation License 2.0 +-------------------------------------- +alchemy/src/test/scala/org/apache/spark/* diff --git a/README.md b/README.md index 988b93e..653ca6c 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,14 @@ Spark Alchemy is a collection of open-source Spark tools & frameworks that have data science teams at [Swoop](https://www.swoop.com) highly productive in our demanding petabyte-scale environment with rich data (thousands of columns). +## Supported languages + +While `spark-alchemy`, like Spark itself, is written in Scala, much of its functionality, such as interoperable HyperLogLog functions, can be used from other Spark-supported langauges such as SparkSQL and Python ([here is how](https://github.com/swoop-inc/spark-alchemy/issues/9#issuecomment-555155170)). + ## Installation +Versions 0.x target Spark 2.x and Scala 2.11. Versions 1.x target Spark 3.x and Scala 2.12. + Add the following to your `libraryDependencies` in SBT: ```scala @@ -20,31 +26,31 @@ You can find all released versions [here](https://github.com/swoop-inc/spark-alc ## For Spark users -- Native [HyperLogLog functions](../../wiki/Spark-HyperLogLog-Functions) that offer reaggregatable fast approximate distinct counting capabilities far beyond those in OSS Spark with interoperability to Postgres and even JavaScript. +- Native [HyperLogLog functions](../../wiki/Spark-HyperLogLog-Functions) that offer reaggregatable fast approximate distinct counting capabilities far beyond those in OSS Spark with interoperability to Postgres and even JavaScript. Just as Spark's own native functions, once the functions are registered with Spark, they can be used from SparkSQL, Python, etc. ## For Spark framework developers - Helpers for [native function registration](../../wiki/Spark-Native-Functions) -## What's coming +- Look at [`SparkSessionSpec`](alchemy/src/test/scala/com/swoop/test_utils/SparkSessionSpec.scala) as an example of how to reuse advanced Spark testing functionality from OSS Spark. + +## What we hope to open source in the future - Configuration Addressable Production (CAP), Automatic Lifecycle Management (ALM) and Just-in-time Dependency Resolution (JDR) as outlined in our Spark+AI Summit talk [Unafraid of Change: Optimizing ETL, ML, and AI in Fast-Paced Environments](https://databricks.com/session/unafraid-of-change-optimizing-etl-ml-ai-in-fast-paced-environments). +- Utilities that make [Delta Lake](https://delta.io) development substantially more productive. + - Hundreds of productivity-enhancing extensions to the core user-level data types: `Column`, `Dataset`, `SparkSession`, etc. - Data discovery and cleansing tools we use to ingest and clean up large amounts of dirty data from third parties. - Cross-cluster named lock manager, which simplifies data production by removing the need for workflow servers much of the time. -- Versioned data source, which allows a new version to be written while the current version is being read. - - `case class` code generation from Spark schema, with easy implementation customization. - Tools for deploying Spark ML pipelines to production. -- Lots more, as we are constantly building up our internal toolset. - ## More from Swoop - [spark-records](https://github.com/swoop-inc/spark-records): bulletproof Spark jobs with fast root cause analysis in the case of failures @@ -57,4 +63,4 @@ Spark Alchemy is maintained by the team at [Swoop](https://www.swoop.com). If yo ## License -`spark-alchemy` is Copyright © 2018 [Swoop, Inc.](https://www.swoop.com) It is free software, and may be redistributed under the terms of the LICENSE. +`spark-alchemy` is Copyright © 2018-2020 [Swoop, Inc.](https://www.swoop.com) It is free software, and may be redistributed under the terms of the LICENSE. diff --git a/VERSION b/VERSION index 978ba2b..f755149 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.7.0-SNAPSHOT +1.0.0-SNAPSHOT diff --git a/alchemy/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/alchemy/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala index fae2911..89d53e0 100644 --- a/alchemy/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala +++ b/alchemy/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala @@ -73,11 +73,11 @@ trait PlanTestBase extends PredicateHelper with SQLHelper { .reduce(And), child) case sample: Sample => sample.copy(seed = 0L) - case Join(left, right, joinType, condition) if condition.isDefined => + case Join(left, right, joinType, condition, hint) if condition.isDefined => val newCondition = splitConjunctivePredicates(condition.get).map(rewriteEqual).sortBy(_.hashCode()) .reduce(And) - Join(left, right, joinType, Some(newCondition)) + Join(left, right, joinType, Some(newCondition), hint) } } diff --git a/build.sbt b/build.sbt index 1e4e6f7..5fe4d44 100644 --- a/build.sbt +++ b/build.sbt @@ -1,14 +1,14 @@ ThisBuild / organization := "com.swoop" ThisBuild / version := scala.io.Source.fromFile("VERSION").mkString.stripLineEnd -ThisBuild / scalaVersion := "2.11.12" -ThisBuild / crossScalaVersions := Seq("2.11.12") +ThisBuild / scalaVersion := "2.12.11" +ThisBuild / crossScalaVersions := Seq("2.12.11") ThisBuild / javacOptions ++= Seq("-source", "1.8", "-target", "1.8") val scalaTest = "org.scalatest" %% "scalatest" % "3.0.8" -val sparkVersion = "2.4.5" +val sparkVersion = "3.0.0" // https://bintray.com/swoop-inc/maven resolvers += Resolver.bintrayRepo("swoop-inc", "maven") diff --git a/project/build.properties b/project/build.properties index 6d44192..654fe70 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.2.4 +sbt.version=1.3.12 diff --git a/project/plugins.sbt b/project/plugins.sbt index b1a4123..1b44ca6 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,8 +1,6 @@ -import sbt.Resolver +scalaVersion := "2.12.11" -scalaVersion := "2.12.5" - -addSbtPlugin("com.47deg" % "sbt-microsites" % "0.7.27") +addSbtPlugin("com.47deg" % "sbt-microsites" % "0.7.27") addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.2") addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0")