diff --git a/.travis.yml b/.travis.yml index a776739e..96d98f7e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,4 +29,4 @@ before_install: # Build install: -- build/gimel -Dmaven.test.skip=true -pl gimel-dataapi/gimel-tools -am | egrep -v "Download|Copy|Including|WARNING" +- sh -x build/gimel -Dmaven.test.skip=true -pl gimel-dataapi | egrep -iv "Download|Copy|Including|WARNING" diff --git a/build/gimel b/build/gimel index 0d4bf254..2b9ddfef 100755 --- a/build/gimel +++ b/build/gimel @@ -15,9 +15,81 @@ # See the License for the specific language governing permissions and # limitations under the License. +echo "Present Working Dir" +echo $PWD + this_dir=`dirname build/install_dependencies` -# Source reusable functions -source ${this_dir}/gimel_functions + +#----------------------------function will check for error code & exit if failure, else proceed further----------------------------# + +#usage : check_error <$?> +#Example: Check_error < pass $? from the shell command > < Custom Message for errorcode -gt 0 > + +check_error() +{ + cmd_error_code=$1 + custom_message=$2 + if [ ${cmd_error_code} -gt 0 ]; then + write_log "Error | Stage | ${custom_message}" + exit ${cmd_error_code} + else + write_log "Success | Stage | ${custom_message}" + fi +} + +#----------------------------function will check for error code & warn if failure----------------------------# + +#usage : check_warning <$?> +#Example: Check_warning < pass $? from the shell command > < Custom Message for errorcode -gt 0 > + + +check_warning() +{ + + cmd_error_code=$1 + pgm_exit_code=$2 + pgm_exit_msg=$3 + if [ ${cmd_error_code} -gt 0 ]; then + write_log "WARNING ! ${cmd_error_code} ${pgm_exit_code} ${pgm_exit_msg}" + else + echo "" + fi +} + + + +#----------------------------function will write the message to Console / Log File----------------------------# + +#Usage : write_log < Whatever message you need to log > + +write_log() +{ + msg=$1 + to_be_logged="$(date '+%Y%m%d %H:%M:%S') | $msg" + echo ${to_be_logged} +} + +#-----------------------------------Executes a Command--------------------------------------------------------# + + + +#Usage : run_cmd < The command to execute > + +run_cmd() +{ + cmd=$1 + if [ -z $2 ]; then + fail_on_error="break_code" + else + fail_on_error=$2 + fi + write_log "Executing Command --> $1" + $cmd + error_code=$? + if [ ! $fail_on_error = "ignore_errors" ]; then + check_error $error_code "$cmd" + fi +} #--------------------------------Begin execution of Steps------------------------------------------------# @@ -34,9 +106,29 @@ else fi -build/install_dependencies -check_error $? "build/install_dependencies" +write_log "Installing dependencies [sh -x build/install_dependencies]" + +# All jars below are not present in maven central or any public repository, thats why they are added manually while building gimel. + +mvn install:install-file -DgroupId=qubole-hive-JDBC -DartifactId=qubole-hive-JDBC -Dversion=0.0.7 -Dpackaging=jar -Dfile=${this_dir}/../lib/qubole-hive-JDBC.jar 1>>/dev/null 2>&1 +check_error $? "install qubole-hive-JDBC" + +mvn install:install-file -DgroupId=com.hortonworks -DartifactId=shc-core -Dversion=1.1.2-2.3-s_2.11 -Dpackaging=jar -Dfile=${this_dir}/../lib/shc-core.jar 1>>/dev/null 2>&1 +check_error $? "install shc-core" + +mvn install:install-file -DgroupId=com.osscube -DartifactId=aerospike-spark -Dversion=0.3-SNAPSHOT -Dpackaging=jar -Dfile=${this_dir}/../lib/aerospike-spark.jar 1>>/dev/null 2>&1 +check_error $? "install aerospike-spark" + +mvn install:install-file -DgroupId=com.twitter -DartifactId=zookeeper-client_2.10 -Dversion=2.0.0_fs-b -Dpackaging=jar -Dfile=${this_dir}/../lib/zookeeper-client_2.10-2.0.0_fs-b.jar 1>>/dev/null 2>&1 +check_error $? "install zookeeper-client" + +mvn install:install-file -DgroupId=com.teradata.jdbc -DartifactId=terajdbc4 -Dversion=15.10.00.22 -Dpackaging=jar -Dfile=${this_dir}/../lib/terajdbc4-15.10.00.22.jar 1>>/dev/null 2>&1 +check_error $? "install terajdbc4" + +mvn install:install-file -DgroupId=com.teradata.jdbc -DartifactId=tdgssconfig -Dversion=15.10.00.22 -Dpackaging=jar -Dfile=${this_dir}/../lib/tdgssconfig-15.10.00.22.jar 1>>/dev/null 2>&1 +check_error $? "install tdgssconfig" +write_log "Building the project [mvn install ${user_args}]" mvn install "$@" check_error $? "mvn install $@" diff --git a/build/install_dependencies b/build/install_dependencies index 90159c1d..5f7c185a 100755 --- a/build/install_dependencies +++ b/build/install_dependencies @@ -18,8 +18,77 @@ this_script=`pwd`/$BASH_SOURCE this_dir=`dirname $this_script` -# Source reusable functions -source ${this_dir}/gimel_functions + +#----------------------------function will check for error code & exit if failure, else proceed further----------------------------# + +#usage : check_error <$?> +#Example: Check_error < pass $? from the shell command > < Custom Message for errorcode -gt 0 > + +check_error() +{ + cmd_error_code=$1 + custom_message=$2 + if [ ${cmd_error_code} -gt 0 ]; then + write_log "Error | Stage | ${custom_message}" + exit ${cmd_error_code} + else + write_log "Success | Stage | ${custom_message}" + fi +} + +#----------------------------function will check for error code & warn if failure----------------------------# + +#usage : check_warning <$?> +#Example: Check_warning < pass $? from the shell command > < Custom Message for errorcode -gt 0 > + + +check_warning() +{ + + cmd_error_code=$1 + pgm_exit_code=$2 + pgm_exit_msg=$3 + if [ ${cmd_error_code} -gt 0 ]; then + write_log "WARNING ! ${cmd_error_code} ${pgm_exit_code} ${pgm_exit_msg}" + else + echo "" + fi +} + + + +#----------------------------function will write the message to Console / Log File----------------------------# + +#Usage : write_log < Whatever message you need to log > + +write_log() +{ + msg=$1 + to_be_logged="$(date '+%Y%m%d %H:%M:%S') | $msg" + echo ${to_be_logged} +} + +#-----------------------------------Executes a Command--------------------------------------------------------# + + + +#Usage : run_cmd < The command to execute > + +run_cmd() +{ + cmd=$1 + if [ -z $2 ]; then + fail_on_error="break_code" + else + fail_on_error=$2 + fi + write_log "Executing Command --> $1" + $cmd + error_code=$? + if [ ! $fail_on_error = "ignore_errors" ]; then + check_error $error_code "$cmd" + fi +} #--------------------------------Begin execution of Steps------------------------------------------------# diff --git a/docs/getting-started/gimel-modules.md b/docs/getting-started/gimel-modules.md index 384c008f..ff8dab50 100644 --- a/docs/getting-started/gimel-modules.md +++ b/docs/getting-started/gimel-modules.md @@ -14,7 +14,7 @@ com.paypal.gimel gimel-tools - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT provided ``` @@ -23,7 +23,7 @@ com.paypal.gimel gimel-sql - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT provided ``` @@ -32,7 +32,7 @@ com.paypal.gimel gimel-core - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT provided ``` diff --git a/docs/gimel-connectors/kafka2.md b/docs/gimel-connectors/kafka2.md index 7c9572b6..d2371f90 100644 --- a/docs/gimel-connectors/kafka2.md +++ b/docs/gimel-connectors/kafka2.md @@ -117,7 +117,7 @@ ```bash spark-shell --jars -gimel-tools-2.0.0-SNAPSHOT-uber.jar, +gimel-tools-2.4.7-SNAPSHOT-uber.jar, generic-deserializers-1.0-SNAPSHOT-uber.jar, generic-serializers-1.0-SNAPSHOT-uber.jar @@ -510,7 +510,7 @@ Users can implement their own logic for getting the kafka properties which will com.paypal.gimel gimel-common - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT provided ``` diff --git a/docs/gimel-logging/gimel-logging.md b/docs/gimel-logging/gimel-logging.md index 679b5091..442e1fae 100644 --- a/docs/gimel-logging/gimel-logging.md +++ b/docs/gimel-logging/gimel-logging.md @@ -75,7 +75,7 @@ object SparkWordCount { Example: ```shell script -spark-shell --jars gimel-tools-2.0.0-SNAPSHOT-uber.jar \ +spark-shell --jars gimel-tools-2.4.7-SNAPSHOT-uber.jar \ --conf spark.driver.extraJavaOptions="-Dgimel.logger.properties.filepath=gimelLoggerConfig.properties" \ --conf spark.executor.extraJavaOptions="-Dgimel.logger.properties.filepath=gimelLoggerConfig.properties" \ --conf spark.files=/path/to/gimelLoggerConfig.properties \ diff --git a/docs/gimel-serde/gimel-serde.md b/docs/gimel-serde/gimel-serde.md index 14c36bd5..5d4de174 100644 --- a/docs/gimel-serde/gimel-serde.md +++ b/docs/gimel-serde/gimel-serde.md @@ -57,7 +57,7 @@ Example: spark-shell --jars \ generic-deserializers-1.0-SNAPSHOT-uber.jar,\ generic-serializers-1.0-SNAPSHOT-uber.jar,\ -gimel-tools-2.0.0-SNAPSHOT-uber.jar +gimel-tools-2.4.7-SNAPSHOT-uber.jar ``` ## Generic Deserializers diff --git a/docs/try-gimel/0-prerequisite.md b/docs/try-gimel/0-prerequisite.md index 61a3be11..8b2401e8 100644 --- a/docs/try-gimel/0-prerequisite.md +++ b/docs/try-gimel/0-prerequisite.md @@ -89,7 +89,7 @@ quickstart/start-gimel kafka,elasticsearch,hbase-master,hbase-regionserver ``` docker exec -it spark-master bash -c \ "export USER=an;export SPARK_HOME=/spark/;export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin; \ -/spark/bin/spark-shell --jars /root/gimel-sql-2.0.0-SNAPSHOT-uber.jar" +/spark/bin/spark-shell --jars /root/gimel-sql-2.4.7-SNAPSHOT-uber.jar" ``` **Note:** *You can view the Spark UI here* diff --git a/gimel-dataapi/gimel-common/pom.xml b/gimel-dataapi/gimel-common/pom.xml index dfb658e6..b0c3a867 100644 --- a/gimel-dataapi/gimel-common/pom.xml +++ b/gimel-dataapi/gimel-common/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../pom.xml 4.0.0 gimel-common - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -55,9 +55,9 @@ under the License. ${packaging.scope} - org.scala-lang - scala-xml - 2.11.0-M4 + org.scala-lang.modules + scala-xml_${scala.binary.version} + ${scala.xml.version} ${scala.packaging.scope} @@ -82,12 +82,6 @@ under the License. com.paypal.gimel gimel-logger ${gimel.version}-SNAPSHOT - - - org.apache.kafka - kafka-clients - - org.apache.kafka @@ -97,7 +91,7 @@ under the License. com.paypal.gimel serde-common - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT ${packaging.scope} @@ -106,12 +100,6 @@ under the License. ${kafka.version} ${packaging.scope} - - com.databricks - spark-avro_${scala.binary.version} - 4.0.0 - ${packaging.scope} - org.apache.curator curator-framework @@ -162,7 +150,7 @@ under the License. org.apache.spark - spark-streaming-kafka-${kafka.binary.version}_${scala.binary.version} + spark-streaming-kafka-${spark.kafka.connector.version} ${spark.version} ${spark.packaging.scope} @@ -197,7 +185,7 @@ under the License. ${confluent.version} test - + io.netty netty @@ -211,7 +199,7 @@ under the License. test - net.jpountz.lz4 + net.jpountz.lz4 lz4 1.3.0 test @@ -220,16 +208,19 @@ under the License. com.fasterxml.jackson.core jackson-core ${jackson.version} + compile com.fasterxml.jackson.core jackson-annotations ${jackson.version} + compile com.fasterxml.jackson.core jackson-databind ${jackson.version} + compile @@ -277,7 +268,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.2.1 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/pom.xml b/gimel-dataapi/gimel-connectors/gimel-aerospike/pom.xml similarity index 95% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/pom.xml rename to gimel-dataapi/gimel-connectors/gimel-aerospike/pom.xml index 5b781719..bd3e9355 100644 --- a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-aerospike/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 - gimel-aerospike-3.14 - 2.0.0-SNAPSHOT + gimel-aerospike + 2.4.7-SNAPSHOT @@ -58,7 +58,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/DataSet.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/DataSet.scala rename to gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/DataSet.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeClientConfiguration.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeClientConfiguration.scala rename to gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeClientConfiguration.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeConfigs.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeConfigs.scala rename to gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/conf/AerospikeConfigs.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/reader/AerospikeReader.scala b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/reader/AerospikeReader.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/reader/AerospikeReader.scala rename to gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/reader/AerospikeReader.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/utilities/AeroSparkConnector.scala b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/utilities/AeroSparkConnector.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/utilities/AeroSparkConnector.scala rename to gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/utilities/AeroSparkConnector.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/utilities/AerospikeUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/utilities/AerospikeUtilities.scala similarity index 98% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/utilities/AerospikeUtilities.scala rename to gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/utilities/AerospikeUtilities.scala index b34d1dff..ace5118f 100644 --- a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/utilities/AerospikeUtilities.scala +++ b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/utilities/AerospikeUtilities.scala @@ -57,7 +57,7 @@ object AerospikeUtilities { val aerospikeSet = conf.aerospikeSet val aerospikeRowKey = conf.aerospikeRowKey // For each partition of Dataframe, aerospike client is created and is used to write data to aerospike - dataFrame.foreachPartition { partition => + dataFrame.foreachPartition { partition: Iterator[Row] => val client = AerospikeAdminClient.createClientConnection(aerospikeHosts, aerospikePort.toInt) partition.foreach { row => val bins = columns.map(eachCol => new Bin(eachCol.toString, row.getAs(eachCol).toString)).toArray diff --git a/gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/writer/AerospikeWriter.scala b/gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/writer/AerospikeWriter.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-aerospike-3.14/src/main/scala/com/paypal/gimel/aerospike/writer/AerospikeWriter.scala rename to gimel-dataapi/gimel-connectors/gimel-aerospike/src/main/scala/com/paypal/gimel/aerospike/writer/AerospikeWriter.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/pom.xml b/gimel-dataapi/gimel-connectors/gimel-cassandra/pom.xml similarity index 89% rename from gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/pom.xml rename to gimel-dataapi/gimel-connectors/gimel-cassandra/pom.xml index 92cc4f81..ca83e106 100644 --- a/gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-cassandra/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 - gimel-cassandra-2.0 - 2.0.0-SNAPSHOT + gimel-cassandra + 2.4.7-SNAPSHOT @@ -42,11 +42,11 @@ under the License. spark-cassandra-connector_${scala.binary.version} ${cassandra.spark.version} - - com.datastax.spark - spark-cassandra-connector-java_${scala.binary.version} - 1.5.2 - + + + + + org.scalatest scalatest_${scala.binary.version} @@ -62,7 +62,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/DataSet.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/DataSet.scala rename to gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/DataSet.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraClientConfiguration.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraClientConfiguration.scala rename to gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraClientConfiguration.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraConfigs.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraConfigs.scala rename to gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/conf/CassandraConfigs.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/reader/CassandraReader.scala b/gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/reader/CassandraReader.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/reader/CassandraReader.scala rename to gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/reader/CassandraReader.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/writer/CassandraWriter.scala b/gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/writer/CassandraWriter.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-cassandra-2.0/src/main/scala/com/paypal/gimel/cassandra/writer/CassandraWriter.scala rename to gimel-dataapi/gimel-connectors/gimel-cassandra/src/main/scala/com/paypal/gimel/cassandra/writer/CassandraWriter.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/pom.xml b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/pom.xml deleted file mode 100644 index e6e60b26..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/pom.xml +++ /dev/null @@ -1,171 +0,0 @@ - - - - - - - gimel-dataapi - com.paypal.gimel - 2.0.0-SNAPSHOT - ../../pom.xml - - - 4.0.0 - gimel-druid-0.82 - 2.0.0-SNAPSHOT - - - - com.paypal.gimel - gimel-common - ${gimel.version}-SNAPSHOT - - - - io.druid - tranquility-core_${scala.binary.version} - ${tranquility.version} - - - com.fasterxml.jackson.core - jackson-databind - - - com.fasterxml.jackson.core - jackson-core - - - org.apache.derby - derbyclient - - - - - - - - com.fasterxml.jackson.core - jackson-annotations - ${fasterxml.jackson.core.version} - - - io.druid - tranquility-spark_${scala.binary.version} - ${tranquility.version} - - - org.scalatest - scalatest_${scala.binary.version} - ${scalatest.version} - test - - - - - src/main/scala - - - - net.alchim31.maven - scala-maven-plugin - 3.2.1 - - - - compile - testCompile - - - - - - -Xms64m - -Xmx1024m - - - - - org.scalatest - scalatest-maven-plugin - 1.0 - - ${project.build.directory}/surefire-reports - . - WDF TestSuite.txt - - - - test - - test - - - - - - - org.apache.maven.plugins - maven-shade-plugin - 3.0.0 - - - - scala.tools - gimel-shaded.scala.tools - - - com.google.common - gimel-shaded.com.google.common - - - com.sun.jersey - gimel-shaded.com.sun.jersey - - - - org.apache.hadoop - gimel-shaded.org.apache.hadoop - - - - : - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - gimel-shading - package - - shade - - - - - - - - diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/DataSet.scala deleted file mode 100644 index aeee42e2..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/DataSet.scala +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid - -import scala.language.implicitConversions -import scala.reflect.runtime.universe._ - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SparkSession} - -import com.paypal.gimel.datasetfactory.GimelDataSet -import com.paypal.gimel.druid.conf.{DruidClientConfiguration, DruidConfigs, DruidConstants} -import com.paypal.gimel.druid.util.DruidUtility -import com.paypal.gimel.druid.writer.DruidRealtimeWriter -import com.paypal.gimel.logger.Logger - -/** - * Concrete Implementation for Druid DataSet. - * - * @param sparkSession : SparkSession - */ - -class DataSet(sparkSession: SparkSession) extends GimelDataSet(sparkSession: SparkSession) { - - // GET LOGGER - val logger = Logger() - logger.info(s"Initiated --> ${this.getClass.getName}") - - /** - * Read Implementation for Casandra DataSet. - * - * @param dataset Name of the UDC Data Set. - * @param datasetProps Additional parameters for read and write operations in DataSet class. - * @return DataFrame - */ - override def read(dataset: String, datasetProps: Map[String, Any]): DataFrame = { - throw new Exception("Read for Druid Dataset is not enabled.") - } - - /** Write Implementation for Druid DataSet. - * - * @param dataset Name of the UDC Data Set. - * @param dataFrame The DataFrame to write to target. - * @param datasetProps Additional parameters for read and write operations in DataSet class. - * @return DataFrame - */ - - override def write(dataset: String, dataFrame: DataFrame, - datasetProps: Map[String, Any]): DataFrame = { - logger.info(s"Druid Dataset Write Initialized for ---> $dataset.") - logger.info(s"Scala Version Used ---> ${scala.util.Properties.versionString}") - - if (datasetProps.isEmpty) { - throw new DataSetException("Props Map Cannot be emtpy for DruidDataSet Write.") - } - - val allProps = datasetProps ++ - Map(DruidConfigs.FIELDS -> DruidUtility.getFieldNames(dataFrame)) - - logger.info(s"Begin Building DruidClientConfiguration") - logger.debug(s"Incoming Properties --> ${ - allProps.map(x => s"${x._1} -> ${x._2}") - .mkString("\n") - }") - - val conf = new DruidClientConfiguration(allProps) - - logger.debug(s"DruidClientConfiguration --> $conf") - logger.info(s"DruidClientConfiguration Building done --> " + - s"${conf.getClass.getName}") - - // Get load type from DruidClientConfiguration. - // i.e Real-time or Batch and then runs appropriate driver. - // Defaults to Real-time Driver. - conf.druidLoadType match { - case DruidConstants.REALTIME_LOAD => - DruidRealtimeWriter.writeToTable(sparkSession, conf, dataFrame) - - case DruidConstants.BATCH_LOAD => - val errorMsg = "Batch Load type for druid-connector has not been implemented." - throw new IllegalArgumentException(errorMsg) - - case _ => - DruidRealtimeWriter.writeToTable(sparkSession, conf, dataFrame) - } - - dataFrame - } - - // Add Additional Supported types to this list as and when we support other Types of RDD - // Example to support RDD[String], add to List - override val supportedTypesOfRDD: List[String] = List(typeOf[Map[String, Any]].toString) - - /** - * Writes a given dataframe to the actual target System. - * (Example Hive : DB.Table | HBASE namespace.Table) - * - * The inheriting DataSet Operators must typeCast the RDD to supported types. - * - * - *
  • instance#1: - * ElasticSearchDataSet may support just RDD[Seq(Map[String, String])], - * so Elastic Search must implement supported Type checking - * - *
  • instance#2: Kafka, HDFS, HBASE throw Unsupported Operation Exception. - * The exception should clearly educate users—Until they support an RDD operation for Any Type T. - * - * Additional parameters for read and write operations in DataSet class - * Example: to write kafka with a specific parallelism: - * {{{ - * val props = Map("parallelsPerPartition" -> 10) - * Dataset(sc).write(clientDataFrame, props) - * }}} - * - * @param dataset Name of the UDC Data Set. - * @param rdd The RDD[T] to write into Target. - * @param datasetProps Map containing dataset props - * @return RDD[T] - */ - def write[T: TypeTag](dataset: String, rdd: RDD[T], datasetProps: Map[String, Any]): RDD[T] = { - logger.info(s"Druid Dataset Write Initialized for ---> $dataset.") - logger.info(s"Scala Version Used ---> ${scala.util.Properties.versionString}") - - if (!supportedTypesOfRDD.contains(typeOf[T].toString)) { - throw new UnsupportedOperationException( - s"""Invalid RDD Type. Supported Types : - |${supportedTypesOfRDD.mkString(" | ")}""".stripMargin) - } - - if (datasetProps.isEmpty) { - throw new DataSetException("Props Map Cannot be emtpy for DruidDataSet Write.") - } - - val allProps = datasetProps ++ - Map(DruidConfigs.FIELDS -> DruidUtility.getFieldNames(dataset, sparkSession)) - - logger.info(s"Begin Building DruidClientConfiguration") - logger.debug(s"Incoming Properties --> ${ - allProps.map(x => s"${x._1} -> ${x._2}") - .mkString("\n") - }") - - val conf = new DruidClientConfiguration(allProps) - - logger.debug(s"DruidClientConfiguration --> $conf") - logger.info(s"DruidClientConfiguration Building done --> " + - s"${conf.getClass.getName}") - - // Get load type from DruidClientConfiguration. - // i.e Real-time or Batch and then runs appropriate driver. - // Defaults to Real-time Driver. - conf.druidLoadType match { - case DruidConstants.REALTIME_LOAD => - DruidRealtimeWriter.writeToTable(sparkSession, conf, rdd.asInstanceOf[RDD[Map[String, Any]]]) - - case DruidConstants.BATCH_LOAD => - val errorMsg = "Batch Load type for druid-connector has not been implemented." - throw new IllegalArgumentException(errorMsg) - - case _ => - DruidRealtimeWriter.writeToTable(sparkSession, conf, rdd.asInstanceOf[RDD[Map[String, Any]]]) - } - - rdd - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def create(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new Exception(s"DataSet create for druid currently not Supported") - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def drop(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new Exception(s"DataSet drop for druid currently not Supported") - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def truncate(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new Exception(s"DataSet truncate for druid currently not Supported") - } - - /** - * Save Checkpoint - */ - override def clearCheckPoint(): Unit = { - logger.info(s"Clear check Point functionality is not available for Druid Dataset") - } - - /** - * Clear Checkpoint - */ - override def saveCheckPoint(): Unit = { - logger.info(s"Save check Point functionality is not available for Druid Dataset") - } -} - -/** - * Custom Exception for DruidDataset initiation errors - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class DataSetException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidClientConfiguration.scala deleted file mode 100644 index 130be1e6..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidClientConfiguration.scala +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.conf - -import scala.collection.immutable.Map -import scala.reflect.ClassTag - -import com.metamx.common.Granularity - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.common.conf.GimelConstants -import com.paypal.gimel.druid.model.{DruidDimension, DruidMetric} -import com.paypal.gimel.druid.util.DruidUtility - -/** - * DruidClientConfiguration Class. Takes a map of properties and build its own properties. - * This Class extends Serializable as it is needed to be passed to the executors. - * - * @param props Map[String, Any] of the properties specified by the user. - */ -@SerialVersionUID(100L) -class DruidClientConfiguration(props: Map[String, Any]) extends Serializable { - - // This is the DataSet Properties - val datasetProps: DataSetProperties = props(GimelConstants.DATASET_PROPS).asInstanceOf[DataSetProperties] - val tableProps: Map[String, String] = datasetProps.props - - val druidLoadType: String = fetchProperty[String](DruidConfigs.LOAD_TYPE) - .getOrElse(DruidConstants.REALTIME_LOAD) - - // Zookeeper services running. Example: localhost:2121. Required Configuration - val zookeeper: String = fetchProperty[String](DruidConfigs.ZOOKEEPER, isRequired = true).get - - // Index Service as specified for druid cluster. Required Configuration - val indexService: String = fetchProperty[String](DruidConfigs.INDEX_SERVICE, isRequired = true).get - - // Duscovery Path as specified for druid cluster. Required Configuration - val discoveryPath: String = fetchProperty[String](DruidConfigs.DISCOVERY_PATH, isRequired = true).get - - // Datasource in Druid to index for. Required Configuration - val datasource: String = fetchProperty[String](DruidConfigs.DATASOURCE, isRequired = true).get - - val fieldNames: List[String] = - fetchProperty[List[String]](DruidConfigs.FIELDS, isRequired = true).get - - val timestamp_field: String = - fetchProperty[String](DruidConfigs.TIMESTAMP) - .getOrElse(DruidConstants.TIMESTAMP_FIELD_NAME) - - val timestamp_format: String = fetchProperty[String](DruidConfigs.TIMESTAMP_FORMAT) - .getOrElse(DruidConstants.TIMESTAMP_FORMAT) - - // Get Segment Granularity String from the props and convert it into com.metamx.common.Granularity - val segmentGranularity: Granularity = { - val granularityString = fetchProperty[String](DruidConfigs.SEGMENT_GRANULARITY) - .getOrElse(DruidConstants.SEGMENT_GRANULARITY_FIFTEEN_MINUTE) - - val granularity = Granularity.values.find(g => granularityString.equalsIgnoreCase(g.toString)) - - // If given Granularity is not found then throw an Error - if (granularity.isEmpty) { - val errorMsg = s"Specified Segment Granularity $granularityString is not a valid Granularity" - throw new IllegalArgumentException(errorMsg) - } - - granularity.get - } - - // Get Segment Granularity String from the props and convert it into com.metamx.common.Granularity - val queryGranularity: String = { - fetchProperty[String](DruidConfigs.QUERY_GRANULARITY) - .getOrElse(DruidConstants.QUERY_GRANULARITY_ONE_MINUTE) - } - - // Window Period for which druid will accept the incoming data. Defaults to PT10M - val windowPeriod: String = fetchProperty[String](DruidConfigs.WINDOW_PERIOD).getOrElse(DruidConstants.WINDOW_PERIOD) - - // Number of Partitions Defined - val numPartitions: Int = fetchProperty[Int](DruidConfigs.PARTITIONS).getOrElse(DruidConstants.PARTITIONS) - - // Number of Replicants Specified - val numReplicants: Int = fetchProperty[Int](DruidConfigs.REPLICANTS).getOrElse(DruidConstants.REPLICANTS) - - val ARROW = DruidConstants.ARROW - val NEW_LINE = DruidConstants.NEW_LINE - - // Get List of Druid Field names from props that is a string value for the list. - lazy val fields: List[DruidDimension] = fieldNames.map(DruidDimension(_)) - - // Get List of Druid Dimensions from props that is a string value for the list. - lazy val dimensions: List[DruidDimension] = { - errorIfMissing(DruidConfigs.DIMENSIONS) - - DruidUtility.parseString[List[String]]( - fetchProperty[String](DruidConfigs.DIMENSIONS, isRequired = true).get - ).map(DruidDimension(_)) - } - - // Get List of Druid Metric from props that is a string value for the list. - lazy val metrics: List[DruidMetric] = { - val metricString = fetchProperty[String](DruidConfigs.METRICS) - - // Check if metricString is not null or else return a Default count Metric - if (metricString.isDefined) { - DruidUtility.parseString[List[DruidMetric]](metricString.get) - } else { - List(DruidMetric.getDefaultMetric) - } - } - - /** - * Private Method to check if the key exists in the props. - * If the key doesnt exists than throw an error. - * - * @param key String value for the key - */ - private def errorIfMissing(key: String): Unit = { - if (tableProps.get(key).isEmpty && props.get(key).isEmpty) { - val errorMsg = s"Missing Property: $key for the Druid Client Configuration!" - throw new IllegalArgumentException(errorMsg) - } - } - - /** - * Method to fetch property value from props and tableProps. - * This methods first looks for a key in props, if not than looks for in tableProps. - * - * @param key Key of the property to be fetched - * @param isRequired If the key is required or not. - * If it is required than it throws an error if the key - * does not exist in either of props or tableProps - * @tparam T Type of the value to return for a given property - * @return An Option of the value or None if the property key does not exist. - */ - def fetchProperty[T](key: String, isRequired: Boolean = false) - (implicit tag: ClassTag[T]): Option[T] = { - // If isRequired is true, than throw an error if the key is missing - if (isRequired) errorIfMissing(key) - - val propValue = props.get(key).orElse(tableProps.get(key)) - - if (propValue.isDefined) { - propValue.get match { - case _: T => - Option(propValue.get.asInstanceOf[T]) - case _ => - val errorMsg = s"Value for Property Key: $key cannot be cast." - throw new IllegalArgumentException(errorMsg) - } - } else None - } - - /** - * Overriden Method to Print Configuration Variables for this config. - * - * @return Print message for this Coniguration. - */ - override def toString: String = { - var message = "Druid Client Configuration Parameters --->" + DruidConstants.NEW_LINE - - message += DruidConfigs.ZOOKEEPER + ARROW + this.zookeeper + NEW_LINE - message += DruidConfigs.INDEX_SERVICE + ARROW + this.indexService + NEW_LINE - message += DruidConfigs.DISCOVERY_PATH + ARROW + this.discoveryPath + NEW_LINE - message += DruidConfigs.DATASOURCE + ARROW + this.datasource + NEW_LINE - message += DruidConfigs.FIELDS + ARROW + this.fieldNames + NEW_LINE - message += DruidConfigs.DIMENSIONS + ARROW + - this.dimensions.map(_.name).mkString(",") + NEW_LINE - message += DruidConfigs.METRICS + ARROW + this.metrics.mkString(",") + NEW_LINE - message += DruidConfigs.TIMESTAMP + ARROW + this.timestamp_field + NEW_LINE - message += DruidConfigs.TIMESTAMP_FORMAT + ARROW + - this.timestamp_format + NEW_LINE - message += DruidConfigs.QUERY_GRANULARITY + ARROW + - this.queryGranularity + NEW_LINE - message += DruidConfigs.SEGMENT_GRANULARITY + ARROW + - this.segmentGranularity + NEW_LINE - message += DruidConfigs.WINDOW_PERIOD + ARROW + this.windowPeriod + NEW_LINE - message += DruidConfigs.PARTITIONS + ARROW + this.numPartitions + NEW_LINE - message += DruidConfigs.REPLICANTS + ARROW + this.numReplicants + NEW_LINE - message += DruidConfigs.LOAD_TYPE + ARROW + this.druidLoadType + NEW_LINE - - message - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidConfigs.scala deleted file mode 100644 index f934a40d..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidConfigs.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.conf - -/** - * Object Defining List of available Configuration Keys - */ -object DruidConfigs { - val ZOOKEEPER = "gimel.druid.zookeeper.hosts" - val INDEX_SERVICE = "gimel.druid.cluster.index.service" - val DISCOVERY_PATH = "gimel.druid.cluster.discovery.path" - val DATASOURCE = "gimel.druid.datasource.name" - val FIELDS = "gimel.druid.datasource.fields" - val DIMENSIONS = "gimel.druid.datasource.dimensions" - val METRICS = "gimel.druid.datasource.metrics" - val TIMESTAMP = "gimel.druid.timestamp.fieldname" - val TIMESTAMP_FORMAT = "gimel.druid.timestamp.format" - val QUERY_GRANULARITY = "gimel.druid.query.granularity" - val SEGMENT_GRANULARITY = "gimel.druid.segment.granularity" - val WINDOW_PERIOD = "gimel.druid.stream.window.period" - val PARTITIONS = "gimel.druid.datasource.partitions" - val REPLICANTS = "gimel.druid.datasource.replicas" - val LOAD_TYPE = "gimel.druid.ingestion.type" -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidConstants.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidConstants.scala deleted file mode 100644 index 35a184e8..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/conf/DruidConstants.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.conf - -/** - * Object Defining Default Values for Configuration - */ -object DruidConstants { - val TIMESTAMP_FIELD_NAME = "timestamp" - val TIMESTAMP_FORMAT = "millis" - val QUERY_GRANULARITY_ONE_MINUTE = "MINUTE" - val SEGMENT_GRANULARITY_FIFTEEN_MINUTE = "FIFTEEN_MINUTE" - val WINDOW_PERIOD = "PT10M" - val PARTITIONS = 1 - val REPLICANTS = 1 - val REALTIME_LOAD = "realtime" - val BATCH_LOAD = "batch" - val ARROW = "->" - val NEW_LINE = "\n" - val MILLISECONDS = "millis" - val SECONDS = "seconds" - val ISO = "iso" -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/model/DruidDimension.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/model/DruidDimension.scala deleted file mode 100644 index a8a28677..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/model/DruidDimension.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.model - -import org.json4s.FieldSerializer -import org.json4s.FieldSerializer.{renameFrom, renameTo} - -/** - * Druid Dimension Object. - * Class extends Serializable as it is passed to executors. - * - * @param name Name for the dimension - */ -@SerialVersionUID(100L) -case class DruidDimension(name: String) extends Serializable - -object DruidDimension { - object DimensionFieldNames { - val NAME = "name" - } - - // Deserializer for Druid Dimension. - // Rename name -> name - val drudDimensionSerializer: FieldSerializer[DruidDimension] = FieldSerializer[DruidDimension] ( - renameTo("name", DimensionFieldNames.NAME), - renameFrom(DimensionFieldNames.NAME, "name") - ) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/model/DruidMetric.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/model/DruidMetric.scala deleted file mode 100644 index 5280c619..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/model/DruidMetric.scala +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.model - -import com.fasterxml.jackson.annotation.JsonIgnoreProperties -import io.druid.query.aggregation.{AggregatorFactory, CountAggregatorFactory, LongSumAggregatorFactory} -import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory -import org.json4s._ -import org.json4s.FieldSerializer._ - -import com.paypal.gimel.druid.model.DruidMetric.{MetricFieldNames, MetricTypes} - -/** - * Model class representing a DruidMetric. - * This case class extends Serializable. - * - * @param metricsType Type of the metric to be computed. - * @param fieldName Name of field to perform aggregation on - * @param name Name of the metric - */ -@SerialVersionUID(100L) -@JsonIgnoreProperties(ignoreUnknown = true) -case class DruidMetric( metricsType: String, fieldName: String, name: String) - extends Serializable { - - /** - * Method to initialize a DruidMetric from a map supplied - * - * @param map Map[String, String] having (key -> value) for a given druid metric - * @return DruidMetric object using values from the map - */ - def initializeFromMap(map: Map[String, String]): DruidMetric = { - DruidMetric(map.get(MetricFieldNames.TYPE).orNull, - map.get(MetricFieldNames.FIELD_NAME).orNull, - map.get(MetricFieldNames.NAME).orNull) - } - - /** - * Converts the given DruidMetric to its corresponding AggregatorFactory - * Object that is used by Tranquility. - * Supported MetricTypes - Count, LongSum, HyperUnique - * - * @return AggregatorFactory object corresponding to the given Metric Type - */ - def getAggregator: AggregatorFactory = { - metricsType match { - case MetricTypes.LONG_SUM => - new LongSumAggregatorFactory(name, fieldName) - case MetricTypes.COUNT => - new CountAggregatorFactory(name) - case MetricTypes.HYPER_UNIQUE => - new HyperUniquesAggregatorFactory(name, fieldName) - case otherType: String => - throw new Exception(s"Metric Type: $otherType is not supported.") - } - } -} - -object DruidMetric { - def getDefaultMetric: DruidMetric = { - DruidMetric(MetricTypes.COUNT, null, MetricTypes.COUNT) - } - - object MetricFieldNames { - val TYPE = "type" - val FIELD_NAME = "field_name" - val NAME = "name" - } - - object MetricTypes { - val LONG_SUM = "longSum" - val COUNT = "count" - val HYPER_UNIQUE = "hyperUnique" - } - - // Deserializer for Druid Metric. - // Ignore fieldName if does not exists. - // Rename metricsType -> type, fieldName -> field_name, name -> name - val drudMetricSerializer: FieldSerializer[DruidMetric] = FieldSerializer[DruidMetric] ( - ignore("fieldName") orElse renameTo("metricsType", MetricFieldNames.TYPE) orElse - renameTo("fieldName", MetricFieldNames.FIELD_NAME) orElse - renameTo("name", MetricFieldNames.NAME), - renameFrom(MetricFieldNames.TYPE, "metricsType") orElse - renameFrom(MetricFieldNames.FIELD_NAME, "fieldName") orElse - renameFrom(MetricFieldNames.NAME, "name") - ) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/reader/DruidReader.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/reader/DruidReader.scala deleted file mode 100644 index b6099278..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/reader/DruidReader.scala +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.reader - -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.hive.HiveContext - -import com.paypal.gimel.druid.conf.DruidClientConfiguration -import com.paypal.gimel.logger.Logger - -/** - * DruidReader Object. Main class to implement Reader for Druid. - */ -object DruidReader { - private val logger = Logger() - - /** - * Method for reading from Druid. This is not yet implemented. - * - * @param hiveContext HiveContext Object to be used. - * @param conf DruidClientConfiguration specified. - * @return DataFrame after processing. - */ - def readTable(hiveContext: HiveContext, conf: DruidClientConfiguration): DataFrame = { - // TODO: Read Implementation not done - throw new Exception("Read for druid-connector is not implemented.") - } - -} diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/util/DruidEventBeam.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/util/DruidEventBeam.scala deleted file mode 100644 index 7ad94f2d..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/util/DruidEventBeam.scala +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.util - -import com.metamx.tranquility.beam.{Beam, ClusteredBeamTuning} -import com.metamx.tranquility.druid.{DruidBeams, DruidLocation, DruidRollup, SpecificDruidDimensions} -import com.metamx.tranquility.spark.BeamFactory -import com.metamx.tranquility.typeclass.Timestamper -import io.druid.data.input.impl.TimestampSpec -import org.apache.curator.framework.CuratorFrameworkFactory -import org.apache.curator.retry.BoundedExponentialBackoffRetry -import org.joda.time.{DateTime, DateTimeZone, Period} - -import com.paypal.gimel.druid.conf.DruidClientConfiguration - -/** - * DruidEventBeam object. - * Given a DruidClientConfiguration, returns a singleton instance of DruidBeam. - * DruidBeam instance should be a singleton inorder to share the same connection - */ -object DruidEventBeam { - var druidConfig: DruidClientConfiguration = _ - - /** - * Method to initialize the required params for the DruidEventBeam instance. - * This method must be call before trying to fetch BeamInstance - * DruidClientConfiguration is a required param that needs to be set. - * - * @param configMgr DruidClientConfiguration object to be set for defining configuration. - */ - def init(configMgr: DruidClientConfiguration): Unit = { - druidConfig = configMgr - } - - /** - * Timestamper object that defines how to extract a timestamp from any custom object - */ - implicit val timestamper = new Timestamper[Map[String, Any]]() { - - /** - * Overriden method to extract timestamp from a given custom object. - * - * @param rowMap Map[String, String] representing a single row with - * (columnName -> columnValue) format map - * @return org.joda.time.DateTime by extracting timestamp from the rowMap - */ - override def timestamp(rowMap: Map[String, Any]): DateTime = { - new DateTime(rowMap(druidConfig.timestamp_field), DateTimeZone.UTC) - } - } - - /** - * Builds and stores a singleton instance of Beam[T] given the - * DruidClientConfiguration object for configuration. - */ - lazy val BeamInstance: Beam[Map[String, Any]] = { - - // Tranquility uses ZooKeeper (through Curator framework) for coordination. - val curator = CuratorFrameworkFactory.newClient( - druidConfig.zookeeper, - new BoundedExponentialBackoffRetry(100, 3000, 5) - ) - curator.start() - - // Transforms List[DruidDimensions] from the DruidClientConfiguration to List[String] - val dimensions = druidConfig - .dimensions - .map(_.name) - - // Transforms List[DruidMetrics] from the DruidClientConfiguration to List[AggregatorFactory] - val aggregators = druidConfig - .metrics - .map(_.getAggregator) - - // Building a Druid Beam - DruidBeams - .builder() - .curator(curator) - .discoveryPath(druidConfig.discoveryPath) - .location(DruidLocation.create(druidConfig.indexService, druidConfig.datasource)) - .rollup(DruidRollup(SpecificDruidDimensions(dimensions), - aggregators, DruidUtility.fetchQueryGranularity(druidConfig.queryGranularity))) - .tuning( - ClusteredBeamTuning ( - segmentGranularity = druidConfig.segmentGranularity, - windowPeriod = new Period(druidConfig.windowPeriod), - partitions = druidConfig.numPartitions, replicants = druidConfig.numReplicants - ) - ) - .timestampSpec(new TimestampSpec(druidConfig.timestamp_field, "iso", null)) - .buildBeam() - } -} - -class DruidEventBeam(config: DruidClientConfiguration) extends BeamFactory[Map[String, Any]] { - // Return a singleton, so the same connection is shared across all tasks in the same JVM. - def makeBeam: Beam[Map[String, Any]] = { - DruidEventBeam.init(config) - DruidEventBeam.BeamInstance - } -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/util/DruidUtility.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/util/DruidUtility.scala deleted file mode 100644 index 458df4c1..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/util/DruidUtility.scala +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.util - -import java.lang.reflect.Field - -import scala.reflect.ClassTag - -import io.druid.granularity.{QueryGranularities, QueryGranularity} -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.types.{StructField, StructType} -import org.joda.time.{DateTime, DateTimeZone} -import org.joda.time.format.DateTimeFormat -import org.json4s.{DefaultFormats, Formats} -import org.json4s.jackson.JsonMethods._ - -import com.paypal.gimel.druid.conf.DruidConstants -import com.paypal.gimel.druid.model.{DruidDimension, DruidMetric} - -/** - * Object instance for Druid Utility. Contains all the utility methods required for druid-connector - */ -object DruidUtility { - - // Format object to serialize and deserialize used by json4s - implicit val format: Formats = - DefaultFormats + DruidMetric.drudMetricSerializer + DruidDimension.drudDimensionSerializer - - /** - * Given a date or time in String, this method converts this datetime to - * org.joda.time.DateTime using the specified format. - * Returns current datetime if datetime string is null. - * Supports - millis, seconds and other DateTime format - * - * @param datetime Datetime in String. Can be time in millis, seconds or in any DATETIME format - * @param format String format for transforming string to org.joda.time.DateTime. Default: millis - * @return org.joda.time.DateTime given a datetime in string and the specified format. - */ - def extractDateTime(datetime: String, format: String = "millis"): DateTime = { - if (Option(datetime).isDefined) { - format match { - // Converts Milliseconds to org.joda.time.DateTime - case DruidConstants.MILLISECONDS => - new DateTime(datetime.toLong, DateTimeZone.UTC) - - // Converts Seconds to org.joda.time.DateTime - case DruidConstants.SECONDS => - new DateTime(toMillis(datetime.toLong), DateTimeZone.UTC) - - // Converts ISO datetime to org.joda.time.DateTime - case DruidConstants.ISO => - DateTime.parse(datetime).withZone(DateTimeZone.UTC) - - // Converts all the other DateTime formats to org.joda.time.DateTime - case otherFormat: String => - val formatter = DateTimeFormat.forPattern(otherFormat).withZoneUTC() - formatter.parseDateTime(datetime) - } - } else { - // Returns current time in UTC if datetime string is null. - new DateTime(DateTimeZone.UTC) - } - } - - /** - * Converts seconds to Milliseconds - * - * @param seconds Long seconds to be converted - * @return Long Milliseconds corresponding to the seconds - */ - def toMillis(seconds: Long): Long = seconds * 1000 - - /** - * Fetch List of class variables - * - * @param tag ClassTag[T] object - * @tparam T Class type passed - * @return List[Field] of fields that T class contains - */ - def fetchClassVariable[T](implicit tag: ClassTag[T]): List[Field] = - tag.runtimeClass.getDeclaredFields.toList - - - /** - * Get Hive Table Field names given the name of hive table - * - * @param dataset Hive Table name - * @return List[String] of Field names for the hive table - */ - def getFieldNames(dataset: String, sparkSession: SparkSession): List[String] = { - extractFields(sparkSession.read.table(dataset).schema) - } - - - /** - * Get Hive Table Field names given the Dataframe. - * - * @param dataFrame Dataframe for which schema is to be returned - * @return List[String] of Field names for the hive table - */ - def getFieldNames(dataFrame: DataFrame): List[String] = { - extractFields(dataFrame.schema) - } - - /** - * Given a Schema StructType, extract the field names. - * - * @param schema StructType Schema - * @return List[String] of field names - */ - def extractFields(schema: StructType): List[String] = { - Option(schema) - .getOrElse(StructType(List.empty[StructField])) - .map(_.name).toList - } - - /** - * Method to parse a string to a Custom object. - * - * @param value String value to be parsed. - * @tparam T Custom object to parse the String. - * @return Parsed object based on the value and T. - */ - def parseString[T: ClassTag](value: String)(implicit manifest: Manifest[T]): T = { - parse(s"""$value""") - .extract[T](format, mf = manifest) - } - - /** - * Method to Fetch Query Granularity based on the String Provided. - * - * @param granularityString Query Granularity String to be parsed - * @return QueryGranularity Object corresponding to the string - */ - def fetchQueryGranularity(granularityString: String): QueryGranularity = { - // Using Reflection, find a field with the same name - // as the query granularity string specified by the user - val granularityField = DruidUtility.fetchClassVariable[QueryGranularities] - .find(field => granularityString.equalsIgnoreCase(field.getName)) - - // If given Granularity is not found then throw an Error - if (granularityField.isEmpty) { - val errorMsg = s"Specified Query Granularity $granularityString is not a valid Granularity" - throw new IllegalArgumentException(errorMsg) - } - - // Extract QueryGranularity Variable value from the field - val queryGranularity = QueryGranularities.MINUTE - granularityField.get.get(queryGranularity).asInstanceOf[QueryGranularity] - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/writer/DruidRealtimeWriter.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/writer/DruidRealtimeWriter.scala deleted file mode 100644 index ee211fcc..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/writer/DruidRealtimeWriter.scala +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.writer - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SparkSession} - -import com.paypal.gimel.druid.conf.DruidClientConfiguration -import com.paypal.gimel.druid.util.{DruidEventBeam, DruidUtility} - -/** - * DruidRealtimeWriter Object. - * Main method to implement writer for Druid Realtime Ingestion. - * Extends DruidWriter trait. - */ -object DruidRealtimeWriter extends DruidWriter { - /** - * Write To Table for Druid Realtime Ingestion for a given RDD. - * - * @param sparkSession : SparkSession - * @param conf DruidClientConfiguration Object - * @param dataFrame Dataframe to be ingested to druid - * @return Dataframe - */ - def writeToTable(sparkSession: SparkSession, - conf: DruidClientConfiguration, dataFrame: DataFrame): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - // Convert to RDD of Map[String, String] - val eventsRDD: RDD[Map[String, Any]] = dataFrame.rdd - .map(row => { - conf.fields - .map(field => { - var fieldValue: Any = row.getAs[Any](field.name) - - if (field.name.equalsIgnoreCase(conf.timestamp_field)) { - fieldValue = DruidUtility - .extractDateTime(fieldValue.toString, conf.timestamp_format) - .toString - } - - field.name -> fieldValue - }) - .toMap - }) - - // Converting RDD to data frame - import com.metamx.tranquility.spark.BeamRDD._ - eventsRDD.propagate(new DruidEventBeam(conf)) - - dataFrame - } - - /** - * Write To Table for Druid Realtime Ingestion for a given RDD. - * - * @param sparkSession : SparkSession - * @param conf DruidClientConfiguration Object - * @param rdd RDD of Map[String, String] to be ingested to druid - * @return RDD - */ - def writeToTable(sparkSession: SparkSession, - conf: DruidClientConfiguration, - rdd: RDD[Map[String, Any]]): RDD[Map[String, Any]] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - // Convert to RDD of Map[String, String] - val eventsRDD: RDD[Map[String, Any]] = rdd - .map(rowMap => { - conf.fields - .map(field => { - var fieldValue: Any = rowMap(field.name) - - if (field.name.equalsIgnoreCase(conf.timestamp_field)) { - fieldValue = DruidUtility - .extractDateTime(fieldValue.toString, conf.timestamp_format) - .toString - } - - field.name -> fieldValue - }) - .toMap - }) - - // Converting RDD to data frame - import com.metamx.tranquility.spark.BeamRDD._ - eventsRDD.propagate(new DruidEventBeam(conf)) - - eventsRDD - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/writer/DruidWriter.scala b/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/writer/DruidWriter.scala deleted file mode 100644 index 74495544..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-druid-0.82/src/main/scala/com/paypal/gimel/druid/writer/DruidWriter.scala +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.druid.writer - -import com.paypal.gimel.logger.Logger - -/** - * DruidWriter trait. - * Generic trait to be extended by all the DruidWriters - DruidRealtimeWriter, DruidBatchWriter - */ -trait DruidWriter { - protected val logger = Logger() -} diff --git a/gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/pom.xml b/gimel-dataapi/gimel-connectors/gimel-elasticsearch/pom.xml similarity index 85% rename from gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/pom.xml rename to gimel-dataapi/gimel-connectors/gimel-elasticsearch/pom.xml index 1f03c1c3..e39db857 100644 --- a/gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-elasticsearch/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 - gimel-elasticsearch-6.2 - 2.0.0-SNAPSHOT + gimel-elasticsearch + 2.4.7-SNAPSHOT @@ -45,9 +45,23 @@ under the License. org.elasticsearch - elasticsearch-spark-20_${scala.binary.version} + elasticsearch-spark-20_${elastic.scala.binary.version} ${elasticsearch.version} ${packaging.scope} + + + org.apache.spark + * + + + org.slf4j + log4j-over-slf4j + + + org.apache.hive + * + + org.elasticsearch @@ -78,7 +92,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/DataSet.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/DataSet.scala rename to gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/DataSet.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConfigs.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConfigs.scala rename to gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConfigs.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConstants.scala b/gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConstants.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConstants.scala rename to gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/conf/ElasticSearchConstants.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/utilities/ElasticSearchUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/utilities/ElasticSearchUtilities.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-elasticsearch-6.2/src/main/scala/com/paypal/gimel/elasticsearch/utilities/ElasticSearchUtilities.scala rename to gimel-dataapi/gimel-connectors/gimel-elasticsearch/src/main/scala/com/paypal/gimel/elasticsearch/utilities/ElasticSearchUtilities.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/pom.xml b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/pom.xml deleted file mode 100644 index 690ad484..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/pom.xml +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - gimel-dataapi - com.paypal.gimel - 2.0.0-SNAPSHOT - ../../pom.xml - - 4.0.0 - - gimel-hbase-1.2 - 2.0.0-SNAPSHOT - - - - com.paypal.gimel - gimel-common - ${gimel.version}-SNAPSHOT - ${packaging.scope} - - - org.scalatest - scalatest_${scala.binary.version} - ${scalatest.version} - test - - - com.hortonworks - shc-core - ${hortonworks.shc.version}-${spark.binary.version}-s_${scala.binary.version} - ${packaging.scope} - - - org.apache.hbase - hbase-common - ${hbase.version} - ${packaging.scope} - - - org.jboss.netty - netty - - - io.netty - netty - - - - - org.apache.hbase - hbase-protocol - ${hbase.version} - ${packaging.scope} - - - org.apache.hbase - hbase-server - ${hbase.version} - ${packaging.scope} - - - org.jboss.netty - netty - - - io.netty - netty - - - - - org.htrace - htrace-core - 3.0.4 - ${packaging.scope} - - - org.apache.hive - hive-hbase-handler - ${hive.version} - - ${packaging.scope} - - - org.apache.commons - * - - - - - - org.apache.hbase - hbase-testing-util - ${hbase.version} - test - - - - io.netty - netty - ${netty.hadoop.version} - test - - - io.netty - netty-all - ${netty.all.hadoop.version} - test - - - net.jpountz.lz4 - lz4 - 1.3.0 - test - - - - - src/main/scala - src/test/scala - - - net.alchim31.maven - scala-maven-plugin - 3.2.1 - - - - compile - testCompile - - - - - - -Xms64m - -Xmx1024m - - - - - org.scalatest - scalatest-maven-plugin - 1.0 - - ${project.build.directory}/surefire-reports - . - WDF TestSuite.txt - - - - test - - test - - - - - - org.apache.maven.plugins - maven-shade-plugin - 3.0.0 - - - - com.google.common - gimel-shaded.com.google.common - - - com.sun.jersey - gimel-shaded.com.sun.jersey - - - - org.apache.hadoop - gimel-shaded.org.apache.hadoop - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - gimel-shading - package - - shade - - - - - - - diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/DataSet.scala deleted file mode 100644 index 34ab8269..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/DataSet.scala +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase - -import scala.reflect.runtime.universe._ - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SparkSession} - -import com.paypal.gimel.datasetfactory.GimelDataSet -import com.paypal.gimel.hbase.conf.{HbaseConfigs, HbaseConstants} -import com.paypal.gimel.hbase.utilities.{HBaseLookUp, HBasePut, HBaseSparkConnector, HBaseUtilities} -import com.paypal.gimel.logger.Logger - -/** - * Concrete Implementation for HBASE Dataset - * - * @param sparkSession : SparkSession - */ - -class DataSet(sparkSession: SparkSession) extends GimelDataSet(sparkSession: SparkSession) { - - // GET LOGGER - val logger = Logger() - /** - * Change this parameter with cluster config - */ - logger.info(s"Initiated --> ${this.getClass.getName}") - lazy val hbaseUtilities = HBaseUtilities(sparkSession) - lazy val hbaseLookUp = HBaseLookUp(sparkSession) - lazy val hbasePut = HBasePut(sparkSession) - lazy val hbaseSparkConnector = HBaseSparkConnector(sparkSession) - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : to get 10 factor parallelism (specifically) - * val props = Map("coalesceFactor" -> 10) - * val data = Dataset(sc).read("flights", props) - * data.coalesce(props.get("coalesceFactor")) - * @return DataFrame - */ - override def read(dataset: String, dataSetProps: Map[String, Any]): DataFrame = { - if (dataSetProps.isEmpty) throw new DataSetException("props cannot be empty !") - - val hbaseOperation = dataSetProps.getOrElse(HbaseConfigs.hbaseOperation, HbaseConstants.SCAN_OPERATION).toString - hbaseOperation match { - case HbaseConstants.GET_OPERATION => - logger.info("Reading through Java Get API.") - hbaseLookUp.get(dataset, dataSetProps) - case _ => - logger.info("Reading through SHC Connector.") - hbaseSparkConnector.read(dataset, dataSetProps) - } - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataFrame The Dataframe to write into Target - * @param dataSetProps - * Example Usecase : we want only 1 executor for hbase (specifically) - * val props = Map("coalesceFactor" -> 1) - * Dataset(sc).write(clientDataFrame, props) - * Inside write implementation :: dataFrame.coalesce(props.get("coalesceFactor")) - * @return DataFrame - */ - - override def write(dataset: String, dataFrame: DataFrame, dataSetProps: Map[String, Any]): DataFrame = { - if (dataSetProps.isEmpty) { - throw new DataSetException("props cannot be empty !") - } - - val castedDataFrame = hbaseUtilities.castAllColsToString(dataFrame) - val hbaseOperation = dataSetProps.getOrElse(HbaseConfigs.hbaseOperation, HbaseConstants.SCAN_OPERATION).toString - hbaseOperation match { - case HbaseConstants.PUT_OPERATION => - logger.info("Writing through Java Put API.") - hbasePut.put(dataset, castedDataFrame, dataSetProps) - case _ => - logger.info("Writing through SHC Connector.") - hbaseSparkConnector.write(dataset, castedDataFrame, dataSetProps) - } - } - - // Add Additional Supported types to this list as and when we support other Types of RDD - // Example to start supporting RDD[String], add to List < typeOf[Seq[Map[String, String]]].toString) > - override val supportedTypesOfRDD: List[String] = List[String]() - - /** - * Function writes a given dataframe to the actual Target System (Example Hive : DB.Table | HBASE namespace.Table) - * - * @param dataset Name of the UDC Data Set - * @param rdd The RDD[T] to write into Target - * Note the RDD has to be typeCast to supported types by the inheriting DataSet Operators - * instance#1 : ElasticSearchDataSet may support just RDD[Seq(Map[String, String])], so Elastic Search must implement supported Type checking - * instance#2 : Kafka, HDFS, HBASE - Until they support an RDD operation for Any Type T : They throw Unsupporter Operation Exception & Educate Users Clearly ! - * @param dataSetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : to write kafka with a specific parallelism : One can set something like below - - * val props = Map("parallelsPerPartition" -> 10) - * Dataset(sc).write(clientDataFrame, props) - * @return RDD[T] - */ - def write[T: TypeTag](dataset: String, rdd: RDD[T], dataSetProps: Map[String, Any]): RDD[T] = { - - if (!supportedTypesOfRDD.contains(typeOf[T].toString)) { - throw new UnsupportedOperationException(s"""Invalid RDD Type. Supported Types : ${supportedTypesOfRDD.mkString(" | ")}""") - } else { - // todo Implementation for Write - rdd - } - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def create(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new UnsupportedOperationException(s"DataSet create for hbase currently not Supported") - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def drop(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new UnsupportedOperationException(s"DataSet drop for hbase currently not Supported") - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def truncate(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new UnsupportedOperationException(s"DataSet truncate for hbase currently not Supported") - } - - /** - * Save Checkpoint - */ - override def clearCheckPoint(): Unit = { - logger.info(s"Clear check Point functionality is not available for Hbase Dataset") - } - - /** - * Clear Checkpoint - */ - override def saveCheckPoint(): Unit = { - logger.info(s"Save check Point functionality is not available for Hbase Dataset") - } -} - -/** - * Custom Exception for HBase API initiation errors - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class DataSetException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseClientConfiguration.scala deleted file mode 100644 index d9a0d530..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseClientConfiguration.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.conf - -import scala.collection.immutable.Map -import scala.language.implicitConversions - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.common.conf.{GimelConstants, GimelProperties} -import com.paypal.gimel.common.utilities.GenericUtils -import com.paypal.gimel.logger.Logger - -/** - * Gimel Client Configuration for Hbase Dataset Operations. - * - * @param props Hbase Client properties. - */ -class HbaseClientConfiguration(val props: Map[String, Any]) { - - private val logger = Logger() - logger.info(s"Begin Building --> ${this.getClass.getName}") - // logger.info(s"Incoming Properties --> ${props.map(x => s"${x._1} -> ${x._2}").mkString("\n")}") - - // Load Default Prop from Resource File - val pcatProps = GimelProperties() - - // appTag is used to maintain checkpoints & various other factors that are unique to the application - val appTag: String = props.getOrElse(GimelConstants.APP_TAG, "").toString - - // This is the DataSet Properties - val datasetProps: DataSetProperties = props(GimelConstants.DATASET_PROPS).asInstanceOf[DataSetProperties] - val tableProps: Map[String, String] = datasetProps.props - - val clusterName = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName() - val hbaseNameSpaceAndTable = GenericUtils.getValueFailIfEmpty(tableProps, HbaseConfigs.hbaseTableKey, - "HBase table name not found. Please set the property " + HbaseConfigs.hbaseTableKey) - val hbaseTableColumnMapping = tableProps.getOrElse(HbaseConfigs.hbaseColumnMappingKey, "") - val maxSampleRecordsForSchema = GenericUtils.getValue(tableProps, HbaseConfigs.hbaseMaxRecordsForSchema, HbaseConstants.MAX_SAMPLE_RECORDS_FOR_SCHEMA).toInt - val maxColumnsForSchema = GenericUtils.getValue(tableProps, HbaseConfigs.hbaseMaxColumnsForSchema, HbaseConstants.MAX_COLUMNS_FOR_SCHEMA).toInt - // If this property consists of namespace and tablename both separated by colon ":", take the table name by splitting this string - val hbaseTableNamespaceSplit = hbaseNameSpaceAndTable.split(":") - val hbaseTableName = if (hbaseTableNamespaceSplit.length > 1) { - hbaseTableNamespaceSplit(1) - } else { - hbaseNameSpaceAndTable - } - val hbaseNameSpace = tableProps.getOrElse(HbaseConfigs.hbaseNamespaceKey, HbaseConstants.DEFAULT_NAMESPACE) - // If ColumnFamily name needs to be appneded with Column Name in resultant Dataframe - val hbaseColumnNamewithColumnFamilyAppended = tableProps.getOrElse(HbaseConfigs.hbaseColumnNamewithColumnFamilyAppended, "false").toString.toBoolean - // HDFS path for hbase-site.xml - val hbaseSiteXMLHDFSPath = tableProps.getOrElse(HbaseConfigs.hbaseSiteXMLHDFSPathKey, HbaseConstants.NONE_STRING) - val schema: Array[String] = if (datasetProps.fields != null && datasetProps.fields.nonEmpty) { - datasetProps.fields.map(_.fieldName) - } else { - Array.empty[String] - } - - val getOption = tableProps.getOrElse(HbaseConfigs.hbaseFilter, "") - - // Getting Row Key from user otherwise from schema in UDC or hive table. If it is not present in schema also, set defaultValue - val hbaseRowKeys = tableProps.getOrElse(HbaseConfigs.hbaseRowKey, HbaseConstants.DEFAULT_ROW_KEY_COLUMN).split(",") - - logger.info(s"Fields Initiated --> ${this.getClass.getFields.map(f => s"${f.getName} --> ${f.get().toString}").mkString("\n")}") - logger.info(s"Completed Building --> ${this.getClass.getName}") - -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConfigs.scala deleted file mode 100644 index 6edcda3e..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConfigs.scala +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.conf - -// keys related to HBASE -object HbaseConfigs { - // metastore properties - val hbaseTableKey: String = "gimel.hbase.table.name" - val hbaseColumnMappingKey: String = "gimel.hbase.columns.mapping" - val hbaseNamespaceKey = "gimel.hbase.namespace.name" - - // misc properties for read/write - val hbaseStorageHandler: String = "org.apache.hadoop.hive.hbase.HBaseStorageHandler" - val hbaseOperation: String = "gimel.hbase.operation" - val hbaseFilter: String = "gimel.hbase.get.filter" - val hbaseRowKey: String = "gimel.hbase.rowkey" - val hbaseColumnNamewithColumnFamilyAppended: String = "gimel.hbase.colName.with.cfName.appended" - val hbaseSiteXMLHDFSPathKey: String = "gimel.hbase.site.xml.hdfs.path" - val hbaseMaxRecordsForSchema: String = "gimel.hbase.schema.max.records" - val hbaseMaxColumnsForSchema: String = "gimel.hbase.schema.max.columns" - -} - - diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConstants.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConstants.scala deleted file mode 100644 index 1eb3c8f0..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConstants.scala +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.conf - -object HbaseConstants { - // basic variable references - val DEFAULT_ROW_KEY_COLUMN = "rowKey" - val DEFAULT_NAMESPACE = "default" - - val SCAN_OPERATION = "scan" - val GET_OPERATION = "get" - val PUT_OPERATION = "put" - - val NONE_STRING = "NONE" - - val MAX_SAMPLE_RECORDS_FOR_SCHEMA = "1000" - val MAX_COLUMNS_FOR_SCHEMA = "100000" -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseCatalog.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseCatalog.scala deleted file mode 100644 index 10b75758..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseCatalog.scala +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.apache.spark.sql.types.StructType - -import com.paypal.gimel.logger.Logger - -/** - * This is a Template Implementation to Create a Catalog for HBASE Spark Connector - * Take a variety of parameters for mapping a DataFrame to its HBASE equivalent - * 1. NameSpace - * 2. TableName - * 3. Keys - * 4. Column Families with a list of columns to put in each column family - * 5. TableCoder - */ - - -object HBaseCatalog { - val logger = Logger() - - val holderNameSpace = "" - val holderTableName = "" - val holderTableCoder = "" - val holderKey = "" - val holderKeysAsCols = "" - val holderColumns = "" - val catalogTemplate: String = - s"""|{"table":{"namespace":"$holderNameSpace", "name":"$holderTableName", "tableCoder":"$holderTableCoder"}, - |"rowkey":"$holderKey", - |"columns":{ - |$holderKeysAsCols, - |$holderColumns - |} - |} - """.stripMargin - - /** - * This function creates fields as String for Catalog with column Family appending with Column Name - * - * @param fields Hbase Table namespace - * @param columnFamily Hbase table name - */ - def fieldsAsStringForCataLogAppendColumnFamily(fields: Array[String], columnFamily: String = "rowkey"): String = { - - var lengthString = "" - fields.map { - eachKey => - val hbaseCol = if (columnFamily == "rowkey") { - lengthString = """, "length":"50"""" - eachKey - } else eachKey - s""""$columnFamily""" + s"""_$eachKey":{"cf":"$columnFamily", "col":"$hbaseCol", "type":"string"$lengthString}""" - }.mkString("", ",\n", "") - } - - - /** - * This function creates fields as String for Catalog - * - * @param fields Hbase Table namespace - * @param columnFamily Hbase table name - */ - def fieldsAsStringForCataLog(fields: Array[String], columnFamily: String = "rowkey"): String = { - - var lengthString = "" - fields.map { - eachKey => - val hbaseCol = if (columnFamily == "rowkey") { - lengthString = """, "length":"50"""" - eachKey - } else eachKey - s""""$eachKey":{"cf":"$columnFamily", "col":"$hbaseCol", "type":"string"$lengthString}""" - }.mkString("", ",\n", "") - } - - /** - * This function creates a catalog for hbase table with single column family - * - * @param nameSpace Hbase Table namespace - * @param tableName Hbase table name - * @param dfSchema Array of columns in dataframe - * @param keys Array of row key columns - * @param columnFamily - * @param tableCoder - * @return String - */ - def apply(nameSpace: String, tableName: String, dfSchema: Array[String], keys: Array[String], columnFamily: String, tableCoder: String = "PrimitiveType"): String = { - val key = keys.mkString(":") - val keysAsCols = fieldsAsStringForCataLog(keys) - val columns = dfSchema.diff(keys) - val colsAsCols = fieldsAsStringForCataLog(columns, columnFamily) - val catalogString = catalogTemplate. - replaceAllLiterally(holderNameSpace, nameSpace) - .replaceAllLiterally(holderTableName, tableName) - .replaceAllLiterally(holderTableCoder, tableCoder) - .replaceAllLiterally(holderKey, key) - .replaceAllLiterally(holderColumns, colsAsCols) - .replaceAllLiterally(holderKeysAsCols, keysAsCols) - catalogString - } - - /** - * This function creates a catalog for hbase table with multiple column family - * - * @param nameSpace Hbase Table namespace - * @param tableName Hbase table name - * @param cfColsMap Map[Column Family -> Array[Column Names ] ] - * @param keys Array of row key columns - * @param tableCoder - * @return String - */ - - def apply(nameSpace: String, tableName: String, cfColsMap: Map[String, Array[String]], keys: Array[String], tableCoder: String, readWithColumnFamily: Boolean): String = { - val key = keys.mkString(":") - val keysAsCols = if (readWithColumnFamily) { - fieldsAsStringForCataLogAppendColumnFamily(keys) - } else { - fieldsAsStringForCataLog(keys) - } - val colsAsCols = if (readWithColumnFamily) { - cfColsMap.map { x => fieldsAsStringForCataLogAppendColumnFamily(x._2.diff(keys), x._1) }.mkString("", ",\n", "") - } - else { - cfColsMap.map { x => fieldsAsStringForCataLog(x._2.diff(keys), x._1) }.mkString("", ",\n", "") - } - val catalogString = catalogTemplate. - replaceAllLiterally(holderNameSpace, nameSpace) - .replaceAllLiterally(holderTableName, tableName) - .replaceAllLiterally(holderTableCoder, tableCoder) - .replaceAllLiterally(holderKey, key) - .replaceAllLiterally(holderColumns, colsAsCols) - .replaceAllLiterally(holderKeysAsCols, keysAsCols) - logger.info(catalogString) - logger.info("catalog is --> " + catalogString) - catalogString - } - - /** - * This function creates a catalog for hbase table with single column family from a dataframe schema - * - * @param nameSpace Hbase Table namespace - * @param tableName Hbase table name - * @param dfSchema Dataframe Schema - * @param keys Array of row key columns - * @param columnFamily - * @param tableCoder - * @return String - */ - def apply(nameSpace: String, tableName: String, dfSchema: StructType, keys: Array[String], columnFamily: String, tableCoder: String): String = { - val key = keys.mkString(":") - val keysAsCols = fieldsAsStringForCataLog(keys) - val columns = dfSchema.fieldNames.diff(keys) - val colsAsCols = fieldsAsStringForCataLog(columns, columnFamily) - val catalogString = catalogTemplate. - replaceAllLiterally(holderNameSpace, nameSpace) - .replaceAllLiterally(holderTableName, tableName) - .replaceAllLiterally(holderTableCoder, tableCoder) - .replaceAllLiterally(holderKey, key) - .replaceAllLiterally(holderColumns, colsAsCols) - .replaceAllLiterally(holderKeysAsCols, keysAsCols) - logger.info(catalogString) - logger.info("catalog is --> " + catalogString) - catalogString - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseLookUp.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseLookUp.scala deleted file mode 100644 index c5b12b43..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseLookUp.scala +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import scala.collection.JavaConverters._ -import scala.collection.immutable.{Iterable, Map} - -import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName} -import org.apache.hadoop.hbase.client.{ConnectionFactory, Get, Result} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.sql.{DataFrame, SparkSession, SQLContext} -import spray.json._ -import spray.json.DefaultJsonProtocol._ - -import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs} -import com.paypal.gimel.logger.Logger - -object HBaseLookUp { - - def apply(sparkSession: SparkSession): HBaseLookUp = new HBaseLookUp(sparkSession) - -} - -class HBaseLookUp(sparkSession: SparkSession) { - - val logger = Logger() - - /** - * This function reads all or given columns in column family for a rowKey specified by user - * - * @param dataset Name - * @param dataSetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : Hbase lookup for rowKey=r1 and columns c1, c12 of column family cf1 and c2 of cf2 - * val options: Map[String, Any] = Map("operation"->"get","filter"->"rowKey=r1:toGet=cf1-c1,c12|cf2-c2") - * val recsDF = dataSet.read("udc.test123", options); - * @return DataFrame - */ - def get(dataset: String, dataSetProps: Map[String, Any]): DataFrame = { - try { - - // Hbase configuration - val conf = new HbaseClientConfiguration(dataSetProps) - var options = Map.empty[String, String] - try { - if (conf.getOption.isEmpty) { - throw new IllegalArgumentException( - s""" - | HBase get filter condition not found. Please set the property ${HbaseConfigs.hbaseFilter}. - | Example: rowKey=1:toGet=personal - | where personal is the column family name - |""".stripMargin) - } - options = conf.getOption.split(":").map { x => x.split("=")(0) -> x.split("=")(1) }.toMap - if (!options.contains("rowKey")) { - throw new IllegalArgumentException( - s""" - | rowKey not present in the filter condition. Please check the property ${HbaseConfigs.hbaseFilter}. - | Examples: rowKey=1:toGet=personal - | where personal is the column family name - |""".stripMargin) - } - } catch { - case ex: Throwable => - logger.error( - s""" - | Unable to parse the filter condition. Please check the property ${HbaseConfigs.hbaseFilter} - | Example: rowKey=1:toGet=personal - | where personal is the column family name - |""".stripMargin) - ex.printStackTrace() - throw ex - } - - val rowKey = options("rowKey") - - val dataFromHBASE: Map[String, String] = if (!options.contains("toGet")) { - getColumnsInRowKey(conf.hbaseNameSpace + ":" + conf.hbaseTableName, rowKey) - } else { - val cfsAndCols = options("toGet") - // (Column family to Array[Columns]) mapping specified by user in toGet - val cfsSets: Map[String, Array[String]] = cfsAndCols.split('|').map { x => - if (x.split("-").length > 1) x.split('-')(0) -> x.split('-')(1).split(',') else x.split('-')(0) -> null - }.toMap - getColumnsInRowKey(conf.hbaseNameSpace + ":" + conf.hbaseTableName, rowKey, cfsSets) - } - val hbaseDataJSON = dataFromHBASE.toJson.compactPrint - val hbaseDf = jsonStringToDF(sparkSession, hbaseDataJSON) - hbaseDf - } catch { - case ex: Throwable => - ex.printStackTrace() - logger.error(s"Unable to get data from HBase table.") - throw ex - } - } - - /** - * Returns all/specified columns in column family for a rowKey specified by user - * - * @param hbaseTable Name of the Data Set - * @param rowKey row Key for the lookup - * @param cfsSets User Specified column family and columns - * @return Map[Column -> Column Value ] - */ - def getColumnsInRowKey(hbaseTable: String, rowKey: String, cfsSets: Map[String, Array[String]]): Map[String, String] = { - val k: Iterable[Map[String, String]] = cfsSets.map { x => - val cf1 = x._1 - val cols = x._2 - val hbaseData = getColumnsInFamily(hbaseTable, rowKey, cf1, cols) - hbaseData - } - val foldedMap: Map[String, String] = k.tail.foldLeft(k.head)((x, y) => x ++ y) - foldedMap - } - - /** - * Converts a String to DataFrame - * - * @param sqlCntxt SQLContext - * @param string Input String (must be JSON Format) - */ - def jsonStringToDF(sqlCntxt: SQLContext, string: String): DataFrame = { - val rdd = sqlCntxt.sparkContext.parallelize(Seq(string)) - sqlCntxt.read.json(rdd) - } - - /** - * Converts a String to DataFrame - * - * @param sparkSession : SparkSession - * @param string Input String (must be JSON Format) - */ - def jsonStringToDF(sparkSession: SparkSession, string: String): DataFrame = { - val rdd = sparkSession.sparkContext.parallelize(Seq(string)) - sparkSession.read.json(rdd) - } - - /** - * Returns Column Value for each column in a column family - * - * @param hbaseTable HBASE Table Name - * @param rowKey Row Key - * @param columnFamily Column Family Name - * @param columns Array of Column Names - * @return Map[Column -> Column Value ] - */ - def getColumnsInFamily(hbaseTable: String, rowKey: String, columnFamily: String, columns: Array[String]): Map[String, String] = { - try { - val hbaseColumnFamily: Array[Byte] = Bytes.toBytes(columnFamily) - val hTable = TableName.valueOf(hbaseTable) - val rowKeyBytes = Bytes.toBytes(rowKey) - val getRowKey: Get = new Get(rowKeyBytes) - // Configure And Connect - val conf = HBaseConfiguration.create() - val cnxn = ConnectionFactory.createConnection(conf) - // Get Operation - val tbl = cnxn.getTable(hTable) - val k: Result = tbl.get(getRowKey) - - // Get Column values of each column as Map of [Column Name -> Column Value] - val allColumns: Map[String, String] = columns match { - // If user specifies only column family, get all the columns in that column family otherwise get specified columns - case null => - k.getFamilyMap(Bytes.toBytes(columnFamily)).asScala.map(x => (Bytes.toString(x._1), Bytes.toString(x._2))).toMap - case _ => - // Columns Bytes - val hbaseColumns = columns.map(Bytes.toBytes) - // Mapping Cf with Columns into single collection - val cfAndColumns: Array[(Array[Byte], Array[Byte])] = hbaseColumns.map((hbaseColumnFamily, _)) - // Return requested Columns and their values in a Map - val allColumns = cfAndColumns.map { x => - Bytes.toString(x._2) -> Bytes.toString(k.getValue(x._1, x._2)) - }.toMap - allColumns - } - allColumns - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - - } - - /** - * Returns all columns in all column families for a rowKey specified by user - * - * @param hbaseTable Name of the Data Set - * @param rowKey row Key for the lookup - * @return Map[Column -> Column Value ] - */ - def getColumnsInRowKey(hbaseTable: String, rowKey: String): Map[String, String] = { - try { - val hTable = TableName.valueOf(hbaseTable) - val rowKeyBytes = Bytes.toBytes(rowKey) - val getRowKey: Get = new Get(rowKeyBytes) - // Configure And Connect - val conf = HBaseConfiguration.create() - val cnxn = ConnectionFactory.createConnection(conf) - // Get Operation - val tbl = cnxn.getTable(hTable) - val k: Result = tbl.get(getRowKey) - val columnsVals = k.rawCells().map(cell => (Bytes.toString(CellUtil.cloneQualifier(cell)), Bytes.toString(CellUtil.cloneValue(cell)))).toMap - tbl.close() - columnsVals - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBasePut.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBasePut.scala deleted file mode 100644 index d7bb489e..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBasePut.scala +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.apache.hadoop.hbase.{HBaseConfiguration, TableName} -import org.apache.hadoop.hbase.client.{ConnectionFactory, Put} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.sql.{DataFrame, SparkSession} - -import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs} -import com.paypal.gimel.logger.Logger - -object HBasePut { - - def apply(sparkSession: SparkSession): HBasePut = new HBasePut(sparkSession) - -} - -class HBasePut(sparkSession: SparkSession) { - val logger = Logger() - lazy val hbaseUtilities = HBaseUtilities(sparkSession) - - /** - * This function performs put(insert/update) operation on each row of dataframe - * - * @param dataset Name - * @param dataFrame The Dataframe to write into Target - * @param dataSetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : Hbase put - * val props = Map("operation" -> "put") - * val recsDF = dataSet.write("pcatalog.test123", df, options); - * @return DataFrame - */ - def put(dataset: String, dataFrame: DataFrame, dataSetProps: Map[String, Any]): DataFrame = { - try { - // Hbase configuration - val conf = new HbaseClientConfiguration(dataSetProps) - // Getting (Column family -> Array[Columns]) mapping - val columnFamilyToColumnMapping: Map[String, Array[String]] = hbaseUtilities.getColumnMappingForColumnFamily(conf.hbaseNameSpace, - conf.hbaseTableName, - conf.hbaseTableColumnMapping, - conf.maxSampleRecordsForSchema, - conf.maxColumnsForSchema) - logger.info("Column mapping -> " + columnFamilyToColumnMapping) - // Converting columnFamilyToColumnMapping to a map of (Column -> Column Family) - val columnToColumnFamilyMapping = columnFamilyToColumnMapping.flatMap(cfCols => cfCols._2.map(col => (col, cfCols._1))) - // Create Put object for each row in dataframe - putRows(conf.hbaseNameSpace + ":" + conf.hbaseTableName, dataFrame, conf.hbaseRowKeys.mkString(":"), dataFrame.columns, columnToColumnFamilyMapping) - dataFrame - } catch { - case ex: Throwable => - ex.printStackTrace() - logger.error(s"Unable to put data into HBase table.") - throw ex - } - } - - /** - * - * @param hbaseTable Hbase Table Name - * @param dataFrame The Dataframe to put into Target - * @param rowKeyColumn Name of the row Key column in hive table - * @param columns Array of Columns to be put - * @param cfColsMap Map of (Column -> Column Family) - */ - def putRows(hbaseTable: String, dataFrame: DataFrame, rowKeyColumn: String, columns: Array[String], cfColsMap: Map[String, String]) { - try { - // Configure And Connect - val conf = HBaseConfiguration.create() - val cnxn = ConnectionFactory.createConnection(conf) - // Create Connection to HBase table - val tbl = cnxn.getTable(TableName.valueOf(hbaseTable)) - val rows = dataFrame.rdd.map { row => - (row.getAs(rowKeyColumn).toString, - columns.map(eachCol => (cfColsMap.getOrElse(eachCol, ""), eachCol, row.getAs(eachCol).asInstanceOf[String])) - ) - }.collect() - // Performing put operation on each row of dataframe - rows.foreach { row => - val putRow: Put = new Put(Bytes.toBytes(row._1.asInstanceOf[String])) - row._2.foreach(x => if (x._2 != rowKeyColumn) putRow.addColumn(Bytes.toBytes(x._1), Bytes.toBytes(x._2), Bytes.toBytes(x._3))) - tbl.put(putRow) - } - tbl.close() - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseScanner.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseScanner.scala deleted file mode 100644 index 9049e93d..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseScanner.scala +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.apache.commons.lang.StringEscapeUtils -import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName} -import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan} -import org.apache.hadoop.hbase.filter.PageFilter -import org.apache.hadoop.hbase.util.Bytes -import scala.collection.JavaConverters._ - -import com.paypal.gimel.common.utilities.GenericUtils -import com.paypal.gimel.logger.Logger - -object HBaseScanner { - - def apply(): HBaseScanner = new HBaseScanner() - -} - -class HBaseScanner() { - - val logger = Logger(this.getClass) - - /** - * Returns schema of hbase table - * - * @param connection - * @param namespace - * @param tableName - * @param maxResults : Number of maximum records to be scanned - * @return Map of [Column Family -> Array[Columns] ] - */ - def getSchema(connection: Connection, namespace: String, tableName: String, rowKey: String, maxResults: Int): Map[String, Array[String]] = { - val table: TableName = TableName.valueOf(namespace + ":" + tableName) - val tbl = connection.getTable(table) - // INITIATE SCANNER - val scan = new Scan() - - // Setting the Page Filter to retrieve pageSize records from each region server - val pageSize = getPageSize(connection, table, maxResults) - logger.info("Setting the pageSize = " + pageSize) - val filter = new PageFilter(maxResults) - scan.setFilter(filter) - - var count = 0 - // Iterate through all the records retrieved from HBase and get column family and column names - GenericUtils.withResources(tbl.getScanner(scan)) { scanner => - val res = scanner.iterator().asScala.flatMap { result => - count = count + 1 - val cells = result.listCells().iterator().asScala - cells.map(cell => (Bytes.toString(CellUtil.cloneFamily(cell)), Bytes.toString(CellUtil.cloneQualifier(cell)))).toList - }.toList.distinct.groupBy(_._1).map(x => (x._1, x._2.map(p => p._2).toArray)) - logger.info(s"Records Count for ${tableName} : " + count) - val rowKeyMap = Map("rowKey" -> Array(rowKey)) - rowKeyMap ++ res - } - } - - /** - * Returns schema of hbase table with specified maximum number of columns and result size - * - * @param connection - * @param namespace - * @param tableName - * @param maxResults : Number of maximum records to be scanned - * @param maxColumns : Number of maximum columns to be scanned - * @param maxResultSize : Maximum result size in bytes - * @return Map of [Column Family -> Array[Columns] ] - */ - def getSchema(connection: Connection, namespace: String, tableName: String, maxResults: Int, maxColumns: Int, maxResultSize : Long): Map[String, Array[String]] = { - val table: TableName = TableName.valueOf(namespace + ":" + tableName) - val tbl = connection.getTable(table) - // INITIATE SCANNER - val scan = new Scan() - - // Setting the Page Filter to retrieve pageSize records from each region server - val pageSize = getPageSize(connection, table, maxResults) - logger.info("Setting the pageSize = " + pageSize) - val fil = new PageFilter(maxResults) - scan.setFilter(fil) - // Setting the maximum result size in bytes - scan.setMaxResultSize(maxResultSize) - - var count = 0 - var columnsCount = 0 - // Iterate through all the records retrieved from HBase and get column family and column names - GenericUtils.withResources(tbl.getScanner(scan)) { scanner => - val res = scanner.iterator().asScala.takeWhile(_ => columnsCount < maxColumns).flatMap { result => - count = count + 1 - val cells = result.listCells() - columnsCount = cells.size() - val cellsItr = cells.iterator().asScala - // Escape each column family and column in case of any special characters - cellsItr.map(cell => (StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneFamily(cell))), - StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneQualifier(cell))))).toList - }.toList.distinct.groupBy(_._1).map(x => (x._1, x._2.map(p => p._2).toArray)) - logger.info(s"Records Count for ${tableName} : " + count) - res - } - } - - /** - * Returns schema of hbase table with specified maximum number of columns - * - * @param connection - * @param namespace - * @param tableName - * @param maxResults : Number of maximum records to be scanned - * @param maxColumns : Number of maximum columns to be scanned - * @return Map of [Column Family -> Array[Columns] ] - */ - def getSchema(connection: Connection, namespace: String, tableName: String, maxResults: Int, maxColumns: Int): Map[String, Array[String]] = { - val table: TableName = TableName.valueOf(namespace + ":" + tableName) - val tbl = connection.getTable(table) - // INITIATE SCANNER - val scan = new Scan() - - // Setting the Page Filter to retrieve pageSize records from each region server - val pageSize = getPageSize(connection, table, maxResults) - logger.info("Setting the pageSize = " + pageSize) - val filter = new PageFilter(pageSize) - scan.setFilter(filter) - - var count = 0 - var columnsCount = 0 - // Iterate through all the records retrieved from HBase and get column family and column names - GenericUtils.withResources(tbl.getScanner(scan)) { scanner => - val res = scanner.iterator().asScala.takeWhile(_ => columnsCount < maxColumns).flatMap { result => - count = count + 1 - val cells = result.listCells() - columnsCount = cells.size() - val cellsItr = cells.iterator().asScala - // Escape each column family and column in case of any special characters - cellsItr.map(cell => (StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneFamily(cell))), - StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneQualifier(cell))))).toList - }.toList.distinct.groupBy(_._1).map(x => (x._1, x._2.map(p => p._2).toArray)) - logger.info(s"Records Count for ${tableName} : " + count) - res - } - } - - /** - * Returns page size based on the number of regions and maxResults size - * - * @param connection - * @param table - * @param maxResults : Number of maximum records to be scanned - * @return Page Size - */ - def getPageSize(connection: Connection, table: TableName, maxResults: Int): Int = { - // Getting total region servers to decide the PageFilter size - val regionLocator = connection.getRegionLocator(table) - val numRegionServers = regionLocator.getAllRegionLocations().asScala.map(eachRegion => eachRegion.getHostname()).distinct.size - if (numRegionServers == 0) { - 0 - } else { - Math.max(maxResults / numRegionServers, 1) - } - } - - /** - * Returns schema of hbase table by creating a connection - * - * @param namespace : Name of hbase name space - * @param tableName : Name of hbase table - * @param maxResults : Number of maximum records to be scanned - * @param maxColumns : Number of maximum columns to be scanned - * @return Map of [Column Family -> Array[Columns] ] - */ - def getSchema(namespace: String, tableName: String, maxResults: Int, maxColumns: Int): Map[String, Array[String]] = { - val conf = HBaseConfiguration.create() - GenericUtils.withResources(ConnectionFactory.createConnection(conf)) { - connection => - getSchema(connection, namespace, tableName, maxResults, maxColumns) - } - } - - /** - * Returns schema of hbase table by creating a connection with specified maximum number of columns - * - * @param namespace - * @param tableName - * @param maxResults : Number of maximum records to be scanned - * @param maxColumns : Number of maximum columns to be scanned - * @param maxResultSize : Maximum result size in bytes - * @return Map of [Column Family -> Array[Columns] ] - */ - def getSchema(namespace: String, tableName: String, maxResults: Int, maxColumns: Int, maxResultSize : Long): Map[String, Array[String]] = { - val conf = HBaseConfiguration.create() - GenericUtils.withResources(ConnectionFactory.createConnection(conf)) { - connection => - getSchema(connection, namespace, tableName, maxResults, maxColumns, maxResultSize) - } - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnector.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnector.scala deleted file mode 100644 index 13669183..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnector.scala +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.execution.datasources.hbase.{HBaseRelation, HBaseTableCatalog} - -import com.paypal.gimel.common.storageadmin.HBaseAdminClient -import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs, HbaseConstants} -import com.paypal.gimel.logger.Logger - -/** - * Spark Hbase Connector by Hortonworks implementations internal to Gimel - */ -object HBaseSparkConnector { - - def apply(sparkSession: SparkSession): HBaseSparkConnector = new HBaseSparkConnector(sparkSession) - -} - -class HBaseSparkConnector(sparkSession: SparkSession) { - val logger = Logger() - lazy val hbaseUtilities = HBaseUtilities(sparkSession) - - /** - * This function performs scan/bulkGet on hbase table - * - * @param dataset Name - * @param dataSetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : to get 10 factor parallelism (specifically) - * val props = Map("coalesceFactor" -> 10) - * val data = Dataset(sc).read("flights", props) - * data.coalesce(props.get("coalesceFactor")) - * @return DataFrame - */ - def read(dataset: String, dataSetProps: Map[String, Any] = Map.empty): DataFrame = { - try { - - val conf = new HbaseClientConfiguration(dataSetProps) - // Setting the map (Column family -> Array of columns) - val columnFamilyToColumnMapping: Map[String, Array[String]] = hbaseUtilities.getColumnMappingForColumnFamily(conf.hbaseNameSpace, - conf.hbaseTableName, - conf.hbaseTableColumnMapping, - conf.maxSampleRecordsForSchema, - conf.maxColumnsForSchema) - logger.info("Column mapping -> " + columnFamilyToColumnMapping) - // Get the hbase-site.xml file location - val hbaseConfigFileLocation = HBaseAdminClient.getHbaseSiteXml(conf.hbaseSiteXMLHDFSPath) - // Create catalog for the SHC connector - val catalog = HBaseCatalog(conf.hbaseNameSpace, conf.hbaseTableName, columnFamilyToColumnMapping, conf.hbaseRowKeys, - "PrimitiveType", conf.hbaseColumnNamewithColumnFamilyAppended) - logger.info(s"Reading with catalog --> $catalog") - - val dataframe = conf.hbaseSiteXMLHDFSPath match { - case HbaseConstants.NONE_STRING => - readWithCatalog(catalog) - case _ => - readWithCatalog(catalog, hbaseConfigFileLocation) - } - dataframe - } catch { - case ex: Throwable => - ex.printStackTrace() - logger.error(s"Unable to read data from HBase table.") - throw ex - } - } - - /** - * This function reads data from HBase with catalog string. - * - * @param catalog - * @return - */ - private def readWithCatalog(catalog: String): DataFrame = { - try { - sparkSession - .read - .options(Map((HBaseTableCatalog.tableCatalog, catalog))) - .format("org.apache.spark.sql.execution.datasources.hbase") - .load() - } - catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - - /** - * This function reads data from HBase with catalog string. - * - * @param catalog - * @param hbaseConfigFileLocation The HBASE Configuration File : hbase-site.xml - * @return DataFrame - */ - private def readWithCatalog(catalog: String, hbaseConfigFileLocation: String): DataFrame = { - try { - sparkSession - .read - .options(Map((HBaseTableCatalog.tableCatalog, catalog), (HBaseRelation.HBASE_CONFIGFILE, hbaseConfigFileLocation))) - .format("org.apache.spark.sql.execution.datasources.hbase") - .load() - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - - /** - * This function performs bulk write into hbase table - * - * @param dataset Name - * @param dataFrame The Dataframe to write into Target - * @param dataSetProps - * Example Usecase : we want only 1 executor for hbase (specifically) - * val props = Map("coalesceFactor" -> 1) - * Dataset(sc).write(clientDataFrame, props) - * Inside write implementation :: dataFrame.coalesce(props.get("coalesceFactor")) - * @return DataFrame - */ - - def write(dataset: String, dataFrame: DataFrame, dataSetProps: Map[String, Any]): DataFrame = { - try { - val conf = new HbaseClientConfiguration(dataSetProps) - - if (conf.hbaseRowKeys.diff(dataFrame.columns.toSeq).nonEmpty) { - throw new IllegalArgumentException( - s""" - |Row Key columns not found in input dataframe. - |You can modify the value through ${HbaseConfigs.hbaseRowKey} parameter. - |Note: Default value is first column of the schema from UDC or ${HbaseConstants.DEFAULT_ROW_KEY_COLUMN}. - |""".stripMargin) - } - // Get columns in dataframe excluding row key columns - val dfColumns = dataFrame.columns.filter(x => !conf.hbaseRowKeys.contains(x)).toSeq - logger.info("Columns in dataframe -> " + dfColumns) - // Setting (Column family -> array of columns) mapping - val columnFamilyToColumnMapping: Map[String, Array[String]] = hbaseUtilities.getColumnMappingForColumnFamily(conf.hbaseNameSpace, - conf.hbaseTableName, - conf.hbaseTableColumnMapping, - conf.maxSampleRecordsForSchema, - conf.maxColumnsForSchema) - logger.info("Column mapping -> " + columnFamilyToColumnMapping) - val columnsInSchema = columnFamilyToColumnMapping.map(_._2).flatten.toSeq - logger.info("Columns in schema : " + columnsInSchema) - // Check what columns in the input hbase column mapping are not present in the input dataframe - val diff = columnsInSchema.diff(dfColumns) - if (diff.nonEmpty) { - throw new IllegalArgumentException( - s""" - |Columns : ${diff.mkString(",")} not found in dataframe schema. - |Please check the property : ${HbaseConfigs.hbaseColumnMappingKey} = ${conf.hbaseTableColumnMapping} - |""".stripMargin - ) - } - // Select columns provided in gimel.hbase.column.mapping property and row keys from the input dataframe. - val dataFrameToWrite = dataFrame.selectExpr(columnsInSchema ++ conf.hbaseRowKeys: _*) - // Get the hbase-site.xml file location - val hbaseConfigFileLocation = HBaseAdminClient.getHbaseSiteXml(conf.hbaseSiteXMLHDFSPath) - // Create catalog for the SHC connector - val catalog = HBaseCatalog(conf.hbaseNameSpace, conf.hbaseTableName, columnFamilyToColumnMapping, conf.hbaseRowKeys, "PrimitiveType", false) - logger.info(s"Writing with catalog --> $catalog") - conf.hbaseSiteXMLHDFSPath match { - case HbaseConstants.NONE_STRING => - logger.info(s"PLAIN WRITE") - writeWithCatalog(dataFrameToWrite, catalog) - case _ => - logger.info(s"write with ${conf.hbaseSiteXMLHDFSPath}") - writeWithCatalog(dataFrameToWrite, catalog, hbaseConfigFileLocation) - } - dataFrame - } catch { - case ex: Throwable => - ex.printStackTrace() - logger.error(s"Unable to write data to HBase table.") - throw ex - } - } - - /** - * This function writes data to HBase with catalog string. - * - * @param dataFrame DataFrame to write to HBase. - * @param hbaseConfigFileLocation The HBASE Configuration File : hbase-site.xml - * @param catalog Catalog string holdin schema for HBase table. - */ - // HBaseTableCatalog.newTable Property needs to be set as a default paramaneter as SHC Connector expects this argumnet but it doesnt create the table again[SHC ISSUE- https://github.com/hortonworks-spark/shc/issues/151].If we take the master branch and build it ,we dont need this parameter. - private def writeWithCatalog(dataFrame: DataFrame, catalog: String, hbaseConfigFileLocation: String) = { - try { - dataFrame - .write - .options(Map((HBaseTableCatalog.tableCatalog, catalog), (HBaseTableCatalog.newTable, "5"), (HBaseRelation.HBASE_CONFIGFILE, hbaseConfigFileLocation))) - .format("org.apache.spark.sql.execution.datasources.hbase") - .save() - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - - /** - * This function writes data to HBase with catalog string. - * - * @param dataFrame DataFrame to write to HBase. - * @param catalog Catalog string holdin schema for HBase table. - */ - // HBaseTableCatalog.newTable Property needs to be set as a default paramaneter as SHC Connector expects this argumnet but it doesnt create the table again[SHC ISSUE- https://github.com/hortonworks-spark/shc/issues/151].If we take the master branch and build it ,we dont need this parameter. - private def writeWithCatalog(dataFrame: DataFrame, catalog: String) = { - try { - dataFrame - .write - .options(Map((HBaseTableCatalog.tableCatalog, catalog), (HBaseTableCatalog.newTable, "5"))) - .format("org.apache.spark.sql.execution.datasources.hbase") - .save() - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseUtilities.scala deleted file mode 100644 index 7a91bf71..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseUtilities.scala +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.apache.commons.lang.StringEscapeUtils -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.StringType - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.common.conf.GimelConstants -import com.paypal.gimel.common.security.AuthHandler -import com.paypal.gimel.hbase.conf.HbaseConfigs -import com.paypal.gimel.logger.Logger - -/** - * HBASE implementations internal to Gimel - */ -object HBaseUtilities { - - def apply(sparkSession: SparkSession): HBaseUtilities = new HBaseUtilities(sparkSession) - -} - -class HBaseUtilities(sparkSession: SparkSession) { - val logger = Logger() - val columnFamilyNamePattern = "(.+):(.+)".r - lazy val hbaseScanner = HBaseScanner() - - /** - * - * @param dataDrame DataFrame to cast all columns to string format. - * @return Dataframe with all string data. - */ - def castAllColsToString(dataDrame: DataFrame): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - logger.info("Casting All Columns as String") - val k = dataDrame.schema.fieldNames.foldRight(dataDrame) { - (column: String, df: DataFrame) => df.withColumn(column, df(column).cast(StringType)) - } - logger.info("Coalescing All Columns with Null Values to Empty String") - val returningDF = k.schema.fieldNames.foldRight(k) { - (fieldName: String, df: DataFrame) => df.withColumn(fieldName, coalesce(df(fieldName), lit(""))) - } - logger.info("Done with Column Coalese operation") - returningDF - } - - /** - * This function scans the sample records from hbase table if column mapping parameter is empty - * - * @param namespace String HBase Namespace Name - * @param tableName String HBase Table Name - * @param tableColumnMapping String (:key,cf1:c1,cf1:c2,cf2:c3) - * @return - */ - - def getColumnMappingForColumnFamily(namespace: String, tableName: String, tableColumnMapping: String, maxRecords: Int, maxColumns: Int): Map[String, Array[String]] = { - val schema = getColumnMappingForColumnFamily(tableColumnMapping) - if (schema.isEmpty) { - logger.info("Column family to column mapping is not present or is in wrong format, scanning the sample records.") - val schemaFromSampleRecords = hbaseScanner.getSchema(namespace, tableName, maxRecords, maxColumns) - if (schemaFromSampleRecords.isEmpty) { - throw new IllegalStateException("No columns found while scanning. May be the table is empty.") - } - schemaFromSampleRecords - } else { - schema - } - } - - /** - * This function performs Table Column Mapping - * - * @param tableColumnMapping String (:key,cf1:c1,cf1:c2,cf2:c3) - * @return - */ - - def getColumnMappingForColumnFamily(tableColumnMapping: String): Map[String, Array[String]] = { - // to remove the exact location of :Key - val indexOfKey: Int = tableColumnMapping.split(",").indexOf(":key") - val updateMapping = if (indexOfKey != -1) { - val mappingBuffer = tableColumnMapping.split(",").toBuffer - mappingBuffer.remove(indexOfKey) - mappingBuffer.toArray.mkString(",") - } else { - tableColumnMapping - } - - try { - // checking if CF Mapping matches the pattern - val columnMapping = updateMapping.split(",").flatMap { - case columnFamilyNamePattern(cf, cname) => Some((StringEscapeUtils.escapeJava(cf), StringEscapeUtils.escapeJava(cname))) - case _ => throw new IllegalArgumentException( - s""" - |Column family to column mapping pattern is not correct -> ${tableColumnMapping} - |Please check the property ${HbaseConfigs.hbaseColumnMappingKey}, it should be in format -> cf1:c1,cf1:c2,cf2:c3 - |""".stripMargin) - }.groupBy(_._1).map { case (k, v) => (k, v.map(_._2)) } - columnMapping - } catch { - case ex: IllegalArgumentException => - logger.warning(ex.getMessage) - Map.empty[String, Array[String]] - } - } - - /** - * Authenticate Read/Write with HBASE Policies - * - * @param dataset - * @param operation - * @param dataSetProps - */ - def authenticateThroughRangerPolicies(dataset: String, operation: String, dataSetProps: Map[String, Any]): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - val datasetProps: DataSetProperties = dataSetProps(GimelConstants.DATASET_PROPS).asInstanceOf[DataSetProperties] - val tableProperties = datasetProps.props - val hbaseTable = dataSetProps.getOrElse(HbaseConfigs.hbaseTableKey, tableProperties.getOrElse(HbaseConfigs.hbaseTableKey, "")).asInstanceOf[String] - val hbaseNameSpace = dataSetProps.getOrElse(GimelConstants.HBASE_NAMESPACE, tableProperties.getOrElse(GimelConstants.HBASE_NAMESPACE, "default")).asInstanceOf[String] - val hbaseTableName = hbaseTable.split(":")(1) - val hBaseNameSpaceAndTable = hbaseNameSpace + ":" + hbaseTableName - val clusterName = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName() - logger.info("hBaseNameSpaceAndTable and clusterName" + hBaseNameSpaceAndTable + clusterName) - val currentUser = datasetProps.props.getOrElse(GimelConstants.GTS_USER_CONFIG, sparkSession.sparkContext.sparkUser) - if (AuthHandler.isAuthRequired(sparkSession)) { - AuthHandler.authenticateHbasePolicy(currentUser, operation, hBaseNameSpaceAndTable, dataset, clusterName, dataSetProps) - } - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/DataSetTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/DataSetTest.scala deleted file mode 100644 index f2eee94b..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/DataSetTest.scala +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase - -import org.scalatest._ - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.hbase.conf.HbaseConfigs -import com.paypal.gimel.hbase.utilities.HBaseLocalClient - -class DataSetTest extends HBaseLocalClient with Matchers { - test("Write operation") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val dataFrame = mockDataInDataFrame(10) - dataFrame.show(1) - val df = dataSet.write(dataSetName, dataFrame, datasetProps) - assert(df.count() == 10) - } - - test("Read operation") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val df = dataSet.read(dataSetName, datasetProps) - df.show(1) - assert(df.count() == 10) - } - - test("Write operation - column given in input via " + HbaseConfigs.hbaseColumnMappingKey + " not present in dataframe to write") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:manager,professional:comp") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val dataFrame = mockDataInDataFrame(10) - dataFrame.show(1) - val exception = intercept[Exception] { - dataSet.write(dataSetName, dataFrame, datasetProps) - } - assert(exception.getMessage.contains("Columns : manager,comp not found in dataframe schema") == true) - } - - test("Read operation - select specific columns") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,professional:company") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val df = dataSet.read(dataSetName, datasetProps) - df.show(1) - assert(df.columns.sameElements(Array("id", "company", "name", "address"))) - } - - test("Read operation - same column in 2 column families") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,professional:name", - HbaseConfigs.hbaseColumnNamewithColumnFamilyAppended -> "true") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val df = dataSet.read(dataSetName, datasetProps) - df.show(1) - assert(df.columns.sameElements(Array("rowkey_id", "professional_name", "personal_name", "personal_address"))) - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseCatalogTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseCatalogTest.scala deleted file mode 100644 index cc768df3..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseCatalogTest.scala +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} -import org.scalatest.{FunSpec, Matchers} - -class HBaseCatalogTest extends FunSpec with Matchers { - - describe("fieldsAsStringForCataLogAppendColumnFamily") { - it ("should create json of fields with type as string for Catalog with column Family appended with Column Name") { - HBaseCatalog.fieldsAsStringForCataLogAppendColumnFamily(columnsList, "cf1") should be ( - s""""cf1_c1":{"cf":"cf1", "col":"c1", "type":"string"}, - |"cf1_c2":{"cf":"cf1", "col":"c2", "type":"string"}, - |"cf1_c3":{"cf":"cf1", "col":"c3", "type":"string"}""".stripMargin) - - HBaseCatalog.fieldsAsStringForCataLogAppendColumnFamily(keyList, "rowkey") should be ( - s""""rowkey_key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"}, - |"rowkey_key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}""".stripMargin) - } - } - - describe("fieldsAsStringForCataLog") { - it ("should create json of fields with type as string for Catalog") { - HBaseCatalog.fieldsAsStringForCataLog(columnsList, "cf1") should be ( - s""""c1":{"cf":"cf1", "col":"c1", "type":"string"}, - |"c2":{"cf":"cf1", "col":"c2", "type":"string"}, - |"c3":{"cf":"cf1", "col":"c3", "type":"string"}""".stripMargin) - - HBaseCatalog.fieldsAsStringForCataLog(keyList, "rowkey") should be ( - s""""key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"}, - |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}""".stripMargin) - } - } - - describe("HBaseCatalog") { - it ("should create a catalog string with one column family and df columns array for shc connector") { - HBaseCatalog("namespace", "tablename", columnsList, keyList, "cf1") should be - s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"}, - |"rowkey":"key1:key2", - |"columns":{ - |"key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"}, - |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}, - |"c1":{"cf":"cf1", "col":"c1", "type":"string"}, - |"c2":{"cf":"cf1", "col":"c2", "type":"string"}, - |"c3":{"cf":"cf1", "col":"c3", "type":"string"} - |} - |} - |""".stripMargin - } - - it ("should create a catalog string with one column family and df schema for shc connector") { - HBaseCatalog("namespace", "tablename", schema, keyList, "cf1", "PrimitiveType") should be - s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"}, - |"rowkey":"key1:key2", - |"columns":{ - |"key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"}, - |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}, - |"num":{"cf":"cf1", "col":"num", "type":"string"}, - |"letter":{"cf":"cf1", "col":"letter", "type":"string"} - |} - |} - |""".stripMargin - } - - it ("should create a catalog string with multiple column families for shc connector") { - // With column family appended - HBaseCatalog("namespace", "tablename", columnFamilyToColumnMapping, keyList, "PrimitiveType", true) should be - s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"}, - |"rowkey":"key1:key2", - |"columns":{ - |"rowkey_key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"}, - |"rowkey_key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}, - |"cf1_c11":{"cf":"cf1", "col":"c11", "type":"string"}, - |"cf1_c12":{"cf":"cf1", "col":"c12", "type":"string"}, - |"cf2_c21":{"cf":"cf2", "col":"c21", "type":"string"}, - |"cf2_c22":{"cf":"cf2", "col":"c22", "type":"string"} - |} - |} - |""".stripMargin - - // Without column family appended - HBaseCatalog("namespace", "tablename", columnFamilyToColumnMapping, keyList, "PrimitiveType", false) should be - s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"}, - |"rowkey":"key1:key2", - |"columns":{ - |"key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"}, - |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}, - |"c11":{"cf":"cf1", "col":"c11", "type":"string"}, - |"c12":{"cf":"cf1", "col":"c12", "type":"string"}, - |"c21":{"cf":"cf2", "col":"c21", "type":"string"}, - |"c22":{"cf":"cf2", "col":"c22", "type":"string"} - |} - |} - |""".stripMargin - - } - } - - val schema: StructType = StructType( - List( - StructField("num", IntegerType, true), - StructField("letter", StringType, true) - ) - ) - - val columnFamilyToColumnMapping = Map("cf1" -> Array("c11", "c12"), - "cf2" -> Array("c21", "c22")) - - val keyList = Array("key1", "key2") - - val columnsList = Array("c1", "c2", "c3") -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLocalClient.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLocalClient.scala deleted file mode 100644 index e8945f86..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLocalClient.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import java.io.File - -import scala.collection.mutable.ArrayBuffer - -import com.google.common.io.Files -import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName} -import org.apache.hadoop.hbase.util.Bytes -import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.execution.QueryExecution -import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf -import org.apache.spark.sql.util._ -import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} - -import com.paypal.gimel.common.catalog.Field -import com.paypal.gimel.hbase.DataSet - -class HBaseLocalClient extends FunSuite with Matchers with BeforeAndAfterAll { - - var sparkSession : SparkSession = _ - var dataSet: DataSet = _ - val hbaseTestingUtility = new HBaseTestingUtility() - val tableName = "test_table" - val cfs = Array("personal", "professional") - val columns = Array("id", "name", "age", "address", "company", "designation", "salary") - val fields = columns.map(col => new Field(col)) - - val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)] - - protected override def beforeAll(): Unit = { - val tempDir: File = Files.createTempDir - tempDir.deleteOnExit - hbaseTestingUtility.startMiniCluster() - SparkHBaseConf.conf = hbaseTestingUtility.getConfiguration - createTable(tableName, cfs) - val conf = new SparkConf - conf.set(SparkHBaseConf.testConf, "true") - sparkSession = SparkSession.builder() - .master("local") - .appName("HBase Test") - .config(conf) - .getOrCreate() - - val listener = new QueryExecutionListener { - // Only test successful case here, so no need to implement `onFailure` - override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} - override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { - metrics += ((funcName, qe, duration)) - } - } - sparkSession.listenerManager.register(listener) - sparkSession.sparkContext.setLogLevel("ERROR") - dataSet = new DataSet(sparkSession) - } - - protected override def afterAll(): Unit = { - hbaseTestingUtility.shutdownMiniCluster() - sparkSession.close() - } - - def createTable(name: String, cfs: Array[String]) { - val tName = Bytes.toBytes(name) - val bcfs = cfs.map(Bytes.toBytes(_)) - try { - hbaseTestingUtility.deleteTable(TableName.valueOf(tName)) - } catch { - case _ : Throwable => - println("No table = " + name + " found") - } - hbaseTestingUtility.createMultiRegionTable(TableName.valueOf(tName), bcfs) - } - - // Mocks data for testing - def mockDataInDataFrame(numberOfRows: Int): DataFrame = { - def stringed(n: Int) = s"""{"id": "$n","name": "MAC-$n", "address": "MAC-${n + 1}", "age": "${n + 1}", "company": "MAC-$n", "designation": "MAC-$n", "salary": "${n * 10000}" }""" - val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) } - val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts) - val dataFrame: DataFrame = sparkSession.read.json(rdd) - dataFrame - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLookUpTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLookUpTest.scala deleted file mode 100644 index 63dd6511..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLookUpTest.scala +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.scalatest.{BeforeAndAfterAll, Matchers} - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.hbase.conf.HbaseConfigs - -class HBaseLookUpTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll { - ignore ("get") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseFilter -> "rowKey=10", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary", - HbaseConfigs.hbaseOperation -> "get") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val dataFrame = mockDataInDataFrame(10) - dataFrame.show(1) - val df = HBasePut(sparkSession).put(dataSetName, dataFrame, datasetProps) - val dfLookUp = HBaseLookUp(sparkSession).get(dataSetName, datasetProps) - dfLookUp.show - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBasePutTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBasePutTest.scala deleted file mode 100644 index 04800f50..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBasePutTest.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.scalatest.{BeforeAndAfterAll, Matchers} - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.hbase.conf.HbaseConfigs - -class HBasePutTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll { - ignore ("put") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary", - HbaseConfigs.hbaseOperation -> "put") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val dataFrame = mockDataInDataFrame(10) - dataFrame.show(1) - val df = HBasePut(sparkSession).put(dataSetName, dataFrame, datasetProps) - assert(df.count() == 10) - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseScannerTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseScannerTest.scala deleted file mode 100644 index ba62c4c1..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseScannerTest.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.scalatest.{BeforeAndAfterAll, Matchers} - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.hbase.conf.HbaseConfigs - -class HBaseScannerTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll { - ignore("getSchema") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val dataFrame = mockDataInDataFrame(1000) - dataFrame.show(1) - HBaseSparkConnector(sparkSession).write(dataSetName, dataFrame, datasetProps) - val schema = HBaseScanner().getSchema("default", tableName, 100, 100000) - println(schema) - assert(schema.keys.sameElements(cfs)) - assert(schema("personal").sameElements(Array("name", "age", "address"))) - assert(schema("professional").sameElements(Array("company", "designation", "salary"))) - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnectorTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnectorTest.scala deleted file mode 100644 index 657e7fef..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnectorTest.scala +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.scalatest.{BeforeAndAfterAll, Matchers} - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.common.conf.GimelConstants -import com.paypal.gimel.hbase.conf.HbaseConfigs - -class HBaseSparkConnectorTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll { - - test("write operation") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val dataFrame = mockDataInDataFrame(1000) - dataFrame.show(1) - val df = HBaseSparkConnector(sparkSession).write(dataSetName, dataFrame, datasetProps) - assert(df.count() == 1000) - } - - test("read operation") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties) - val df = HBaseSparkConnector(sparkSession).read(dataSetName, datasetProps) - df.show(1) - assert(df.count() == 1000) - } - - test("read operation with page size") { - val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default", - HbaseConfigs.hbaseTableKey -> s"""$tableName""", - HbaseConfigs.hbaseRowKey -> "id", - HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") - sparkSession.conf.set(GimelConstants.HBASE_PAGE_SIZE, 20) - val dataSetName = "HBase.Local.default." + tableName - val dataSetProperties = DataSetProperties(dataSetName, null, null, props) - val datasetProps : Map[String, Any] = Map("dataSetProperties"->dataSetProperties) - val df = HBaseSparkConnector(sparkSession).read(dataSetName, datasetProps) - df.show(20) - val metricInsertQuery = metrics(metrics.length - 1) - val qe = metricInsertQuery._2 - println(qe.executedPlan.children(0).children(0).children(0).metrics) - val kafkaReadOutputRows = qe.executedPlan.children(0).children(0).children(0).metrics("numOutputRows").value - assert(kafkaReadOutputRows == 20) - sparkSession.conf.unset(GimelConstants.HBASE_PAGE_SIZE) - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseUtilitiesTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseUtilitiesTest.scala deleted file mode 100644 index 3cd9e867..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-hbase-1.2/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseUtilitiesTest.scala +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.hbase.utilities - -import org.apache.spark.sql.types.StringType -import org.scalatest.{BeforeAndAfterAll, Matchers} - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.hbase.conf.HbaseConfigs - -class HBaseUtilitiesTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll { - - var hbaseUtilities : HBaseUtilities = new HBaseUtilities(sparkSession) - - test ("castAllColsToString") { - // it should cast all the columns in dataframe to string - assert (hbaseUtilities.castAllColsToString(mockDataInDataFrame(5)).schema.filter(col => col.dataType != StringType).length == 0) - } - - test ("getColumnMappingForColumnFamily") { - // it should return the map of column family to column with correct pattern - val mapping = hbaseUtilities.getColumnMappingForColumnFamily("cf1:c1,cf1:c2,cf1:c3,cf2:c4") - assert(mapping("cf1").sameElements(Array("c1", "c2", "c3"))) - assert(mapping("cf2").sameElements(Array("c4"))) - - // it should return the map of column family to column with correct pattern including :key - val mapping1 = hbaseUtilities.getColumnMappingForColumnFamily(":key,cf1:c1,cf1:c2,cf1:c3,cf2:c4") - assert(mapping1("cf1").sameElements(Array("c1", "c2", "c3"))) - assert(mapping1("cf2").sameElements(Array("c4"))) - - val mapping2 = hbaseUtilities.getColumnMappingForColumnFamily("cf1:c1,:key,cf1:c2,cf1:c3,cf2:c4") - assert(mapping2("cf1").sameElements(Array("c1", "c2", "c3"))) - assert(mapping2("cf2").sameElements(Array("c4"))) - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/pom.xml b/gimel-dataapi/gimel-connectors/gimel-hive/pom.xml similarity index 97% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/pom.xml rename to gimel-dataapi/gimel-connectors/gimel-hive/pom.xml index 15495433..f40e5ad0 100644 --- a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-hive/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 - gimel-hive-1.2 - 2.0.0-SNAPSHOT + gimel-hive + 2.4.7-SNAPSHOT diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/DataSet.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/DataSet.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/DataSet.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsClientConfiguration.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsClientConfiguration.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsClientConfiguration.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConfigs.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConfigs.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConfigs.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConstants.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConstants.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConstants.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/conf/HdfsConstants.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/utilities/HDFSUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/utilities/HDFSUtilities.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hdfs/utilities/HDFSUtilities.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hdfs/utilities/HDFSUtilities.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/DataSet.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/DataSet.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/DataSet.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/conf/HiveConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/conf/HiveConfigs.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/conf/HiveConfigs.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/conf/HiveConfigs.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/conf/HiveConstants.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/conf/HiveConstants.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/conf/HiveConstants.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/conf/HiveConstants.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveJDBCUtils.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveJDBCUtils.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveJDBCUtils.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveJDBCUtils.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaTemplates.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaTemplates.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaTemplates.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaTemplates.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaUtils.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaUtils.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaUtils.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveSchemaUtils.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveUtils.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveUtils.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/main/scala/com/paypal/gimel/hive/utilities/HiveUtils.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/main/scala/com/paypal/gimel/hive/utilities/HiveUtils.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.avro b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.avro similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.avro rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.avro diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.csv b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.csv similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.csv rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.csv diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.json b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.json similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.json rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.json diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.parquet b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.parquet similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.parquet rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.parquet diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.seq b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.seq similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.seq rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.seq diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.txt b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.txt similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.txt rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.txt diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.txt.gz b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.txt.gz similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/resources/hdfs_test.txt.gz rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/resources/hdfs_test.txt.gz diff --git a/gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/scala/com/paypal/gimel/hdfs/DataSetTest.scala b/gimel-dataapi/gimel-connectors/gimel-hive/src/test/scala/com/paypal/gimel/hdfs/DataSetTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-hive-1.2/src/test/scala/com/paypal/gimel/hdfs/DataSetTest.scala rename to gimel-dataapi/gimel-connectors/gimel-hive/src/test/scala/com/paypal/gimel/hdfs/DataSetTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-jdbc/pom.xml b/gimel-dataapi/gimel-connectors/gimel-jdbc/pom.xml index 5ac357b3..8c6088ea 100644 --- a/gimel-dataapi/gimel-connectors/gimel-jdbc/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-jdbc/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 gimel-jdbc - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -83,7 +83,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/ExtendedJdbcRDD.scala b/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/ExtendedJdbcRDD.scala index 9e7e3ecb..537b4eb6 100644 --- a/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/ExtendedJdbcRDD.scala +++ b/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/ExtendedJdbcRDD.scala @@ -98,7 +98,11 @@ class ExtendedJdbcRDD[T: ClassTag]( protected var finished = false val logger = Logger(this.getClass.getName) - context.addTaskCompletionListener { context => closeIfNeeded() } + context.addTaskCompletionListener{ + (context: org.apache.spark.TaskContext) => + closeIfNeeded() + "dummy" + } val conn: Connection = getConnection() diff --git a/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/JDBCUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/JDBCUtilities.scala index 62b8ed8b..1e02efc8 100644 --- a/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/JDBCUtilities.scala +++ b/gimel-dataapi/gimel-connectors/gimel-jdbc/src/main/scala/com/paypal/gimel/jdbc/utilities/JDBCUtilities.scala @@ -349,6 +349,9 @@ class JDBCUtilities(sparkSession: SparkSession) extends Serializable { st } +// private def handlePartition(partition:Iterator[Row]):Unit = { +// +// } /** * This method inserts into given table in given mode * @@ -374,10 +377,8 @@ class JDBCUtilities(sparkSession: SparkSession) extends Serializable { } case _ => // do nothing } - - // For each partition create a temp table to insert - dataFrame.foreachPartition { batch => + dataFrame.foreachPartition { batch: Iterator[Row] => // create logger inside the executor val logger = Logger(this.getClass.getName) @@ -572,7 +573,7 @@ class JDBCUtilities(sparkSession: SparkSession) extends Serializable { */ private def updateTable(dataFrame: DataFrame, jdbcConnectionUtility: JDBCConnectionUtility, jdbcHolder: JDBCArgsHolder) { - dataFrame.foreachPartition { batch => + dataFrame.foreachPartition { batch: Iterator[Row] => if (batch.nonEmpty) { // create logger inside the executor val logger = Logger(this.getClass.getName) @@ -630,7 +631,7 @@ class JDBCUtilities(sparkSession: SparkSession) extends Serializable { */ private def upsertTable(dataFrame: DataFrame, jDBCConnectionUtility: JDBCConnectionUtility, jdbcHolder: JDBCArgsHolder) { - dataFrame.foreachPartition { batch => + dataFrame.foreachPartition { batch: Iterator[Row] => // create logger inside the executor val logger = Logger(this.getClass.getName) if (batch.nonEmpty) { diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/pom.xml b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/pom.xml deleted file mode 100644 index 154762e2..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/pom.xml +++ /dev/null @@ -1,120 +0,0 @@ - - - - - - - gimel-dataapi - com.paypal.gimel - 2.0.0-SNAPSHOT - ../../pom.xml - - 4.0.0 - - gimel-kafka-0.10 - 2.0.0-SNAPSHOT - - - - com.paypal.gimel - gimel-common - ${gimel.version}-SNAPSHOT - - - com.databricks - spark-avro_${scala.binary.version} - 3.2.0 - ${packaging.scope} - - - org.scalatest - scalatest_${scala.binary.version} - ${scalatest.version} - test - - - - - src/main/scala - src/test/scala - - - org.apache.maven.plugins - maven-shade-plugin - 3.0.0 - - - - com.google.common - gimel-shaded.com.google.common - - - com.sun.jersey - gimel-shaded.com.sun.jersey - - - - org.apache.hadoop - gimel-shaded.org.apache.hadoop - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - gimel-shading - package - - shade - - - - - - org.scalatest - scalatest-maven-plugin - 1.0 - - ${project.build.directory}/surefire-reports - . - WDF TestSuite.txt - - - - test - - test - - - - - - - - diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/DataSet.scala deleted file mode 100644 index 93c614e3..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/DataSet.scala +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka - -import scala.language.implicitConversions -import scala.reflect.runtime.universe._ - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.streaming.kafka010.OffsetRange - -import com.paypal.gimel.datasetfactory.GimelDataSet -import com.paypal.gimel.kafka.conf.KafkaClientConfiguration -import com.paypal.gimel.kafka.reader.KafkaBatchConsumer -import com.paypal.gimel.kafka.utilities.ImplicitZKCheckPointers._ -import com.paypal.gimel.kafka.utilities.ZooKeeperHostAndNodes -import com.paypal.gimel.kafka.writer.KafkaBatchProducer -import com.paypal.gimel.logger.Logger - -/** - * Concrete Implementation for Kafka DataSet - * - * @param sparkSession : SparkSession - */ - -class DataSet(sparkSession: SparkSession) extends GimelDataSet(sparkSession: SparkSession) { - - // GET LOGGER - val logger = Logger() - logger.info(s"Initiated --> ${this.getClass.getName}") - - var readTillOffsetRange: Option[Array[OffsetRange]] = None - var alreadyCheckPointed = false - // FIXME What happens if two users call read and write at the same time? Data race over conf? - private var conf: KafkaClientConfiguration = _ - - /** - * Saves Currently Read Offsets to Zookeeper - */ - def saveCheckPoint(): Unit = { - if (alreadyCheckPointed) { - logger.warning("Already Check-Pointed, Consume Again to Checkpoint !") - } else { - val zkNode = conf.zkCheckPoints - val zkHost = conf.zkHostAndPort - val zk = ZooKeeperHostAndNodes(zkHost, zkNode) - (zk, readTillOffsetRange.get).saveZkCheckPoint - alreadyCheckPointed = true - logger.info(s"Check-Point --> ${readTillOffsetRange.get.mkString("|")} | Success @ -> ${zk} ") - } - } - - /** - * Completely Clear the CheckPointed Offsets, leading to Read from Earliest offsets from Kafka - */ - def clearCheckPoint(): Unit = { - val zkNode = conf.zkCheckPoints - val zkHost = conf.zkHostAndPort - val zk = ZooKeeperHostAndNodes(zkHost, zkNode) - zk.deleteZkCheckPoint() - } - - /** Read Implementation for Kafka DataSet - * - * @param dataset Name of the PCatalog Data Set - * @param datasetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : to read kafka from-to a certain offset range : One can set something like below - - * val props = Map("fromOffset" -> 10, "toOffset" -> 20) - * val data = Dataset(sc).read("flights.topic", props) - * @return DataFrame - */ - override def read(dataset: String, datasetProps: Map[String, Any]): DataFrame = { - - if (datasetProps.isEmpty) { - throw new DataSetException("Props Map Cannot be emtpy for KafkaDataSet Read.") - } - conf = new KafkaClientConfiguration(datasetProps) - val (data, toOffset) = KafkaBatchConsumer.consumeFromKakfa(sparkSession, conf) - alreadyCheckPointed = false - readTillOffsetRange = Some(toOffset) - data - } - - /** Write Implementation for Kafka DataSet - * - * @param dataset Name of the PCatalog Data Set - * @param dataFrame The DataFrame to write to target - * @param datasetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : to write kafka with a specific parallelism : One can set something like below - - * val props = Map("parallelsPerPartition" -> 10) - * Dataset(sc).write(clientDataFrame, props) - * @return DataFrame - */ - - override def write(dataset: String, dataFrame: DataFrame, datasetProps: Map[String, Any]): DataFrame = { - - if (datasetProps.isEmpty) { - throw new DataSetException("Props Map Cannot be emtpy for KafkaDataSet Write.") - } - conf = new KafkaClientConfiguration(datasetProps) - KafkaBatchProducer.produceToKafka(conf, dataFrame) - dataFrame - } - - // Add Additional Supported types to this list as and when we support other Types of RDD - // Example to start supporting RDD[String], add to List < typeOf[Seq[Map[String, String]]].toString) > - override val supportedTypesOfRDD: List[String] = List[String](typeOf[String].toString, typeOf[Array[Byte]].toString) - - /** - * Function writes a given dataframe to the actual Target System (Example Hive : DB.Table | HBASE namespace.Table) - * - * @param dataset Name of the PCatalog Data Set - * @param rdd The RDD[T] to write into Target - * Note the RDD has to be typeCast to supported types by the inheriting DataSet Operators - * instance#1 : ElasticSearchDataSet may support just RDD[Seq(Map[String, String])], so Elastic Search must implement supported Type checking - * instance#2 : Kafka, HDFS, HBASE - Until they support an RDD operation for Any Type T : They throw Unsupporter Operation Exception & Educate Users Clearly ! - * @param datasetProps - * props is the way to set various additional parameters for read and write operations in DataSet class - * Example Usecase : to write kafka with a specific parallelism : One can set something like below - - * val props = Map("parallelsPerPartition" -> 10) - * Dataset(sc).write(clientDataFrame, props) - * @return RDD[T] - */ - def write[T: TypeTag](dataset: String, rdd: RDD[T], datasetProps: Map[String, Any]): RDD[T] = { - - if (!supportedTypesOfRDD.contains(typeOf[T].toString)) { - throw new UnsupportedOperationException(s"""Invalid RDD Type. Supported Types : ${supportedTypesOfRDD.mkString(" | ")}""") - } else { - if (datasetProps.isEmpty) { - throw new DataSetException("Props Map Cannot be emtpy for KafkaDataSet Write.") - } - conf = new KafkaClientConfiguration(datasetProps) - val rdd1: RDD[String] = rdd.asInstanceOf[RDD[String]] - KafkaBatchProducer.produceToKafka(conf, rdd1) - } - rdd - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def create(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new Exception(s"DataSet create for kafka currently not Supported") - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def drop(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new Exception(s"DataSet drop for kafka currently not Supported") - } - - /** - * - * @param dataset Name of the UDC Data Set - * @param dataSetProps - * * @return Boolean - */ - override def truncate(dataset: String, dataSetProps: Map[String, Any]): Unit = { - throw new Exception(s"DataSet truncate for kafka currently not Supported") - } -} - -/** - * Custom Exception for KafkaDataSet initiation errors - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class DataSetException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/DataStream.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/DataStream.scala deleted file mode 100644 index c1e85f6c..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/DataStream.scala +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka - -import scala.language.implicitConversions - -import org.apache.spark.streaming.StreamingContext - -import com.paypal.gimel.datastreamfactory.{GimelDataStream, StreamingResult} -import com.paypal.gimel.kafka.conf.KafkaClientConfiguration -import com.paypal.gimel.kafka.reader.KafkaStreamConsumer -import com.paypal.gimel.logger.Logger - -class DataStream(streamingContext: StreamingContext) extends GimelDataStream(streamingContext: StreamingContext) { - - // GET LOGGER - val logger = Logger() - logger.info(s"Initiated --> ${this.getClass.getName}") - - /** - * Provides DStream for a given configuration - * - * @param dataset Kafka Topic Name - * @param datasetProps Map of K->V kafka Properties - * @return Tuple2 Of - - * Dstream[GenericRecord , Its Equivalent JSON String] - * A Function That Takes (SQLContext, RDD[GenericRecord]) , and returns a DataFrame - */ - def read(dataset: String, datasetProps: Map[String, Any]): StreamingResult = { - - if (datasetProps.isEmpty) { - throw new DataStreamException("Props Map Cannot be empty for KafkaDataSet Read") - } - val conf = new KafkaClientConfiguration(datasetProps) - KafkaStreamConsumer.createDStream(streamingContext, conf) - } - -} - -/** - * Custom Exception for KafkaDataStream initiation errors - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class DataStreamException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/avro/AvroToSQLSchemaConverter.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/avro/AvroToSQLSchemaConverter.scala deleted file mode 100644 index cf397e20..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/avro/AvroToSQLSchemaConverter.scala +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.avro - -import java.nio.ByteBuffer -import java.util - -import scala.collection.JavaConverters._ - -import org.apache.avro.Schema -import org.apache.avro.Schema.Type._ -import org.apache.avro.generic.{GenericData, GenericRecord} -import org.apache.avro.generic.GenericData.Fixed -import org.apache.spark.sql.Row -import org.apache.spark.sql.types._ - -/** - * This looic is borrowed from databricks spark-avro-2_10.jar to aid in the conversion of avro RDD to DataFrame. - * - * https://github.com/databricks/spark-avro/blob/master/src/main/scala/com/databricks/spark/avro/SchemaConverters.scala - * - * This object contains method that are used to convert sparkSQL schemas to avro schemas and vice versa. - * - * Note that original code has been enhanced. Please ensure notes are maintained for new additions to track deviations from original code. - * - * 2017-08-19 : Added support for Set(STRING, LONG) : This enabled Reading FTPI data - */ -object AvroToSQLSchemaConverter { - - case class SchemaType(dataType: DataType, nullable: Boolean) - - /** - * This function takes an avro schema and returns a sql schema. - */ - def toSqlType(avroSchema: Schema): SchemaType = { - avroSchema.getType match { - case INT => - SchemaType(IntegerType, nullable = false) - case STRING => - SchemaType(StringType, nullable = false) - case BOOLEAN => - SchemaType(BooleanType, nullable = false) - case BYTES => - SchemaType(BinaryType, nullable = false) - case DOUBLE => - SchemaType(DoubleType, nullable = false) - case FLOAT => - SchemaType(FloatType, nullable = false) - case LONG => - SchemaType(LongType, nullable = false) - case FIXED => - SchemaType(BinaryType, nullable = false) - case ENUM => - SchemaType(StringType, nullable = false) - - case RECORD => - val fields = avroSchema.getFields.asScala.map { f => - val schemaType = toSqlType(f.schema()) - StructField(f.name, schemaType.dataType, schemaType.nullable) - } - - SchemaType(StructType(fields), nullable = false) - - case ARRAY => - val schemaType = toSqlType(avroSchema.getElementType) - SchemaType( - ArrayType(schemaType.dataType, containsNull = schemaType.nullable), - nullable = false) - - case MAP => - val schemaType = toSqlType(avroSchema.getValueType) - SchemaType( - MapType(StringType, schemaType.dataType, valueContainsNull = schemaType.nullable), - nullable = false) - - case UNION => - if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { - // In case of a union with null, eliminate it and make a recursive call - val remainingUnionTypes = avroSchema.getTypes.asScala.filterNot(_.getType == NULL).toList - if (remainingUnionTypes.size == 1) { - toSqlType(remainingUnionTypes.head).copy(nullable = true) - } else { - toSqlType(Schema.createUnion(remainingUnionTypes.asJava)).copy(nullable = true) - } - } else avroSchema.getTypes.asScala.map(_.getType) match { - case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => - SchemaType(LongType, nullable = false) - case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => - SchemaType(DoubleType, nullable = false) - case other => - throw new UnsupportedOperationException( - s"This mix of union types is not supported (see README): $other") - } - - case other => - throw new UnsupportedOperationException(s"Unsupported type $other") - } - } - - /** - * Returns a function that is used to convert avro types to their - * corresponding sparkSQL representations. - */ - def createConverterToSQL(schema: Schema): Any => Any = { - schema.getType match { - // Avro strings are in Utf8, so we have to call toString on them - case STRING | ENUM => - (item: Any) => if (item == null) null else item.toString - case INT | BOOLEAN | DOUBLE | FLOAT | LONG => - identity - // Byte arrays are reused by avro, so we have to make a copy of them. - case FIXED => - (item: Any) => - if (item == null) { - null - } else { - item.asInstanceOf[Fixed].bytes().clone() - } - case BYTES => - (item: Any) => - if (item == null) { - null - } else { - val bytes = item.asInstanceOf[ByteBuffer] - val javaBytes = new Array[Byte](bytes.remaining) - bytes.get(javaBytes) - javaBytes - } - case RECORD => - val fieldConverters = schema.getFields.asScala.map(f => createConverterToSQL(f.schema)) - (item: Any) => - if (item == null) { - null - } else { - val record = item.asInstanceOf[GenericRecord] - val converted = new Array[Any](fieldConverters.size) - var idx = 0 - while (idx < fieldConverters.size) { - converted(idx) = fieldConverters.apply(idx)(record.get(idx)) - idx += 1 - } - Row.fromSeq(converted.toSeq) - } - case ARRAY => - val elementConverter = createConverterToSQL(schema.getElementType) - (item: Any) => - if (item == null) { - null - } else { - item.asInstanceOf[GenericData.Array[Any]].asScala.map(elementConverter) - } - case MAP => - val valueConverter = createConverterToSQL(schema.getValueType) - (item: Any) => - if (item == null) { - null - } else { - item.asInstanceOf[util.HashMap[Any, Any]].asScala.map { case (k, v) => - (k.toString, valueConverter(v)) - }.toMap - } - case UNION => - if (schema.getTypes.asScala.exists(_.getType == NULL)) { - val remainingUnionTypes = schema.getTypes.asScala.filterNot(_.getType == NULL) - if (remainingUnionTypes.size == 1) { - createConverterToSQL(remainingUnionTypes.head) - } else { - createConverterToSQL(Schema.createUnion(remainingUnionTypes.asJava)) - } - } else schema.getTypes.asScala.map(_.getType) match { - case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => - (item: Any) => { - item match { - case l: Long => - l - case i: Int => - i.toLong - case null => - null - } - } - case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => - (item: Any) => { - item match { - case d: Double => - d - case f: Float => - f.toDouble - case null => - null - } - } - case Seq(t1, t2) if Set(t1, t2) == Set(STRING, LONG) => - (item: Any) => { - // @todo This fix is pending as currently we are unable to convert Avro to Spark types for this combination (STRING, LONG). Wip ! - item match { - case l: Long => - l - case js: org.apache.avro.util.Utf8 => - js.toString - case null => - null - } - } - case other => - throw new UnsupportedOperationException( - s"This mix of union types is not supported (see README): $other") - } - case other => - throw new UnsupportedOperationException(s"invalid avro type: $other") - } - } - -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/avro/SparkAvroUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/avro/SparkAvroUtilities.scala deleted file mode 100644 index 7a50fbfb..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/avro/SparkAvroUtilities.scala +++ /dev/null @@ -1,326 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.avro - -import java.io.{ByteArrayInputStream, ByteArrayOutputStream} - -import io.confluent.kafka.schemaregistry.client.rest.RestService -import org.apache.avro.{specific, Schema} -import org.apache.avro.generic.{GenericData, GenericRecord} -import org.apache.avro.io.{DecoderFactory, EncoderFactory} -import org.apache.avro.specific.SpecificDatumWriter -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.apache.spark.sql.types.StructType -import scala.collection.JavaConverters._ -import spray.json._ -import spray.json.DefaultJsonProtocol._ - -import com.paypal.gimel.kafka.conf.KafkaClientConfiguration -import com.paypal.gimel.logger.Logger - -/** - * Avro - Spark Conversion operations are implemented here - */ - -object SparkAvroUtilities { - - val logger = Logger() - - /** - * Converts a DataFrame into RDD[Avro Generic Record] - * - * @param dataFrame DataFrame - * @param avroSchemaString Avro Schema String - * @return RDD[GenericRecord] - */ - - def dataFrametoGenericRecord(dataFrame: DataFrame, avroSchemaString: String): RDD[GenericRecord] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - try { - if (!isDFFieldsEqualAvroFields(dataFrame, avroSchemaString)) { - throw new SparkAvroConversionException(s"Incompatible DataFrame Schema Vs Provided Avro Schema.") - } - dataFrame.rdd.map { row => - val avroSchema = (new Schema.Parser).parse(avroSchemaString) - val fields = avroSchema.getFields.asScala.map { x => x.name() }.toArray - val cols: Map[String, Any] = row.getValuesMap(fields) - val genericRecord: GenericRecord = new GenericData.Record(avroSchema) - cols.foreach(x => genericRecord.put(x._1, x._2)) - genericRecord - } - } catch { - case ex: Throwable => - ex.printStackTrace() - throw new SparkAvroConversionException("Failed while converting DataFrame to Generic Record") - } - } - - /** - * Converts an RDD[Avro GenericRecord] into a DataFrame - * - * @param sqlContext SQLContext - * @param genericRecRDD RDD[GenericRecord] - * @param schemaString The AVRO schema String - * @return DataFrame - */ - def genericRecordtoDF(sqlContext: SQLContext, genericRecRDD: RDD[GenericRecord], schemaString: String): DataFrame = { - - genericRecordToDFViaAvroSQLConvertor(sqlContext, genericRecRDD, schemaString) - } - - /** - * Converts an RDD[Avro GenericRecord] into a DataFrame - * - * @param sqlContext SQLContext - * @param genericRecRDD RDD[GenericRecord] - * @param schemaString The AVRO schema String - * @return DataFrame - */ - - def genericRecordToDFViaAvroSQLConvertor(sqlContext: SQLContext, genericRecRDD: RDD[GenericRecord], schemaString: String): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - import com.databricks.spark.avro.SchemaConverters._ - try { - val rowRDD: RDD[Row] = genericRecRDD.map { x => - val avroSchema: Schema = (new Schema.Parser).parse(schemaString) - val converter = AvroToSQLSchemaConverter.createConverterToSQL(avroSchema) - converter(x).asInstanceOf[Row] - } - val avroSchema: Schema = (new Schema.Parser).parse(schemaString) - val schemaType = toSqlType(avroSchema) - sqlContext.createDataFrame(rowRDD, schemaType.dataType.asInstanceOf[StructType]) - } catch { - case ex: Throwable => - ex.printStackTrace() - throw new SparkAvroConversionException("Failed while converting Generic Record to DataFrame") - } - } - - /** - * Compare Fields of Avro Schema with Fields of DataFrame - * Return true if both match false if there is any mismatch - * Also log/print the differences. - * - * @param dataFrame DataFrame - * @param avroSchemaString Avro Schema String - * @return Boolean - */ - def isDFFieldsEqualAvroFields(dataFrame: DataFrame, avroSchemaString: String): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - try { - val dfFields = dataFrame.schema.fieldNames - val avroSchema = (new Schema.Parser).parse(avroSchemaString) - val avroFields = avroSchema.getFields.asScala.map { x => x.name() }.toArray - val inDFMissingInAvro = dfFields.diff(avroFields) - val inAvroMissingInDF = avroFields.diff(dfFields) - val isMatching = inDFMissingInAvro.isEmpty && inAvroMissingInDF.isEmpty - if (!isMatching) { - val warningMessage = - s""" - |Provided Avro Fields --> ${avroFields.mkString(",")} - |Determined DataFrame Fields --> ${dfFields.mkString(",")} - |Missing Fields in Avro --> ${inDFMissingInAvro.mkString(",")} - |Missing Fields in DataFrame --> ${inAvroMissingInDF.mkString(",")} - """.stripMargin - logger.warning(warningMessage) - } - isMatching - } catch { - case ex: Throwable => - ex.printStackTrace() - throw new SparkAvroConversionException(s"Failed While Comparing DF Fields match against Fields in Avro Schema String $avroSchemaString") - } - - } - - /** - * Gets the fields from a Avro Schema String - * - * @param avroSchema Avro Schema String - * @return Fields - */ - def getFieldsFromAvroSchemaString(avroSchema: String): Seq[String] = { - val schemaAsJsVal = avroSchema.parseJson // parse as JsValue - val schemaAsJsObject = schemaAsJsVal.asJsObject // Convert to JsObject - val schemaFields = schemaAsJsObject.getFields("fields").head.convertTo[Seq[JsValue]] - val existingFields = schemaFields.map { x => x.asJsObject.fields("name").toString().replace("\"", "") } - existingFields - } - - /** - * DeSerialize an Avro Generic Record - * - * @param serializedBytes A Serialized Byte Array (serialization should have been done through Avro Serialization) - * @param schemaString An Avro Schema String - * @return An Avro Generic Record - */ - - def bytesToGenericRecord(serializedBytes: Array[Byte], schemaString: String): GenericRecord = { - - try { - // Build Avro Schema From String - val avroSchema = (new Schema.Parser).parse(schemaString) - // Initiate AVRO Reader from Factory - val reader = new specific.SpecificDatumReader[GenericRecord](avroSchema) - // Initiate a new Java Byte Array Input Stream - val in = new ByteArrayInputStream(serializedBytes) - // Get appropriate AVRO Decoder from Factory - val decoder = DecoderFactory.get().binaryDecoder(in, null) - // Get AVRO generic record - val genericRecordRead = reader.read(null, decoder) - genericRecordRead - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - - /** - * Copies to a new generic record - * - * @param genericRecord Input Generic Record - * @param avroSchemaString Avro Schema that can be used to parse input Generic Record - * @param newAvroString New Avro Schema for the Outgoing Generic Record - * @return Outgoing Generic Record copied from Input - */ - def copyToGenericRecord(genericRecord: GenericRecord, avroSchemaString: String, newAvroString: String): GenericRecord = { - val existingFields = getFieldsFromAvroSchemaString(avroSchemaString) - val newAvroSchema = (new Schema.Parser).parse(newAvroString) - val newGenericRec: GenericRecord = new GenericData.Record(newAvroSchema) - existingFields.foreach(field => newGenericRec.put(field, genericRecord.get(field))) - newGenericRec - } - - /** - * A Functionality to Perform 2nd level De Serialization in case the data is from CDH - * This is necessary since Actual Data in CDH is wrapped by a Raw Record which get Deserialized when read from Kafka - * When this functionality is called, we check if the data is CDH type, then do second level deserialization - * If the data is not of CDH type, then we skip 2nd level deserialization - * - * @param avroRecordRDD RDD[GenericRecord] - * @param conf KafkaClientConfiguration - * @return RDD[GenericRecord] - */ - def deserializeCurrentRecord(avroRecordRDD: RDD[GenericRecord], conf: KafkaClientConfiguration): RDD[GenericRecord] = { - val schemaRegistryClient = new RestService(conf.avroSchemaURL) - val schemaLookup: scala.collection.mutable.Map[Int, String] = scala.collection.mutable.Map() - val actualRecord = avroRecordRDD.map { eachRecord => - val eachRecordSchemaVersion: Int = eachRecord.get("schemaVersion").toString.toInt - val schemaForThisRecord = schemaLookup.get(eachRecordSchemaVersion) match { - case None => - val schema = schemaRegistryClient.getVersion(conf.avroSchemaKey, eachRecordSchemaVersion).getSchema - schemaLookup.put(eachRecordSchemaVersion, schema) - schema - case Some(x) => - x - } - - val eachRecordBytes: Array[Byte] = eachRecord.get("currentRecord").asInstanceOf[Array[Byte]] - bytesToGenericRecord(eachRecordBytes, schemaForThisRecord) - } - actualRecord - } - - /** - * Serialize Avro GenericRecord into Byte Array - * - * @param rec An Avro Generic Record - * @param schemaString An Avro Schema String - * @return Serialized Byte Array - */ - - def genericRecordToBytes(rec: GenericRecord, schemaString: String): Array[Byte] = { - - try { - // Build Avro Schema From String - val avroSchema = (new Schema.Parser).parse(schemaString) - // Initiate a new Java Byte Array Output Stream - val out = new ByteArrayOutputStream() - // Get appropriate AVRO Decoder from Factory - val encoder = EncoderFactory.get().binaryEncoder(out, null) - // Write the Encoded data's output (Byte Array) into the Output Stream - // Initiate AVRO Writer from Factory - val writer = new SpecificDatumWriter[GenericRecord](avroSchema) - writer.write(rec, encoder) - // Flushes Data to Actual Output Stream - encoder.flush() - // Close the Output Stream - out.close() - val serializedBytes: Array[Byte] = out.toByteArray - serializedBytes - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - - /** - * Converts an RDD[Avro GenericRecord] into a DataFrame - * - * @param sqlContext SQLContext - * @param genericRecRDD RDD[GenericRecord] - * @param schemaString The AVRO schema String - * @return DataFrame - */ - def genericRecordToDataFrameViaJSON(sqlContext: SQLContext, genericRecRDD: RDD[GenericRecord], schemaString: String): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - try { - val avroSchema: Schema = (new Schema.Parser).parse(schemaString) - val fields: Seq[String] = avroSchema.getFields.asScala.map { x => x.name() }.toArray.toSeq - sqlContext.read.json(genericRecRDD.map(_.toString)).selectExpr(fields: _*) - } catch { - case ex: Throwable => - ex.printStackTrace() - throw new SparkAvroConversionException("Failed while converting Generic Record to DataFrame") - } - } - - /** - * Custom Exception - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ - private class SparkAvroConversionException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) - } - -} - - diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaClientConfiguration.scala deleted file mode 100644 index 41f472b7..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaClientConfiguration.scala +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.conf - -import java.util.Properties - -import scala.collection.JavaConverters._ -import scala.collection.immutable.Map -import scala.language.implicitConversions - -import io.confluent.kafka.schemaregistry.client.rest.RestService - -import com.paypal.gimel.common.catalog.DataSetProperties -import com.paypal.gimel.common.conf.{CatalogProviderConstants, GimelConstants, GimelProperties} -import com.paypal.gimel.common.schema.SchemaRegistryLookUp -import com.paypal.gimel.logger.Logger - -/** - * Gimel Client Configuration for Kafka Dataset Operations. - * - * @param props Kafka Client properties. - */ -class KafkaClientConfiguration(val props: Map[String, Any]) { - - private val logger = Logger() - logger.info(s"Begin Building --> ${this.getClass.getName}") - // logger.info(s"Incoming Properties --> ${props.map(x => s"${x._1} -> ${x._2}").mkString("\n")}") - - // Load Default Prop from Resource File - val pcatProps = GimelProperties() - - // appTag is used to maintain checkpoints & various other factors that are unique to the application - val appTag: String = props.getOrElse(GimelConstants.APP_TAG, "").toString - - // This is the DataSet Properties - val datasetProps: DataSetProperties = props(GimelConstants.DATASET_PROPS).asInstanceOf[DataSetProperties] - val tableProps: Map[String, String] = datasetProps.props - val hiveDBName = tableProps.getOrElse(CatalogProviderConstants.PROPS_NAMESPACE, GimelConstants.PCATALOG_STRING) - val hiveTableName = tableProps(CatalogProviderConstants.DATASET_PROPS_DATASET) - val clusterName = props.getOrElse(KafkaConstants.cluster, "unknown") - - logger.info(s"Hive Table Props --> ${tableProps.map(x => s"${x._1} --> ${x._2}").mkString("\n")}") - - // Schema Source either comes from Table "INLINE" (as a property) or from confluent Schema Registry if its = "CDH" or "CSR" - val avroSchemaSource: String = tableProps.getOrElse(KafkaConfigs.avroSchemaSource, KafkaConstants.gimelKafkaAvroSchemaInline) - val avroSchemaURL: String = tableProps.getOrElse(KafkaConfigs.avroSchemaSourceUrl, pcatProps.confluentSchemaURL) - val avroSchemaWrapperKey: String = tableProps.getOrElse(KafkaConfigs.avroSchemaSourceWrapperKey, pcatProps.kafkaAvroSchemaKey) - val avroSchemaKey: String = tableProps.getOrElse(KafkaConfigs.avroSchemaSourceKey, "") - val (avroSchemaString, cdhTopicSchemaMetadata, cdhAllSchemaDetails) = - avroSchemaSource.toUpperCase() match { - case KafkaConstants.gimelKafkaAvroSchemaCDH => - val schemaRegistryClient = new RestService(avroSchemaURL) - val allSchemas = SchemaRegistryLookUp.getAllSubjectAndSchema(avroSchemaURL) - (schemaRegistryClient.getLatestVersion(avroSchemaWrapperKey).getSchema, - Some(allSchemas(avroSchemaKey)._1), - Some(allSchemas) - ) - case KafkaConstants.gimeKafkaAvroSchemaCSR => - val schemaRegistryClient = new RestService(avroSchemaURL) - (schemaRegistryClient.getLatestVersion(avroSchemaWrapperKey).getSchema, - None, - None - ) - case KafkaConstants.gimelKafkaAvroSchemaInline => - (tableProps.getOrElse(KafkaConfigs.avroSchemaStringKey, ""), None, None) - case _ => - throw new Exception(s"Unsupported Schema Source Supplied --> $avroSchemaSource") - } - - // Kafka Props - val randomId: String = scala.util.Random.nextInt.toString - val kafkaHostsAndPort: String = tableProps.getOrElse(KafkaConfigs.kafkaServerKey, pcatProps.kafkaBroker) - val KafkaConsumerGroupID: String = props.getOrElse(KafkaConfigs.kafkaGroupIdKey, tableProps.getOrElse(KafkaConfigs.kafkaGroupIdKey, randomId)).toString - val kafkaConsumerID: String = props.getOrElse(KafkaConfigs.consumerId, tableProps.getOrElse(KafkaConfigs.consumerId, appTag)).toString.replaceAllLiterally("/", "_").replaceAllLiterally(":", "_") - val kafkaZKTimeOutMilliSec: String = tableProps.getOrElse(KafkaConfigs.zookeeperConnectionTimeoutKey, 10000.toString) - val kafkaAutoOffsetReset: String = tableProps.getOrElse(KafkaConfigs.offsetResetKey, "smallest") - val kafkaCustomOffsetRange: String = tableProps.getOrElse(KafkaConfigs.customOffsetRange, "") - val consumerModeBatch: String = tableProps.getOrElse(KafkaConstants.gimelAuditRunTypeBatch, "BATCH") - val consumerModeStream: String = tableProps.getOrElse(KafkaConstants.gimelAuditRunTypeStream, "STREAM") - val kafkaTopics: String = tableProps.getOrElse(KafkaConfigs.whiteListTopicsKey, "") - - // Kafka Serde - val kafkaKeySerializer: String = tableProps.getOrElse(KafkaConfigs.serializerKey, KafkaConfigs.kafkaStringSerializer) - val kafkaValueSerializer: String = tableProps.getOrElse(KafkaConfigs.serializerValue, KafkaConfigs.kafkaByteSerializer) - val kafkaKeyDeSerializer: String = tableProps.getOrElse(KafkaConfigs.deSerializerKey, KafkaConfigs.kafkaStringDeSerializer) - val kafkaValueDeSerializer: String = tableProps.getOrElse(KafkaConfigs.deSerializerValue, KafkaConfigs.kafkaByteDeSerializer) - - // Kafka Message Value Type --> String, Byte, Avro, JSON - val kafkaMessageValueType: Option[String] = tableProps.get(KafkaConfigs.kafkaMessageValueType) - - // Zookeeper Details - val zkHostAndPort: String = tableProps.getOrElse(KafkaConfigs.zookeeperCheckpointHost, pcatProps.zkHostAndPort) - if (pcatProps.kafkaConsumerCheckPointRoot == "") throw new Exception("Root CheckPoint Path for ZK cannot be Empty") - if (appTag == "") throw new Exception("appTag cannot be Empty") - if (kafkaTopics == "") throw new Exception("kafkaTopics cannot be Empty") - val zkCheckPoints: Seq[String] = kafkaTopics.split(",").map{ kafkaTopic => - tableProps.getOrElse(KafkaConfigs.zookeeperCheckpointPath, pcatProps.kafkaConsumerCheckPointRoot) + "/" + appTag + "/" + kafkaTopic - } - - // Kafka Monitoring for PayPal - /* - val kafkaMetricsReporter = props.getOrElse(KafkaConfigs.paypalMetricsReporterKey, KafkaConfigs.paypalMetricsReporterValue).toString - val kafkaMonitoringCluster = props.getOrElse(KafkaConfigs.paypalKafkaClusterKey, "unknown").toString - val kafkaMonitoringColo = props.getOrElse(KafkaConfigs.paypalMonitoringColoKey, "unknown").toString - val kafkaMonitoringPoolDefault = kafkaConsumerID - val kafkaMonitoringPool = "Gimel-" + props.getOrElse(KafkaConfigs.paypalMonitoringPoolKey, kafkaMonitoringPoolDefault).toString - val kafkaInterceptorClasses = props.getOrElse(KafkaConfigs.paypalInterceptorClassesKey, KafkaConfigs.paypalInterceptorClassName).toString - val kafkaMetricsSamplingWindowMilliSec = props.getOrElse(KafkaConfigs.paypalMetricsSamplingMilliSecKey, "6000").toString -*/ - val clientProps = scala.collection.immutable.Map( - KafkaConfigs.kafkaServerKey -> kafkaHostsAndPort - , KafkaConfigs.kafkaGroupIdKey -> s"${KafkaConsumerGroupID}" - , KafkaConfigs.kafkaClientIdKey -> s"${scala.util.Random.nextInt.toString}_${kafkaConsumerID}".takeRight(128) - ) - -// val ppKafkaListnerProps = scala.collection.immutable.Map( -// KafkaConfigs.paypalMetricsReporterKey -> kafkaMetricsReporter -// , KafkaConfigs.paypalKafkaClusterKey -> kafkaMonitoringCluster -// , KafkaConfigs.paypalMonitoringColoKey -> kafkaMonitoringColo -// , KafkaConfigs.paypalMonitoringPoolKey -> kafkaMonitoringPool -// , KafkaConfigs.paypalInterceptorClassesKey -> kafkaInterceptorClasses -// , KafkaConfigs.paypalMetricsSamplingMilliSecKey -> kafkaMetricsSamplingWindowMilliSec -// ) - - // Explicitly Making a Map of Properties that are necessary to Connect to Kafka for Subscribes (Reads) - val kafkaConsumerProps: Map[String, String] = scala.collection.immutable.Map(KafkaConfigs.kafkaServerKey -> kafkaHostsAndPort - , KafkaConfigs.kafkaGroupIdKey -> KafkaConsumerGroupID - , KafkaConfigs.zookeeperConnectionTimeoutKey -> kafkaZKTimeOutMilliSec - , KafkaConfigs.offsetResetKey -> kafkaAutoOffsetReset - , KafkaConfigs.kafkaTopicKey -> kafkaTopics - , KafkaConfigs.serializerKey -> kafkaKeySerializer - , KafkaConfigs.serializerValue -> kafkaValueSerializer - , KafkaConfigs.deSerializerKey -> kafkaKeyDeSerializer - , KafkaConfigs.deSerializerValue -> kafkaValueDeSerializer - ) ++ clientProps - - logger.info(s"KafkaConsumerProps --> ${kafkaConsumerProps.mkString("\n")}") - - // Explicitly Making a Map of Properties that are necessary to Connect to Kafka for Publishes (Writes) - val kafkaProducerProps: Properties = new java.util.Properties() - val producerProps = scala.collection.immutable.Map(KafkaConfigs.kafkaServerKey -> kafkaHostsAndPort - , KafkaConfigs.serializerKey -> kafkaKeySerializer - , KafkaConfigs.serializerValue -> kafkaValueSerializer - , KafkaConfigs.kafkaTopicKey -> kafkaTopics) - producerProps.foreach { kvPair => kafkaProducerProps.put(kvPair._1.toString, kvPair._2.toString) } - - logger.info(s"kafkaProducerProps --> ${kafkaProducerProps.asScala.mkString("\n")}") - - // These are key throttling factors for Improved Performance in Batch Mode - val maxRecsPerPartition: Long = props.getOrElse(KafkaConfigs.maxRecordsPerPartition, 2500000).toString.toLong - val parallelsPerPartition: Int = props.getOrElse(KafkaConfigs.batchFetchSizeTemp, 250).toString.toInt - val minRowsPerParallel: Long = props.getOrElse(KafkaConfigs.minRowsPerParallelKey, 100000).toString.toLong - val fetchRowsOnFirstRun: Long = props.getOrElse(KafkaConfigs.rowCountOnFirstRunKey, 2500000).toString.toLong - val targetCoalesceFactor: Int = props.getOrElse(KafkaConfigs.targetCoalesceFactorKey, 1).toString.toInt - - // These are key throttling factors for Improved Performance in Streaming Mode - val maxRatePerPartition: String = props.getOrElse(KafkaConfigs.maxRatePerPartitionKey, 3600).toString - val streamParallelismFactor: Int = props.getOrElse(KafkaConfigs.streamParallelKey, 10).toString.toInt - val isStreamParallel: Boolean = props.getOrElse(KafkaConfigs.isStreamParallelKey, "true").toString.toBoolean - - // Resolve fields for empty kafka topic property - val fieldsBindToJSONString = tableProps.getOrElse(GimelConstants.FIELDS_BIND_TO_JSON, "") - - // Additional CDH Metadata Fields @todo this is not used in the code yet, KafkaUtilities implements this inside - this must superceed everywhere. - val additionalCDHFields = scala.collection.Map("gg_commit_timestamp" -> "opTs", "opt_type" -> "opType", "trail_seq_no" -> "trailSeqno", "trail_rba" -> "trailRba") - - logger.info(s"Fields Initiated --> ${this.getClass.getFields.map(f => s"${f.getName} --> ${f.get().toString}").mkString("\n")}") - logger.info(s"Completed Building --> ${this.getClass.getName}") - -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaConfigs.scala deleted file mode 100644 index c47cfc5b..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaConfigs.scala +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.conf - -object KafkaConfigs { - - // kafka properties - val kafkaServerKey: String = "bootstrap.servers" - val kafkaGroupIdKey: String = "group.id" - val kafkaClientIdKey: String = "client.id" - val paypalMetricsReporterKey: String = "metric.reporters" - val paypalKafkaClusterKey: String = "kafka.monitoring.cluster" - val paypalMonitoringColoKey: String = "kafka.monitoring.colo" - val paypalMonitoringPoolKey: String = "kafka.monitoring.pool" - val paypalInterceptorClassesKey: String = "interceptor.classes" - val paypalMetricsSamplingMilliSecKey: String = "metrics.sample.window.ms" - val zookeeperConnectionTimeoutKey: String = "zookeeper.connection.timeout.ms" - val offsetResetKey: String = "auto.offset.reset" - val kafkaTopicKey: String = "kafka.topic" - val serializerKey: String = "key.serializer" - val serializerValue: String = "value.serializer" - val deSerializerKey: String = "key.deserializer" - val deSerializerValue: String = "value.deserializer" - val consumerId: String = "consumer.id" - // misc properties for read/write - val rowCountOnFirstRunKey: String = s"gimel.kafka.throttle.batch.fetchRowsOnFirstRun" - val targetCoalesceFactorKey: String = "gimel.kafka.throttle.batch.targetCoalesceFactor" - val minRowsPerParallelKey: String = s"gimel.kafka.throttle.batch.minRowsPerParallel" - val batchFetchSize: String = s"gimel.kafka.throttle.batch.parallelsPerPartition" - val maxRecordsPerPartition: String = s"gimel.kafka.throttle.batch.maxRecordsPerPartition" - val batchFetchSizeTemp: String = s"gimel.kafka.throttle.batch.parallelsPerPartition" - val messageColumnAliasKey: String = "gimel.kafka.message.column.alias" - val avroSchemaStringKey: String = "gimel.kafka.avro.schema.string" - val kafkaMessageValueType: String = "gimel.kafka.message.value.type" - // metastore properties - val zookeeperCheckpointHost: String = "gimel.kafka.checkpoint.zookeeper.host" - val zookeeperCheckpointPath: String = "gimel.kafka.checkpoint.zookeeper.path" - val avroSchemaSource: String = "gimel.kafka.avro.schema.source" - val avroSchemaSourceUrl: String = s"${avroSchemaSource}.url" - val avroSchemaSourceWrapperKey: String = s"${avroSchemaSource}.wrapper.key" - val avroSchemaSourceKey: String = s"${avroSchemaSource}.key" - val whiteListTopicsKey: String = "gimel.kafka.whitelist.topics" - // streaming properties - val defaultBatchInterval: String = "gimel.kafka.throttle.streaming.window.seconds" - val maxRatePerPartitionKey: String = "gimel.kafka.throttle.streaming.maxRatePerPartition" - val streamMaxRatePerPartitionKey: String = "gimel.kafka.spark.streaming.kafka.maxRatePerPartition" - val streamParallelKey: String = "gimel.kafka.throttle.streaming.parallelism.factor" - val isStreamParallelKey: String = "gimel.kafka.throttle.streaming.isParallel" - val isBackPressureEnabledKey: String = "gimel.kafka.spark.streaming.backpressure.enabled" - val streamaWaitTerminationOrTimeoutKey: String = "gimel.kafka.streaming.awaitTerminationOrTimeout" - val isStreamBatchSwitchEnabledKey: String = "gimel.kafka.stream.batch.switch.enabled" - val failStreamThresholdKey: String = "gimel.kafka.fail.stream.threshold.message.per.second" - val streamCutOffThresholdKey: String = "gimel.kafka.batch.to.stream.cutoff.threshold" - val streamFailureThresholdPerSecondKey: String = "gimel.kafka.fail.stream.threshold.message.per.second" - val streamFailureWindowFactorKey: String = "gimel.kafka.fail.stream.window.factor" - val kafkaConsumerReadCheckpointKey: String = "gimel.kafka.reader.checkpoint.save" - val kafkaConsumerClearCheckpointKey: String = "gimel.kafka.reader.checkpoint.clear" - val customOffsetRange: String = "gimel.kafka.custom.offset.range" - // default packages used in Kafka read/write API - val paypalMetricsReporterValue: String = "com.paypal.kafka.reporters.KafkaClientMetricsReporter" - val paypalInterceptorClassName: String = "com.paypal.kafka.clients.interceptors.MonitoringConsumerInterceptor" - val kafkaStorageHandler: String = "org.apache.hadoop.hive.kafka.KafkaStorageHandler" - val kafkaStringSerializer: String = "org.apache.kafka.common.serialization.StringSerializer" - val kafkaByteSerializer: String = "org.apache.kafka.common.serialization.ByteArraySerializer" - val kafkaStringDeSerializer: String = "org.apache.kafka.common.serialization.StringDeserializer" - val kafkaByteDeSerializer: String = "org.apache.kafka.common.serialization.ByteArrayDeserializer" -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaConstants.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaConstants.scala deleted file mode 100644 index bbd1f8f3..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaConstants.scala +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.conf - -object KafkaConstants { - // basic variable references - val gimelKafkaAvroSchemaCDH = "CDH" - val gimeKafkaAvroSchemaCSR = "CSR" - val gimelKafkaAvroSchemaInline = "INLINE" - val gimelAuditRunTypeBatch = "BATCH" - val gimelAuditRunTypeStream = "STREAM" - val gimelAuditRunTypeIntelligent = "INTELLIGENT" - val cluster = "cluster" - // polling properties - val unknownContainerName = "unknown" - val kafkaAllTopics = "All" - val targetDb = "pcatalog" - val generateDdlKey = "generate_ddl_for" - val targetDbkey = "target_db" - val avroToHiveTypes = Map( - "null" -> "void", - "boolean" -> "boolean", - "int" -> "int", - "long" -> "bigint", - "float" -> "float", - "double" -> "double", - "bytes" -> "binary", - "string" -> "string", - "record" -> "struct", - "map" -> "map", - "list" -> "array", - "union" -> "union", - "enum" -> "string", - "fixed" -> "binary") - // STRUCTURED STREAMING SPARK CONSTANTS - val KAFKA_FORMAT: String = "org.apache.spark.sql.kafka010.KafkaSourceProvider" - val KAFKA_BOOTSTRAP_SERVERS: String = "kafka.bootstrap.servers" - val KAFKA_SUBSCRIBE: String = "subscribe" - val KAFKA_START_OFFSETS: String = "startingOffsets" - val KAFKA_END_OFFSETS: String = "endingOffsets" - val STREAM_FAIL_ON_DATA_LOSS: String = "failOnDataLoss" - val KAFKA_POLL_TIMEOUT: String = "kafkaConsumer.pollTimeoutMs" - val KAFKA_FETCH_RETRIES: String = "fetchOffset.numRetries" - val KAFKA_RETRY_INTERVAL: String = "fetchOffset.retryIntervalMs" - val earliestOffset: String = "earliest" -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaJsonProtocol.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaJsonProtocol.scala deleted file mode 100644 index 200de228..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/conf/KafkaJsonProtocol.scala +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.conf - -import spray.json.{DefaultJsonProtocol, RootJsonFormat} - -import com.paypal.gimel.kafka.utilities.{OffsetProperties, OffsetRangeProperties} - - -object KafkaJsonProtocol extends DefaultJsonProtocol { - implicit val offsetRangePropertiesFormat: RootJsonFormat[OffsetRangeProperties] = jsonFormat3(OffsetRangeProperties) - implicit val offsetPropertiesFormat: RootJsonFormat[OffsetProperties] = jsonFormat2(OffsetProperties) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/reader/KafkaBatchConsumer.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/reader/KafkaBatchConsumer.scala deleted file mode 100644 index 8144311e..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/reader/KafkaBatchConsumer.scala +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.reader - -import scala.collection.immutable.Map -import scala.language.implicitConversions - -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.streaming.kafka010.OffsetRange - -import com.paypal.gimel.common.conf.GimelConstants -import com.paypal.gimel.common.utilities.BindToFieldsUtils._ -import com.paypal.gimel.kafka.conf.{KafkaClientConfiguration, KafkaConstants} -import com.paypal.gimel.kafka.utilities.{BrokersAndTopic, KafkaUtilitiesException} -import com.paypal.gimel.kafka.utilities.ImplicitKafkaConverters._ -import com.paypal.gimel.kafka.utilities.KafkaUtilities._ - -/** - * Implements Kafka Consumer Batch Here - */ -object KafkaBatchConsumer { - - val logger = com.paypal.gimel.logger.Logger() - - - /** - * Connects to Kafka, Deserializes data from Kafka, Attempts to Convert Avro to a DataFrame - * - * @param sparkSession : SparkSession - * @param conf KafkaClientConfiguration - * @return DataFrame - * @return Read Till Array[OffsetRange] - * - */ - - def consumeFromKakfa(sparkSession: SparkSession, conf: KafkaClientConfiguration): (DataFrame, Array[OffsetRange]) = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val kafkaParams: Map[String, String] = conf.kafkaConsumerProps - try { - val finalOffsetRangesForReader: Array[OffsetRange] = - if (conf.kafkaCustomOffsetRange.isEmpty()) { - logger.info(s"""No custom offset information was given by the user""") - val lastCheckPoint: Option[Array[OffsetRange]] = getLastCheckPointFromZK(conf.zkHostAndPort, conf.zkCheckPoints) - val availableOffsetRange: Array[OffsetRange] = BrokersAndTopic(conf.kafkaHostsAndPort, conf.kafkaTopics).toKafkaOffsetsPerPartition - val newOffsetRangesForReader = getNewOffsetRangeForReader(lastCheckPoint, availableOffsetRange, conf.fetchRowsOnFirstRun) - logger.info("Offset Ranges From Difference -->") - newOffsetRangesForReader.foreach(x => logger.info(x.toString)) - newOffsetRangesForReader.applyThresholdPerPartition(conf.maxRecsPerPartition.toLong) // Restrict Offset Ranges By Applying Threshold Per Partition - } - else { - logger.info(s"""Custom offset information was given by the user""") - getCustomOffsetRangeForReader(conf.kafkaTopics.split(","), conf.kafkaCustomOffsetRange, KafkaConstants.gimelAuditRunTypeBatch) - } - logger.info("Offset Ranges After applying Threshold Per Partition/Custom Offsets -->") - finalOffsetRangesForReader.foreach(x => logger.info(x.toString)) - - // If kafka topic is empty return empty dataframe with the columns in gimel.fields.bind.to.json prop - val finalDF = if (isKafkaTopicEmpty(finalOffsetRangesForReader) && !conf.fieldsBindToJSONString.isEmpty) { - logger.info("Kafka Topic is Empty.") - logger.info("Returning Datafame with fields in " + GimelConstants.FIELDS_BIND_TO_JSON) - getEmptyDFBindToFields(sparkSession, conf.fieldsBindToJSONString) - } else { - val parallelizedRanges: Array[OffsetRange] = finalOffsetRangesForReader.parallelizeOffsetRanges(conf.parallelsPerPartition, conf.minRowsPerParallel) - logger.info("Final Array of OffsetRanges to Fetch from Kafka --> ") - parallelizedRanges.foreach(range => logger.info(range)) - if (parallelizedRanges.isEmpty) throw new KafkaUtilitiesException("There is an issue ! No Offset Range From Kafka ... Is the topic having any message at all ?") - val sqlContext = sparkSession.sqlContext - getAsDFFromKafka(sqlContext, conf, parallelizedRanges) - } - - (finalDF, finalOffsetRangesForReader) - } catch { - case ex: Throwable => - ex.printStackTrace() - val messageString = - s""" - |kafkaParams --> ${kafkaParams.mkString(" \n ")} - """.stripMargin - logger.error(s"An Error While Attempting to Consume From Kafka with Parameters --> $messageString") - throw ex - } - } - - /** - * Checks if the given kafka topics are empty - * - * @param offsetRanges : array of OffsetRanges for the topics to check - * @return - * - */ - def isKafkaTopicEmpty(offsetRanges: Array[OffsetRange]): Boolean = { - offsetRanges.isEmpty || offsetRanges.forall (each => (each.untilOffset - each.fromOffset) == 0) - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/reader/KafkaStreamConsumer.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/reader/KafkaStreamConsumer.scala deleted file mode 100644 index e213cedd..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/reader/KafkaStreamConsumer.scala +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.reader - -import scala.collection.immutable.Map -import scala.language.implicitConversions - -import org.apache.avro.generic.GenericRecord -import org.apache.kafka.clients.consumer._ -import org.apache.kafka.common.TopicPartition -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql.streaming.DataStreamReader -import org.apache.spark.streaming.StreamingContext -import org.apache.spark.streaming.dstream._ -import org.apache.spark.streaming.kafka010._ -import spray.json._ - -import com.paypal.gimel.common.catalog.GimelCatalogJsonProtocol._ -import com.paypal.gimel.datastreamfactory.{CheckPointHolder, StreamingResult, StructuredStreamingResult, WrappedData} -import com.paypal.gimel.kafka.avro.SparkAvroUtilities -import com.paypal.gimel.kafka.conf.{KafkaClientConfiguration, KafkaConstants} -import com.paypal.gimel.kafka.utilities.BrokersAndTopic -import com.paypal.gimel.kafka.utilities.ImplicitKafkaConverters._ -import com.paypal.gimel.kafka.utilities.KafkaUtilities._ - -/** - * Implements Kafka Stream Consumer Logic here - */ -object KafkaStreamConsumer { - - val logger = com.paypal.gimel.logger.Logger() - - /** - * - * Core Function to Provide Data Stream - * - * @param streamingContext StreamingContext - * @param conf KafkaClientConfiguration - * @return StreamingResult - */ - def createDStream(streamingContext: StreamingContext, conf: KafkaClientConfiguration): StreamingResult = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - try { - val sparkConf = streamingContext.sparkContext.getConf - val streamRate = sparkConf.get("throttle.streaming.maxRatePerPartition", conf.maxRatePerPartition) - streamingContext.sparkContext.getConf - .set("spark.streaming.backpressure.enabled", "true") - .set("spark.streaming.kafka.maxRatePerPartition", streamRate) - val isStreamParallel = sparkConf.get("throttle.streaming.isParallel", conf.isStreamParallel.toString).toBoolean - val streamParallels = sparkConf.get("throttle.streaming.parallelism.factor", conf.streamParallelismFactor.toString).toInt - logger.debug( - s""" - |isStreamParallel --> ${isStreamParallel} - |streamParallels --> ${streamParallels} - """.stripMargin) - // Resolve all the Properties & Determine Kafka CheckPoint before reading from Kafka - val (schemaString, kafkaTopic, brokers) = (conf.avroSchemaString, conf.kafkaTopics, conf.kafkaHostsAndPort) - logger.info(s"Zookeeper Server : ${conf.zkHostAndPort}") - logger.info(s"Zookeeper Checkpoint : ${conf.zkCheckPoints}") - val startOffsetsForStream: Map[TopicPartition, Long] = - getStartOffsets(conf, kafkaTopic, brokers) - var kafkaParams: Map[String, Object] = setKafkaParams(conf) - val consumerStrategy = ConsumerStrategies.Subscribe[Any, Any](kafkaTopic.split(",").toSet, kafkaParams, startOffsetsForStream) - val locationStrategy = LocationStrategies.PreferConsistent - logger.info( - s""" - |consumerStrategy --> ${consumerStrategy} - |locationStrategy --> ${locationStrategy.toString} - |Initiating createDirectStream with above Parameters... - """.stripMargin) - val msg: InputDStream[ConsumerRecord[Any, Any]] = KafkaUtils.createDirectStream(streamingContext, locationStrategy, consumerStrategy) - var offsetRanges = Array[OffsetRange]() - val messages1: DStream[WrappedData] = msg.transform { rdd => - offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges - // CheckPointHolder().currentCheckPoint = offsetRanges - CheckPointHolder().setCurentCheckPoint(offsetRanges) - rdd - }.map { x => WrappedData(x.key(), x.value()) } - // CheckPointer Function - CheckPoints each window - val saveCheckPoint: (Array[OffsetRange]) => Boolean = inStreamCheckPoint(conf.zkHostAndPort, conf.zkCheckPoints, _) - // Convertor Function : takes Raw Data and Returns AvroGeneric Data - val bytesToGenericRDD: (RDD[WrappedData]) => RDD[GenericRecord] = - wrappedDataToAvro(_, conf.avroSchemaKey, conf.avroSchemaURL, conf.avroSchemaSource, conf.avroSchemaString, isStreamParallel, streamParallels, conf.cdhAllSchemaDetails) - val finalSchema = conf.avroSchemaSource.toUpperCase() match { - case "CDH" => addAdditionalFieldsToSchema(getAdditionalFields().keySet.toList, conf.cdhTopicSchemaMetadata.get) - case _ => conf.avroSchemaString - } - // Convertor Function - RDD[GenericRecord] => DataFrame - val genericRecToDF: (SQLContext, RDD[GenericRecord]) => DataFrame = SparkAvroUtilities.genericRecordtoDF(_, _, finalSchema) - // Provide Option to Clear CheckPoint - val deleteCheckPoint: (String) => Unit = clearCheckPoint(conf.zkHostAndPort, conf.zkCheckPoints, _: String) - // Provide Option to Get DataFrame for a Simple String Message from Kafka Topic - val columnAlias = kafkaMessageColumnAlias(conf) - // val wrappedDataToDF: (SQLContext, RDD[WrappedData]) => DataFrame = wrappedStringDataToDF(columnAlias, _, _) - val wrappedDatatoDF1: (SQLContext, RDD[WrappedData]) => DataFrame = rddToDF(_, conf.kafkaMessageValueType, conf.kafkaKeySerializer, conf.kafkaValueSerializer, _, "value", conf.avroSchemaString, conf.avroSchemaSource, conf.cdhTopicSchemaMetadata, conf.cdhAllSchemaDetails) - // Return a Wrapper of various functionalities to Client of this function - StreamingResult(messages1, bytesToGenericRDD, genericRecToDF, wrappedDatatoDF1, saveCheckPoint, deleteCheckPoint) - } - catch { - case ex: Throwable => { - ex.printStackTrace() - streamingContext.stop() - throw ex - } - } - } - - /** - * - * Function to set kafka parameters for stream - * - * @param conf KafkaClientConfiguration object that holds the configuration paremeters - * @return Kafka Parameters in a Map[String, Object] - */ - private def setKafkaParams(conf: KafkaClientConfiguration) = { - var kafkaParams: Map[String, Object] = Map() - conf.kafkaConsumerProps.foreach(x => kafkaParams += (x._1 -> x._2)) - val (keyDeSer, valDeSer) = (getSerDe(conf.kafkaKeyDeSerializer), getSerDe(conf.kafkaValueDeSerializer)) - kafkaParams += ("key.deserializer" -> keyDeSer, "value.deserializer" -> valDeSer) - kafkaParams - } - - /** - * - * Function to get the starting offsets for the stream to read from - * - * @param conf KafkaClientConfiguration object that holds the configuration paremeters - * @param kafkaTopic The kafkaTopics list to subscribe to - * @return Starting Offsets in a Map[TopicPartition, Long] - */ - private def getStartOffsets(conf: KafkaClientConfiguration, kafkaTopic: String, brokers: String) = { - if (conf.kafkaCustomOffsetRange.isEmpty()) { - val lastCheckPoint: Option[Array[OffsetRange]] = getLastCheckPointFromZK(conf.zkHostAndPort, conf.zkCheckPoints) - val availableOffsetRange: Array[OffsetRange] = BrokersAndTopic(brokers, kafkaTopic).toKafkaOffsetsPerPartition - if (lastCheckPoint == None) { - logger.info("No CheckPoint Found !") - if(conf.kafkaAutoOffsetReset.equals(KafkaConstants.earliestOffset)) { - logger.info("Fetching from the beginning") - availableOffsetRange.map { - x => (new TopicPartition(x.topic, x.partition) -> x.fromOffset) - }.toMap - } - else { - logger.info("Fetching from the latest offset") - availableOffsetRange.map { - x => (new TopicPartition(x.topic, x.partition) -> x.untilOffset) - }.toMap - } - } else { - logger.info(s"Found Checkpoint Value --> ${lastCheckPoint.get.mkString("|")}") - lastCheckPoint.get.map { - x => (new TopicPartition(x.topic, x.partition) -> x.untilOffset) - }.toMap - } - } - else { - val customOffsetRangesForStream: Array[OffsetRange] = getCustomOffsetRangeForReader(conf.kafkaTopics.split(","), conf.kafkaCustomOffsetRange, KafkaConstants.gimelAuditRunTypeStream) - customOffsetRangesForStream.map { - x => (new TopicPartition(x.topic, x.partition) -> x.fromOffset) - }.toMap - } - } - - /** - * - * Function to return the last saved checkpoint from zookeeper - * - * @param conf KafkaClientConfiguration object that holds the configuration paremeters - * @return Optional checkpoint Offsets in a Array[OffsetRange] - */ - private def getLastCheckPoint(conf: KafkaClientConfiguration) = { - val lastCheckPoint: Option[Array[OffsetRange]] = getLastCheckPointFromZK(conf.zkHostAndPort, conf.zkCheckPoints) - lastCheckPoint - } - - /** - * - * Core Function to create a structured stream - * - * @param sparkSession the spark session passed by the user - * @param conf KafkaClientConfiguration object that holds the configuration paremeters - * @return StreamingResult in a StructuredStreamingResult Object - */ - def createStructuredStream(sparkSession: SparkSession, conf: KafkaClientConfiguration): StructuredStreamingResult = { - try { - val sparkConf = sparkSession.sparkContext.getConf - val streamRate = sparkConf.get("throttle.streaming.maxRatePerPartition", conf.maxRatePerPartition) - sparkSession.sparkContext.getConf - .set("spark.streaming.backpressure.enabled", "true") - .set("spark.streaming.kafka.maxRatePerPartition", streamRate) - val isStreamParallel = sparkConf.get("throttle.streaming.isParallel", conf.isStreamParallel.toString).toBoolean - val streamParallels = sparkConf.get("throttle.streaming.parallelism.factor", conf.streamParallelismFactor.toString).toInt - logger.debug( - s""" - |isStreamParallel --> ${isStreamParallel} - |streamParallels --> ${streamParallels} - """.stripMargin) - // Resolve all the Properties & Determine Kafka CheckPoint before reading from Kafka - val (schemaString, kafkaTopic, brokers) = (conf.avroSchemaString, conf.kafkaTopics, conf.kafkaHostsAndPort) - logger.info(s"Zookeeper Server : ${conf.zkHostAndPort}") - logger.info(s"Zookeeper Checkpoint : ${conf.zkCheckPoints}") - val startOffsetsForStream: Map[TopicPartition, Long] = - getStartOffsets(conf, kafkaTopic, brokers) - val lastCheckPoint = getLastCheckPoint(conf) - val startOffsetsStructured = startOffsetsForStream.toList.groupBy(_._1.topic()) - .mapValues(_.map(x => - (x._1.partition().toString, x._2)).toMap) - val kafkaBootstrapServers = conf.kafkaHostsAndPort - val topics = conf.kafkaTopics - - val dataStreamReader: DataStreamReader = sparkSession - .readStream - .format(KafkaConstants.KAFKA_FORMAT) - .option(KafkaConstants.KAFKA_BOOTSTRAP_SERVERS, kafkaBootstrapServers) - .option(KafkaConstants.KAFKA_SUBSCRIBE, topics) - .options(conf.kafkaConsumerProps) - - val df = lastCheckPoint match { - case None => { - dataStreamReader.load() - } - case Some(lastCheckPoint) => { - dataStreamReader - .option(KafkaConstants.KAFKA_START_OFFSETS, startOffsetsStructured.toJson.toString()) - .load() - } - } - - // CheckPointer Function - CheckPoints each window - val saveCheckPoint: Unit = inStructuredStreamCheckPoint(sparkSession, conf.zkHostAndPort, conf.zkCheckPoints) - // Convertor Function : takes Raw Data and Returns AvroGeneric Data - val bytesToGenericRDD: (RDD[WrappedData]) => RDD[GenericRecord] = - wrappedDataToAvro(_, conf.avroSchemaKey, conf.avroSchemaURL, conf.avroSchemaSource, conf.avroSchemaString, isStreamParallel, streamParallels, conf.cdhAllSchemaDetails) - val finalSchema = conf.avroSchemaSource.toUpperCase() match { - case "CDH" => addAdditionalFieldsToSchema(getAdditionalFields().keySet.toList, conf.cdhTopicSchemaMetadata.get) - case _ => conf.avroSchemaString - } - // Provide Option to Clear CheckPoint - val deleteCheckPoint: (String) => Unit = clearCheckPoint(conf.zkHostAndPort, conf.zkCheckPoints, _: String) - // Provide Option to Get DataFrame for a Simple String Message from Kafka Topic - val columnAlias = kafkaMessageColumnAlias(conf) - // Return a Wrapper of various functionalities to Client of this function - StructuredStreamingResult(df, saveCheckPoint, deleteCheckPoint) - } - - catch { - case ex: Throwable => { - ex.printStackTrace() - throw ex - } - } - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitHDFSCheckPointers.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitHDFSCheckPointers.scala deleted file mode 100644 index 30bbe0e4..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitHDFSCheckPointers.scala +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.utilities - -import scala.language.implicitConversions - -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.spark.streaming.kafka010.OffsetRange - -import com.paypal.gimel.common.storageadmin.HDFSAdminClient._ -import com.paypal.gimel.kafka.utilities.ImplicitKafkaConverters._ - -/** - * Provides Implicit, Convenience Functions for Developers to Do CheckPointing Operations - */ -object ImplicitHDFSCheckPointers { - - val logger = com.paypal.gimel.logger.Logger() - - /** - * @param offsetRangesAndCheckPointDirectory A Tuple of (Array[OffsetRange], checkPointDirectory) - */ - implicit class CheckPointers(offsetRangesAndCheckPointDirectory: (Array[OffsetRange], String)) { - /** - * CheckPoints a Tuple of (Array[OffsetRange], checkPointDirectory) - * - * @example (Array(OffsetRange("l1", 11, 1, 1)), "${USER_DEFINED_CHECKPOINT_PATH}").saveCheckPoint - * @return true if Success - * - */ - def saveCheckPoint: Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val latestFile = "/latest" - val checkPointDir = offsetRangesAndCheckPointDirectory._2 - val checkPointFile = checkPointDir + latestFile - val contentToWrite = offsetRangesAndCheckPointDirectory._1.toStringOfKafkaOffsetRanges - try { - val conf = new org.apache.hadoop.conf.Configuration() - val fs = FileSystem.get(conf) - val latestHDFSPath = new Path(checkPointFile) - if (!fs.exists(latestHDFSPath)) { - writeHDFSFile(checkPointFile, contentToWrite) - } else { - val timeStamp = System.currentTimeMillis - val toRenameLatestPath = checkPointDir + s"/$timeStamp" - val toRenameLatestPathHDFS = new Path(toRenameLatestPath) - fs.rename(latestHDFSPath, toRenameLatestPathHDFS) - writeHDFSFile(checkPointFile, contentToWrite) - } - } catch { - case ex: Throwable => - throw ex - } - true - } - } - - - /** - * @param checkPointDirectoryPath A Tuple of (Array[OffsetRange], checkPointDirectory) - */ - implicit class CheckPointFetcher(checkPointDirectoryPath: String) { - /** - * Fetches CheckPoints as An Array[OffsetRange] - * - * @example ("USER_DEFINED_CHECKPOINT_PATH").fetchCheckPoint - * @return Some(Array[OffsetRange]) - * - */ - def fetchCheckPoint: Option[Array[OffsetRange]] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - if (checkPointDirectoryPath.isEmpty) throw new HDFSCheckPointerException("Expected CheckPoint Directory, but got Empty String !") - val latestFile = "/latest" - val checkPointDir = checkPointDirectoryPath - val checkPointFile = checkPointDir + latestFile - val conf = new org.apache.hadoop.conf.Configuration() - val fs = FileSystem.get(conf) - val latestHDFSPath = new Path(checkPointFile) - if (fs.exists(latestHDFSPath)) { - val checkPointString = readHDFSFile(checkPointDirectoryPath + "/latest") - println("inside fetchCheckPoint ->" + checkPointString) - Some(checkPointString.split('|').map(x => CheckPointString(x)).toKafkaOffsetRanges) - } else { - None - } - } - } - -} - -/** - * Custom Exception - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class HDFSCheckPointerException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitKafkaConverters.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitKafkaConverters.scala deleted file mode 100644 index bda8c51d..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitKafkaConverters.scala +++ /dev/null @@ -1,343 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.utilities - -import java.{lang, util} -import java.util.{Collections, Properties} - -import scala.collection.mutable.ArrayBuffer -import scala.language.implicitConversions - -import org.apache.kafka.clients.admin.AdminClient -import org.apache.kafka.common.TopicPartition -import org.apache.spark.streaming.kafka010.OffsetRange - -import com.paypal.gimel.logger.Logger - -/** - * Case Class to Represent a CheckPoint String. Example "flights,1,1,100" - * - * @param checkPoint - */ -case class CheckPointString(checkPoint: String) - -/** - * Case Class to Represent Brokers and Topics - * - * @param brokers Example : kafka_broker_ip:8081 - * @param topic Example : flights - */ - -case class BrokersAndTopic(brokers: String, topic: String) - -/** - * Provides a set of Implicit , Convenience APIs for developers to use - */ - -object ImplicitKafkaConverters { - - val logger: Logger = Logger() - - /** - * @param offsetRanges An Array of OffsetRange - */ - implicit class OffsetsConverter(offsetRanges: Array[OffsetRange]) { - - /** - * Converts An Array OffsetRange to String of [CheckPoints (comma-separated)], each checkpoint Separated by Pipe - * - * @example Array(OffsetRange("test", 0, 1, 100),OffsetRange("test", 1, 1, 100)).toStringOfKafkaOffsetRanges - * @return String of [CheckPoints (comma-separated)], each checkpoint Separated by Pipe - */ - def toStringOfKafkaOffsetRanges: String = { - offsetRanges.map(offsetRange => offsetRange.toStringOfKafkaOffsetRange).mkString("|") - } - } - - - /** - * @param offsetRange A Kafka OffsetRange - */ - implicit class OffsetConverter(offsetRange: OffsetRange) { - /** - * Converts a Kafka OffsetRange to A CheckPoint (comma-separated) - * - * @return A CheckPoint (comma-separated) - * @example "test,0,0,4".toKafkaOffsetRanges - */ - def toStringOfKafkaOffsetRange: String = { - offsetRange.topic + "," + offsetRange.partition + "," + offsetRange.fromOffset + "," + offsetRange.untilOffset - } - } - - /** - * @param checkPointString A CheckPoint (comma-separated) - */ - implicit class CheckPointConverter(checkPointString: CheckPointString) { - /** - * Converts A CheckPoint (comma-separated) to An OffsetRange - * - * @return An OffsetRange - * @example "test,0,0,4".toKafkaOffsetRanges - */ - def toKafkaOffsetRange: OffsetRange = { - val splitString = checkPointString.checkPoint.split(",") - OffsetRange(splitString(0), splitString(1).toInt, splitString(2).toLong, splitString(3).toLong) - } - } - - /** - * @param checkPointsString an Array of CheckPoints (comma-separated) - */ - implicit class CheckPointsConverter(checkPointsString: Array[CheckPointString]) { - /** - * Converts an Array of CheckPoints (comma-separated) to An Array of OffsetRange - * - * @return An Array of OffsetRange - * @example "test,0,0,4|test,1,0,5".split("|").toKafkaOffsetRanges - */ - def toKafkaOffsetRanges: Array[OffsetRange] = { - checkPointsString.map(eachOffsetString => eachOffsetString.toKafkaOffsetRange) - } - } - - - /** - * @param brokersAndTopic A Tuple of (Comma-Separated Hosts, TopicString) - */ - implicit class TopicPartitionsConverter(brokersAndTopic: BrokersAndTopic) { - - val clientID: Int = scala.util.Random.nextLong().toInt - val brokers: Array[String] = brokersAndTopic.brokers.split(",") - val host1: String = brokers(0).split(":")(0) - val port1: Int = brokers(0).split(":")(1).toInt - val latestTime: Long = -1L - val earliestTime: Long = -2L - - /** - * Converts a given Tuple of KafkaBrokers & Topic into KafkaTopicAndPartitions - * - * @example val testing: Array[TopicAndPartition] = ("localhost:8080,localhost:8081", "test").toTopicAndPartitions - * @return Array[TopicAndPartition] - */ - def toTopicAndPartitions: Map[TopicPartition, (String, Int)] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val client = AdminClient.create(KafkaUtilities.getDefaultConsumerPropertiesPerBroker(brokersAndTopic.brokers)) - import scala.collection.JavaConverters._ - try { - client.describeTopics( - Collections.singletonList(brokersAndTopic.topic) - ).all().get().asScala.flatMap { topicMetadata => { - topicMetadata._2.partitions().asScala.map { - partitionMetadata => - partitionMetadata.isr() - (new TopicPartition(topicMetadata._1, partitionMetadata.partition()), - (partitionMetadata.leader().host(), partitionMetadata.leader().port())) - } - } - }.toMap - } finally { - client.close() - } - - } - - /** - * Converts a given Tuple of KafkaBrokers & Topic into Array[OffsetRange] available currently in Kafka Cluster - * - * @example val kafkaOffsets:Array[OffsetRange] = ("localhost:8080,localhost:8081", "test").toKafkaOffsetsPerPartition - * @return Array[OffsetRange] - * - */ - def toKafkaOffsetsPerPartition: Array[OffsetRange] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - val topicAndPartitions: Map[TopicPartition, (String, Int)] = brokersAndTopic.toTopicAndPartitions - import scala.collection.JavaConverters._ - val partitions = topicAndPartitions.keySet.asJava - - logger.info("The Topic And Partitions are --> ") - topicAndPartitions.foreach(println) - - val kafkaConsumer = KafkaUtilities.getKafkaConsumer(Some( - KafkaUtilities.getDefaultConsumerPropertiesPerBroker(brokersAndTopic.brokers) - )) - try { - val beginningOffsets: util.Map[TopicPartition, lang.Long] = kafkaConsumer.beginningOffsets(partitions) - val endOffsets: util.Map[TopicPartition, lang.Long] = kafkaConsumer.endOffsets(partitions) - topicAndPartitions.map { - topicAndPartition => - OffsetRange(topicAndPartition._1.topic, topicAndPartition._1.partition, - beginningOffsets.get(topicAndPartition._1), endOffsets.get(topicAndPartition._1)) - }.toArray - } finally { - kafkaConsumer.close() - } - } - - /** - * Take a TopicAndPartition and Returns a Tuple of leader Host & Port - * - * @param topicAndPartition Kafka TopicAndPartition - * @return Tuple(host, port) - */ - private def findLeader(topicAndPartition: (TopicPartition, (String, Int))): (String, Int) = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - val leaderDetails: (String, Int) = (topicAndPartition._2._1, topicAndPartition._2._2) - leaderDetails - } - } - - - /** - * @param offsetRangePairs an Array of Tuple(OffsetRange, OffsetRange). LeftSide Should be Lower Than RightSize - */ - implicit class NewOffsetRangesProvider(offsetRangePairs: (Array[OffsetRange], Array[OffsetRange])) { - /** - * Calculates the New Range of Offsets to Read from Kafka based on a Pair of OffsetRange - * - * @return Array[OffsetRange] - * @example (Array(OffsetRange("a", 0, 1, 1), OffsetRange("a", 1, 2, 100)) ,Array( OffsetRange("a", 1, 2, 100),OffsetRange("a", 0, 1, 100))).toNewOffsetRange - */ - def toNewOffsetRanges: Array[OffsetRange] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val sortedLeft = offsetRangePairs._1.sortBy(offsetRange => offsetRange.partition) - val sortedRight = offsetRangePairs._2.sortBy(offsetRange => offsetRange.partition) - val combinedAfterSort = sortedLeft.zip(sortedRight) - combinedAfterSort.map { eachPair => - val left = eachPair._1 - val right = eachPair._2 - if (left.topic != right.topic) throw new KafkaOperationsException(s"Invalid Operation ! Seems we are comparing two different topics --> ${left.topic} <> ${right.topic} ") - if (left.untilOffset > right.untilOffset) throw new KafkaOperationsException(s"Left Side Until:Offset ${left.untilOffset} is Higher than Right Side Until:Offset ${right.untilOffset}") - if (left.fromOffset > right.untilOffset) throw new KafkaOperationsException(s"Left Side from:Offset ${left.fromOffset} is Already Beyond Right Side Until:Offset ${right.untilOffset}") - if (left.untilOffset < right.fromOffset) throw new KafkaOperationsException(s"Left Side from:Offset ${left.untilOffset} is Lower Than Right Side from:Offset ${right.untilOffset}. This usually indicates Data Loss !") - val fromOffset = { - if (left.untilOffset == right.untilOffset) { - right.untilOffset - } else { - left.untilOffset - } - } - OffsetRange(left.topic, left.partition, fromOffset, right.untilOffset) - } - } - } - - /** - * @param offsetRanges An Array of OffsetRange - */ - implicit class OffsetRangeRestriction(offsetRanges: Array[OffsetRange]) { - /** - * Limits the OffsetRanges to the given threshold per partition - * - * @example val kafkaOffsets:Array[OffsetRange] = Array(OffsetRange(("localhost:8080,localhost:8081", "test"))).applyThresholdPerPartition(100) - * @return Array[OffsetRange] - * - */ - def applyThresholdPerPartition(maxPerPartition: Long): Array[OffsetRange] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - offsetRanges.map { - eachOffsetRange => - val fromOffset = eachOffsetRange.fromOffset - val maxUntil = fromOffset + maxPerPartition - val untilOffset = eachOffsetRange.untilOffset - val newUntilOffset = scala.math.min(untilOffset, maxUntil) - OffsetRange(eachOffsetRange.topic, eachOffsetRange.partition, eachOffsetRange.fromOffset, newUntilOffset) - } - } - - /** - * Parallelizes an Array of Offset Range, by applying parallelism factor on each Offset Range - * - * @param parallelism Number of parallel shards - * @return Array[OffsetRange] - */ - def parallelizeOffsetRanges(parallelism: Int, minRowsPerParallel: Long): Array[OffsetRange] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val returningRanges = offsetRanges.flatMap(erange => parallelizeOffsetRange(erange, parallelism, minRowsPerParallel)) - logger.info("Outgoing Array of OffsetRanges --> ") - returningRanges.foreach(logger.info(_)) - returningRanges - } - - // parallelizeOffsetRange(OffsetRange("a", 1, 1, 20), 3) - private def parallelizeOffsetRange(eachRange: OffsetRange, parallel: Int, minRowsPerParallel: Long): Array[OffsetRange] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val total = eachRange.untilOffset - eachRange.fromOffset - if ((total > minRowsPerParallel)) { - logger.info(s"Incoming Range --> $eachRange") - logger.info(s"Parallel Factor --> $parallel") - val returningRange: scala.collection.mutable.ArrayBuffer[OffsetRange] = ArrayBuffer() - - val recordsPer = scala.math.max(total / parallel, minRowsPerParallel) - var cntr = eachRange.fromOffset - val end = eachRange.untilOffset - while (cntr < end) { - returningRange.append(OffsetRange(eachRange.topic, eachRange.partition, cntr, cntr + recordsPer)) - cntr = cntr + recordsPer - if (cntr + recordsPer > end) { - returningRange.append(OffsetRange(eachRange.topic, eachRange.partition, cntr, end)) - cntr = end - } - } - logger.info("Parallelized Ranges for the given OffsetRange ..") - returningRange.foreach(logger.info(_)) - returningRange.toArray - } else { - logger.info(s"Not Applying Parallelism as the total rows : $total in this Offset Range < min rows per parallel : $minRowsPerParallel ") - Array(eachRange) - } - } - } - -} - -/** - * Custom Exception - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class KafkaOperationsException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitZKCheckPointers.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitZKCheckPointers.scala deleted file mode 100644 index ba2d3d08..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/ImplicitZKCheckPointers.scala +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.utilities - -import scala.language.implicitConversions - -import org.apache.spark.streaming.kafka010.OffsetRange - -import com.paypal.gimel.common.storageadmin.ZooKeeperAdminClient._ -import com.paypal.gimel.kafka.utilities.ImplicitKafkaConverters._ -import com.paypal.gimel.logger.Logger - -case class ZooKeeperHostAndNodes(host: String, nodes: Seq[String]) - -/** - * Provides Implicit, Convenience Functions for Developers to Do CheckPointing Operations - */ -object ImplicitZKCheckPointers { - - val logger = Logger() - - /** - * @param checkPointingInfo Tuple of (ZooKeeperHostAndNode, Array[Kafka OffsetRange]) - */ - implicit class ZKCheckPointers(checkPointingInfo: (ZooKeeperHostAndNodes, Array[OffsetRange])) { - /** - * CheckPoints a Tuple of (Array[OffsetRange], checkPointDirectory) - * - * @example (Array(OffsetRange("l1", 11, 1, 1)),"${USER_DEFINED_CHECKPOINT_PATH}").saveCheckPoint - * @return true if Success - * - */ - def saveZkCheckPoint: Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val zkServers = checkPointingInfo._1.host - val zkNodes = checkPointingInfo._1.nodes - val contentToWrite = checkPointingInfo._2.toStringOfKafkaOffsetRanges - try { - zkNodes.map { zkNode => - writetoZK(zkServers, zkNode, contentToWrite) - } - } catch { - case ex: Throwable => - throw ex - } - true - } - - } - - - /** - * @param zooKeeperDetails ZooKeeperHostAndNode - */ - implicit class ZKCheckPointFetcher(zooKeeperDetails: ZooKeeperHostAndNodes) { - /** - * Fetches CheckPoints as An Array[OffsetRange] - * - * @example ("${USER_DEFINED_CHECKPOINT_PATH}").fetchCheckPoint - * @return Some(Array[OffsetRange]) - * - */ - def fetchZkCheckPoint: Option[Array[OffsetRange]] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - val zkServers = zooKeeperDetails.host - val zkNodes = zooKeeperDetails.nodes - if (zkServers.isEmpty) throw new ZooKeeperCheckPointerException("Expected CheckPoint Directory, but got Empty String !") - val zkCheckPoints = zkNodes.flatMap { zkNode => - val checkPointString: Option[String] = readFromZK(zkServers, zkNode) - checkPointString match { - case None => - None - case _: Option[String] => - checkPointString.get.split('|').map(x => CheckPointString(x)).toKafkaOffsetRanges - } - }.filter { - None => true - }.toArray - if (zkCheckPoints.isEmpty) { - None - } - else { - Some(zkCheckPoints) - } - } - - /** - * Deletes a ZooKeeper CheckPoint - */ - def deleteZkCheckPoint(): Unit = { - logger.warning(s"WARNING !!!!! Deleting --> host : ${zooKeeperDetails.host} | node : ${zooKeeperDetails.nodes}") - try { - zooKeeperDetails.nodes.map { node => - deleteNodeOnZK(zooKeeperDetails.host, node) - } - } catch { - case ex: Throwable => - throw ex - } - } - } - -} - -/** - * Custom Exception - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class ZooKeeperCheckPointerException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/KafkaUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/KafkaUtilities.scala deleted file mode 100644 index eccbb3c2..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/utilities/KafkaUtilities.scala +++ /dev/null @@ -1,1018 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.utilities - -import java.io.{Closeable, Serializable} -import java.nio.ByteBuffer -import java.util.{Properties, UUID} - -import scala.collection.JavaConverters._ -import scala.collection.immutable.Map -import scala.collection.mutable -import scala.language.implicitConversions -import scala.reflect.runtime.universe._ -import scala.util.parsing.json.JSON - -import org.apache.avro.generic.GenericRecord -import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer} -import org.apache.kafka.clients.producer.ProducerConfig -import org.apache.kafka.common.serialization._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql.streaming.StreamingQueryListener -import org.apache.spark.sql.streaming.StreamingQueryListener.{QueryProgressEvent, QueryStartedEvent, QueryTerminatedEvent} -import org.apache.spark.streaming.kafka010._ -import org.apache.spark.streaming.kafka010.KafkaUtils._ -import spray.json._ -import spray.json.DefaultJsonProtocol._ - -import com.paypal.gimel.common.catalog.CatalogProvider -import com.paypal.gimel.common.conf.GimelConstants -import com.paypal.gimel.common.schema.ConfluentSchemaRegistry -import com.paypal.gimel.common.storageadmin -import com.paypal.gimel.common.storageadmin.KafkaAdminUtils -import com.paypal.gimel.common.utilities.DataSetUtils._ -import com.paypal.gimel.datastreamfactory.{StreamCheckPointHolder, WrappedData} -import com.paypal.gimel.kafka.avro.SparkAvroUtilities._ -import com.paypal.gimel.kafka.conf._ -import com.paypal.gimel.kafka.conf.KafkaJsonProtocol.{offsetPropertiesFormat, offsetRangePropertiesFormat} -import com.paypal.gimel.kafka.utilities.ImplicitKafkaConverters._ -import com.paypal.gimel.kafka.utilities.ImplicitZKCheckPointers._ - - -case class MessageInfo[T: TypeTag](key: String, message: T, topic: String, partition: Int, offset: Long) - -/* -Case classes for reading custom offset properties from the user defined properties - */ -case class OffsetRangeProperties(partition: Int, - from: Long, - to: Option[Long]) - -case class OffsetProperties(topic: String, - offsetRange: Array[OffsetRangeProperties]) - -object KafkaUtilities { - - val logger = com.paypal.gimel.logger.Logger() - - /** - * This is a Map of Properties that will be used to set the batch parameters - * , based on the incoming volume of data & user supplied parameters - */ - val defaultRowsPerBatch: Map[Int, Map[String, String]] = Map( - 100000000 -> Map( - KafkaConfigs.batchFetchSize -> "500" - , KafkaConfigs.maxRecordsPerPartition -> "100000000" - , KafkaConfigs.minRowsPerParallelKey -> "100000" - ) - , 50000000 -> Map( - KafkaConfigs.batchFetchSize -> "500" - , KafkaConfigs.maxRecordsPerPartition -> "50000000" - , KafkaConfigs.minRowsPerParallelKey -> "100000" - ) - , 25000000 -> Map( - KafkaConfigs.batchFetchSize -> "250" - , KafkaConfigs.maxRecordsPerPartition -> "25000000" - , KafkaConfigs.minRowsPerParallelKey -> "100000" - ) - , 10000000 -> Map( - KafkaConfigs.batchFetchSize -> "100" - , KafkaConfigs.maxRecordsPerPartition -> "10000000" - , KafkaConfigs.minRowsPerParallelKey -> "100000" - ) - , 1000000 -> Map( - KafkaConfigs.batchFetchSize -> "20" - , KafkaConfigs.maxRecordsPerPartition -> "1000000" - , KafkaConfigs.minRowsPerParallelKey -> "100000" - ) - , 100000 -> Map( - KafkaConfigs.batchFetchSize -> "10" - , KafkaConfigs.maxRecordsPerPartition -> "100000" - , KafkaConfigs.minRowsPerParallelKey -> "10000" - ) - , 30000 -> Map( - KafkaConfigs.batchFetchSize -> "10" - , KafkaConfigs.maxRecordsPerPartition -> "100000" - , KafkaConfigs.minRowsPerParallelKey -> "10000" - ) - ) - - - /** - * Determines whether an incoming volume of messages - * from Kafka is Streamable with given parameters. - * - * @param sparkSession : SparkSession - * @param props Properties - * @param rowsInBatch RowsPerBatch Map - * @return true if data is within streaming capacity - * , false if we need to switch to batch - */ - def isStreamable(sparkSession: SparkSession, props: Map[String, String] - , rowsInBatch: Map[Int, Map[String, String]] = defaultRowsPerBatch): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - // val dSet = com.paypal.gimel.DataSet(hiveContext) - val dataSet = props(GimelConstants.DATASET) - // import com.paypal.gimel.DataSetUtils._ - // This is the DataSet Properties - val datasetProps = CatalogProvider.getDataSetProperties(dataSet) - logger.info( - s"""DataSet Props --> - |${datasetProps.props.map(x => s"${x._1} --> ${x._2}").mkString("\n")}""".stripMargin) - val newProps: Map[String, Any] = getProps(props) ++ Map( - GimelConstants.DATASET_PROPS -> datasetProps, - GimelConstants.DATASET -> dataSet, - GimelConstants.RESOLVED_HIVE_TABLE -> resolveDataSetName(dataSet), - GimelConstants.APP_TAG -> getAppTag(sparkSession.sparkContext)) - val conf = new KafkaClientConfiguration(newProps) - logger.info(s"Zookeeper Details --> ${conf.zkHostAndPort} | ${conf.zkCheckPoints}") - val thresholdRows = 1000000000 - val lastCheckPoint: Option[Array[OffsetRange]] = getLastCheckPointFromZK(conf.zkHostAndPort - , conf.zkCheckPoints) - val availableOffsetRange: Array[OffsetRange] = { - BrokersAndTopic(conf.kafkaHostsAndPort, conf.kafkaTopics).toKafkaOffsetsPerPartition - } - if (lastCheckPoint.isDefined) { - logger.info(s"Offsets in CheckPoint --> ${lastCheckPoint.get.mkString("\n")}") - } - logger.info(s"Offsets in Kafka --> ${availableOffsetRange.mkString("\n")}") - val newOffsetRangesForReader: Array[OffsetRange] = { - getNewOffsetRangeForReader(lastCheckPoint, availableOffsetRange, thresholdRows) - } - logger.info(s"New Offsets to Fetch --> ${newOffsetRangesForReader.mkString("\n")}") - val totalMessages = newOffsetRangesForReader.map(oR => oR.untilOffset - oR.fromOffset).sum.toInt - logger.info(s"Total Messages from New Offsets to Fetch --> $totalMessages") - val userSuppliedMaxRows = { - sparkSession.conf.get(KafkaConfigs.rowCountOnFirstRunKey, totalMessages.toString) - } - val totalRows = if (lastCheckPoint.isEmpty) userSuppliedMaxRows.toInt else totalMessages - logger.info(s"Final Total Messages to Fetch --> $totalRows") - val streamCutOff = sparkSession.conf.get(KafkaConfigs.streamCutOffThresholdKey, "100000").toInt - val (batchProps, isStreamable) = totalRows match { - case n if 50000000 <= n => - (rowsInBatch(100000000), false) - case n if 25000000 <= n => - (rowsInBatch(50000000), false) - case n if 10000000 <= n => - (rowsInBatch(25000000), false) - case n if 1000000 <= n => - (rowsInBatch(10000000), false) - case n if streamCutOff <= n => - (rowsInBatch(1000000), false) - case _ => - (Map(), true) - } - logger.info(s"Batch Props --> $batchProps") - val resolvedProps = props ++ batchProps - logger.info(s"Resolved Props --> $resolvedProps") - logger.info(s"isStreamable --> $isStreamable") - resolvedProps.foreach(p => sparkSession.conf.set(p._1, p._2.toString)) - isStreamable - } - - /** - * Convenience Function to checkpoint a given OffsetRange - * - * @param zkHost Host Server for Zookeeper - * @param zkNodes Node where we want to checkPoint - * @param offsetRange Array[OffsetRange] - * @return Boolean indicating checkpointing status - */ - - def inStreamCheckPoint(zkHost: String, zkNodes: Seq[String] - , offsetRange: Array[OffsetRange]): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val zk = ZooKeeperHostAndNodes(zkHost, zkNodes) - (zk, offsetRange).saveZkCheckPoint - } - - /** - * Convenience Function to checkpoint a given OffsetRange - * - * @param sparkSession Spark Session - * @param zkHost Host Server for Zookeeper - * @param zkNodes Node where we want to checkPoint - * @return Boolean indicating checkpointing status - */ - - def inStructuredStreamCheckPoint(sparkSession: SparkSession, zkHost: String, zkNodes: Seq[String]): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - sparkSession.streams.addListener(new StreamingQueryListener() { - override def onQueryStarted(event: QueryStartedEvent): Unit = Unit - override def onQueryProgress(event: QueryProgressEvent): Unit = { - val queryStatusMap = JSON.parseFull(event.progress.json).get.asInstanceOf[Map[String, Any]] - val endOffsetsMap: Map[String, Map[Any, Any]] = queryStatusMap.get("sources").head.asInstanceOf[List[Any]].head.asInstanceOf[Map[Any, Any]].get("endOffset").head.asInstanceOf[Map[String, Map[Any, Any]]] - val endOffsets = endOffsetsMap.flatMap { x => - x._2.map { y => - OffsetRange(topic = x._1, partition = y._1.asInstanceOf[String].toInt, fromOffset = 0L, untilOffset = y._2.asInstanceOf[Double].longValue()) - } - }.toArray - StreamCheckPointHolder().setCurentCheckPoint(endOffsets) - inStreamCheckPoint(zkHost, zkNodes, endOffsets) - } - override def onQueryTerminated(event: QueryTerminatedEvent): Unit = { - sparkSession.streams.removeListener(this) - } - }) - } - - /** - * Gets the Appropriate Serializer Class - * - * @param serializerClassName Name of the Serializer Class - * @return Serializer Class - */ - - def getSerializer(serializerClassName: String) - : Class[_ >: StringSerializer with ByteArraySerializer <: Serializer[_ >: String with Array[Byte]]] = { - serializerClassName match { - case "org.apache.kafka.common.serialization.StringSerializer" => { - classOf[org.apache.kafka.common.serialization.StringSerializer] - } - case "org.apache.kafka.common.serialization.ByteArraySerializer" => { - classOf[org.apache.kafka.common.serialization.ByteArraySerializer] - } - case _ => { - throw new Exception(s"UnSupported Serializer Class Requested : ${serializerClassName}") - } - } - } - - /** - * Gets the Appropriate DeSerializer Class - * - * @param deserializerClassName Name of the DeSerializer Class - * @return DeSerializer Class - */ - - def getDeserializer(deserializerClassName: String) - : Class[_ >: StringDeserializer with ByteArrayDeserializer <: Deserializer[_ >: String with Array[Byte]]] = { - deserializerClassName match { - case "org.apache.kafka.common.serialization.StringDeserializer" => { - classOf[org.apache.kafka.common.serialization.StringDeserializer] - } - case "org.apache.kafka.common.serialization.ByteArrayDeserializer" => { - classOf[org.apache.kafka.common.serialization.ByteArrayDeserializer] - } - case _ => { - throw new Exception(s"UnSupported DeSerializer Class Requested : ${deserializerClassName}") - } - } - } - - /** - * Gets the Appropriate De/Serializer Class - * - * @param serDe Name of the De/Serializer Class - * @return De/Serializer Class - */ - - def getSerDe(serDe: String): Class[_ >: StringDeserializer - with ByteArrayDeserializer with StringSerializer with ByteArraySerializer <: Closeable] = { - serDe match { - case "org.apache.kafka.common.serialization.StringDeserializer" => { - classOf[org.apache.kafka.common.serialization.StringDeserializer] - } - case "org.apache.kafka.common.serialization.ByteArrayDeserializer" => { - classOf[org.apache.kafka.common.serialization.ByteArrayDeserializer] - } - case "org.apache.kafka.common.serialization.StringSerializer" => { - classOf[org.apache.kafka.common.serialization.StringSerializer] - } - case "org.apache.kafka.common.serialization.ByteArraySerializer" => { - classOf[org.apache.kafka.common.serialization.ByteArraySerializer] - } - case _ => { - throw new Exception(s"UnSupported serDe Class Requested : ${serDe}") - } - } - } - - /** - * Converts RDD[WrappedData] to DataFrame with just 1 column - - * which is the entire message String from Kafka - * - * @param sqlContext SQLContext - * @param columnAlias Name of Column in DataFrame - * @param wrappedData WrappedData - * @return DataFrame - */ - def wrappedStringDataToDF(columnAlias: String, sqlContext: SQLContext - , wrappedData: RDD[WrappedData]): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - logger.info("Attempting to Convert Value in Wrapped Data to String Type") - try { - val rdd: RDD[(String, String)] = wrappedData.map { x => - (x.key.asInstanceOf[String], x.value.asInstanceOf[String]) - } - val df = rddAsDF(sqlContext, columnAlias, rdd) - logger.info("Completed --> Convert Value to String Type") - df - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - - } - - /** - * Completely Clear the CheckPointed Offsets, leading to Read from Earliest offsets from Kafka - * - * @param zkHost Zookeeper Host - * @param zkNodes Zookeeper Path - * @param msg Some Message or A Reason for Clearing CheckPoint - */ - def clearCheckPoint(zkHost: String, zkNodes: Seq[String], msg: String): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val zk = ZooKeeperHostAndNodes(zkHost, zkNodes) - zk.deleteZkCheckPoint() - } - - - /** - * Gets the Latest CheckPoint from Zookeeper, if available - * - * @param zkHost Host Server for Zookeeper - * @param zkNodes Node where we want to checkPoint - * @return Option[Array[OffsetRange] - */ - - def getLastCheckPointFromZK(zkHost: String, zkNodes: Seq[String]): Option[Array[OffsetRange]] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - try { - val zk = ZooKeeperHostAndNodes(zkHost, zkNodes) - val lastCheckPoint: Option[Array[OffsetRange]] = zk.fetchZkCheckPoint - lastCheckPoint - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - - /** - * Function Gets - * Either : The difference between lastCheckPoint & latestOffsetRange - * Or : latestOffsetRange from Kafka - * - * @param lastCheckPoint savedCheckPoint, if available - * @param availableOffsetRange latestOfffsetRange from Kafka - * @param fetchRowsOnFirstRun This will be used if reading from kafka without - * any prior checkpoint, - * to ensure we read only last N messages - * from topic as requested by client - * @return Array[OffsetRange] - */ - - def getNewOffsetRangeForReader(lastCheckPoint: Option[Array[OffsetRange]] - , availableOffsetRange: Array[OffsetRange] - , fetchRowsOnFirstRun: Long): Array[OffsetRange] = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - try { - val newOffsetRangesForReader = lastCheckPoint match { - case None => { - logger.warning( - s"""No CheckPoint Found. - |Reader will attempt to fetch "from beginning" From Kafka !""".stripMargin) - availableOffsetRange.map { - eachOffsetRange => - val fromOffset = scala.math.min(fetchRowsOnFirstRun - , eachOffsetRange.untilOffset - eachOffsetRange.fromOffset) - logger.info(s"Since this is first run," + - s" will try to fetch only ${fromOffset} rows from Kafka") - OffsetRange(eachOffsetRange.topic, eachOffsetRange.partition - , eachOffsetRange.untilOffset - fromOffset, eachOffsetRange.untilOffset) - } - } - case Some(lastCheckPoint) => { - logger.info("""Found CheckPoint """) - (lastCheckPoint, availableOffsetRange).toNewOffsetRanges - } - } - newOffsetRangesForReader - } - catch { - case ex: Throwable => { - ex.printStackTrace() - throw ex - } - } - } - - /** - * Function Gets - * a custom offset range as a JSON from the user defined properties - * Converts it to an array of offset ranges and returns them - * - * @param kafkaTopics sequence of topics - * @param offsetRange user given custom offset ranges, if available - * @return Array[OffsetRange] - */ - - def getCustomOffsetRangeForReader(kafkaTopics: Seq[String], offsetRange: String, consumerMode: String): Array[OffsetRange] = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - try { - val offsetRangeObject = offsetRange.parseJson.convertTo[Seq[OffsetProperties]] - val finalOffsetRanges = offsetRangeObject.flatMap { - eachTopicRange => - eachTopicRange.offsetRange.map { - eachOffsetRange => { - var toOffset = 0L - if (consumerMode == KafkaConstants.gimelAuditRunTypeStream) { - toOffset = eachOffsetRange.to.getOrElse(-1) - } - else if (consumerMode == KafkaConstants.gimelAuditRunTypeBatch) { - toOffset = eachOffsetRange.to.get - } - if(!kafkaTopics.contains(eachTopicRange.topic)) { - throw new Exception("The topic specified in custom offset range does not match the subscribed topic! Please unset the previous value or check your properties") - } - OffsetRange(eachTopicRange.topic, eachOffsetRange.partition, eachOffsetRange.from, toOffset) - } - } - }.toArray - finalOffsetRanges - } catch { - case ex: Throwable => - ex.printStackTrace() - throw ex - } - } - - /** - * Converts an RDD[Wrapped Data] into RDD[GenericRecord] - * - * @param wrappedDataRDD RDD[WrappedData] - * @param avroSchemaKey AvroSchemaKey | Example flights , flights.flights_log - * @param avroSchemaURL Confluent Schema Registry URL:Port - * @param avroSchemaSource Specifies whether schema is inline text or from CDH schema registry - * @param avroSchemaString Avro Schema String for flights - * @param isStreamParallel true indicates : can repartition data for parallelism. - * false is usually set for preserving ordering of data - * as received from kafka - * @param streamParallels Repartition factor, for example : 10 indicates repartition to - * 10 executors - * @return RDD[GenericRecord] - */ - def wrappedDataToAvro(wrappedDataRDD: RDD[WrappedData], avroSchemaKey: String, - avroSchemaURL: String, - avroSchemaSource: String, avroSchemaString: String, - isStreamParallel: Boolean, streamParallels: Int, - cdhAllSchemaDetails: Option[Map[String, - (String, mutable.Map[Int, String])]]): RDD[GenericRecord] = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - try { - val parallelRDD = if (isStreamParallel) { - wrappedDataRDD.repartition(streamParallels) - } else { - wrappedDataRDD - } - val avroRecord: RDD[GenericRecord] = parallelRDD.map { - x => bytesToGenericRecord(x.value.asInstanceOf[Array[Byte]], avroSchemaString) - } - val finalAvroRecord = avroSchemaSource.toUpperCase() match { - case "CDH" => - deserializeCurRec(avroRecord, cdhAllSchemaDetails) - case _ => avroRecord - } - finalAvroRecord - } - catch { - case ex: Throwable => { - ex.printStackTrace() - throw ex - } - } - } - - /** - * Fetches the Schema for each Topic with version - * - * @param schemaSubject Schema Key - * @param avroSchemaURL Confluent Schema URL - * @return Map of Topic -> (Version & Schema) - */ - - def getAllSchemasForSubject(schemaSubject: String, avroSchemaURL: String) - : (String, mutable.Map[Int, String]) = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - val schemaLookup: scala.collection.mutable.Map[Int, String] = scala.collection.mutable.Map() - val schemaRegistryClient = new ConfluentSchemaRegistry(avroSchemaURL) - val k = schemaRegistryClient.getAllVersions(schemaSubject).asScala - val k2 = k.map { eachVersion => - val version = eachVersion.toString.toInt - version -> schemaRegistryClient.getVersion(schemaSubject, version).getSchema - }.toMap - k2.foreach(entry => schemaLookup.put(entry._1, entry._2)) - val latestSchema = schemaRegistryClient.getLatestVersion(schemaSubject).getSchema - (latestSchema, schemaLookup) - } - - - /** - * Deserialize the CDH record (bytes) , get GenericRecord - * - * @param avroRecord Avro GenericRecord RDD - * @param cdhAllSchemaDetails All the Subjects with LatestSchema and EachVersion - * @return Avro GenericRecord RDD - */ - def deserializeCurRec(avroRecord: RDD[GenericRecord] - , cdhAllSchemaDetails: Option[Map[String, - (String, mutable.Map[Int, String])]]): RDD[GenericRecord] = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - val additionalFields = getAdditionalFields() - try { - val actualRecord: RDD[GenericRecord] = avroRecord.map { eachRecord => - val eachRecordSchemaSubject = eachRecord.get("schemaSubject").toString - val schemaThisRec = cdhAllSchemaDetails.get(eachRecordSchemaSubject)._1 - val eachRec: Array[Byte] = eachRecord.get("currentRecord").asInstanceOf[ByteBuffer].array() - var metaColumnsMap = scala.collection.immutable.Map[String, String]() - // Add mandatory meta columns, gg commit timestamp, rba and opType - additionalFields.foreach { - field => metaColumnsMap += (field._1 -> eachRecord.get(field._2).toString) - } - val genericRecord: GenericRecord = bytesToGenericRecord(eachRec, schemaThisRec) - val newSchema = addAdditionalFieldsToSchema(additionalFields.keySet.toList, schemaThisRec) - val newGenericRecord = copyToGenericRecord(genericRecord, schemaThisRec, newSchema) - metaColumnsMap.foreach { kv => newGenericRecord.put(kv._1, kv._2) } - newGenericRecord - } - actualRecord - } - catch { - case ex: Throwable => { - ex.printStackTrace() - throw ex - } - } - } - - /** - * Lists Additional fields to pick from CDH metadata record. - * - * @return List of Metadata columns - */ - def getAdditionalFields(): scala.collection.immutable.Map[String, String] = - scala.collection.immutable.Map("gg_commit_timestamp" -> "opTs" - , "opt_type" -> "opType", "trail_seq_no" -> "trailSeqno", "trail_rba" -> "trailRba") - - - /** - * Adds additional fields to the Avro Schem - * - * @param additionalFields List of fields to Add - * @param schemaString Input Avro Schema - * @return Updated Avro Schema String - */ - def addAdditionalFieldsToSchema(additionalFields: List[String], schemaString: String) - : String = { - // Parse as JsValue - val schemaAsJsVal = schemaString.parseJson - // Convert to JsObject - val schemaAsJsObject = schemaAsJsVal.asJsObject - // Get the Map of each element & Value - val schemaElementsMap: Map[String, JsValue] = schemaAsJsObject.fields - // These fields will be added with "to-add" fields - val schemaFields = schemaAsJsObject.getFields("fields").head.convertTo[Seq[JsValue]] - val additionalFieldsJSON: List[String] = additionalFields.map { - x => s"""{"name":"${x}","type":["null","string"]}""".stripMargin - } // "to-add" fields - val additionalFieldsAsJsVal: List[JsValue] = additionalFieldsJSON.map { x => x.parseJson } - // added both fields - val combinedFields: Seq[JsValue] = schemaFields ++ additionalFieldsAsJsVal - // formation of a String so it can be inferred as JsVal - val combinedFieldsAsString = combinedFields.map { - x => x.asJsObject.compactPrint - }.mkString("[", ",", "]") - val combinedFieldsAsJsValue = combinedFieldsAsString.parseJson - val toOverride = scala.collection.Map("fields" -> combinedFieldsAsJsValue) - val k12 = schemaElementsMap ++ toOverride - k12.toJson.compactPrint - } - - /** - * Get the Column Alias Name for a Given Single Column DF to be read from Kafka Topic - * that has human readable message - * - * @param conf KafkaClientConfiguration - * @return column alias name - */ - def kafkaMessageColumnAlias(conf: KafkaClientConfiguration): String = { - conf.tableProps.getOrElse("kafka.message.column.alias", "message").toString - } - - - /** - * InTakes RDD And Converts to DataFrame - * - * @param sqlContext SQL Context - * @param messageColumnAlias Message Column Name - * @param rdd RDD[(String,String)] - * @return DataFrame - */ - def stringRddAsDF(sqlContext: SQLContext, messageColumnAlias: String - , rdd: RDD[(String, String)]): DataFrame = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - try { - val dataIntermediate = sqlContext.createDataFrame(rdd) - .withColumnRenamed("_2", "message") - .withColumnRenamed("_1", "key") - val df = dataIntermediate.select("message").withColumnRenamed("message", messageColumnAlias) - df - } - catch { - case ex: Throwable => { - ex.printStackTrace() - logger.error(s"Failed While Attempting to Convert RDD to DF") - throw ex - } - } - } - - /** - * Converts RDD[WrappedData] to DataFrame - * - * @param sqlContext SQLContext - * @param valueMessageType Message Type From Kafka - such as string, json, binary.. - * @param keySerializer Key Serializer - * @param valueSerializer Value Serializer - * @param rdd RDD[Wrapped Data] - * @param kafkaValueMessageColAlias Column Alias in DataFrame for Messages from Kafka - * @param avroSchemaString Avro Schema String for Derserialization - * @param avroSchemaSource Avro Schema Source such as Inline or CDH Confluent Schema Registry - * @param cdhTopicSchemaMetadata CDH TopicSchema Details - * @param cdhAllSchemaDetails The Topic , Version, Schema information - * @return DataFrame - */ - - def rddToDF(sqlContext: SQLContext - , valueMessageType: Option[String] - , keySerializer: String - , valueSerializer: String - , rdd: RDD[WrappedData] - , kafkaValueMessageColAlias: String = "value" - , avroSchemaString: String - , avroSchemaSource: String - , cdhTopicSchemaMetadata: Option[String] - , cdhAllSchemaDetails: Option[Map[String, (String, mutable.Map[Int, String])]]) - : DataFrame = { - (valueMessageType, valueSerializer) match { - // Bytes Messages - case (Some("binary"), "org.apache.kafka.common.serialization.ByteArraySerializer") => - val rDD = rdd.map { x => (x.key.asInstanceOf[String], x.value.asInstanceOf[Array[Byte]]) } - // logger.info("Byte Messages -->"); - // rDD.cache.collect.take(10).foreach(x => logger.info(x)) - val columnAlias = kafkaValueMessageColAlias - byteRddAsDF(sqlContext, columnAlias, rDD) - // String Messages - case (Some("string"), "org.apache.kafka.common.serialization.StringSerializer") => - val rDD = rdd.map { x => (x.key.asInstanceOf[String], x.value.asInstanceOf[String]) } - // logger.info("String Messages -->"); - // rDD.cache.collect.take(10).foreach(x => logger.info(x)) - val columnAlias = kafkaValueMessageColAlias - stringRddAsDF(sqlContext, columnAlias, rDD) - // JSON Messages - case (Some("json"), "org.apache.kafka.common.serialization.StringSerializer") => - val rDD: RDD[String] = rdd.map { x => x.value.asInstanceOf[String] } - // logger.info("JSON Messages -->"); - // rDD.cache.collect.take(10).foreach(x => logger.info(x)) - sqlContext.read.json(rDD) - // Avro - CDH | Generic Avro - case (_, "org.apache.kafka.common.serialization.ByteArraySerializer") => - val rDD = rdd.map { x => (x.key, x.value.asInstanceOf[Array[Byte]]) } - // logger.info("Raw Messages -->"); - // rDD.cache.collect.take(10).foreach(x => logger.info(x)) - val avroRecord: RDD[GenericRecord] = rDD.map { x => - bytesToGenericRecord(x._2, avroSchemaString) - } - val (finalAvroRecord, finalSchema) = avroSchemaSource.toUpperCase() match { - case KafkaConstants.gimelKafkaAvroSchemaCDH => { - val newSchemaCDH = addAdditionalFieldsToSchema(getAdditionalFields().keySet.toList - , cdhTopicSchemaMetadata.get) - (deserializeCurRec(avroRecord, cdhAllSchemaDetails), newSchemaCDH) - } - case _ => (avroRecord, avroSchemaString) - } - genericRecordtoDF(sqlContext, finalAvroRecord, finalSchema) - // Other Types - case _ => throw new Exception("Unsupported Configuration or Serialization Techniques") - } - } - - /** - * Returns A Wrapped Message from Kafka - * - * @param sqlContext SQLContext - * @param conf KafkaClientConfiguration - * @param parallelizedRanges Array[OffsetRange] - * @return RDD[WrappedData] - */ - - def getFromKafkaAsWrappedData(sqlContext: SQLContext - , conf: KafkaClientConfiguration - , parallelizedRanges: Array[OffsetRange] - ): RDD[WrappedData] = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - val (avroSchemaString, avroSchemaKey, avroSchemaURL) = (conf.avroSchemaString - , conf.avroSchemaKey - , conf.avroSchemaURL - ) - val kafkaParams: java.util.Map[String, Object] = new java.util.HashMap() - conf.kafkaConsumerProps.foreach { x => kafkaParams.put(x._1, x._2) } - logger.info(s"Final Kafka Params --> ${kafkaParams.asScala.mkString("\n", "\n", "\n")}") - logger.info( - s"""kafka.message.value.type --> ${conf.kafkaMessageValueType} - |\nValue Serializer --> ${conf.kafkaValueSerializer}""".stripMargin - ) - try { - - val rdd: RDD[_ >: (String, Array[Byte]) with (String, String) <: (String, Serializable)] = - (conf.kafkaMessageValueType, conf.kafkaValueSerializer) match { - // Bytes Messages - case (Some("binary"), "org.apache.kafka.common.serialization.ByteArraySerializer") => - val rDDConsumerRec: RDD[ConsumerRecord[String, Array[Byte]]] = - createRDD[String, Array[Byte]]( - sqlContext.sparkContext, kafkaParams - , parallelizedRanges, LocationStrategies.PreferConsistent) - rDDConsumerRec.map { x => (x.key(), x.value()) } - // String Messages - case (Some("string"), "org.apache.kafka.common.serialization.StringSerializer") => - val rDDConsumerRec: RDD[ConsumerRecord[String, String]] = - createRDD[String, String](sqlContext.sparkContext - , kafkaParams, parallelizedRanges, LocationStrategies.PreferConsistent) - rDDConsumerRec.map { x => (x.key(), x.value()) } - // JSON Messages - case (Some("json"), "org.apache.kafka.common.serialization.StringSerializer") => - val rDDConsumerRec: RDD[ConsumerRecord[String, String]] = - createRDD[String, String](sqlContext.sparkContext - , kafkaParams, parallelizedRanges, LocationStrategies.PreferConsistent) - rDDConsumerRec.map { x => (x.key(), x.value()) } - // Avro - CDH | Generic Avro - case (_, "org.apache.kafka.common.serialization.ByteArraySerializer") => - val rDDConsumerRec: RDD[ConsumerRecord[String, Array[Byte]]] = - createRDD[String, Array[Byte]](sqlContext.sparkContext - , kafkaParams, parallelizedRanges, LocationStrategies.PreferConsistent) - rDDConsumerRec.map { x => (x.key(), x.value()) } - // Other Types - case _ => throw new Exception("Unsupported Configuration or Serialization Techniques") - } - - rdd.map(x => WrappedData(x._1, x._2)) - } - catch { - case ex: Throwable => { - ex.printStackTrace() - val messageString = - s"""kafkaParams --> ${kafkaParams.asScala.mkString(" \n ")}""".stripMargin - logger.error(s"Unable to Fetch from Kafka for given parameters --> ${messageString}") - throw ex - } - } - } - - /** - * Returns DataFrame -fetching messages from Kafka - * - * @param sqlContext SQLContext - * @param conf KafkaClientConfiguration - * @param parallelizedRanges Array[OffsetRange] - * @return DataFrame - */ - - def getAsDFFromKafka(sqlContext: SQLContext, conf: KafkaClientConfiguration - , parallelizedRanges: Array[OffsetRange]): DataFrame = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - val kafkaParams: java.util.Map[String, Object] = new java.util.HashMap() - conf.kafkaConsumerProps.foreach { x => kafkaParams.put(x._1, x._2) } - logger.info(s"Final Kafka Params --> ${kafkaParams.asScala.mkString("\n", "\n", "\n")}") - logger.info( - s"""kafka.message.value.type --> ${conf.kafkaMessageValueType} - |\nValue Serializer --> ${conf.kafkaValueSerializer}""".stripMargin) - val wrappedDataRdd: RDD[WrappedData] = getFromKafkaAsWrappedData(sqlContext, conf, parallelizedRanges) - rddToDF(sqlContext, conf.kafkaMessageValueType, conf.kafkaKeySerializer - , conf.kafkaValueSerializer, wrappedDataRdd, "value", conf.avroSchemaString - , conf.avroSchemaSource, conf.cdhTopicSchemaMetadata, conf.cdhAllSchemaDetails) - } - - /** - * Converts Avro RDD to Spark DataFrame - * - * @param avroRecord RDD Generic Record - * @param sqlContext SQLContext - * @param avroSchemaString Avro Schema String - * @param avroSchemaSource Avro Schema Source - * @param cdhTopicSchemaMetadata CDH Topic Metadata Details - * @param cdhAllSchemaDetails CDH Schema Details (Keys, Schemas..) - * @return DataFrame - */ - - @deprecated - def avroToDF1(avroRecord: RDD[GenericRecord] - , sqlContext: SQLContext - , avroSchemaString: String - , avroSchemaSource: String - , cdhTopicSchemaMetadata: Option[String] - , cdhAllSchemaDetails: Option[Map[String, (String, mutable.Map[Int, String])]]) - : DataFrame = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - val (finalAvroRecord, finalSchema) = avroSchemaSource match { - case KafkaConstants.gimelKafkaAvroSchemaCDH => { - val newSchemaCDH = addAdditionalFieldsToSchema(getAdditionalFields().keySet.toList - , cdhTopicSchemaMetadata.get) - (deserializeCurRec(avroRecord, cdhAllSchemaDetails), newSchemaCDH) - } - case _ => (avroRecord, avroSchemaString) - } - val df = genericRecordtoDF(sqlContext, finalAvroRecord, finalSchema) - df - } - - /** - * InTakes RDD And Converts to DataFrame - * - * @param sqlContext SQL Context - * @param messageColumnAlias Message Column Name - * @param rdd RDD[(String, String)] - * @return DataFrame - */ - def rddAsDF(sqlContext: SQLContext, messageColumnAlias: String - , rdd: RDD[(String, String)]): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - try { - val dataIntermediate = sqlContext.createDataFrame(rdd) - .withColumnRenamed("_2", "message").withColumnRenamed("_1", "key") - dataIntermediate.select("message").withColumnRenamed("message", messageColumnAlias) - } catch { - case ex: Throwable => - ex.printStackTrace() - logger.error(s"Failed While Attempting to Convert RDD to DF") - throw ex - } - } - - /** - * InTakes RDD And Converts to DataFrame - * - * @param sqlContext SQL Context - * @param messageColumnAlias Message Column Name - * @param rdd RDD[(String,Array[Byte])] - * @return DataFrame - */ - def byteRddAsDF(sqlContext: SQLContext, messageColumnAlias: String - , rdd: RDD[(String, Array[Byte])]): DataFrame = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - try { - val dataIntermediate = sqlContext.createDataFrame(rdd) - .withColumnRenamed("_2", "message").withColumnRenamed("_1", "key") - dataIntermediate.select("message").withColumnRenamed("message", messageColumnAlias) - } - catch { - case ex: Throwable => { - ex.printStackTrace() - logger.error(s"Failed While Attempting to Convert RDD to DF") - throw ex - } - } - } - - /** - * Creates a Topic in Kafka if it does not exists - * - * @param zookKeeperHostAndPort Zookeeper Host & Port | Example localhost:2181 - * @param kafkaTopicName Kafka Topic Name - * @param numberOfPartitions Number of Partitions - * @param numberOfReplica Number of Replicas - */ - def createTopicIfNotExists(zookKeeperHostAndPort: String, kafkaTopicName: String - , numberOfPartitions: Int, numberOfReplica: Int): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - KafkaAdminUtils.createTopicIfNotExists( - zookKeeperHostAndPort - , kafkaTopicName - , numberOfPartitions - , numberOfReplica - ) - } - - /** - * Delete a Topic if it exists - * - * @param zookKeeperHostAndPort Zookeeper Host & Port | Example localhost:2181 - * @param kafkaTopicName Kafka Topic Name - */ - def deleteTopicIfExists(zookKeeperHostAndPort: String, kafkaTopicName: String): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - storageadmin.KafkaAdminUtils.deleteTopicIfExists( - zookKeeperHostAndPort - , kafkaTopicName - ) - } - - /** - * - * @param properties - * @return - */ - def getKafkaConsumer(properties: Option[Properties] = None): KafkaConsumer[Object, Object] = { - val consumerProperties = new Properties() - if (properties.isDefined) { - consumerProperties.putAll(properties.get) - } - // Ensure the serializer configuration is set though its not needed - consumerProperties.setProperty(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[BytesDeserializer].getName) - consumerProperties.setProperty(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, classOf[BytesDeserializer].getName) - val group = consumerProperties.get(ConsumerConfig.GROUP_ID_CONFIG) - if (group == null) { - consumerProperties.put(ConsumerConfig.GROUP_ID_CONFIG, "kafka-consumer-offset-client-" + UUID.randomUUID) - } - new KafkaConsumer[Object, Object](consumerProperties) - } - - /** - * - * @param broker - * @return - */ - def getDefaultConsumerPropertiesPerBroker(broker: String): Properties = { - val props = new Properties() - props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, broker) - props - } -} - -/** - * Custom Exception for KafkaUtilities related errors - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -class KafkaUtilitiesException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/writer/KafkaBatchProducer.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/writer/KafkaBatchProducer.scala deleted file mode 100644 index bb0c3413..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/main/scala/com/paypal/gimel/kafka/writer/KafkaBatchProducer.scala +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.writer - -import java.util.Properties - -import scala.collection.JavaConverters._ -import scala.language.implicitConversions -import scala.reflect.runtime.universe._ - -import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame - -import com.paypal.gimel.kafka.avro.SparkAvroUtilities._ -import com.paypal.gimel.kafka.conf.KafkaClientConfiguration -import com.paypal.gimel.kafka.utilities.KafkaUtilitiesException - -/** - * Implements Produce to Kafka Logic Here - */ -object KafkaBatchProducer { - - val logger = com.paypal.gimel.logger.Logger() - - /** - * InTakes a DataFrame - * Convert to Avro Record - * Serialize the record into Bytes - * Publish to Kafka - * - * @param conf KafkaClientConfiguration - * @param data RDD - */ - def produceToKafka[T: TypeTag](conf: KafkaClientConfiguration, data: RDD[T]): Unit = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - val kafkaProps: Properties = conf.kafkaProducerProps - val kafkaTopic = conf.kafkaTopics - logger.info(s"Kafka Props for Producer -> ${kafkaProps.asScala.mkString("\n")}") - logger.info("Begin Publishing to Kafka....") - try { - data.foreachPartition { eachPartition => - val producer: KafkaProducer[Nothing, T] = new KafkaProducer(kafkaProps) - val resp = eachPartition.map { messageString => - val rec = new ProducerRecord(kafkaTopic, messageString) - producer.send(rec) - } - resp.length - producer.close() - } - } - catch { - case ex: Throwable => { - ex.printStackTrace() - val msg = - s""" - |kafkaTopic -> ${kafkaTopic} - |kafkaParams --> ${kafkaProps.asScala.mkString("\n")}} - """.stripMargin - throw new KafkaUtilitiesException(s"Failed While Pushing Data Into Kafka \n ${msg}") - } - } - logger.info("Publish to Kafka - Completed !") - } - - /** - * InTakes a DataFrame - * Convert to Avro Record - * Serialize the record into Bytes - * Publish to Kafka - * - * @param conf KafkaClientConfiguration - * @param dataFrame DataFrame - */ - def produceToKafka(conf: KafkaClientConfiguration, dataFrame: DataFrame): Unit = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - - logger.info(s"kafka.message.value.type --> ${conf.kafkaMessageValueType} \nValue Serializer --> ${conf.kafkaValueSerializer}") - (conf.kafkaMessageValueType, conf.kafkaValueSerializer) match { - case (Some("binary"), "org.apache.kafka.common.serialization.ByteArraySerializer") => - val rdd = dataFrame.rdd.map { x => x.getAs[Array[Byte]](0) } - produceToKafka(conf, rdd) - case (Some("string"), "org.apache.kafka.common.serialization.StringSerializer") => - val rdd = dataFrame.rdd.map { x => x.getAs[String](0) } - produceToKafka(conf, rdd) - case (Some("json"), "org.apache.kafka.common.serialization.StringSerializer") => - val rdd = dataFrame.toJSON.rdd - produceToKafka(conf, rdd) - case (_, "org.apache.kafka.common.serialization.ByteArraySerializer") => { - val kafkaProps: Properties = conf.kafkaProducerProps - val avroSchemaString = conf.avroSchemaString - val kafkaTopic = conf.kafkaTopics - logger.debug(s"Kafka Props for Producer -> ${kafkaProps.asScala.mkString("\n")}") - logger.debug(s"avro Schema --> ${avroSchemaString}") - logger.debug(s"dataframe Schema --> ${dataFrame.schema}") - try { - if (!isDFFieldsEqualAvroFields(dataFrame, avroSchemaString)) { - throw new KafkaUtilitiesException(s"Incompatible DataFrame Schema Vs Provided Avro Schema.") - } - val genericRecordRDD = dataFrametoGenericRecord(dataFrame, avroSchemaString) - val serializedRDD: RDD[Array[Byte]] = genericRecordRDD.map(genericRecord => genericRecordToBytes(genericRecord, avroSchemaString)) - logger.info("Begin Publishing to Kafka....") - serializedRDD.foreachPartition { - eachPartition => - val producer: KafkaProducer[Nothing, Array[Byte]] = new KafkaProducer(kafkaProps) - val resp = eachPartition.map { - arrayByte => - val rec = new ProducerRecord(kafkaTopic, arrayByte) - producer.send(rec) - } - resp.length - producer.close() - } - } - catch { - case ex: Throwable => { - ex.printStackTrace() - val msg = - s""" - |kafkaTopic -> ${kafkaTopic} - |kafkaParams --> ${kafkaProps.asScala.mkString("\n")}} - |avroSchemaString --> ${avroSchemaString} - """.stripMargin - throw new KafkaUtilitiesException(s"Failed While Pushing Data Into Kafka \n ${msg}") - } - } - logger.info("Publish to Kafka - Completed !") - } - case _ => throw new Exception(s"UnSupported Serialization --> ${conf.kafkaValueSerializer}") - } - - } -} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/test/scala/com/paypal/gimel/kafka/utilities/KafkaConvertersTests.scala b/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/test/scala/com/paypal/gimel/kafka/utilities/KafkaConvertersTests.scala deleted file mode 100644 index f3b698fb..00000000 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-0.10/src/test/scala/com/paypal/gimel/kafka/utilities/KafkaConvertersTests.scala +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.kafka.utilities - -import scala.language.implicitConversions - -import org.apache.spark.streaming.kafka010.OffsetRange -import org.scalatest._ - -import com.paypal.gimel.kafka.utilities.ImplicitKafkaConverters._ -import com.paypal.gimel.kafka.utilities.KafkaUtilities._ - -class KafkaConvertersTests extends FunSpec with Matchers { - - it("should convert array of offset ranges to a single parsable string") { - val sampleRange: Array[OffsetRange] = Array( - OffsetRange("test", 0, 1, 100), - OffsetRange("test", 1, 1, 100)) - val stringified = sampleRange.toStringOfKafkaOffsetRanges - stringified shouldBe "test,0,1,100|test,1,1,100" - } - - it("should converr offset Range to a single parsable checkPoint String") { - val sampleRange = OffsetRange("test", 0, 1, 100) - val stringiFied = sampleRange.toStringOfKafkaOffsetRange - stringiFied shouldBe "test,0,1,100" - } - - it("should convert a single parsable CheckPoint string to a valid offset Range") { - val sampleString = "test,0,1,100" - val offsetRange = CheckPointString(sampleString).toKafkaOffsetRange - offsetRange shouldBe OffsetRange("test", 0, 1, 100) - } - - it("should convert composite `CheckPoint (Array[String])` to a valid Array(Offset Range)") { - val expectedOffsetRanges = Array(OffsetRange("test", 0, 1, 100), OffsetRange("test", 1, 1, 101)) - val sampleString: Array[String] = "test,0,1,100|test,1,1,101".split('|') - val offsetRanges: Array[OffsetRange] = sampleString.map(CheckPointString).toKafkaOffsetRanges - offsetRanges shouldEqual expectedOffsetRanges - } - - it("should convert a json string of custom partition information to an array of offset ranges") { - val sampleRange: Array[OffsetRange] = Array( - OffsetRange("test", 0, 1, 100), - OffsetRange("test", 1, 1, 100)) - val defaultRange: Array[OffsetRange] = Array( - OffsetRange("test", 0, 1, 100), - OffsetRange("test", 2, 1, 100)) - val sampleJson: String = - """[{"topic":"test","offsetRange":[{"partition":0,"from":1,"to":100},{"partition":1,"from":1,"to":100}]}]""" - /* - Happy case for Batch - The value returned should be a valid conversion of the sampleJson to an Array[OffsetRange] - */ - val finalOffsetRanges: Array[OffsetRange] = getCustomOffsetRangeForReader("test".split(","), sampleJson, "BATCH") - finalOffsetRanges shouldEqual(sampleRange) - - val sampleRangeForStream: Array[OffsetRange] = Array( - OffsetRange("test", 0, 1, 100), - OffsetRange("test", 1, 1, -1)) - /* - To offset missing case for Stream - The value returned should be a valid conversion of the sampleJson to an Array[OffsetRange] with To offset as -1 - */ - val sampleJsonForStream: String = - """[{"topic":"test","offsetRange":[{"partition":0,"from":1,"to":100},{"partition":1,"from":1}]}]""" - val finalOffsetRangesForStreamWithoutTo: Array[OffsetRange] = getCustomOffsetRangeForReader("test".split(","), sampleJsonForStream, "STREAM") - finalOffsetRangesForStreamWithoutTo shouldEqual(sampleRangeForStream) - } - -} - diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/pom.xml b/gimel-dataapi/gimel-connectors/gimel-kafka/pom.xml similarity index 89% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/pom.xml rename to gimel-dataapi/gimel-connectors/gimel-kafka/pom.xml index f48ceae1..f6bba1b5 100644 --- a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-kafka/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 - gimel-kafka-2.2 - 2.0.0-SNAPSHOT + gimel-kafka + 2.4.7-SNAPSHOT @@ -37,6 +37,14 @@ under the License. gimel-common ${gimel.version}-SNAPSHOT + + org.apache.hbase + * + + + org.apache.hadoop + * + com.fasterxml.jackson.core jackson-core @@ -51,23 +59,11 @@ under the License. - - com.databricks - spark-avro_${scala.binary.version} - 3.2.0 - ${packaging.scope} - org.apache.spark spark-sql-kafka-${spark.kafka.connector.version} ${spark.version} ${packaging.scope} - - - org.apache.kafka - kafka-clients - - @@ -75,21 +71,6 @@ under the License. kafka-clients ${kafka.version} - - com.fasterxml.jackson.core - jackson-core - ${jackson.version} - - - com.fasterxml.jackson.core - jackson-annotations - ${jackson.version} - - - com.fasterxml.jackson.core - jackson-databind - ${jackson.version} - @@ -163,12 +144,6 @@ under the License. ${confluent.version} test - - org.apache.avro - avro - 1.7.7 - test - io.netty @@ -234,7 +209,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.2.1 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/DataSet.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/DataSet.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/DataSet.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/DataStream.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/DataStream.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/DataStream.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/DataStream.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaClientConfiguration.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaClientConfiguration.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaClientConfiguration.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConfigs.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConfigs.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConfigs.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConstants.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConstants.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConstants.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaConstants.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaJsonProtocol.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaJsonProtocol.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaJsonProtocol.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/conf/KafkaJsonProtocol.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumer.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumer.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumer.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumer.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaStreamConsumer.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaStreamConsumer.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaStreamConsumer.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/reader/KafkaStreamConsumer.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConverters.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConverters.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConverters.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConverters.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointers.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointers.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointers.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointers.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtils.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtils.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtils.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtils.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilities.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilities.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilities.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducer.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducer.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducer.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducer.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaStreamProducer.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaStreamProducer.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaStreamProducer.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/main/scala/com/paypal/gimel/kafka2/writer/KafkaStreamProducer.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/DataSetTest.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/DataSetTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/DataSetTest.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/DataSetTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumerTest.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumerTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumerTest.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/reader/KafkaBatchConsumerTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConvertersTest.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConvertersTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConvertersTest.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitKafkaConvertersTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointersTest.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointersTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointersTest.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/ImplicitZKCheckPointersTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtilsTest.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtilsTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtilsTest.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaOptionsLoaderUtilsTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilitiesTest.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilitiesTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilitiesTest.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/KafkaUtilitiesTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/MockKafkaoptionsLoader.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/MockKafkaoptionsLoader.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/utilities/MockKafkaoptionsLoader.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/utilities/MockKafkaoptionsLoader.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducerTest.scala b/gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducerTest.scala similarity index 100% rename from gimel-dataapi/gimel-connectors/gimel-kafka-2.2/src/test/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducerTest.scala rename to gimel-dataapi/gimel-connectors/gimel-kafka/src/test/scala/com/paypal/gimel/kafka2/writer/KafkaBatchProducerTest.scala diff --git a/gimel-dataapi/gimel-connectors/gimel-restapi/pom.xml b/gimel-dataapi/gimel-connectors/gimel-restapi/pom.xml index f4adc094..4d5d19a6 100644 --- a/gimel-dataapi/gimel-connectors/gimel-restapi/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-restapi/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 gimel-restapi - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -47,12 +47,11 @@ under the License. src/main/scala - src/test/scala org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-s3/pom.xml b/gimel-dataapi/gimel-connectors/gimel-s3/pom.xml index 6b41acdf..db84305b 100644 --- a/gimel-dataapi/gimel-connectors/gimel-s3/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-s3/pom.xml @@ -25,14 +25,14 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 com.paypal.gimel gimel-s3 - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -69,7 +69,6 @@ under the License. src/main/scala - src/test/scala net.alchim31.maven @@ -111,7 +110,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-connectors/gimel-sftp/pom.xml b/gimel-dataapi/gimel-connectors/gimel-sftp/pom.xml index dd3db8a9..5c2c66b0 100644 --- a/gimel-dataapi/gimel-connectors/gimel-sftp/pom.xml +++ b/gimel-dataapi/gimel-connectors/gimel-sftp/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 gimel-sftp - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -39,9 +39,19 @@ under the License. com.springml - spark-sftp_${scala.binary.version} + spark-sftp_${springml.scala.binary.version} ${spark.sftp.version} ${packaging.scope} + + + org.apache.spark + * + + + org.scala-lang + * + + org.scalatest @@ -53,12 +63,11 @@ under the License. src/main/scala - src/test/scala org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-core/pom.xml b/gimel-dataapi/gimel-core/pom.xml index 94e58924..5fee5d06 100644 --- a/gimel-dataapi/gimel-core/pom.xml +++ b/gimel-dataapi/gimel-core/pom.xml @@ -23,12 +23,12 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../pom.xml 4.0.0 gimel-core - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -87,17 +87,12 @@ under the License. com.paypal.gimel - gimel-kafka-0.10 + gimel-kafka ${gimel.version}-SNAPSHOT com.paypal.gimel - gimel-kafka-2.2 - ${gimel.version}-SNAPSHOT - - - com.paypal.gimel - gimel-elasticsearch-6.2 + gimel-elasticsearch ${gimel.version}-SNAPSHOT @@ -105,31 +100,31 @@ under the License. gimel-jdbc ${gimel.version}-SNAPSHOT + + + + + com.paypal.gimel - gimel-hbase-1.2 - ${gimel.version}-SNAPSHOT - - - com.paypal.gimel - gimel-aerospike-3.14 - ${gimel.version}-SNAPSHOT - - - com.paypal.gimel - gimel-hive-1.2 + gimel-aerospike ${gimel.version}-SNAPSHOT com.paypal.gimel - gimel-cassandra-2.0 + gimel-hive ${gimel.version}-SNAPSHOT com.paypal.gimel - gimel-druid-0.82 + gimel-cassandra ${gimel.version}-SNAPSHOT + + + + + com.paypal.gimel gimel-sftp @@ -245,7 +240,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataSet.scala b/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataSet.scala index 0f7e09c9..645b10af 100644 --- a/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataSet.scala +++ b/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataSet.scala @@ -38,9 +38,8 @@ import com.paypal.gimel.common.utilities.BindToFieldsUtils._ import com.paypal.gimel.common.utilities.DataSetUtils.propStringToMap import com.paypal.gimel.datasetfactory.GimelDataSet import com.paypal.gimel.elasticsearch.conf.ElasticSearchConfigs -import com.paypal.gimel.hbase.conf.HbaseConfigs import com.paypal.gimel.jdbc.conf.JdbcConfigs -import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants} +import com.paypal.gimel.kafka2.conf.{KafkaConfigs, KafkaConstants} import com.paypal.gimel.logger.Logger class DataSet(val sparkSession: SparkSession) { @@ -160,7 +159,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -190,7 +189,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -293,7 +292,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -324,7 +323,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -411,7 +410,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -440,7 +439,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -597,7 +596,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -627,7 +626,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -727,7 +726,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -758,7 +757,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -855,7 +854,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -886,7 +885,7 @@ class DataSet(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkAppName , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch + , "BATCH" , clusterName , user , appTag.replaceAllLiterally("/", "_") @@ -983,10 +982,6 @@ object DataSetUtils { def getDataSet(sparkSession: SparkSession, sourceType: DataSetType.SystemType): GimelDataSet = { sourceType match { - case DataSetType.KAFKA => - new com.paypal.gimel.kafka.DataSet(sparkSession) - case DataSetType.HBASE => - new com.paypal.gimel.hbase.DataSet(sparkSession) case DataSetType.HDFS => new com.paypal.gimel.hdfs.DataSet(sparkSession) case DataSetType.ES => @@ -999,12 +994,8 @@ object DataSetUtils { new com.paypal.gimel.cassandra.DataSet(sparkSession) case DataSetType.AEROSPIKE => new com.paypal.gimel.aerospike.DataSet(sparkSession) - case DataSetType.HDFS => - new com.paypal.gimel.hdfs.DataSet(sparkSession) case DataSetType.RESTAPI => new com.paypal.gimel.restapi.DataSet(sparkSession) - case DataSetType.DRUID => - new com.paypal.gimel.druid.DataSet(sparkSession) case DataSetType.SFTP => new com.paypal.gimel.sftp.DataSet(sparkSession) case DataSetType.KAFKA2 => @@ -1022,8 +1013,8 @@ object DataSetUtils { def getLatestKafkaDataSetReader(dataSet: DataSet): Option[GimelDataSet] = { Try { dataSet.latestDataSetReader.get match { - case kafka: com.paypal.gimel.kafka.DataSet => - kafka +// case kafka: com.paypal.gimel.kafka.DataSet => +// kafka case kafka2: com.paypal.gimel.kafka2.DataSet => kafka2 } diff --git a/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataStream.scala b/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataStream.scala deleted file mode 100644 index 37f90269..00000000 --- a/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataStream.scala +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel - -import scala.collection.immutable.Map -import scala.language.implicitConversions -import scala.util.{Success, Try} - -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.hive.HiveContext -import org.apache.spark.streaming.{Seconds, StreamingContext} - -import com.paypal.gimel.common.catalog.{CatalogProvider, DataSetProperties} -import com.paypal.gimel.common.conf.{CatalogProviderConfigs, CatalogProviderConstants, GimelConstants} -import com.paypal.gimel.common.utilities.Timer -import com.paypal.gimel.datastreamfactory.{GimelDataStream, StreamingResult} -import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants} -import com.paypal.gimel.logger.Logger - -object DataStreamType extends Enumeration { - type SystemType = Value - val KAFKA = Value -} - -class DataStream(val streamingContext: StreamingContext) { - - import com.paypal.gimel.common.utilities.DataSetUtils._ - - val user: String = sys.env(GimelConstants.USER) - val sparkAppName: String = streamingContext.sparkContext.getConf.get(GimelConstants.SPARK_APP_NAME) - val appTag: String = getAppTag(streamingContext.sparkContext) - val sparkContext: SparkContext = streamingContext.sparkContext - val logger = Logger() - logger.setSparkVersion(streamingContext.sparkContext.version) - val latestDataStreamReader: Option[GimelDataStream] = None - var datasetSystemType: String = "KAFKA" - var additionalPropsToLog = scala.collection.mutable.Map[String, String]() - - // get gimel timer object - val gimelTimer = Timer() - - import DataStreamUtils._ - - def latestKafkaDataStreamReader: Option[com.paypal.gimel.kafka.DataStream] = { - getLatestKafkaDataStreamReader(this) - } - - /** - * Provides DStream for a given configuration - * - * @param sourceType DataStreamType.Type - * @param sourceName Kafka Topic Name - * @param props Map of K->V kafka Properties - * @return StreamingResult - */ - private def read(sourceType: DataStreamType.SystemType - , sourceName: String, props: Any): StreamingResult = { - val propsMap: Map[String, Any] = getProps(props) - val dataStream = DataStreamUtils.getDataStream(streamingContext, sourceType) - dataStream.read(sourceName, propsMap) - } - - /** - * Provides DStream for a given configuration - * - * @param dataSet Kafka Topic Name - * @param props Map of K->V kafka Properties - * @return StreamingResult - */ - def read(dataSet: String, props: Any = Map[String, Any]()): StreamingResult = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - // get start time - val startTime = gimelTimer.start.get - - try { - - // Get catalog provider from run time hive context (1st Preference) - // if not available - check user props (2nd Preference) - // if not available - check Primary Provider of Catalog (Default) - val formattedProps: Map[String, Any] = - Map(CatalogProviderConfigs.CATALOG_PROVIDER -> CatalogProviderConstants.PRIMARY_CATALOG_PROVIDER, - GimelConstants.SPARK_APP_ID -> streamingContext.sparkContext.getConf.get(GimelConstants.SPARK_APP_ID), - GimelConstants.SPARK_APP_NAME -> streamingContext.sparkContext.getConf.get(GimelConstants.SPARK_APP_NAME), - GimelConstants.APP_TAG -> appTag) ++ - getProps(props) - val dataSetProperties: DataSetProperties = - CatalogProvider.getDataSetProperties(dataSet, formattedProps) - // dataSetProperties. - // val (systemType, hiveTableProps) = getSystemType(dataSet) - // val systemType = getSystemType1(dataSetProperties) - val newProps: Map[String, Any] = getProps(props) ++ Map( - GimelConstants.DATASET_PROPS -> dataSetProperties - , GimelConstants.DATASET -> dataSet - , GimelConstants.RESOLVED_HIVE_TABLE -> resolveDataSetName(dataSet) - , GimelConstants.APP_TAG -> appTag) - - // Why are we doing this? Elastic Search Cannot Accept "." in keys - val dataSetProps = dataSetProperties.props.map { case (k, v) => - k.replaceAllLiterally(".", "~") -> v - } - - val propsToLog = scala.collection.mutable.Map[String, String]() - dataSetProps.foreach(x => propsToLog.put(x._1, x._2)) - // additionalPropsToLog = propsToLog - - val data = this.read(DataStreamType.KAFKA, dataSet, newProps) - - - // update log variables to push logs - val endTime = gimelTimer.endTime.get - val executionTime: Double = gimelTimer.endWithMillSecRunTime - - // post audit logs to KAFKA - logger.logApiAccess(streamingContext.sparkContext.getConf.getAppId - , streamingContext.sparkContext.getConf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , getYarnClusterName() - , user - , appTag.replaceAllLiterally("/", "_") - , MethodName - , dataSet - , datasetSystemType - , "" - , additionalPropsToLog - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - , startTime - , endTime - , executionTime - ) - - data - } - catch { - case e: Throwable => - - logger.info(s"Pushing to logs: Error Description\n dataset=${dataSet}\n method=${MethodName}\n Error: ${e.printStackTrace()}") - - // update log variables to push logs - val endTime = System.currentTimeMillis() - val executionTime = endTime - startTime - - // post audit logs to KAFKA - logger.logApiAccess(streamingContext.sparkContext.getConf.getAppId - , streamingContext.sparkContext.getConf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , getYarnClusterName() - , user - , appTag.replaceAllLiterally("/", "_") - , MethodName - , dataSet - , datasetSystemType - , "" - , additionalPropsToLog - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - , startTime - , endTime - , executionTime - ) - - - // throw error to console - logger.throwError(e.toString) - - val msg = s"Error in DataSet ${MethodName} Operation: ${e.printStackTrace()}" - throw new DataSetOperationException(msg, e) - } - - } - -} - -/** - * Client API for initiating datastreams - */ - -object DataStream { - - val defaultBatchInterval = 25 - - import DataStreamUtils._ - - /** - * Client calls for a DataStream with SparkContext - * , we internally create an HiveContext & provide DataStream - * - * @param sparkContext SparkContext - * @return DataStream - */ - def apply(sparkContext: SparkContext): DataStream = { - // todo ADD LOGGING .... WARN USER of default value or pass a specific one explicitly - getOrCreateLogger(sparkContext).warning("Initiating New Spark Context. " + - "Please provide HiveContext if you already have One.") - val allConfs = sparkContext.getConf.getAll.toMap - val batchWindowSec = allConfs.getOrElse(KafkaConfigs.defaultBatchInterval - , defaultBatchInterval.toString).toInt - val ssc = new StreamingContext(sparkContext, Seconds(batchWindowSec)) - this (ssc) - } - - /** - * Client calls for a DataStream with already available HiveContext - * , we provide a DataStream API with the same HiveConext - * - * @param hiveContext HiveContext - * @return DataStream - */ - def apply(hiveContext: HiveContext): DataStream = { - getOrCreateLogger(hiveContext.sparkContext).warning("Initiating New Spark Context" + - ". Please provide HiveContext if you already have One.") - this (hiveContext.sparkContext) - } - - /** - * Client calls for a DataStream without any context (spark or hive) - * , we provide a DataStream API with the same HiveConext - * - * @return DataStream - */ - def apply(): DataStream = { - val sparkConf = new SparkConf().setAppName(sys.env(GimelConstants.USER) + "PCataLog-DataSet") - val sc = new SparkContext(sparkConf) - getOrCreateLogger(sc).warning("Initiating New Spark Context" + - ". Please provide HiveContext if you already have One.") - this (sc) - } - - /** - * Client calls for a DataStream with already available SQLContext - * , we provide a DataStream API with the equivalent HiveConext - * - * @param sqlContext SQLContext - * @return DataStream - */ - def apply(sqlContext: SQLContext): DataStream = { - getOrCreateLogger(sqlContext.sparkContext).warning("Initiating New Spark Context. " + - "Please provide HiveContext if you already have One.") - this (sqlContext.sparkContext) - } - - /** - * Client calls for a DataStream with SparkContext, - * we internally create an HiveContext & provide DataStream - * - * @param streamingContext StreamingContext - * @return DataStream - */ - def apply(streamingContext: StreamingContext): DataStream = { - new DataStream(streamingContext) - } - -} - -/** - * Custom Exception for DataStream initiation errors - * - * @param message Message to Throw - * @param cause A Throwable Cause - */ -private class DataStreamInitializationException(message: String, cause: Throwable) - extends RuntimeException(message) { - if (cause != null) { - initCause(cause) - } - - def this(message: String) = this(message, null) -} - - -/** - * Private Functionalities required for DataStream Initiation Operations - * Do Not Expose to Client - */ - -private object DataStreamUtils { - - - /** - * Convenience Method to Get or Create Logger - * - * @param sparkContext SparkContext - * @return Logger - */ - def getOrCreateLogger(sparkContext: SparkContext): Logger = { - val user = sys.env(GimelConstants.USER) - val sparkAppName = sparkContext.getConf.get(GimelConstants.SPARK_APP_NAME) - val appTag = s"${user}-${sparkAppName}" - val logger = Logger(appTag) - logger - } - - /** - * provides an appropriate DataStream - * - * @param sparkStreamingContext - * @param sourceType Type of System. Example - KAFKA - * @return DataStream - */ - - def getDataStream(sparkStreamingContext: StreamingContext - , sourceType: DataStreamType.SystemType): GimelDataStream = { - sourceType match { - case DataStreamType.KAFKA => - new com.paypal.gimel.kafka.DataStream(sparkStreamingContext) - } - } - - /** - * Gets the last user Kafka KafkaDataStream reader (if already use), else Returns None - * - * @param dataStream DataStream - * @return Option[KafkaDataStream] - */ - - def getLatestKafkaDataStreamReader(dataStream: DataStream) - : Option[com.paypal.gimel.kafka.DataStream] = { - val kafkaReader = Try { - dataStream.latestDataStreamReader.get.asInstanceOf[com.paypal.gimel.kafka.DataStream] - } - kafkaReader match { - case Success(x) => - Some(x) - case _ => - None - } - } - -} diff --git a/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataStream2.scala b/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataStream2.scala index 6f67ca92..92bba7d1 100644 --- a/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataStream2.scala +++ b/gimel-dataapi/gimel-core/src/main/scala/com/paypal/gimel/DataStream2.scala @@ -184,7 +184,7 @@ class DataStream2(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkContext.getConf.get("spark.app.name") , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream + , "STREAM" , getYarnClusterName() , user , appTag.replaceAllLiterally("/", "_") @@ -300,7 +300,7 @@ class DataStream2(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkContext.getConf.get("spark.app.name") , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream + , "STREAM" , getYarnClusterName() , user , appTag.replaceAllLiterally("/", "_") @@ -330,7 +330,7 @@ class DataStream2(val sparkSession: SparkSession) { logger.logApiAccess(sparkSession.sparkContext.getConf.getAppId , sparkContext.getConf.get("spark.app.name") , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream + , "STREAM" , getYarnClusterName() , user , appTag.replaceAllLiterally("/", "_") diff --git a/gimel-dataapi/gimel-examples/pom.xml b/gimel-dataapi/gimel-examples/pom.xml deleted file mode 100644 index 1102fc1d..00000000 --- a/gimel-dataapi/gimel-examples/pom.xml +++ /dev/null @@ -1,83 +0,0 @@ - - - - - - - gimel-dataapi - com.paypal.gimel - 2.0.0-SNAPSHOT - ../pom.xml - - 4.0.0 - gimel-examples - 2.0.0-SNAPSHOT - - - - com.paypal.gimel - gimel-core - ${gimel.version}-SNAPSHOT - - - - - src/main/scala - - - - org.apache.maven.plugins - maven-shade-plugin - 3.0.0 - - - - com.google.common - gimel-shaded.com.google.common - - - com.sun.jersey - gimel-shaded.com.sun.jersey - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - gimel-shading - package - - shade - - - - - - - diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageAcrossDataSets.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageAcrossDataSets.scala deleted file mode 100644 index f1419742..00000000 --- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageAcrossDataSets.scala +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.examples - -import org.apache.spark.sql._ -import org.apache.spark.streaming.{Seconds, StreamingContext} - -import com.paypal.gimel.DataSet -import com.paypal.gimel.logger.Logger - -object APIUsageAcrossDataSets { - - // spark-shell --master yarn-client --driver-memory 4g \ - // --executor-memory 4g --executor-cores 1 --num-executors 2 --jars ~/pcatalog.jar - // Initiate Logger - val logger = Logger(this.getClass.getName) - // Specify Batch Interval for Streaming - val batchInterval = 5 - - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - sc.setLogLevel("ERROR") - val sqlContext = sparkSession.sqlContext - - /** - * --------------------- Context Initiation --------------------- - */ - - val ssc = new StreamingContext(sc, Seconds(batchInterval.toInt)) - - /** - * --------------------- DataSet Initiation --------------------- - */ - - // Initiate Pcatalog DataSet - // val systemType = DataSetType - val dataSet: DataSet = DataSet(sparkSession) - - /** - * --------------------- Begin KAFKA Params --------------------- - */ - // Create a Schema String for Avro SerDe - val schema: String = - """ { - | "type" : "record", - | "namespace" : "default", - | "name" : "flights", - | "fields" : [ - | { "name" : "month" , "type" : "string" }, - | { "name" : "dayofmonth" , "type" : "string" }, - | { "name" : "dayofweek" , "type" : "string" }, - | { "name" : "deptime" , "type" : "string" }, - | { "name" : "crsdeptime" , "type" : "string" }, - | { "name" : "arrtime" , "type" : "string" }, - | { "name" : "crsarrtime" , "type" : "string" }, - | { "name" : "uniquecarrier" , "type" : "string" }, - | { "name" : "flightnum" , "type" : "string" }, - | { "name" : "tailnum" , "type" : "string" }, - | { "name" : "actualelapsedtime" , "type" : "string" }, - | { "name" : "crselapsedtime" , "type" : "string" }, - | { "name" : "airtime" , "type" : "string" }, - | { "name" : "arrdelay" , "type" : "string" }, - | { "name" : "depdelay" , "type" : "string" }, - | { "name" : "origin" , "type" : "string" }, - | { "name" : "dest" , "type" : "string" }, - | { "name" : "distance" , "type" : "string" }, - | { "name" : "taxiin" , "type" : "string" }, - | { "name" : "taxiout" , "type" : "string" }, - | { "name" : "cancelled" , "type" : "string" }, - | { "name" : "cancellationcode" , "type" : "string" }, - | { "name" : "diverted" , "type" : "string" }, - | { "name" : "carrierdelay" , "type" : "string" }, - | { "name" : "weatherdelay" , "type" : "string" }, - | { "name" : "nasdelay" , "type" : "string" }, - | { "name" : "securitydelay" , "type" : "string" }, - | { "name" : "lateaircraftdelay" , "type" : "string" }, - | { "name" : "year" , "type" : "string" } - | ] - |} - """.stripMargin - // Create a Host:Port for Kafka, below works for Kafka installed on local machine - val hostAndPort = "localhost:6667" - val topic = "flights_avro_data1" - // Create Kafka Params for Consumer - val consumerParamsKafka: Map[String, String] = Map[String, String]("bootstrap.servers" -> hostAndPort, - "group.id" -> 111.toString, "zookeeper.connection.timeout.ms" -> 10000.toString, "auto.offset.reset" -> "smallest", - "avro.schema.string" -> schema) - // Create Kafka Params for Producer - val producerParamsKafka: Map[String, String] = Map[String, String]("bootstrap.servers" -> hostAndPort, - "key.serializer" -> "org.apache.kafka.common.serialization.StringSerializer", - "value.serializer" -> "org.apache.kafka.common.serialization.ByteArraySerializer", - "avro.schema.string" -> schema) - // Produce to Kafka - - - /** - * ------------------------- ES Props ------------------------------ - */ - - val esOptions: Map[String, String] = Map("pushdown" -> "true", "es.nodes" -> "localhost", "es.port" -> "9200", "es.index.auto.create" -> "true") - - /** - * ------------------------ HDFS Props ------------------------------ - */ - - val wrtoptionsParquet: Map[String, String] = Map("hiveDatabaseName" -> "default", "hdfsPath" -> "hdfs:///tmp/parquet_demo/parquet_out", "inDataFormat" -> "parquet", "compressionCodec" -> "gzip", "columnDelimiter" -> "20") - - /** - * --------------------- Begin Demo of API Usage --------------------- - */ - - - // Read Hive - - val flights_from_hive: DataFrame = dataSet.read("flights_1m") - flights_from_hive.show() - - - // Write Kafka - - dataSet.write(topic, flights_from_hive.limit(10000), producerParamsKafka) - - // Read Kafka - - val flights_from_kafka: DataFrame = dataSet.read(topic, consumerParamsKafka) - flights_from_kafka.show() - - // write HBase - - dataSet.write("flights_hbase", flights_from_kafka) - - // Read HBase - - val flights_from_hbase: DataFrame = dataSet.read("flights_hbase") - flights_from_hbase.show() - - // Write ES - - dataSet.write("flights/demo", flights_from_hbase, esOptions) - - // Read ES - - val flights_from_ES: DataFrame = dataSet.read("flights/demo", esOptions) - flights_from_ES.show() - - // Write HDFS - - dataSet.write("parquet_out", flights_from_ES, wrtoptionsParquet) - - // Read HDFS via Hive - - val flights_parquet_via_hive: DataFrame = dataSet.read("flights_parquet") - flights_parquet_via_hive.show() - - // Comparison of All Operations --> Expected 999999 - - flights_parquet_via_hive.unionAll(flights_from_hive).distinct().count() - -} diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageElasticSearchDataSet.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageElasticSearchDataSet.scala deleted file mode 100644 index e658b891..00000000 --- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageElasticSearchDataSet.scala +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.examples - -import org.apache.spark.sql._ - -import com.paypal.gimel.DataSet - -object APIUsageElasticSearchDataSet { - - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - sc.setLogLevel("ERROR") - val sqlContext = sparkSession.sqlContext - - val nodes = "elastic_host_ip" - - val dataSet = DataSet(sparkSession) - - /* Use case for write as JSON for a given rdd */ - // WriteJSONforRDD - - val json1 = """{"reason2" : "business2", "airport2" : "SFO2"}""" - val json2 = """{"participants2" : 5, "airport2" : "OTP2"}""" - var options = Map("pushdown" -> "true", - "es.nodes" -> "localhost", "es.port" -> "9200", - "es.index.auto.create" -> "true", "JSON" -> "TRUE") - val airportsRDD = sc.makeRDD(Seq(json1, json2)) - - - /* Use case for Read API into a DF */ - // ReadEStoDF - options = Map("gimel.es.index.partitioned" -> "false" - , "gimel.es.index.partition.delimiter" -> "_" - , "gimel.es.index.partition" -> "20170602,20170603") - - - /* Use case for write API for a given rdd */ - // WriteESfromRdd - - val game = Map("name" -> "dheeraj3", "age" -> "28", "gender" -> "male") - val game1 = Map("name" -> "dheeraj4", "age" -> "28", "gender" -> "male") - val rdd = sc.makeRDD(Seq(game, game1)) - options = Map("pushdown" -> "true" - , "es.nodes" -> nodes, "es.port" -> "9200" - , "es.index.auto.create" -> "true" - , "gimel.es.index.partitioned" -> "true" - , "gimel.es.index.partition.delimiter" -> "_" - , "gimel.es.index.partition" -> "20170603") - - - /* Use case for Read API as JSON into a DF */ - // ReadasJSONintoDF - - options = Map("pushdown" -> "true" - , "es.nodes" -> "localhost" - , "es.port" -> "9200" - , "es.index.auto.create" -> "true" - , "JSON" -> "TRUE" - , "gimel.es.index.partitioned" -> "true" - , "gimel.es.index.partition.delimiter" -> "_" - , "gimel.es.index.partition" -> "20170602") - - /* Use case for Write API From a DF */ - // WriteESfromDF - - options = Map("gimel.es.index.partition" -> "20170602") - val json31 = s"""{"name" : "dheeraj11", "age" : "28","gender":"male"}""" - val json41 = s"""{"name" : "dheeraj12", "age" : "28","gender":"male"}""" - val rdd11 = sc.parallelize(Seq(json31, json41)) - val df12 = sqlContext.read.json(rdd11) - - /* Use case for Write API From a DF as JSON */ - // WriteasJSONfromDF - - options = Map("pushdown" -> "true" - , "es.nodes" -> nodes - , "es.port" -> "9200" - , "es.index.auto.create" -> "true" - , "JSON" -> "TRUE") - val json3 = """{"name" : "dheeraj", "age" : 28,","gender":"male"}""" - val json4 = """{"name" : "baskar", "age" : 16,","gender":"male"}""" - - val rdd12 = sc.parallelize(Seq(json3, json4)) - val df1 = sqlContext.read.json(rdd12) - -} diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaProduceConsume.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaProduceConsume.scala deleted file mode 100644 index 9803c400..00000000 --- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaProduceConsume.scala +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.examples - -import scala.language.implicitConversions - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.streaming.{Seconds, StreamingContext} - -import com.paypal.gimel._ -import com.paypal.gimel.logger.Logger - -/** - * Demo's Kafka Producer and Consumer for DataSet - */ -object APIUsageKafkaProduceConsume extends App { - - // Option to Run the Code in spark-submit mode, - // if a table name is passed - it is considered. Else, default of kafka_testing_flights is read - val datasetName = if (args.isEmpty) { - "pcatalog.kafka_flights_log" - } else { - args(0) - } - // Initiate Logger - val logger = Logger(this.getClass.getName) - // Specify Batch Interval for Streaming - val batchInterval = 5 - // Context - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - sc.setLogLevel("ERROR") - val sqlContext = sparkSession.sqlContext - val ssc = new StreamingContext(sc, Seconds(batchInterval.toInt)) - - /** - * ---------Initiate DataSet----------- - */ - val dataSet: DataSet = DataSet(sparkSession) - - - /** - * ------------CDH Example ---------------- - */ - val options = "throttle.batch.fetchRowsOnFirstRun=2500000:throttle.batch.batch.parallelsPerPartition=250:throttle.batch.maxRecordsPerPartition=25000000" - - /** - * ---------Read from Kafka, using the Table Props----------- - */ - val recsDF = dataSet.read(datasetName, options) - recsDF.show - - /** - * ---------Get StateFul Kafka Operator before next read or any step of operation----------- - */ - val kafkaOperator = dataSet.latestKafkaDataSetReader.get - - /** - * ---------to clear Checkpoint (Ideally, one would not clear checkpoint in a continuous Batch or Stream in production)----------- - * This operation Deletes the Zookeeper Node where the checkpoint is being done - */ - - kafkaOperator.clearCheckPoint() - - /** - * ---------- Ability to check if already checkpointed ------- - * Once checkpoint is done - we set kafkaOperator.alreadyCheckPointed = true - * This prevents second time checkpointing (for protection) - * Below will return "true" - */ - - // val isAlreadyCheckPointed = kafkaOperator.alreadyCheckPointed - - /** - * Second call on CheckPoint function will not perform any save but throw a warning message to user - - * "Warning --> Already Check-Pointed, Consume Again to Checkpoint !" - */ - kafkaOperator.saveCheckPoint() - - /** - * ---------Write to Kafka Some Custom Data (NOT CDH !!)----------- - */ - - // Create Dummy Data Set for Write - def stringed(n: Int): String = { - s"""{"age": $n, "name": "MAC-$n", "rev": ${n * 10000}}""" - } - - val texts: Seq[String] = (1 to 20).map { x => stringed(x) } - val rdd: RDD[String] = sc.parallelize(texts) - val df: DataFrame = sqlContext.read.json(rdd) - - // Get a List of Supported Systems for DataSet Operations - // val systemType = DataSetType - - // DataSet Write API Call - dataSet.write(datasetName, df) - -} diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaStreamProduceConsume.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaStreamProduceConsume.scala deleted file mode 100644 index 0f1ceed9..00000000 --- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaStreamProduceConsume.scala +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.examples - -import scala.language.implicitConversions - -import org.apache.avro.generic.GenericRecord -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ - -import com.paypal.gimel._ -import com.paypal.gimel.datastreamfactory.StreamingResult -import com.paypal.gimel.logger.Logger - -/** - * Demo's Kafka Producer and Consumer for DataStream - */ -object APIUsageKafkaStreamProduceConsume extends App { - - // Initiate Logger - val logger = Logger(this.getClass.getName) - // Specify Batch Interval for Streaming - val batchInterval = 10 - // Context - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - sc.setLogLevel("ERROR") - val sqlContext = sparkSession.sqlContext - - // Initiate DStream - val dataStream = DataStream(sc) - - // Option to Run the Code in spark-submit mode, - // if a table name is passed - it is considered. Else, default of kafka_testing_flights is read - val datasetName = if (args.isEmpty) { - "kafka_testing_flights" - } else { - args(0) - } - - // Get Reference to Stream - val streamingResult: StreamingResult = dataStream.read(datasetName) - - // Clear CheckPoint if necessary - streamingResult.clearCheckPoint("some message") - - // Helper for Clients - streamingResult.dStream.foreachRDD { rdd => - - val count = rdd.count() - - if (count > 0) { - - /** - * Mandatory | Get Offset for Current Window, so we can checkpoint at the end of this window's operation - */ - - streamingResult.getCurrentCheckPoint(rdd) - - /** - * Begin | User's Usecases - */ - - // Sample UseCase | Display Count - logger.debug("count is -->") - logger.debug(count) - - // Sample UseCase | Get Avro Generic Record - val rddAvro: RDD[GenericRecord] = streamingResult.convertBytesToAvro(rdd) - rddAvro.map(x => x.toString) - logger.debug("sample records from Avro-->") - rddAvro.map(x => x.toString).take(10).foreach(record => logger.debug(record)) - - // Sample UseCase | Convert to DataFrame - val df: DataFrame = streamingResult.convertAvroToDF(sqlContext, rddAvro) - logger.debug("sample records -->") - df.show(5) - - // JSON / String / Bytes (Avro) / Bytes (CDH) --> All can be deserialized into Spark DataFrame via this function - streamingResult.getAsDF(sqlContext, rdd) - - - /** - * End | User's Usecases - */ - - /** - * Mandatory | Save Current Window - CheckPoint - */ - - streamingResult.saveCurrentCheckPoint() - } - } - - // Start the Context - dataStream.streamingContext.start() - dataStream.streamingContext.awaitTermination() - -} diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/GimelDruidRealtimeIngestion.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/GimelDruidRealtimeIngestion.scala deleted file mode 100644 index 44d84ae6..00000000 --- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/GimelDruidRealtimeIngestion.scala +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.examples - -import org.apache.avro.generic.GenericRecord -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.streaming.{Seconds, StreamingContext} - -import com.paypal.gimel.{DataSet, DataStream} -import com.paypal.gimel.datastreamfactory.StreamingResult -import com.paypal.gimel.logger.Logger - -object GimelDruidRealtimeIngestion { - val logger = Logger(this.getClass.getName) - - - def main(args: Array[String]): Unit = { - - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - sc.setLogLevel("ERROR") - val sqlContext = sparkSession.sqlContext - - // Create Streaming context - - val ssc = new StreamingContext(sc, Seconds(20)) - - val dataStream = DataStream(ssc) - - val streamingResult: StreamingResult = - dataStream - .read("pcatalog.kafka_flights_log") - - streamingResult.clearCheckPoint("Clearing Checkpoint.") - - streamingResult.dStream.foreachRDD { rdd => - val count = rdd.count() - - if (count > 0) { - /** - * Mandatory | Get Offset for Current Window, so we can checkpoint at the end of this window's operation - */ - streamingResult.getCurrentCheckPoint(rdd) - - logger.info(s"Count for current Checkpoint: $count") - logger.info(s"Scala Version Used ---> ${scala.util.Properties.versionString}") - - val rddAvro: RDD[GenericRecord] = streamingResult.convertBytesToAvro(rdd) - rddAvro.map(_.toString) - - val df: DataFrame = streamingResult.convertAvroToDF(sqlContext, rddAvro) - - // Call Druid Connector for realtime ingestion. - - val dataSet = new DataSet(sparkSession) - val dataSetProps = Map("load_type" -> "realtime") - dataSet.write("gimel.druid_flights_log", df, dataSetProps) - - streamingResult.saveCurrentCheckPoint() - } - } - - dataStream.streamingContext - - // Start the computation - ssc.start() - - // Wait for the computation to terminate - ssc.awaitTermination() - } - -} diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingKafkaMessageTesting.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingKafkaMessageTesting.scala deleted file mode 100644 index 8b0c247c..00000000 --- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingKafkaMessageTesting.scala +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.examples - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql.functions._ -import org.apache.spark.streaming.{Seconds, StreamingContext} - -import com.paypal.gimel._ -import com.paypal.gimel.datastreamfactory._ -import com.paypal.gimel.logger.Logger - -object SparkStreamingKafkaMessageTesting extends App { - - // Initiate Logger - val logger = Logger(this.getClass.getName) - - import SparkStreamingKafkaStringMessageUtils._ - - var params = resolveRunTimeParameters(args) - val sourceName = params("source") - val targetName = params.getOrElse("target", "NA") - val messageFormat = params("messageFormat") - // Specify Batch Interval for Streaming - val batchInterval = params.getOrElse("batchInterval", "10").toInt - val timeOutSeconds = params.getOrElse("timeOutSeconds", "60").toInt - // Context - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - sc.setLogLevel("ERROR") - val sqlContext = sparkSession.sqlContext - import sqlContext.implicits._ - - val ssc = new StreamingContext(sc, Seconds(batchInterval.toInt)) - - val dataStream = DataStream(sc) - val dataSet = DataSet(sparkSession) - // Get Reference to Stream - val streamingResult: StreamingResult = dataStream.read(sourceName) - // Clear CheckPoint if necessary - streamingResult.clearCheckPoint("some message") - streamingResult.dStream.foreachRDD { rdd => - val k: RDD[WrappedData] = rdd - val count = rdd.count() - - logger.info(s"Count is --> ${count}") - logger.info(s"Message Type Specified is ${messageFormat}...") - if (count > 0) { - - val df1 = streamingResult.getAsDF(sqlContext, rdd) - df1.printSchema() - df1.show(10) - val updatedDataFrame: DataFrame = df1 - // updatedDataFrame.show - val col1 = date_format(from_unixtime(col("logtime").divide(1000)), "yyyyMMdd") - val dfWithDate = updatedDataFrame.withColumn("dfWithDate", col1) - val dateList = dfWithDate.select("dfWithDate").distinct().collect.flatMap(_.toSeq) - val dateListMap = dateList.map { date => - (date -> dfWithDate.where($"dfWithDate" <=> date)) - }.toMap - - dateListMap.foreach { case (key, dfes) => - val schemaMapping: String = s"""{"appStartTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date" }, "appEndTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date"},"jobStartTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date"}, "jobEndTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date"}, "logtime": { "format": "strict_date_optional_time||epoch_millis", "type": "date"}}""" - val options: Map[String, String] = Map("gimel.es.index.partition.suffix" -> s"$key", "gimel.es.schema.mapping" -> schemaMapping) - if (targetName != "NA") { - logger.info(s"Begin Writing To : ${targetName}") - val res = dataSet.write(targetName, dfes, options) - } - } - } - streamingResult.saveCurrentCheckPoint() - } - - dataStream.streamingContext.start() - dataStream.streamingContext.awaitTerminationOrTimeout(-1) - dataStream.streamingContext.stop(false, true) -} - -object SparkStreamingKafkaStringMessageUtils { - - val logger = Logger(this.getClass.getName) - - def resolveRunTimeParameters(allParams: Array[String]): Map[String, String] = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - logger.info(" @Begin --> " + MethodName) - var paramsMapBuilder: Map[String, String] = Map() - logger.info(s"All Params From User --> \n${allParams.mkString("\n")}") - if (allParams.length == 0) { - throw new Exception("Args Cannot be Empty") - } - - for (jobParams <- allParams) { - for (eachParam <- jobParams.split(" ")) { - paramsMapBuilder += (eachParam.split("=")(0) -> eachParam.split("=", 2)(1)) - } - } - logger.info(s"Resolved Params From Code --> ${paramsMapBuilder}") - paramsMapBuilder - } -} diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingPCatalogUSDemo.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingPCatalogUSDemo.scala deleted file mode 100644 index e67abf9b..00000000 --- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingPCatalogUSDemo.scala +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.examples - -import org.apache.spark.sql._ -import org.apache.spark.sql.expressions.UserDefinedFunction -import org.apache.spark.sql.functions._ -import org.apache.spark.streaming._ - -import com.paypal.gimel.{DataSet, DataStream} -import com.paypal.gimel.logger.Logger - -object SparkStreamingPCatalogUSDemo { - - // Define Geo Function - case class Geo(lat: Double, lon: Double) - - val myUDF: UserDefinedFunction = udf((lat: Double, lon: Double) => Geo(lat, lon)) - - def main(args: Array[String]) { - - // Creating SparkContext - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - sc.setLogLevel("ERROR") - val sqlContext = sparkSession.sqlContext - val ssc = new StreamingContext(sc, Seconds(10)) - val logger = Logger(this.getClass.getName) - - // Initiating PCatalog DataSet and DataStream - val dataSet = DataSet(sparkSession) - val dataStream = DataStream(ssc) - - // Reading from HDFS Dataset - logger.info("Reading address_geo HDFS Dataset") - val geoLookUpDF = dataSet.read("pcatalog.address_geo") - val geoLookUp = geoLookUpDF.withColumn("geo", myUDF(geoLookUpDF("lat"), geoLookUpDF("lon"))).drop("lat").drop("lon") - geoLookUp.cache() - logger.info("Read" + geoLookUp.count() + " records") - - // Reading from Kafka DataStream and Loading into Elastic Search Dataset - val streamingResult = dataStream.read("pcatalog.kafka_transactions") - streamingResult.clearCheckPoint("OneTimeOnly") - streamingResult.dStream.foreachRDD { rdd => - if (rdd.count() > 0) { - streamingResult.getCurrentCheckPoint(rdd) - val txnDF = streamingResult.convertAvroToDF(sqlContext, streamingResult.convertBytesToAvro(rdd)) - val resultSet = txnDF.join(geoLookUp, txnDF("account_number") === geoLookUp("customer_id")) - .selectExpr("CONCAT(time_created,'000') AS time_created", "geo", "usd_amount") - - dataSet.write("pcatalog.elastic_transactions_dmz", resultSet) - streamingResult.saveCurrentCheckPoint() - } - } - - // Start Streaming - dataStream.streamingContext.start() - dataStream.streamingContext.awaitTermination() - - sc.stop() - } -} diff --git a/gimel-dataapi/gimel-logger/pom.xml b/gimel-dataapi/gimel-logger/pom.xml index 92da4bc2..e0688787 100644 --- a/gimel-dataapi/gimel-logger/pom.xml +++ b/gimel-dataapi/gimel-logger/pom.xml @@ -23,13 +23,13 @@ under the License. gimel-dataapi com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../pom.xml 4.0.0 gimel-logger - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -50,33 +50,6 @@ under the License. ${scala.version} ${scala.packaging.scope} - - com.paypal.gimel - gimel-logging_${gimel.logging.spark.binary.version} - ${gimel.logging.version} - - - org.slf4j - slf4j-log4j12 - - - log4j - log4j - - - org.apache.kafka - * - - - org.apache.kafka - kafka-log4j-appender - - - com.googlecode.protobuf-java-format - protobuf-java-format - - - src/main/scala diff --git a/gimel-dataapi/gimel-logger/src/main/scala/com/paypal/gimel/logger/Logger.scala b/gimel-dataapi/gimel-logger/src/main/scala/com/paypal/gimel/logger/Logger.scala index 6f683a60..2b14625c 100644 --- a/gimel-dataapi/gimel-logger/src/main/scala/com/paypal/gimel/logger/Logger.scala +++ b/gimel-dataapi/gimel-logger/src/main/scala/com/paypal/gimel/logger/Logger.scala @@ -24,7 +24,6 @@ import java.util.Calendar import scala.collection.JavaConverters._ import com.paypal.gimel.logger.conf.LoggerConstants -import com.paypal.gimel.logging.impl.JSONSystemLogger /** * com.paypal.gimel.logger.Logger SingleTon @@ -74,7 +73,6 @@ class Logger(config: Any) extends Serializable { private val _APP_reference = config.toString private val logModes = Map(4 -> "INFO", 3 -> "DEBUG", 2 -> "WARN", 1 -> "ERROR") @volatile private var logMode = 4 - lazy val logger: JSONSystemLogger = JSONSystemLogger.getInstance(getClass) private var logAudit = false var consolePrintEnabled = false // logic to attempt logging @@ -136,7 +134,6 @@ class Logger(config: Any) extends Serializable { case _ => if (!sendToKafka) s"[${_APP_reference}] : ${message.toString}" else message } - if (logMode >= 2 && auditingAndAlertingEnabled) logger.debug(finalMessage.asInstanceOf[Object]) if (consolePrintEnabled) println(s"GIMEL-LOGGER | ${Calendar.getInstance().getTime} | ${message}") } catch { case ex: Throwable => @@ -158,7 +155,6 @@ class Logger(config: Any) extends Serializable { case _ => if (!sendToKafka) s"[${_APP_reference}] : ${message.toString}" else message } - if (logMode >= 4 && auditingAndAlertingEnabled) logger.info(finalMessage.asInstanceOf[Object]) if (consolePrintEnabled) println(s"GIMEL-LOGGER | ${Calendar.getInstance().getTime} | ${message}") } catch { case ex: Throwable => @@ -180,7 +176,6 @@ class Logger(config: Any) extends Serializable { case _ => if (!sendToKafka) s"[${_APP_reference}] : ${message.toString}" else message } - if (logMode >= 3 && auditingAndAlertingEnabled) logger.warn(finalMessage.asInstanceOf[Object]) if (consolePrintEnabled) println(s"GIMEL-LOGGER | ${Calendar.getInstance().getTime} | ${message}") } catch { case ex: Throwable => @@ -201,7 +196,6 @@ class Logger(config: Any) extends Serializable { case _ => s"[${_APP_reference}] : ${message.toString}" } - if (logMode >= 1 && auditingAndAlertingEnabled) logger.error(finalMessage) if (consolePrintEnabled) println(s"GIMEL-LOGGER | ${Calendar.getInstance().getTime} | ${message}") } catch { case ex: Throwable => @@ -268,7 +262,6 @@ class Logger(config: Any) extends Serializable { if (logAudit) { this.info("Auditing Information being posted to Gimel Audit Log...") this.info(accessAuditInfo) - logger.info(accessAuditInfo.asJava) } accessAuditInfo } @@ -346,7 +339,6 @@ class Logger(config: Any) extends Serializable { if (logAudit) { this.info("Auditing Information being posted to Gimel Audit Log...") this.info(accessAuditInfo) - logger.info(accessAuditInfo.asJava) } this.logMethodAccess(yarnAppId diff --git a/gimel-dataapi/gimel-sql/pom.xml b/gimel-dataapi/gimel-sql/pom.xml deleted file mode 100644 index 32d24539..00000000 --- a/gimel-dataapi/gimel-sql/pom.xml +++ /dev/null @@ -1,141 +0,0 @@ - - - - gimel-dataapi - com.paypal.gimel - 2.0.0-SNAPSHOT - ../pom.xml - - 4.0.0 - - gimel-sql - 2.0.0-SNAPSHOT - - - - com.paypal.gimel - gimel-core - ${gimel.version}-SNAPSHOT - - - io.netty - netty-handler - - - - - - - org.scalatest - scalatest_${scala.binary.version} - ${scalatest.version} - test - - - org.scalamock - scalamock_${scala.binary.version} - ${scalamock.version} - test - - - - io.netty - netty - ${netty.hadoop.version} - test - - - io.netty - netty-all - ${netty.all.hadoop.version} - test - - - net.jpountz.lz4 - lz4 - 1.3.0 - test - - - - - src/main/scala - src/test/scala - - - net.alchim31.maven - scala-maven-plugin - 3.2.1 - - - - compile - testCompile - - - - - - -Xms64m - -Xmx1024m - - - - - org.scalatest - scalatest-maven-plugin - 1.0 - - ${project.build.directory}/surefire-reports - . - WDF TestSuite.txt - - - - test - - test - - - - - - org.apache.maven.plugins - maven-shade-plugin - 3.0.0 - - - - com.google.common - gimel-shaded.com.google.common - - - com.sun.jersey - gimel-shaded.com.sun.jersey - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - gimel-shading - package - - shade - - - - - - - - diff --git a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryProcessor.scala b/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryProcessor.scala deleted file mode 100644 index be1a9387..00000000 --- a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryProcessor.scala +++ /dev/null @@ -1,1397 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import scala.collection.immutable.Map -import scala.util.{Failure, Success, Try} - -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.apache.spark.sql.streaming.{StreamingQuery, Trigger} -import org.apache.spark.sql.types.StructField -import org.apache.spark.streaming.{Seconds, StreamingContext} - -import com.paypal.gimel._ -import com.paypal.gimel.common.catalog.{CatalogProvider, DataSetProperties} -import com.paypal.gimel.common.conf.{CatalogProviderConfigs, GimelConstants} -import com.paypal.gimel.common.gimelserde.GimelSerdeUtils -import com.paypal.gimel.common.query.guard.QueryGuard -import com.paypal.gimel.common.security.AuthHandler -import com.paypal.gimel.common.utilities.{DataSetType, DataSetUtils, Timer} -import com.paypal.gimel.datasetfactory.GimelDataSet -import com.paypal.gimel.datastreamfactory.{StreamingResult, StructuredStreamingResult, WrappedData} -import com.paypal.gimel.jdbc.conf.JdbcConfigs -import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants} -import com.paypal.gimel.logger.Logger -import com.paypal.gimel.logging.GimelStreamingListener -import com.paypal.gimel.parser.utilities.{QueryConstants, QueryParserUtils} - -object GimelQueryProcessor { - - val logger: Logger = Logger(this.getClass.getName) - lazy val pCatalogStreamingKafkaTmpTableName = "pcatalog_streaming_kafka_tmp_table" - val queryUtils = GimelQueryUtils - - import queryUtils._ - - val originalUser = sys.env("USER") - var user = originalUser - var isQueryFromGTS = false - val yarnCluster = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName() - var queryGuard: Option[QueryGuard] = None - - /** - * At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE) - * - * @param sparkSession Spark Session - */ - def setCatalogProviderInfo(sparkSession: SparkSession): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val catalogProvider: String = sparkSession.conf.get(CatalogProviderConfigs.CATALOG_PROVIDER, GimelConstants.UDC_STRING) - val catalogProviderName: String = sparkSession.conf.get(CatalogProviderConfigs.CATALOG_PROVIDER_NAME_SPACE, GimelConstants.UDC_STRING) - logger.info(s"Catalog Provider --> [${catalogProvider}] | Catalog Provider Name --> [${catalogProviderName}] ") - setCatalogProvider(catalogProvider) - setCatalogProviderName(catalogProviderName) - } - - /** - * Sets Spark GTS User Name if available - * - * @param sparkSession SparkSession - */ - def setGtsUser(sparkSession: SparkSession): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val gtsUser: String = sparkSession.sparkContext.getLocalProperty(GimelConstants.GTS_USER_CONFIG) - val gts_default_user = GimelConstants.GTS_DEFAULT_USER(sparkSession.conf) - if (gtsUser != null && originalUser.equalsIgnoreCase(gts_default_user)) { - logger.info(s"GTS User [${gtsUser}] will be used to over ride executing user [${originalUser}] who started GTS.") - sparkSession.sql(s"set ${GimelConstants.GTS_USER_CONFIG}=${gtsUser}") - - // set jdbc username,if already not set in sparkconf - val jdbcUser: Option[String] = sparkSession.conf.getOption(JdbcConfigs.jdbcUserName) - if (jdbcUser.isEmpty) { - logger.info(s"Setting ${JdbcConfigs.jdbcUserName}=${gtsUser}") - sparkSession.sql(s"set ${JdbcConfigs.jdbcUserName}=${gtsUser}") - } - user = gtsUser - isQueryFromGTS = true - } - } - - /** - * This function guards any runtime changes attempted by users to override GTS specific configurations. - * - * @param sql - * @param sparkSession - */ - def guardGTSStatements(sql: String, sparkSession: SparkSession): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - // Guard only if user is GTS Super User - if (sparkSession.sparkContext.sparkUser.equalsIgnoreCase(GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))) { - - val checkFlag = - // Impersonation Flag is not allowed to be set in GSQL - sql.toLowerCase.contains(GimelConstants.GTS_IMPERSONATION_FLAG) || - // JDBC User is not alloweed to be set in GSQL - // sql.toLowerCase.contains(JdbcConstants.jdbcUserName) || - // GTS User should not be overridded - sql.toLowerCase.contains(GimelConstants.GTS_USER_CONFIG) - - if (checkFlag) throw new Exception(s"SECURITY VIOLATION | Execution of this statement is not allowed: ${sql}") - } - - // Enable or stop query guard based on user config - // switchQueryGuard(sparkSession) - } - - /** - * - * @param sparkSession - */ - def switchQueryGuard(sparkSession: SparkSession): Unit = { - if (queryGuard.isEmpty) { - queryGuard = Some(new QueryGuard(sparkSession)) - } - // Turn ON and OFF Query guard - val queryGuardControl = if (sparkSession.conf.getOption(GimelConstants.GTS_SPARK_QUERY_GUARD).isDefined) { - sparkSession.conf.getOption(GimelConstants.GTS_SPARK_QUERY_GUARD) - } else if (sparkSession.conf.getOption(GimelConstants.GTS_QUERY_GUARD).isDefined) { - sparkSession.conf.getOption(GimelConstants.GTS_QUERY_GUARD) - } else { - None - } - queryGuardControl.foreach { - case control: String if control.toLowerCase == "true" => - // start - logger.info("Starting query guard") - queryGuard.get.start() - case control: String if control.toLowerCase == "false" => - // stop - logger.info("Starting query guard") - queryGuard.get.stop() - case _ => - // wrong config received do nothing - logger.info(s"Wrong config: $queryGuardControl received. So, stopping query guard") - queryGuard.get.stop() - } - } - - /** - * Core Function that will be called from SCAAS for executing a SQL - * - * @param sql SQL String supplied by client - * @param sparkSession : SparkSession - * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries - */ - def executeBatch(sql: String, sparkSession: SparkSession): DataFrame = { - - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val uniformSQL = sql.replace("\n", " ").trim - val sqlArray: Array[String] = uniformSQL.split(";") - val totalStatements = sqlArray.length - val dataFrames: Array[DataFrame] = sqlArray.zipWithIndex.map(eachSql => { - val sqlString = eachSql._1 - val index = eachSql._2 - logger.info(s"Executing statement: ${sqlString}") - try { - executeBatchStatement(sqlString, sparkSession) - } - catch { - case e: Throwable => - val errorMsg = - s""" - | Statements[${index}/${totalStatements}] successfully executed. - | Statement[${index + 1}] execution failed --> ${sqlString} - """.stripMargin - logger.throwError(s"${errorMsg}") - throw e - } - }) - logger.info(s"${totalStatements}/${totalStatements} statements successfully executed.") - dataFrames(totalStatements - 1) - } - - /** - * This method will process one statement from executebatch - * - * @param sql SQL String supplied by client - * @param sparkSession : SparkSession - * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries - */ - def executeBatchStatement(sql: String, sparkSession: SparkSession): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - logger.setSparkVersion(sparkSession.version) - - // Set gimel log level and flag to audit logs to kafka - DataSetUtils.setGimelLogLevel(sparkSession, logger) - guardGTSStatements(sql, sparkSession) - switchQueryGuard(sparkSession) - - val sparkAppName = sparkSession.conf.get("spark.app.name") - - try { - - // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE) - setCatalogProviderInfo(sparkSession) - - // If query comes from GTS - interpret the GTS user and set it - setGtsUser(sparkSession) - - val options = queryUtils.getOptions(sparkSession)._2 - - var resultingString = "" - // val queryTimer = Timer() - // val startTime = queryTimer.start - val isCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean - // val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean - - val sessionID = sparkSession.sparkContext.getLocalProperty(GimelConstants.GTS_GIMEL_LIVY_SESSION_ID) - - logger.debug(s"Is CheckPointing Requested By User --> $isCheckPointEnabled") - val dataSet: DataSet = DataSet(sparkSession) - - // Query is via GTS - val isGTSImpersonated = AuthHandler.isAuthRequired(sparkSession) - - // Query has Hive / HBASE related DML that requires authentication. - lazy val isDMLHiveOrHbase = queryUtils.isHiveHbaseDMLAndGTSUser(sql, options, sparkSession) - // Query is a DDL operation - lazy val isDDL = queryUtils.isDDL(sql, sparkSession) - - // Identify JDBC complete pushdown - val (isJdbcCompletePushDownEnabled, transformedSql, jdbcOptions) = - GimelQueryUtils.isJdbcCompletePushDownEnabled(sparkSession, sql) - - val data = if (isJdbcCompletePushDownEnabled) { - GimelQueryUtils.createPushDownQueryDataframe(sparkSession, transformedSql.get, jdbcOptions.get) - } else if (isGTSImpersonated && (isDDL || isDMLHiveOrHbase)) { - throw new UnsupportedOperationException( - s""" - | DDL or DML for [Hive | HBASE] are not supported in GTS (Gimel thrift server). - | Please run the query in separate spark session. - |""".stripMargin) - } else if (queryUtils.isUDCDataDefinition(sql)) { - logger.info("This path is dynamic dataset creation path") - var resultingStr = "" - Try( - handleDDLs(sql, sparkSession, dataSet, options) - ) match { - case Success(result) => - resultingStr = "Query Completed." - case Failure(e) => - resultingStr = s"Query Failed in function : $MethodName. Error --> \n\n ${ - e.toString - }" - logger.error(resultingStr) - throw e - } - stringToDF(sparkSession, resultingStr) - } else { - // Allow thrift server to execute the Query for all other cases. - val isSelectFromHiveOrHBase = queryUtils.isSelectFromHiveHbaseAndGTSUser(sql, options, sparkSession) - logger.info(s"isSelectFromHiveOrHBase -> $isSelectFromHiveOrHBase") - if (isSelectFromHiveOrHBase) { - logger.info("Select query consists of Hive or HBase dataset, authenticating access through ranger.") - queryUtils.authenticateAccess(sql, sparkSession, options) - } - - // Set HBase Page Size for optimization if selecting from HBase with limit - if (QueryParserUtils.isHavingLimit(sql)) { - setLimitForHBase(sql, options, sparkSession) - } - - val (originalSQL, destination, selectSQL, kafkaDataSets, queryPushDownFlag) = - resolveSQL(sql, sparkSession, dataSet) - destination match { - case Some(target) => - logger.info(s"Target Exists --> ${target}") - Try( - executeResolvedQuery(originalSQL, destination, selectSQL, sparkSession, dataSet, queryPushDownFlag) - ) match { - case Success(result) => - resultingString = result - case Failure(e) => - resultingString = s"Query Failed in function : $MethodName. Error --> \n\n ${ - e.toString - }" - logger.error(resultingString) - throw e - } - - if (isCheckPointEnabled) { - saveCheckPointforKafka(kafkaDataSets) - } - import sparkSession.implicits._ - Seq(resultingString).toDF("Query Execution") - - case _ => - logger.info(s"No Target, returning DataFrame back to client.") - executeSelectClause(selectSQL, sparkSession, queryPushDownFlag) - } - } - - // pushing logs to ES - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql, "isQueryFromGTS" -> isQueryFromGTS.toString, "originalUser" -> originalUser) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - - data - - } catch { - case e: Throwable => - - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeBatch - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql, "isQueryFromGTS" -> isQueryFromGTS.toString, "originalUser" -> originalUser) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - - // throw error to console - logger.throwError(e.toString) - - throw new Exception(s"${e.getMessage}\n", e) - } finally { - logger.info("Unsetting the property -> " + GimelConstants.HBASE_PAGE_SIZE) - sparkSession.conf.unset(GimelConstants.HBASE_PAGE_SIZE) - } - } - - /** - * Core Function that will be called from SCAAS for executing a SQL - * Executes the executeBatch function in streaming window - * - * @param sql SQL String from client - * @param sparkSession : SparkSession - * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries - */ - - def executeStream(sql: String, sparkSession: SparkSession): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - logger.setSparkVersion(sparkSession.version) - val sparkAppName = sparkSession.conf.get("spark.app.name") - - // Set gimel log level and flag to audit logs to kafka - DataSetUtils.setGimelLogLevel(sparkSession, logger) - - // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE) - setCatalogProviderInfo(sparkSession) - - try { - - sparkSession.conf.set(GimelConstants.GIMEL_KAFKA_VERSION, GimelConstants.GIMEL_KAFKA_VERSION_ONE) - val options = queryUtils.getOptions(sparkSession)._2 - val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt - val streamRate = options(KafkaConfigs.maxRatePerPartitionKey) - val isBackPressureEnabled = options(KafkaConfigs.isBackPressureEnabledKey) - val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean - val isSaveCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean - val isStreamFailureBeyondThreshold = options.getOrElse(KafkaConfigs.isStreamBatchSwitchEnabledKey, "false").toBoolean - val streamFailureThresholdPerSecond = options.getOrElse(KafkaConfigs.failStreamThresholdKey, "1200").toInt - val streamFailureWindowFactor = options.getOrElse(KafkaConfigs.streamFailureWindowFactorKey, "10").toString.toInt - val isStreamParallel = options(KafkaConfigs.isStreamParallelKey) - val streamParallels = options(KafkaConfigs.streamParallelKey) - val streamawaitTerminationOrTimeout = options(KafkaConfigs.streamaWaitTerminationOrTimeoutKey).toLong - val sc = sparkSession.sparkContext - val sqlContext = sparkSession.sqlContext - val conf = new org.apache.spark.SparkConf() - val ssc = new StreamingContext(sc, Seconds(batchInterval)) - val listner: GimelStreamingListener = new GimelStreamingListener(conf) - ssc.addStreamingListener(listner) - logger.debug( - s""" - |isStreamParallel --> $isStreamParallel - |streamParallels --> $streamParallels - """.stripMargin) - ssc.sparkContext.getConf - .set(KafkaConfigs.isBackPressureEnabledKey, isBackPressureEnabled) - .set(KafkaConfigs.streamMaxRatePerPartitionKey, streamRate) - .set(KafkaConfigs.isStreamParallelKey, isStreamParallel) - .set(KafkaConfigs.streamParallelKey, streamParallels) - val dataStream = DataStream(ssc) - val sourceTables = getTablesFrom(sql) - val kafkaTables = sourceTables.filter { table => - DataSetUtils.getSystemType(table, sparkSession, options) == DataSetType.KAFKA - } - if (kafkaTables.isEmpty) { - throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !") - } else { - val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName - val newSQL = sql.replaceAll(kafkaTables.head, tmpKafkaTable) - val streamingResult: StreamingResult = dataStream.read(kafkaTables.head, options) - if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint As Requested By User") - try { - streamingResult.dStream.foreachRDD { (rdd, time) => - printStats(time, listner) - val count = rdd.count() - if (count > 0) { - if (isStreamFailureBeyondThreshold) { - if ((count / batchInterval) > streamFailureThresholdPerSecond) throw new Exception(s"Current Messages Per Second : ${count / batchInterval} exceeded Supplied Stream Capacity ${streamFailureThresholdPerSecond}") - else logger.info(s"Current Messages Per Second : ${count / batchInterval} within Supplied Stream Capacity ${streamFailureThresholdPerSecond}") - } - val failureThreshold = (batchInterval * streamFailureWindowFactor) - val totalDelay = (listner.totalDelay / 1000) - if (totalDelay > failureThreshold) { - throw new Exception( - s"""Current Total_Delay:$totalDelay exceeded $failureThreshold -If mode=intelligent, then Restarting will result in Batch Mode Execution first for catchup, and automatically migrate to stream mode ! - """.stripMargin - ) - } else logger.info(s"Current Total_Delay:$totalDelay within $failureThreshold ") - streamingResult.getCurrentCheckPoint(rdd) - streamingResult.getAsDF(sqlContext, rdd).registerTempTable(tmpKafkaTable) - try { - executeBatch(newSQL, sparkSession) - } catch { - case ex: Throwable => - // logger.error(s"Stream Query Failed in function : $MethodName. Error --> \n\n${ex.getStackTraceString}") - // ex.printStackTrace() - // logger.error("Force - Stopping Streaming Context") - ssc.sparkContext.stop() - throw ex - } - try { - if (isSaveCheckPointEnabled) streamingResult.saveCurrentCheckPoint() - if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint as Requested by User") - } - catch { - case ex: Throwable => - // logger.error("Error in CheckPoint Operations in Streaming.") - // ex.printStackTrace() - ssc.sparkContext.stop() - } - } - } - } catch { - case ex: Throwable => - // logger.error(s"ERROR In Streaming Window --> \n\n${ex.getStackTraceString}") - // ex.printStackTrace() - ssc.sparkContext.stop() - throw ex - } - dataStream.streamingContext.start() - dataStream.streamingContext.awaitTerminationOrTimeout(streamawaitTerminationOrTimeout) - dataStream.streamingContext.stop(false, true) - - // push to logger - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - "Success" - } - - } - catch { - case e: Throwable => - - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - - // throw error to console - logger.throwError(e.toString) - - throw e - } - - } - - /** - * Core Function that will be called from SCAAS for executing a SQL - * Executes the executeBatch function in streaming window - * - * @param sql SQL String from client - * @param sparkSession : SparkSession - * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries - */ - - def executeStream2(sql: String, sparkSession: SparkSession): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - logger.setSparkVersion(sparkSession.version) - val sparkAppName = sparkSession.conf.get("spark.app.name") - - // Set gimel log level and flag to audit logs to kafka - DataSetUtils.setGimelLogLevel(sparkSession, logger) - - // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE) - setCatalogProviderInfo(sparkSession) - - try { - - val options = queryUtils.getOptions(sparkSession)._2 - val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt - val triggerInterval = options.getOrElse(GimelConstants.GIMEL_STREAMING_TRIGGER_INTERVAL, "").toString - val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean - val sc = sparkSession.sparkContext - val conf = new org.apache.spark.SparkConf() - val ssc = new StreamingContext(sc, Seconds(batchInterval)) - val listener: GimelStreamingListener = new GimelStreamingListener(conf) - ssc.addStreamingListener(listener) - val dataStream = DataStream2(sparkSession) - val sourceTables = getTablesFrom(sql) - val targetTable = getTargetTables(sql) - val kafkaTables = sourceTables.filter { table => - val dataSetType = DataSetUtils.getSystemType(table, sparkSession, options) - (dataSetType == DataSetType.KAFKA || dataSetType == DataSetType.KAFKA2) - } - if (kafkaTables.isEmpty) { - throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !") - } else { - val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName - val selectSQL = getSelectClause(sql) - val newSQL = selectSQL.toLowerCase().replaceAll(kafkaTables.head, tmpKafkaTable) - val datasetProps = CatalogProvider.getDataSetProperties(kafkaTables.head, options) - /* - * Sets the appropriate deserializer class based on the kafka.message.value.type and value.serializer properties - * This is mainly required for backward compatibility for KAFKA datasets - */ - val newOptions = GimelSerdeUtils.setGimelDeserializer(sparkSession, datasetProps, options, true) - val streamingResult: StructuredStreamingResult = dataStream.read(kafkaTables.head, newOptions) - val streamingDF = streamingResult.df - streamingDF.createOrReplaceTempView(tmpKafkaTable) - - val streamingSQLDF = sparkSession.sql(newSQL) - var writer: StreamingQuery = null - try { - val datastreamWriter = targetTable match { - case Some(target) => - val datasetProps = CatalogProvider.getDataSetProperties(target, options) - /* - * Sets the appropriate serializer class based on the kafka.message.value.type and value.serializer properties - * This is mainly required for backward compatibility for KAFKA datasets - */ - val newOptions = GimelSerdeUtils.setGimelSerializer(sparkSession, datasetProps, options, true) - dataStream.write(target, streamingSQLDF, newOptions) - case _ => - streamingSQLDF - .writeStream - .outputMode("append") - .format("console") - } - - writer = if (triggerInterval.isEmpty) { - datastreamWriter.start() - } else { - datastreamWriter - .trigger(Trigger.ProcessingTime(triggerInterval + " seconds")) - .start() - } - - } catch { - case ex: Throwable => - // logger.error(s"ERROR In Streaming Window --> \n\n${ex.getStackTraceString}") - // ex.printStackTrace() - if (writer != null) { - writer.stop - } - throw ex - } - - // push to logger - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - "Success" - } - - } - catch { - case e: Throwable => - - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - - // throw error to console - logger.throwError(e.toString) - - throw e - } - - } - - /** - * Core Function that will be called from SCAAS for executing a SQL - * - * @return RDD[Resulting String < either sample data for select queries, or "success" / "failed" for insert queries] - */ - def executeBatchSparkMagic: (String, SparkSession) => RDD[String] = executeBatchSparkMagicRDD - - /** - * Core Function that will be called from SCAAS for executing a SQL - * Executes the executeBatchSparkMagicRDD function in streaming window - * - * @return RDD[Resulting String] < either sample data for select queries, or "success" / "failed" for insert queries - */ - def executeStreamSparkMagic: (String, SparkSession) => RDD[String] = executeStreamSparkMagicRDD - - /** - * Core Function that will be called from SCAAS for executing a SQL - * - * @param sql SQL String supplied by client - * @param sparkSession : SparkSession - * @return RDD[Resulting String < either sample data for select queries, or "success" / "failed" for insert queries] - */ - def executeBatchSparkMagicRDD(sql: String, sparkSession: SparkSession): RDD[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - logger.setSparkVersion(sparkSession.version) - - // Set gimel log level and flag to audit logs to kafka - DataSetUtils.setGimelLogLevel(sparkSession, logger) - - val sparkAppName = sparkSession.conf.get("spark.app.name") - - // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE) - setCatalogProviderInfo(sparkSession) - - try { - - val options = queryUtils.getOptions(sparkSession)._2 - - var resultingRDD: RDD[String] = sparkSession.sparkContext.parallelize(Seq("")) - val queryTimer = Timer() - val startTime = queryTimer.start - val isCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean - val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean - logger.debug(s"Is CheckPointing Requested By User --> ${ - isCheckPointEnabled - }") - val dataSet: DataSet = DataSet(sparkSession) - val (originalSQL, destination, selectSQL, kafkaDataSets, queryPushDownFlag) = resolveSQL(sql, sparkSession, dataSet) - Try(executeResolvedQuerySparkMagic(originalSQL, destination, selectSQL, sparkSession, dataSet, queryPushDownFlag)) match { - case Success(result) => - resultingRDD = result - case Failure(e) => - resultingRDD = sparkSession.sparkContext.parallelize(Seq( - s"""{"Batch Query Error" : "${ - e.getStackTraceString - }" """)) - val resultMsg = resultingRDD.collect().mkString("\n") - // logger.error(resultMsg) - throw new Exception(resultMsg) - } - if (isCheckPointEnabled) { - saveCheckPointforKafka(kafkaDataSets) - } - if (isClearCheckPointEnabled) { - clearCheckPointforKafka(kafkaDataSets) - } - - // push logs to ES - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - resultingRDD - } - catch { - case e: Throwable => - - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - // throw error to console - logger.throwError(e.toString) - - throw e - } - } - - - def saveCheckPointforKafka(kafkaDataSets: List[GimelDataSet]): Unit = { - kafkaDataSets.foreach { - case kafka: com.paypal.gimel.kafka.DataSet => - kafka.saveCheckPoint() - case kafka2: com.paypal.gimel.kafka2.DataSet => - kafka2.saveCheckPoint() - } - - } - - - def clearCheckPointforKafka(kafkaDataSets: List[GimelDataSet]): Unit = { - kafkaDataSets.foreach { - case kafka: com.paypal.gimel.kafka.DataSet => - kafka.clearCheckPoint() - case kafka2: com.paypal.gimel.kafka2.DataSet => - kafka2.clearCheckPoint() - } - } - - /** - * Core Function that will be called from SCAAS for executing a SQL - * Executes the executeBatchSparkMagicRDD function in streaming window - * - * @param sql SQL String from client - * @param sparkSession : SparkSession - * @return RDD[Resulting String] < either sample data for select queries, or "success" / "failed" for insert queries - */ - - def executeStreamSparkMagicRDD(sql: String, sparkSession: SparkSession): RDD[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - logger.setSparkVersion(sparkSession.version) - - // Set gimel log level and flag to audit logs to kafka - DataSetUtils.setGimelLogLevel(sparkSession, logger) - - val sparkAppName = sparkSession.conf.get("spark.app.name") - - // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE) - setCatalogProviderInfo(sparkSession) - - try { - - sparkSession.conf.set(GimelConstants.GIMEL_KAFKA_VERSION, GimelConstants.GIMEL_KAFKA_VERSION_ONE) - val options = getOptions(sparkSession)._2 - - val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt - val streamRate = options(KafkaConfigs.maxRatePerPartitionKey) - val isBackPressureEnabled = options(KafkaConfigs.isBackPressureEnabledKey) - val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean - val isSaveCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean - val isStreamParallel = options(KafkaConfigs.isStreamParallelKey) - val sc = sparkSession.sparkContext - val sqlContext = sparkSession.sqlContext - val ssc = new StreamingContext(sc, Seconds(batchInterval)) - val listner: GimelStreamingListener = new GimelStreamingListener(sc.getConf) - ssc.addStreamingListener(listner) - ssc.sparkContext.getConf - .set(KafkaConfigs.isBackPressureEnabledKey, isBackPressureEnabled) - .set(KafkaConfigs.streamMaxRatePerPartitionKey, streamRate) - .set(KafkaConfigs.isStreamParallelKey, isStreamParallel) - val dataStream = DataStream(ssc) - val sourceTables = getTablesFrom(sql) - val kafkaTables = sourceTables.filter { table => - val dataSetProperties: DataSetProperties = - CatalogProvider.getDataSetProperties(table, options) - DataSetUtils.getSystemType(dataSetProperties) == DataSetType.KAFKA - } - val data = if (kafkaTables.isEmpty) { - throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !") - } else { - try { - val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName - val newSQL = sql.replaceAll(kafkaTables.head, tmpKafkaTable) - val streamingResult: StreamingResult = dataStream.read(kafkaTables.head, options) - if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint As Requested By User") - streamingResult.dStream.foreachRDD { - (rdd, time) => - printStats(time, listner) - val k: RDD[WrappedData] = rdd - val count = rdd.count() - if (count > 0) { - streamingResult.getCurrentCheckPoint(rdd) - streamingResult.getAsDF(sqlContext, rdd).registerTempTable(tmpKafkaTable) - try { - executeBatchSparkMagicRDD(newSQL, sparkSession) - } - catch { - case ex: Throwable => - // logger.error(s"Stream Query Failed in function : $MethodName. Error --> \n\n${ex.getStackTraceString}") - // ex.printStackTrace() - // logger.error("Force - Stopping Streaming Context") - ssc.sparkContext.stop() - } - try { - if (isSaveCheckPointEnabled) streamingResult.saveCurrentCheckPoint() - if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint as Requested by User") - } - catch { - case ex: Throwable => - // logger.error("Error in CheckPoint Operations in Streaming.") - // ex.printStackTrace() - ssc.sparkContext.stop() - } - } - } - dataStream.streamingContext.start() - dataStream.streamingContext.awaitTermination() - dataStream.streamingContext.sparkContext.parallelize(Seq(s"""{"Query" : "Running..." }""")) - } catch { - case ex: Throwable => - ex.printStackTrace() - val msg = - s"""{"Error" : "${ - ex.getStackTraceString - }" }""" - dataStream.streamingContext.stop() - // dataStream.streamingContext.spark.parallelize(Seq(s"""{"Error" : "${ex.getStackTraceString}" }""")) - throw new Exception(msg) - } - } - - // push logs to ES - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - - data - } - catch { - case e: Throwable => - - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - - // throw error to console - logger.throwError(e.toString) - - throw e - } - - } - - /** - * Core Function that will be called from SCAAS for executing a SQL - * - * @param sql SQL String supplied by client - * @param sparkSession : SparkSession - * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries - */ - - @deprecated - def executeBatchSparkMagicJSON(sql: String, sparkSession: SparkSession): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - val sparkAppName = sparkSession.conf.get("spark.app.name") - - // Set gimel log level and flag to audit logs to kafka - DataSetUtils.setGimelLogLevel(sparkSession, logger) - - try { - val options = queryUtils.getOptions(sparkSession)._2 - var resultSet = "" - val queryTimer = Timer() - val startTime = queryTimer.start - val isCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean - val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean - logger.debug(s"Is CheckPointing Requested By User --> ${ - isCheckPointEnabled - }") - val dataSet: DataSet = DataSet(sparkSession) - val (originalSQL, destination, selectSQL, kafkaDataSets, queryPushDownFlag) = resolveSQL(sql, sparkSession, dataSet) - Try(executeResolvedQuerySparkMagic(originalSQL, destination, selectSQL, sparkSession, dataSet, queryPushDownFlag)) match { - case Success(result) => - resultSet = - s"""{"Batch Query Result" : "${ - result.collect().mkString("[", ",", "]") - } }""" - case Failure(e) => - resultSet = - s"""{"Batch Query Error" : "${ - e.getStackTraceString - }" """ - // logger.error(resultSet) - throw new Exception(resultSet) - } - - if (isCheckPointEnabled) { - saveCheckPointforKafka(kafkaDataSets) - } - if (isClearCheckPointEnabled) { - clearCheckPointforKafka(kafkaDataSets) - } - - // push logs to ES - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - resultSet - } - catch { - case e: Throwable => - - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - - // throw error to console - logger.throwError(e.toString) - - throw e - } - - } - - /** - * Core Function that will be called from SCAAS for executing a SQL - * Executes the @executeBatchSparkMagicJSON function in streaming window - * - * @param sql SQL String from client - * @param sparkSession : SparkSession - * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries - */ - - @deprecated - def executeStreamSparkMagicJSON(sql: String, sparkSession: SparkSession): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - val sparkAppName = sparkSession.conf.get("spark.app.name") - var returnMsg = "" - - // Set gimel log level and flag to audit logs to kafka - DataSetUtils.setGimelLogLevel(sparkSession, logger) - - try { - sparkSession.conf.set(GimelConstants.GIMEL_KAFKA_VERSION, GimelConstants.GIMEL_KAFKA_VERSION_ONE) - val options = queryUtils.getOptions(sparkSession)._2 - val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt - val streamRate = options(KafkaConfigs.maxRatePerPartitionKey) - val isBackPressureEnabled = options(KafkaConfigs.isBackPressureEnabledKey) - val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean - val isSaveCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean - val isStreamParallel = options(KafkaConfigs.isStreamParallelKey) - val streamParallels = options(KafkaConfigs.streamParallelKey) - val sc = sparkSession.sparkContext - val sqlContext = sparkSession.sqlContext - val ssc = new StreamingContext(sc, Seconds(batchInterval)) - logger.debug( - s""" - |isStreamParallel --> ${ - isStreamParallel - } - |streamParallels --> ${ - streamParallels - } - """.stripMargin) - ssc.sparkContext.getConf - .set(KafkaConfigs.isBackPressureEnabledKey, isBackPressureEnabled) - .set(KafkaConfigs.streamMaxRatePerPartitionKey, streamRate) - .set(KafkaConfigs.isStreamParallelKey, isStreamParallel) - .set(KafkaConfigs.streamParallelKey, streamParallels) - val dataStream = DataStream(ssc) - val sourceTables = getTablesFrom(sql) - val kafkaTables = sourceTables.filter { table => - val dataSetProperties: DataSetProperties = - CatalogProvider.getDataSetProperties(table, options) - DataSetUtils.getSystemType(dataSetProperties) == DataSetType.KAFKA - } - if (kafkaTables.isEmpty) { - throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !") - } else { - val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName - val newSQL = sql.replaceAll(kafkaTables.head, tmpKafkaTable) - val streamingResult: StreamingResult = dataStream.read(kafkaTables.head, options) - if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint As Requested By User") - try { - streamingResult.dStream.foreachRDD { - rdd => - val k: RDD[WrappedData] = rdd - val count = rdd.count() - if (count > 0) { - streamingResult.getCurrentCheckPoint(rdd) - streamingResult.convertAvroToDF(sqlContext, streamingResult.convertBytesToAvro(rdd)).registerTempTable(tmpKafkaTable) - try { - executeBatchSparkMagicJSON(newSQL, sparkSession) - if (isSaveCheckPointEnabled) streamingResult.saveCurrentCheckPoint() - if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint as Requested by User") - } catch { - case ex: Throwable => - returnMsg = - s"""{ "Stream Query Error" : "${ - ex.getStackTraceString - }" } """ - // logger.error(returnMsg) - // ex.printStackTrace() - logger.warning("Force - Stopping Streaming Context") - ssc.sparkContext.stop() - throw new Exception(returnMsg) - } - } - } - } catch { - case ex: Throwable => - returnMsg = - s"""{ "Stream Query ERROR" : "${ - ex.getStackTraceString - }" } """ - // logger.error(returnMsg) - // ex.printStackTrace() - logger.warning("Force - Stopping Streaming Context") - ssc.sparkContext.stop() - throw new Exception(returnMsg) - } - dataStream.streamingContext.start() - dataStream.streamingContext.awaitTermination() - dataStream.streamingContext.stop() - returnMsg = s"""{"Stream Query" : "SUCCESS"} """ - } - - // push logs to ES - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - - returnMsg - } - catch { - case e: Throwable => - - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , KafkaConstants.gimelAuditRunTypeStream - , yarnCluster - , user - , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}") - , MethodName - , sql - , scala.collection.mutable.Map("sql" -> sql) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - - // throw error to console - logger.throwError(e.toString) - - throw e - } - - } - - private def toLogFriendlyString(str: String): String = { - str.replaceAllLiterally("/", "_").replaceAllLiterally(" ", "-") - } - - /** - * handleDDLs will direct to respective data set create/drop/truncate based on the incoming DDL - * - * @param sql - SQL that is passed to create/drop/delete - * @param sparkSession - spark session - * @param dataSet - dataset name - * @param options - List of options - * @return - */ - def handleDDLs(sql: String, sparkSession: SparkSession, dataSet: DataSet, options: Map[String, String]): Unit = { - val uniformSQL = sql.replace("\n", " ") - val sqlParts: Array[String] = uniformSQL.split(" ") - // remove all additional white spaces in the DDL statment - val newSql = sqlParts.filter(x => !x.isEmpty).mkString(" ") - val newSqlParts = newSql.split(" ") - sqlParts.head.toUpperCase match { - // We have two "create ddl" paths. One with full create (plain) statement provided by the user - // the other where we have to construct from the dataframe after running select clause in given sql/ddl - // create table db.tablename(x int, y varchar(10) will be handled by handlePlainCreateDDL funcation - // create table db.tablename tblproperties("table_type":"SET") as select * from another_table. - case QueryConstants.DDL_CREATE_STRING => { - val index = sqlParts.indexWhere(_.toLowerCase().contains(GimelConstants.UDC_STRING)) - // Find out whether select is part of the create statement - val isHavingSelect = QueryParserUtils.isHavingSelect(sql) - isHavingSelect match { - case true => handleSelectDDL(newSqlParts, newSql, dataSet, options, sparkSession) - case false => handlePlainCreateDDL(newSqlParts, dataSet, options, sparkSession) - } - } - // following case will cover DROP DDL - case QueryConstants.DDL_DROP_STRING => { - val dataSetName = newSqlParts(2) - dataSet.drop(dataSetName, options) - } - // following case will cover TRUNCATE DDL - case QueryConstants.DDL_TRUNCATE_STRING => { - val dataSetName = newSqlParts(2) - dataSet.truncate(dataSetName, options) - } - // following case will cover both DELETE AND DELETE FROM DDL - case QueryConstants.DDL_DELETE_STRING => { - val dataSetName = newSqlParts.map(_.toUpperCase()).contains(QueryConstants.DDL_FROM_STRING) match { - case true => newSqlParts(2) - case _ => newSqlParts(1) - } - dataSet.truncate(dataSetName, options) - } - case _ => throw new Exception("Unexpected path at runtime. We should not arrive at this location !") - } - } - - /** - * handleSelectDDL - - * Strip out the the select statement - * Run the sql using executeBatch and get the data frame back - * Get the schema from data frame and pass it in options - * Strip out the table properties and pass it in options - * Create the object/table - * Call dataSet.Write to the object/table that got created - * - * @param sqlParts - each word in the sql comes as array - * @param sql - the full sql query - * @param dataSet - dataset Object itself - * @param options - options comings from user - * @param sparkSession - Spark session - * @return - */ - def handleSelectDDL(sqlParts: Array[String], sql: String, dataSet: DataSet, options: Map[String, String], sparkSession: SparkSession): Unit = { - val selectIndex = sqlParts.indexWhere(_.toUpperCase().contains(QueryConstants.SQL_SELECT_STRING)) - val selectClause = sqlParts.slice(selectIndex, sqlParts.length).mkString(" ") - val pcatalogIndex = sqlParts.indexWhere(_.toLowerCase().contains(GimelConstants.UDC_STRING)) - val datasetname = sqlParts(pcatalogIndex) - - // Run the Select statement and get the results in a dataframe - val selectDF = executeBatch(selectClause, sparkSession) - val schema: Array[StructField] = selectDF.schema.fields - - // Check if 'PARTITIONED' clause present in the sql. If so we want to get the partitioned fileds so that we will use it during creation of the table when building CREATE TABLE statement. - val partitionFields: Array[com.paypal.gimel.common.catalog.Field] = existsPartitionedByClause(sql) match { - case true => getPartitionsFields(sql) - case _ => Array[com.paypal.gimel.common.catalog.Field]() - } - - val newOptions: Map[String, Any] = options ++ Map[String, Any](GimelConstants.TABLE_FILEDS -> schema, GimelConstants.CREATE_STATEMENT_IS_PROVIDED -> "false", GimelConstants.TABLE_SQL -> sql, GimelConstants.HIVE_DDL_PARTITIONS_STR -> partitionFields) - - // Create the table and Write data into it from the selected dataframe - try { - dataSet.create(datasetname, newOptions) - logger.info("Table/object creation success") - dataSet.write(datasetname, selectDF, newOptions) - } catch { - case e: Throwable => - val msg = s"Error creating/writing table: ${e.getMessage}" - throw new Exception(msg, e) - } - } - - def handlePlainCreateDDL(sqlParts: Array[String], dataSet: DataSet, options: Map[String, String], sparkSession: SparkSession): Unit = { - - // Since select is not part of create statement it has to be full create statement - // We need to replace the pcatalog.storagetype.storagesystem.DB.Table with DB.Table - // So that we can pass the entire create statement as is to respective storage engines - val index = sqlParts.indexWhere(_.toLowerCase().contains(GimelConstants.UDC_STRING)) - - val datasetname = sqlParts(index) - val newSQL = sqlParts.map(element => { - if (element.toLowerCase().contains(GimelConstants.UDC_STRING + ".")) { - // we replace pcatalog.storagetype.storagesystem.DB.Table with DB.Table - element.split('.').tail.mkString(".").split('.').tail.mkString(".").split('.').tail.mkString(".") - } - else { - element - } - } - ).mkString(" ") - val newOptions = options ++ Map[String, String](GimelConstants.TABLE_SQL -> newSQL.toString, GimelConstants.CREATE_STATEMENT_IS_PROVIDED -> "true") - dataSet.create(datasetname, newOptions) - - } - - /** booltoDF will convert the boolean result to a dataframe - * - * @param spark - sparksessionboolToDF - * @param result - boolean return from the create/drop/truncate methods - * @return - */ - def boolToDFWithErrorString(spark: SparkSession, result: Boolean, addOnString: String): DataFrame = { - val resultStr = if (result) "success" else "failure" - import spark.implicits._ - result match { - case false => throw new Exception(s"${addOnString}\n") - case _ => Seq(resultStr).toDF("Query Execution") - } - } - - /** booltoDF will convert the boolean result to a dataframe - * - * @param spark - sparksession - * @param result - boolean return from the create/drop/truncate methods - * @return - */ - def boolToDF(spark: SparkSession, result: Boolean): DataFrame = { - val resultStr = if (result) "success" else "failure" - import spark.implicits._ - Seq(resultStr).toDF("Query Execution") - } - - /** stringToDF will convert the string result to a dataframe - * - * @param spark - sparksession - * @param result - boolean return from the create/drop/truncate methods - * @return - */ - def stringToDF(spark: SparkSession, result: String): DataFrame = { - import spark.implicits._ - Seq(result).toDF("Query Execution") - } - - /** - * From the create table SQL, parse the partitioned by clause and get all the partitions - * - * @param sql - Incoming sql - * @return - Array of Fields which has partition column name with data type hard coded as String for now as it is not going to be used elsewhere - */ - def getPartitionsFields(sql: String): Array[com.paypal.gimel.common.catalog.Field] = { - val pattern = """^.+PARTITIONED BY \((.*?)\).+""".r - val pattern(partitions) = sql.toUpperCase() - var fieldsList: Array[com.paypal.gimel.common.catalog.Field] = Array[com.paypal.gimel.common.catalog.Field]() - val listParts = partitions.split(",") - listParts.map(parts => fieldsList :+= com.paypal.gimel.common.catalog.Field(parts, "String")) - fieldsList - } - - - /** - * - * Method to check in special checks in SQL string - * - * @param sql - * @return - */ - def vulnerabilityCheck(sql: String): Unit = { - - val checkFlag = if (sql.toUpperCase.contains(s"SET ${JdbcConfigs.jdbcUserName}".toUpperCase)) { - true - } - else { - false - } - - if (checkFlag) { - throw new Exception( - s""" - |SECURITY VIOLATION | Execution of this statement is not allowed: ${sql} - """.stripMargin) - } - } - -} - diff --git a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryUtils.scala b/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryUtils.scala deleted file mode 100644 index 2a9e8147..00000000 --- a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryUtils.scala +++ /dev/null @@ -1,1742 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import java.nio.charset.StandardCharsets -import java.sql.SQLException -import java.text.SimpleDateFormat -import java.util.Date - -import scala.collection.immutable.Map -import scala.collection.mutable -import scala.util.{Failure, Success, Try} - -import com.google.common.hash.Hashing -import org.apache.commons.lang3.ArrayUtils -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SparkSession} -import org.apache.spark.streaming.Time - -import com.paypal.gimel.common.catalog.{CatalogProvider, DataSetProperties} -import com.paypal.gimel.common.conf.{GimelConstants, _} -import com.paypal.gimel.common.gimelserde.GimelSerdeUtils -import com.paypal.gimel.common.utilities.{DataSetType, DataSetUtils, GenericUtils, RandomGenerator} -import com.paypal.gimel.common.utilities.DataSetUtils._ -import com.paypal.gimel.datasetfactory.GimelDataSet -import com.paypal.gimel.elasticsearch.conf.ElasticSearchConfigs -import com.paypal.gimel.hbase.conf.HbaseConfigs -import com.paypal.gimel.hbase.utilities.HBaseUtilities -import com.paypal.gimel.hive.conf.HiveConfigs -import com.paypal.gimel.hive.utilities.HiveUtils -import com.paypal.gimel.jdbc.conf.{JdbcConfigs, JdbcConstants} -import com.paypal.gimel.jdbc.utilities._ -import com.paypal.gimel.jdbc.utilities.JdbcAuxiliaryUtilities._ -import com.paypal.gimel.jdbc.utilities.PartitionUtils.ConnectionDetails -import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants} -import com.paypal.gimel.logger.Logger -import com.paypal.gimel.logging.GimelStreamingListener -import com.paypal.gimel.parser.utilities.{QueryParserUtils, SearchCriteria, SearchSchemaUtils, SQLNonANSIJoinParser} - -object GimelQueryUtils { - - val logger: Logger = Logger(this.getClass.getName) - /* - * Regex for substituting tmp table in a sql. - * This regex matches if key is preceded by any whitespace character - new line, tab, space - * and followed by (new line, tab, space, round brackets, semi colon and comma) or is at end of line ($$) - * ? inorder to get from all types of SQL pass searchlist - * to be List("into", "view", "table", "from", "join") - * @return Seq[Tables] - */ - - def getAllTableSources(sql: String, - searchList: Seq[SearchCriteria] = SearchSchemaUtils.ALL_TABLES_SEARCH_CRITERIA): Seq[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - val finalList = QueryParserUtils.getAllSourceTables(sql, searchList) - logger.info(s"Final List of Tables --> ${finalList.mkString("[", " , ", "]")}") - finalList - } - - /** - * Sets the Catalog Provider - * - * @param provider Catalog Provider, say - UDC , PCATALOG , HIVE , USER - */ - def setCatalogProvider(provider: String): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - logger.info(s"Supplied catalog provider --> [$provider]") - provider.toUpperCase() match { - case CatalogProviderConstants.HIVE_PROVIDER | CatalogProviderConstants.USER_PROVIDER => - catalogProvider = provider - case CatalogProviderConstants.PCATALOG_PROVIDER => - logger.warning(" ************************* WARNING ************************* ") - logger.warning(s"DEPRECATED Catalog Provider --> [${CatalogProviderConstants.PCATALOG_PROVIDER}]") - logger.warning(s"Please migrate to Catalog Provider --> [${CatalogProviderConstants.UDC_PROVIDER}]") - logger.warning(" ************************* WARNING ************************* ") - catalogProvider = provider - logger.info(s"Auto-Setting catalog provider Namespace to --> [${provider.toUpperCase}]") - setCatalogProviderName(provider.toUpperCase) - case CatalogProviderConstants.UDC_PROVIDER => - logger.info(s"Auto-Setting catalog provider Namespace to --> [${provider.toUpperCase}]") - catalogProvider = provider - setCatalogProviderName(provider.toUpperCase) - case _ => logger.warning( - s""" - |Invalid Catalog Provider --> [${provider}] - |Valid Options --> [ ${CatalogProviderConstants.HIVE_PROVIDER}| ${CatalogProviderConstants.UDC_PROVIDER}| ${CatalogProviderConstants.PCATALOG_PROVIDER}| ${CatalogProviderConstants.USER_PROVIDER} ] - """.stripMargin - ) - } - } - - /** - * Client Function to Get Catalog Provider - * - * @return The Catalog Provider - */ - def getCatalogProvider(): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - catalogProvider - } - - /** - * Sets the Catalog Provider Name - * - * @param providerNameSpace Catalog Provider, say - default, pcatalog, udc, any_other_hive_db_name - */ - - def setCatalogProviderName(providerNameSpace: String): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val catalogProvider = getCatalogProvider - if (catalogProvider.equalsIgnoreCase(CatalogProviderConstants.HIVE_PROVIDER) | - catalogProvider.equalsIgnoreCase(CatalogProviderConstants.USER_PROVIDER)) { - logger.info(s"setting catalog provider Name to --> [$providerNameSpace]") - catalogProviderNameSpace = providerNameSpace - } - else catalogProviderNameSpace = catalogProvider.toLowerCase() - } - - /** - * Client Function to Get Catalog Provider Name - * - * @return The Catalog Provider Name Space, say the hive DB name - */ - - def getCatalogProviderName(): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - catalogProviderNameSpace - } - - /** - * Returns the individual works from the SQL as tokens - * - * @param sql SqlString - * @return String tokens - */ - def tokenizeSql(sql: String): Array[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - QueryParserUtils.tokenize(sql) - } - - /** - * This search will return true if the hive query has a partitioning criteria in it. - * - * @param sql SqlString - * @return true, if query contains ; insert into partitions of target table. - */ - def isQueryContainingPartitioning(sql: String): Boolean = { - val tokens = tokenizeSql(sql.toLowerCase) - var isHaving = false - var tmp = "" - tokens.foreach { token => - if ((tmp == "partition" & token == "(") || tmp.contains("partition(")) isHaving = true - tmp = token - } - isHaving - } - - /** - * Gets the Tables List from SQL - * - * @param sql SQL String - * @return List of Tables - */ - def getTablesFrom(sql: String): Array[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - val otherCatalogProvider = getCatalogProviderName().toLowerCase match { - case GimelConstants.UDC_STRING => GimelConstants.PCATALOG_STRING - case GimelConstants.PCATALOG_STRING => GimelConstants.UDC_STRING - case _ => "hive" - - } - val allTables = getAllTableSources(sql) - val finalList = allTables.filter( - token => - token.toLowerCase.contains(s"${getCatalogProviderName().toLowerCase}.") || - token.toLowerCase.contains(s"$otherCatalogProvider.") - ) - logger.info(s"Source Catalog [udc/pcatalog] Tables from entire SQL --> ${finalList.mkString("[", " , ", "]")}") - finalList.toArray - } - - /** - * Gets the Tables List from SQL - * - * @param sql SQL String - * @return List of Tables - */ - @deprecated - def getTablesFrom1(sql: String): Array[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - val sqlLower = sql.toLowerCase - val searchList = List("insert", "select", "from", "join", "where") - var lastKey = if (searchList.contains(tokenizeSql(sqlLower).head)) { - tokenizeSql(sqlLower).head - } else { - "" - } - var currentKey = "" - val catalogProviderNameSpace = getCatalogProviderName.toLowerCase - // logger.info(s"catalogProviderNameSpace is --> [${catalogProviderNameSpace}]") - var catalogTables = List[String]() - val otherCatalogProvider = getCatalogProviderName.toLowerCase match { - case GimelConstants.UDC_STRING => GimelConstants.PCATALOG_STRING - case GimelConstants.PCATALOG_STRING => GimelConstants.UDC_STRING - case _ => "hive" - - } - // Pick each catalog.table only if its appearing at specific places in the SQL String - // This guard necessary if someone uses "catalog" as an alias, example - udc or pcatalog - tokenizeSql(sqlLower).tail.foreach { - token => - - currentKey = if (searchList.contains(token)) token else currentKey - val pickCriteriaMet = token.toLowerCase.contains(s"${getCatalogProviderName.toLowerCase}.") || - token.toLowerCase.contains(s"${otherCatalogProvider}.") - - if (pickCriteriaMet) { - if (lastKey == "from" & !(currentKey == "select")) catalogTables ++= List(token) - if (lastKey == "join" & !(currentKey == "select")) catalogTables ++= List(token) - } - lastKey = if (searchList.contains(token)) currentKey else lastKey - currentKey = "" - } - - val nonANSIJoinTables: Seq[String] = SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sql) - val nonANSIJoinTablesCatalog = nonANSIJoinTables.filter( - token => - token.toLowerCase.contains(s"${getCatalogProviderName.toLowerCase}.") || - token.toLowerCase.contains(s"${otherCatalogProvider}.") - ) - val finalList = (catalogTables.toArray ++ nonANSIJoinTablesCatalog).distinct - logger.info(s"Source Tables from Non-ANSI Join --> ${nonANSIJoinTables.mkString("[", " , ", "]")}") - logger.info(s"Source Catalog Tables from Non-ANSI Join --> ${nonANSIJoinTablesCatalog.mkString("[", " , ", "]")}") - logger.info(s"Source Catalog Tables from ANSI Join --> ${catalogTables.mkString("[", " , ", "]")}") - logger.info(s"Source Catalog Tables from entire SQL --> ${finalList.mkString("[", " , ", "]")}") - finalList - } - - - /** - * Prints Stats for Streaming Batch Window - * - * @param time Time Object - Spark Streaming - * @param listener GIMEL Streaming Listener - */ - def printStats(time: Time, listener: GimelStreamingListener): Unit = { - val batchTimeMS = time.milliseconds.toString - val batchDate = new Date(batchTimeMS.toLong) - val df = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss") - val batchTime = df.format(batchDate) - logger.info(s"Current Batch ID --> $time | $batchTime | $batchDate") - logger.info( - s"""|----------------------------------------------------------------------- - |Batch ID --> - |----------------------------------------------------------------------- - |time : $time - |batchTimeMS : $batchTimeMS - |batchTime : $batchTime - |----------------------------------------------------------------------- - |Listener Metrics --> - |----------------------------------------------------------------------- - |appProcessingDelay : ${listener.appProcessingDelay} - |appSchedulingDelay : ${listener.appSchedulingDelay} - |appTotalDelay : ${listener.appTotalDelay} - |processingDelay : ${listener.processingDelay} - |schedulingDelay : ${listener.schedulingDelay} - |totalDelay : ${listener.totalDelay} - |----------------------------------------------------------------------- - |""".stripMargin) - } - - /** - * Cache the DataSet (lazily) if its configured to be cached - by user in properties. - * - * @param df DataFrame - * @param dataSetName DataSetName representing the DataFrame - * @param options Props - */ - def cacheIfRequested(df: DataFrame, dataSetName: String, options: Map[String, String]): Unit = { - val isCachingEnabled = ( - options.getOrElse(GimelConstants.DATA_CACHE_IS_ENABLED, "false").toBoolean - && ( - options.getOrElse(s"${GimelConstants.DATA_CACHE_IS_ENABLED}.for.$dataSetName", "false").toBoolean - || options.getOrElse(s"${GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL}", "false").toBoolean - ) - ) - if (isCachingEnabled) df.cache() - } - - private def mergeAllConfs(sparkSession: SparkSession): Map[String, String] = { - sparkSession.conf.getAll ++ Map(CatalogProviderConfigs.CATALOG_PROVIDER -> sparkSession.conf.get( - CatalogProviderConfigs.CATALOG_PROVIDER, CatalogProviderConstants.PRIMARY_CATALOG_PROVIDER) - ) - } - - /** - * Resolves the Query by replacing Tmp Tables in the Query String - * For Each Tmp Table placed in the Query String - a DataSet.read is initiated - * For each Tmp Table - if the dataset is a Kafka DataSet - then each KafkaDataSet object is accumulated - * Accumulated KafkaDataSet Object will be used towards the end of the Query (on success) - - * to call check pointing for each topic consumed - * - * @param originalSQL SQLString - * @param selectSQL SQLString - * @param sparkSession : SparkSession - * @param dataSet Dataset Object - * @return Tuple of (Resolved Original SQL, Resolved Select SQL, List of (KafkaDataSet) - */ - def resolveSQLWithTmpTables(originalSQL: String, selectSQL: String, sparkSession: SparkSession, - dataSet: com.paypal.gimel.DataSet): (String, String, List[GimelDataSet], String) = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - // get queryPushDown flag - val queryPushDownFlag = getQueryPushDownFlag(originalSQL, selectSQL, sparkSession, dataSet) - - var kafkaDataSets: List[GimelDataSet] = List() - var sqlTmpString = selectSQL - var sqlOriginalString = originalSQL - val pCatalogTablesToReplaceAsTmpTable: Map[String, String] = getTablesFrom(selectSQL).map { - eachSource => - val options = getOptions(sparkSession)._2 - // Create a random string with random length for tmp table - val randomString = RandomGenerator.getRandomString( - RandomGenerator.getRandomInt(GimelConstants.GSQL_TMP_TABLE_RANDOM_GENERATOR_MIN, - GimelConstants.GSQL_TMP_TABLE_RANDOM_GENERATOR_MAX)) - val tmpTableName = "tmp_" + eachSource.replaceAll("[^\\w\\s]", "_") + "_" + randomString - - // do DataSet.read() only if queryPushDownFlag is set to "false" - queryPushDownFlag match { - case "false" => - logger.info(s"Setting transformation dataset.read for ${eachSource}") - logger.info("printing all options during read" + options.toString()) - val datasetProps = CatalogProvider.getDataSetProperties(eachSource, options) - /* - * Sets the appropriate deserializer class based on the kafka.message.value.type and value.serializer properties - * This is mainly required for backward compatibility for KAFKA datasets - */ - val newOptions = GimelSerdeUtils.setGimelDeserializer(sparkSession, datasetProps, options) - val df = dataSet.read(eachSource, newOptions) - cacheIfRequested(df, eachSource, newOptions) - df.createOrReplaceTempView(tmpTableName) - case _ => - // do nothing if query pushdown is true. No need to do dataset.read - } - - if (dataSet.latestKafkaDataSetReader.isDefined) { - logger.info(s"@$MethodName | Added Kafka Reader for Source --> $eachSource") - kafkaDataSets = kafkaDataSets ++ List(dataSet.latestKafkaDataSetReader.get) - } - (eachSource, tmpTableName) - }.toMap - - // replacing the dataset names with original tables names if queryPushDown is "true" - queryPushDownFlag match { - case "true" => - logger.info("PATH IS -> QUERY PUSH DOWN") - pCatalogTablesToReplaceAsTmpTable.foreach { kv => - val resolvedSourceTable = resolveDataSetName(kv._1) - val dataSetProperties: DataSetProperties = - CatalogProvider.getDataSetProperties(resolvedSourceTable, mergeAllConfs(sparkSession)) - val hiveTableParams = dataSetProperties.props - val jdbcTableName: String = hiveTableParams(JdbcConfigs.jdbcInputTableNameKey) - logger.info(s"JDBC input table name : ${jdbcTableName}") - logger.info(s"Setting JDBC URL : ${hiveTableParams(JdbcConfigs.jdbcUrl)}") - sparkSession.conf.set(JdbcConfigs.jdbcUrl, hiveTableParams(JdbcConfigs.jdbcUrl)) - logger.info(s"Setting JDBC driver Class : ${hiveTableParams(JdbcConfigs.jdbcDriverClassKey)}") - sparkSession.conf.set(JdbcConfigs.jdbcDriverClassKey, hiveTableParams(JdbcConfigs.jdbcDriverClassKey)) - sqlTmpString = getSQLWithTmpTable(sqlTmpString, kv._1, jdbcTableName) - sqlOriginalString = getSQLWithTmpTable(sqlOriginalString, kv._1, jdbcTableName) - } - case _ => - logger.info("PATH IS --> DEFAULT") - pCatalogTablesToReplaceAsTmpTable.foreach { kv => - sqlTmpString = getSQLWithTmpTable(sqlTmpString, kv._1, kv._2) - sqlOriginalString = getSQLWithTmpTable(sqlOriginalString, kv._1, kv._2) - } - } - - logger.info(s"incoming SQL --> $selectSQL") - logger.info(s"resolved SQL with Temp Table(s) --> $sqlTmpString") - (sqlOriginalString, sqlTmpString, kafkaDataSets, queryPushDownFlag) - } - - /* - * Substitutes dataset name with tmp table in sql using regex - * - * @param sql - * @param datasetName : Mame of dataset to substitute - * @param tmpTableName : Temp table name to substitute - * - * Example: - * sql = select * from udc.hive.test.flights - * key = udc.hive.test.flights - * This should match udc.hive.test.flights in the sql string. - * - * sql = select * fromudc.hive.test.flights - * key = udc.hive.test.flights - * This should not match udc.hive.test.flights in the sql string. - * - * sql = select * from udc.hive.test.flights_schedule - * key = udc.hive.test.flights - * This should not match udc.hive.test.flights in the sql string. - */ - def getSQLWithTmpTable(sql: String, datasetName: String, tmpTableName: String): String = { - sql.replaceAll(regexTmpTable.replace("key", datasetName), tmpTableName) - } - - /** - * Checks if a Query has Insert or if its just a select - * - * @param sql SQL String - * @return true - if there is an "insert" clause, else false - */ - def isHavingInsert(sql: String): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - QueryParserUtils.isHavingInsert(sql) - } - - /** - * This function tokenize the incoming sql and parses it using JSQL parser and identify whether the query is of Insert type - * If it is a insert query, it checks whether it is of HIVE insert, which the caller will use it decide whether to execute it through Livy. - * - * @param sql - Incoming SQL - * @param options - set of Options from the user - * @param sparkSession - spark session - * @return - It returns a boolean that tells whether it is hive insert from GTS - */ - def isHiveHbaseDMLAndGTSUser(sql: String, options: Map[String, String], sparkSession: SparkSession): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql) - val isHive: Boolean = nonEmptyStrTokenized.head.toLowerCase match { - case "insert" => { - val insertTable = getTargetTables(sql) - getSystemType(insertTable.get, sparkSession, options) match { - case DataSetType.HIVE => { - if ( - sparkSession.sparkContext.sparkUser.equalsIgnoreCase(GimelConstants.GTS_DEFAULT_USER(sparkSession.conf)) - ) { - logger.info("Hive insert query and comes from GTS") - true - } else { - false - } - } - case DataSetType.HBASE => { - if ( - sparkSession.sparkContext.sparkUser.equalsIgnoreCase(GimelConstants.GTS_DEFAULT_USER(sparkSession.conf)) - ) { - logger.info("hBase insert query and comes from GTS") - true - } else { - false - } - } - case _ => false - } - } - case _ => - false - } - isHive - } - - /** - * This function tokenize the incoming sql and parses it using JSQL parser and identify whether the query is of Select type - * If it is a select query, it checks whether it is of HIVE or HBase, which the caller will use to decide whether to authenticate through ranger. - * - * @param sql - Incoming SQL - * @param options - set of Options from the user - * @param sparkSession - spark session - * @return - It returns a boolean that tells whether it is hive insert from GTS - */ - def isSelectFromHiveHbaseAndGTSUser(sql: String, options: Map[String, String], - sparkSession: SparkSession): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - var isHiveHbase: Boolean = false - Try { - val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql) - isHiveHbase = nonEmptyStrTokenized.head.toLowerCase match { - case "select" => - val selectTables = getAllTableSources(sql) - if (selectTables.isEmpty) return false - selectTables.map(eachTable => getSystemType(eachTable, sparkSession, options) match { - case DataSetType.HIVE => - if (sparkSession.sparkContext.sparkUser.equalsIgnoreCase( - GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))) { - logger.info("Hive select query and comes from GTS") - true - } else { - false - } - case DataSetType.HBASE => - if (sparkSession.sparkContext.sparkUser.equalsIgnoreCase( - GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))) { - logger.info("hBase select query and comes from GTS") - true - } else { - false - } - case _ => false - }).reduce((x, y) => x | y) - case _ => - false - } - } match { - case Success(_) => - logger.info(s"Interpreted isSelectFromHiveHbaseAndGTSUser with $isHiveHbase") - case Failure(exception) => - logger.error(s"Exeception occurred while interpretting " + - s"isSelectFromHiveHbaseAndGTSUser with ${exception.getMessage}") - if (exception.getMessage.toLowerCase().contains("table not found")) { - logger.info("Suppressing the table not found exception") - } else { - throw exception - } - } - isHiveHbase - } - - /** - * Checks whether the sql is of drop table/view pattern and checks whether the table/view is a temp table - * This will help to take a path to whether to go in livy session or normal gsql session - * - * @param sql - incoming sql - * @param sparkSession - current spark session - * @return - true or false based on whether the dropped table/view is a temp (cached) table. - */ - def isDropTableATempTable(sql: String, sparkSession: SparkSession): Boolean = { - val dropTableIfExistsPattern = s"DROP TABLE IF EXISTS .(.*)".r - val dropViewIfExistsPattern = s"DROP VIEW IF EXISTS .(.*)".r - val dropTablePattern = s"DROP TABLE .(.*)".r - val dropViewPattern = s"DROP VIEW .(.*)".r - val uniformSQL = sql.replace("\n", " ") - val sqlParts: Array[String] = uniformSQL.split(" ") - val newSql = sqlParts.filter(x => !x.isEmpty).mkString(" ") - val tableName = newSql.toUpperCase() match { - case dropTableIfExistsPattern(_) | dropViewIfExistsPattern(_) => - newSql.split(" ")(newSql.split(" ").indexWhere(_.toUpperCase() == "EXISTS") + 1) - - case dropTablePattern(_) => - newSql.split(" ")(newSql.split(" ").indexWhere(_.toUpperCase() == "TABLE") + 1) - - case dropViewPattern(_) => - newSql.split(" ")(newSql.split(" ").indexWhere(_.toUpperCase() == "VIEW") + 1) - - case _ => "." - } - if (tableName.contains(".")) { - false - } else { - isSparkCachedTable(tableName, sparkSession) - } - } - - /** - * This function call will check SQL is a DDL - * - * @param sql - Incoming SQL - * @param sparkSession - Spark Session object - */ - - def isDDL(sql: String, sparkSession: SparkSession): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - QueryParserUtils.isDDL(sql, isDropTableATempTable(sql, sparkSession)) - } - - /** - * This function call will check whether SQL is setting conf, say - "SET key=val" - * - * @param sql - Incoming SQL - * @param sparkSession - Spark Session object - */ - - def isSetConf(sql: String, sparkSession: SparkSession): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - val tokenized = GimelQueryUtils.tokenizeSql(sql) - val nonEmptyStrTokenized = tokenized.filter(x => !x.isEmpty) - nonEmptyStrTokenized.head.toUpperCase.equals("SET") - } - - /** - * isDataDefinition - will find whether we need to take to the Data definition path or select/insert DML path - * - * @param sql SQL String from client - * @return Resulting Boolean - */ - - def isUDCDataDefinition(sql: String): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - // Add alter table - val catalogName = getCatalogProvider.toUpperCase - val createTablePattern = s"CREATE TABLE ${catalogName}.(.*)".r - val createExternalTablePattern = s"CREATE EXTERNAL TABLE ${catalogName}.(.*)".r - val multisetPattern = s"CREATE MULTISET TABLE ${catalogName}.(.*)".r - val setPattern = s"CREATE SET TABLE ${catalogName}.(.*)".r - val dropTablePattern = s"DROP TABLE ${catalogName}.(.*)".r - val truncateTablePattern = s"TRUNCATE TABLE ${catalogName}.(.*)".r - val deleteFromPattern = s"DELETE FROM ${catalogName}.(.*)".r - val deletePattern = s"DELETE ${catalogName}.(.*)".r - - val uniformSQL = sql.replace("\n", " ") - val sqlParts: Array[String] = uniformSQL.split(" ") - // remove all additional white spaces in the DDL statment - val newSql = sqlParts.filter(x => !x.isEmpty).mkString(" ") - newSql.toUpperCase() match { - case createTablePattern(_) | createExternalTablePattern(_) | multisetPattern(_) | setPattern(_) | dropTablePattern(_) | truncateTablePattern(_) | deleteFromPattern(_) | deletePattern(_) => { - true - } - case _ => { - false - } - } - } - - /** - * Parse the SQL and get the entire select clause - * - * @param sql SQL String - * @return SQL String - that has just the select clause - */ - def getSelectClause(sql: String): String = { - QueryParserUtils.getSelectClause(sql) - } - - /** - * Parse the SQL and get the entire select clause - * - * @param sql SQL String - * @return SQL String - that has just the select clause - */ - def getPlainSelectClause(sql: String): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val uniformSQL = sql.replace("\n", " ") - val sqlParts: Array[String] = uniformSQL.split(" ") - val index = sqlParts.indexWhere(_.toUpperCase() == "SELECT") - val selectClauseOnly = sqlParts.slice(index, sqlParts.length).mkString(" ") - selectClauseOnly - } - - /** - * Gets the target table - * - * @param sql SQL String - * @return Table Name - */ - def getTargetTables(sql: String): Option[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - SQLParser.getTargetTables(sql) - } - - /** - * getOptions - read the SparkSession options that was set by the user else add the default values - * - * @param sparkSession : SparkSession - * @return Tuple ( String with concatenated options read from the SparkSession , Same Props as a Map[String, String] ) - */ - - def getOptions(sparkSession: SparkSession): (String, Map[String, String]) = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val hiveConf: Map[String, String] = sparkSession.conf.getAll - val optionsToCheck: Map[String, String] = Map( - KafkaConfigs.rowCountOnFirstRunKey -> "250" - , KafkaConfigs.batchFetchSize -> "250" - , KafkaConfigs.maxRecordsPerPartition -> "25000000" - , GimelConstants.LOG_LEVEL -> "ERROR" - , KafkaConfigs.kafkaConsumerReadCheckpointKey -> "true" - , KafkaConfigs.kafkaConsumerClearCheckpointKey -> "false" - , KafkaConfigs.maxRatePerPartitionKey -> "3600" - , KafkaConfigs.streamParallelKey -> "10" - , KafkaConfigs.defaultBatchInterval -> "30" - , KafkaConfigs.isStreamParallelKey -> "true" - , KafkaConfigs.streamaWaitTerminationOrTimeoutKey -> "-1" - , KafkaConfigs.isBackPressureEnabledKey -> "true" - , JdbcConfigs.teradataReadType -> "" - , HbaseConfigs.hbaseOperation -> "scan" - , HbaseConfigs.hbaseFilter -> "" - , GimelConstants.DATA_CACHE_IS_ENABLED -> "false" - , GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL -> "true" - , KafkaConfigs.isStreamBatchSwitchEnabledKey -> "false" - , KafkaConfigs.streamFailureThresholdPerSecondKey -> "1500" - , ElasticSearchConfigs.esIsDailyIndex -> "false" - , CatalogProviderConfigs.CATALOG_PROVIDER -> CatalogProviderConstants.PRIMARY_CATALOG_PROVIDER - , GimelConstants.SPARK_APP_ID -> sparkSession.conf.get(GimelConstants.SPARK_APP_ID) - , GimelConstants.SPARK_APP_NAME -> sparkSession.conf.get(GimelConstants.SPARK_APP_NAME) - , GimelConstants.APP_TAG -> getAppTag(sparkSession.sparkContext) - ) - val resolvedOptions: Map[String, String] = optionsToCheck.map { kvPair => - (kvPair._1, hiveConf.getOrElse(kvPair._1, kvPair._2)) - } - resolvedOptions.foreach(conf => sparkSession.conf.set(conf._1, conf._2)) - (resolvedOptions.map(x => x._1 + "=" + x._2).mkString(":"), hiveConf ++ resolvedOptions) - } - - - /** - * Executes the SQL and Returns DataFrame - * - * @param selectSQL The Select SQL - * @param sparkSession : SparkSession - * @return DataFrame - */ - def executeSelectClause(selectSQL: String, sparkSession: SparkSession, queryPushDownFlag: String): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - - val selectDF: DataFrame = queryPushDownFlag match { - case "true" => - - // set the SparkContext as well as TaskContext property for JdbcPushDown flag to "false" - // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext") - // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false") - - logger.info(s"Executing Pushdown Query: ${selectSQL}") - val df = executePushdownQuery(selectSQL, sparkSession) - - df - case _ => - sparkSession.sql(selectSQL) - } - selectDF - } - - /** - * Executes the Resolved SQL Query by calling the DataSet code that has been generated - * - * @param clientSQL Original SQL String submitted by Client - * @param dest Target Table - * @param selectSQL SQl String for Select Clause alone - * @param sparkSession :SparkSession - * @param dataset DataSet - * @return Result String - */ - def executeResolvedQuery(clientSQL: String, dest: Option[String], selectSQL: String, sparkSession: SparkSession, - dataset: com.paypal.gimel.DataSet, queryPushDownFlag: String): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - logger.info(s"Client SQL is --> $clientSQL") - logger.info(s"Select SQL is --> $selectSQL") - var resultString = "" - if (dest.isDefined) { - logger.info(s"EXECUTION PATH ====== DATASET WRITE ======") - if (clientSQL.toLowerCase.contains("partition")) { - sparkSession.sql("set hive.exec.dynamic.partition.mode=nonstrict") - } - Try { - val options = getOptions(sparkSession)._2 - val selectDF = executeSelectClause(selectSQL, sparkSession, queryPushDownFlag) - // --- EXISTING LOGIC - // dataset.write(dest.get, selectDF, options) - // --- NEW LOGIC - // Get the DataSet Properties - - val tgt = dest.get - (tgt.split(",").length > 2) match { - case true => - case _ => - } - val dataSetProperties: DataSetProperties = CatalogProvider.getDataSetProperties(dest.get, options) - // val dataSetProperties = GimelServiceUtilities().getDataSetProperties(dest.get) - dataSetProperties.datasetType.toString match { - case "HIVE" | "NONE" => - // If Hive - val sqlToInsertIntoHive = queryPushDownFlag.toLowerCase match { - case "true" => - logger.info(s"Invoking write API in gimel with queryPushDownFlag=${queryPushDownFlag}...") - - // create a temp view for pushdown dataframe. - val jdbcPushDownTempTable = "jdbcPushDownTempTable" - logger.info(s"Creating temp view for pushdown query dataframe as ${jdbcPushDownTempTable}") - selectDF.createOrReplaceTempView(jdbcPushDownTempTable) - - val pushDownSelectQuery = s"SELECT * FROM ${jdbcPushDownTempTable}" - - // replace selectSQL in clientSQL with pushDownSelectQuery - logger.info(s"Replacing ${selectSQL} in ${clientSQL} with ${pushDownSelectQuery}") - val pushDownSparkSql = clientSQL.replace(selectSQL, pushDownSelectQuery) - // dataset.write(dest.get, selectDF, options) - logger.info(s"Spark SQL after Pushdown Query: ${pushDownSparkSql}") - pushDownSparkSql - case _ => - logger.info(s"Invoking sparkSession.sql for write with queryPushDownFlag=${queryPushDownFlag}...") - // Get the DB.TBL from UDC - clientSQL - } - // execute on hive - val db = dataSetProperties.props(HiveConfigs.hiveDBName) - val tbl = dataSetProperties.props(HiveConfigs.hiveTableName) - val actual_db_tbl = s"${db}.${tbl}" - // Replace the SQL with DB.TBL - logger.info(s"Replacing ${dest.get} with ${actual_db_tbl}") - val sqlToExecute = sqlToInsertIntoHive.replaceAll(s"(?i)${dest.get}", actual_db_tbl) - logger.info(s"Passing through SQL to Spark for write since target [${actual_db_tbl}] is of data set type - HIVE ...") - logger.info(s"Final SQL to Run --> \n ${sqlToExecute}") - sparkSession.sql(sqlToExecute) - case _ => - // If Non-HIVE - logger.info(s"Invoking write API in gimel with queryPushDownFlag=${queryPushDownFlag}...") - /* - * Sets the appropriate serializer class based on the kafka.message.value.type and value.serializer properties - * This is mainly required for backward compatibility for KAFKA datasets - */ - val newOptions = GimelSerdeUtils.setGimelSerializer(sparkSession, dataSetProperties, options) - dataset.write(dest.get, selectDF, newOptions) - } - - } match { - case Success(_) => - resultString = "Query Completed." - logger.info(resultString) - case Failure(e) => - // e.printStackTrace() - resultString = - s"""Query Failed in function : $MethodName via path dataset.write. Error --> - | - |${e.toString}""".stripMargin - // logger.error(resultString) - throw e - } - } else { - logger.info(s"EXECUTION PATH ====== DATASET SELECT ======") - val selectDF: DataFrame = queryPushDownFlag match { - case "true" => - // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext") - // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false") - val df = executePushdownQuery(selectSQL, sparkSession) - df - case _ => - sparkSession.sql(selectSQL) - } - val count = selectDF.cache.count - val rowsToShow = sparkSession.conf.get(GimelConstants.MAX_RESULTS_TO_SHOW, "250").toInt - val showRowsOnly = sparkSession.conf.get(GimelConstants.SHOW_ROWS_ENABLED, "false").toBoolean - val resultSet = selectDF.take(rowsToShow).mkString("\n") - val marginString = "-------------------------------------------------------------------------------------------------------" - val extraMessage = - s""" - |$marginString - |Total Rows Returned from original Query --> $count - |Displaying Rows ${scala.math.min(rowsToShow, count)} of $count - | - |$userInfoString - """.stripMargin - resultString = - s"""${if (!showRowsOnly) extraMessage else ""} - |$marginString - |$resultSet - |$marginString""".stripMargin - } - resultString - } - - /** - * Executes the Resolved SQL Query by calling the DataSet code that has been generated - * - * @param clientSQL Original SQL String submitted by Client - * @param dest Target Table - * @param selectSQL SQl String for Select Clause alone - * @param sparkSession : SparkSession - * @param dataset DataSet - * @return RDD[Result JSON String] - */ - // def executeResolvedQuerySparkMagic(clientSQL: String, dest: Option[String], selectSQL: String, hiveContext: HiveContext, dataset: DataSet): RDD[String] = { - // def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - // - // logger.info(" @Begin --> " + MethodName) - // - // logger.info(s"Client SQL is --> $clientSQL") - // logger.info(s"Select SQL is --> $selectSQL") - // logger.silence - // val selectDF = hiveContext.sql(selectSQL) - // selectDF.toJSON - // } - - def executeResolvedQuerySparkMagic(clientSQL: String, dest: Option[String], selectSQL: String, sparkSession: SparkSession, dataset: com.paypal.gimel.DataSet, queryPushDownFlag: String): RDD[String] = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - logger.info(s"Client SQL is --> $clientSQL") - logger.info(s"Select SQL is --> $selectSQL") - var resultString = "" - if (dest.isDefined) { - logger.info(s"EXECUTION PATH ====== DATASET WRITE ======") - if (clientSQL.toLowerCase.contains("partition")) { - sparkSession.sql("set hive.exec.dynamic.partition.mode=nonstrict") - } - Try { - val (_, options) = getOptions(sparkSession) - val selectDF: DataFrame = queryPushDownFlag match { - case "true" => - // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext") - // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false") - val df = executePushdownQuery(selectSQL, sparkSession) - df - case _ => - sparkSession.sql(selectSQL) - } - dataset.write(dest.get, selectDF, options) - } match { - case Success(_) => - resultString = """{"Query Execution":"Success"}""" - logger.info(resultString) - sparkSession.read.json(sparkSession.sparkContext.parallelize(Seq(resultString))).toJSON.rdd - case Failure(e) => - // e.printStackTrace() - resultString = - s"""{"Query Execution Failed":${e.toString}}""" - // logger.error(resultString) - sparkSession.read.json(sparkSession.sparkContext.parallelize(Seq(resultString))).toJSON.rdd - // throw e - } - } else { - logger.info(s"EXECUTION PATH ====== DATASET SELECT ======") - val selectDF: DataFrame = queryPushDownFlag match { - case "true" => - // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext") - // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false") - val df = executePushdownQuery(selectSQL, sparkSession) - df - case _ => - sparkSession.sql(selectSQL) - } - // val count = selectDF.cache.count - val rowsToShow = sparkSession.conf.get(GimelConstants.MAX_RESULTS_TO_SHOW, "250").toInt - selectDF.registerTempTable("tmp_table_spark_magic") - val resultSet = sparkSession.sql(s"select * from tmp_table_spark_magic limit ${rowsToShow}").toJSON.rdd - resultSet - } - } - - /** - * This function parses the SQL and get all the source tables. - * It calls hiveutils.ranger authentication if it is a HIVE table (Either UDC or non UDC tables are covered) - * - * @param sql - incoming sql - * @param sparkSession - spark session object - * @param options - incoming user options - */ - - def authenticateAccess(sql: String, sparkSession: SparkSession, options: Map[String, String]): Unit = { - - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql) - val sqlToAuthenticate: Option[String] = nonEmptyStrTokenized.head.toLowerCase match { - case "select" => - // Handling a Select clause... - val userSuppliedPushDownFlag = sparkSession.conf.get(JdbcConfigs.jdbcPushDownEnabled, "false").toBoolean - // If the pushDownFlag is true it is a pure Teradata query and do not do authentication. - // So don't return any SQL for authentication - if (!userSuppliedPushDownFlag) Some(sql) else None - case "cache" => - logger.info("Handling Cache statement ...") - Some(getPlainSelectClause(sql)) - case "insert" => - logger.info("Handling Insert statement ...Do ranger checks for the select tables if they from hive or hbase") - Some(getPlainSelectClause(sql)) - case _ => None - } - - logger.info("The incoming SQL for authenticateRangerPolicies =>" + sql) - sqlToAuthenticate match { - case Some(sql) => authenticateRangerPolicies(sql, sparkSession, options) - case _ => logger.info("No SQL to Authenticate.") - } - - } - - /** - * Checks whether a table is cached in spark Catalog - * - * @param tableName - incoming table name - * @param sparkSession - spark session - * @return - A boolean value to tell whether the table is cached or not - */ - - def isSparkCachedTable(tableName: String, sparkSession: SparkSession): Boolean = { - - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - val isCached = Try { - sparkSession.catalog.isCached(tableName) - } - match { - case Success(result) => { - result match { - case true => true - case _ => false - } - } - case Failure(e) => false - } - isCached match { - case true => logger.info(tableName + "====> a Cached table") - case _ => logger.info(tableName + "====> NOT a Cached table") - } - isCached - } - - /** - * This function parses the SQL and get all the source tables. - * It calls hiveutils.ranger authentication if it is a HIVE table (Either UDC or non UDC tables are covered) - * - * @param sql - incoming sql - * @param sparkSession - spark session object - * @param options - incoming user options - */ - def authenticateRangerPolicies(sql: String, sparkSession: SparkSession, options: Map[String, String]): Unit = { - - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val listTables: Seq[String] = getAllTableSources(sql) - val newList = listTables.toList.filter(dataSetName => { - logger.info("the current data set name is " + dataSetName) - if (dataSetName.contains(".")) { - true - } else { - !isSparkCachedTable(dataSetName, sparkSession) - } - }) - newList.foreach(dataSet => { - logger.info( - "Data Sets to be checked for Ranger authentication are " + dataSet) - authLogicWrapper(dataSet, sparkSession, options) - } - ) - } - - /** - * core logic to check each data set to see whether if it is HIVE or HBASE, if so do impersonation based on the impersonation flag. - * - * @param dataSet - data set name - * @param sparkSession - spark session - * @param options - user options - */ - def authLogicWrapper(dataSet: String, sparkSession: SparkSession, options: Map[String, String]): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info("@Begin --> " + MethodName) - - logger.info("Data set name is --> " + dataSet) - val formattedProps: Map[String, Any] = DataSetUtils.getProps(options) ++ - Map(CatalogProviderConfigs.CATALOG_PROVIDER -> - sparkSession.conf.get(CatalogProviderConfigs.CATALOG_PROVIDER, - CatalogProviderConstants.PRIMARY_CATALOG_PROVIDER)) - - // if storage type unknown we will default to HIVE PROVIDER - if (DataSetUtils.isStorageTypeUnknown(dataSet)) { - formattedProps ++ Map(CatalogProviderConfigs.CATALOG_PROVIDER -> CatalogProviderConstants.HIVE_PROVIDER) - } - - val dataSetProperties: DataSetProperties = CatalogProvider.getDataSetProperties(dataSet, options) - logger.info("dataSetProperties ==> " + dataSetProperties.toString()) - val systemType = DataSetUtils.getSystemType(dataSetProperties) - - val newProps: Map[String, Any] = DataSetUtils.getProps(options) ++ Map( - GimelConstants.DATASET_PROPS -> dataSetProperties - , GimelConstants.DATASET -> dataSet - , GimelConstants.RESOLVED_HIVE_TABLE -> resolveDataSetName(dataSet)) - - systemType match { - case DataSetType.HIVE => - val hiveUtils = new HiveUtils - - // If its cross cluster access, do not allow dynamic dataset access as it would mean the dataset is not present in UDC - // and it will try to read from hive directly which would fail. - // Also, if HDFS location is not present, it may be a view, so abort it. - if (hiveUtils.isCrossCluster(dataSetProperties)) { - val isDynamicDataset = dataSetProperties.props.getOrElse(CatalogProviderConstants.DYNAMIC_DATASET, "false").toBoolean - if (isDynamicDataset) { - throw new Exception( - s""" - | Cross Cluster Access Detected. Cannot read dynamic dataset. - | This means the dataset does not exist in UDC. - """.stripMargin) - } - - if (!dataSetProperties.props.contains(HiveConfigs.dataLocation) || - dataSetProperties.props.get(HiveConfigs.dataLocation).get == GimelConstants.NOT_APPLICABLE) { - throw new Exception( - s""" - | Cross Cluster Access Detected. Cannot find ${HiveConfigs.dataLocation} property. - | Please check if it is a view as Gimel currently does not support cross cluster view access. - """.stripMargin) - } - hiveUtils.authenticateTableAndLocationPolicy(dataSet, options, sparkSession, GimelConstants.READ_OPERATION) - } else { - val hiveTableName = (dataSetProperties.props(GimelConstants.HIVE_DATABASE_NAME) + "." + dataSetProperties.props(GimelConstants.HIVE_TABLE_NAME)) - val hiveTableObject = CatalogProvider.getHiveTable(hiveTableName) - val tableType = hiveTableObject.getTableType - if (tableType == "VIRTUAL_VIEW") { - logger.info("Seems we are querying a view.") - val viewSql = hiveTableObject.getViewExpandedText() - logger.info(s"View SQL --> \n${viewSql}") - println(s"View SQL --> \n${viewSql}") - val allTableSources = getAllTableSources(viewSql) - logger.info(s"List of tables to be authenticated --> \n${allTableSources.mkString("\n")}") - println(s"List of tables to be authenticated --> \n${allTableSources.mkString("\n")}") - allTableSources.foreach(x => authLogicWrapper(x.replaceAll("`", ""), sparkSession, options)) - } else { - hiveUtils.authenticateTableAndLocationPolicy(dataSet, options, sparkSession, GimelConstants.READ_OPERATION) - } - } - case DataSetType.HBASE => - val hBASEUtilities = HBaseUtilities(sparkSession) - hBASEUtilities.authenticateThroughRangerPolicies(dataSet, GimelConstants.READ_OPERATION, newProps) - case _ => None - } - } - - - /** - * Checks if a Query has Cache statemnt - * - * @param sql SQL String - * @return true - if there is an "Cache" clause, else false - */ - def isHavingCache(sql: String): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - GimelQueryUtils.tokenizeSql(sql).head.equalsIgnoreCase("cache") - } - - /** - * This function tokenizes the incoming sql and parses it using GSQL parser and identify whether the query is of Select type - * If it is a select query, it checks whether it is of HBase and has limit clause. - * - * @param sql - Incoming SQL - * @param options - set of Options from the user - * @param sparkSession - spark session - * @return - */ - def setLimitForHBase(sql: String, options: Map[String, String], - sparkSession: SparkSession): Unit = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - Try { - val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql) - nonEmptyStrTokenized.head.toLowerCase match { - case "select" => - val selectTables = getAllTableSources(sql) - // Checks if there is more than 1 source tables - if (selectTables.isEmpty || selectTables.length > 1) return - selectTables.map(eachTable => DataSetUtils.getSystemType( - eachTable, sparkSession, options) match { - case DataSetType.HBASE => - logger.info("Sql contains limit clause, setting the HBase Page Size.") - val limit = Try(QueryParserUtils.getLimit(sql)).get - sparkSession.conf.set(GimelConstants.HBASE_PAGE_SIZE, limit) - case _ => - return - }) - case _ => - return - } - } match { - case Success(_) => - case Failure(exception) => - logger.error(s"Exeception occurred while setting the limit for HBase -> ${exception.getMessage}") - throw exception - } - } - - /** - * Parse the SQL and get cache Query & select statement - * - * @param sql SQL String - */ - def splitCacheQuery(sql: String): (Option[String], String) = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val uniformSQL = sql.replace("\n", " ") - val sqlParts: Array[String] = uniformSQL.split(" ") - if (isHavingCache(sql)) { - logger.info("Splitting sql since it contains cache table") - val index = sqlParts.indexWhere(_.toUpperCase() == "SELECT") - (Some(sqlParts.slice(0, index).mkString(" ")), sqlParts.slice(index, sqlParts.length).mkString(" ")) - } else { - (None, sqlParts.mkString(" ")) - } - } - - /** - * This method will execute the ' cache table t as...' query - * - * @param cacheStatment cache table statement - * @param dataFrame pushdown dataframe - * @param sparkSession sparksesssion - * @return dataframe - */ - def cachePushDownQuery(cacheStatment: String, dataFrame: DataFrame, sparkSession: SparkSession): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - // create a temp view for pushdown dataframe. - val pushDownCacheTempTable = "pushDownCacheTempTable" - logger.info(s"Creating temp view for pushdown query dataframe as ${pushDownCacheTempTable}") - dataFrame.createOrReplaceTempView(pushDownCacheTempTable) - - val sql = - s""" - | ${cacheStatment} SELECT * FROM ${pushDownCacheTempTable} - """.stripMargin - - // execute the cached statement - logger.info(s"Now caching dataframe for pushdown query: ${sql}") - sparkSession.sql(sql) - } - - /** - * Push downs the SELECT query to JDBC data source and executes using JDBC read. - * - * @param inputSQL SELECT SQL string - * @param sparkSession : SparkSession - * @return DataFrame - */ - - def executePushdownQuery(inputSQL: String, sparkSession: SparkSession): DataFrame = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - // check if SQL contains cache query - val (cacheStatement, selectSQL) = splitCacheQuery(inputSQL) - - val dataSetProps = sparkSession.conf.getAll - val jdbcOptions: Map[String, String] = JdbcAuxiliaryUtilities.getJDBCOptions(dataSetProps) - - if (!jdbcOptions.contains(JdbcConfigs.jdbcUrl)) { - throw new IllegalArgumentException("No JDBC url found. Please verify the dataset name in query") - } - - val userSpecifiedFetchSize = dataSetProps.getOrElse("fetchSize", JdbcConstants.DEFAULT_READ_FETCH_SIZE).toString.toInt - - try { - val jdbcSystem = getJDBCSystem(jdbcOptions(JdbcConfigs.jdbcUrl)) - val pushDownDf = jdbcSystem match { - case JdbcConstants.TERADATA => - executeTeradataSelectPushDownQuery(sparkSession, selectSQL, dataSetProps, jdbcOptions, userSpecifiedFetchSize) - case _ => - val pushDownSqlAsTempTable = s"( $selectSQL ) as pushDownTempTable" - logger.info(s"Final SQL for Query Push Down --> $pushDownSqlAsTempTable") - val jdbcConnectionUtility: JDBCConnectionUtility = JDBCConnectionUtility(sparkSession, dataSetProps) - JdbcAuxiliaryUtilities.sparkJdbcRead(sparkSession, jdbcOptions(JdbcConfigs.jdbcUrl), pushDownSqlAsTempTable, - None, JdbcConstants.DEFAULT_LOWER_BOUND, JdbcConstants.DEFAULT_UPPER_BOUND, - 1, userSpecifiedFetchSize, jdbcConnectionUtility.getConnectionProperties) - } - - // cache query if inputSql contains cache query - cacheStatement match { - case Some(cacheTable) => - // cache the query results from pushdown - logger.info(s"Now caching the dataframe for -> $selectSQL") - cachePushDownQuery(cacheTable, pushDownDf, sparkSession) - case _ => - pushDownDf - } - } - catch { - case exec: SQLException => - val errors = new mutable.StringBuilder() - var ex: SQLException = exec - var lastException: SQLException = exec - while (ex != null) { - if (errors.nonEmpty) { - errors.append(s"${GimelConstants.COMMA} ") - } - errors.append(s = ex.getErrorCode().toString) - lastException = ex - ex = ex.getNextException - } - if (lastException != null) { - lastException.printStackTrace() - } - logger.error(s"SQLException: Error codes ${errors.toString()}") - throw exec - case e: Throwable => - throw e - } - finally { - // re-setting all configs for JDBC - JDBCCommons.resetPushDownConfigs(sparkSession) - } - } - - - def executeTeradataSelectPushDownQuery(sparkSession: SparkSession, selectSQL: String, - dataSetProps: Map[String, String], jdbcOptions: Map[String, String], - userSpecifiedFetchSize: Int): DataFrame = { - logger.info(s" @Begin --> ${new Exception().getStackTrace.apply(1).getMethodName}") - val jdbcConnectionUtility: JDBCConnectionUtility = JDBCConnectionUtility(sparkSession, dataSetProps) - import JDBCUtilities._ - val loggerOption = Some(logger) - val mutableJdbcOptions: mutable.Map[String, String] = scala.collection.mutable.Map(jdbcOptions.toSeq: _*) - var sqlToBeExecutedInJdbcRDD: String = selectSQL - logger.info(s"In query pushdown SQL to be executed --> $sqlToBeExecutedInJdbcRDD") - - // Get connection details per the explain plan of the incomingSql - import JDBCConnectionUtility.withResources - var (connectionDetails, connectionUtilityPerIncomingSQL): (ConnectionDetails, JDBCConnectionUtility) = - (null, jdbcConnectionUtility) - var partitionColumns: Seq[String] = Seq.empty - withResources(getOrCreateConnection(connectionUtilityPerIncomingSQL, logger = loggerOption)) { - connection => - // get the partition columns - partitionColumns = JdbcAuxiliaryUtilities.getAndSetPartitionParameters( - sparkSession, dataSetProps, userSpecifiedFetchSize, mutableJdbcOptions, connection) - val tuple = JdbcAuxiliaryUtilities.getConnectionInfo(sparkSession, - jdbcConnectionUtility, dataSetProps, sqlToBeExecutedInJdbcRDD, loggerOption, partitionColumns) - connectionDetails = tuple._1 - connectionUtilityPerIncomingSQL = tuple._2 - } - - // Create a new connection as per the new config - withResources(getOrCreateConnection(connectionUtilityPerIncomingSQL, logger = loggerOption)) { - connection => - // if partitions greater than 1 - if (connectionDetails.numOfPartitions > 1) { - // if sql having analytical functions - if (QueryParserUtils.isHavingAnalyticalFunction(selectSQL)) { - require(dataSetProps.contains(JdbcConfigs.jdbcTempDatabase), - s"Expecting CONF: ${JdbcConfigs.jdbcTempDatabase} to be available") - val tableName = - s"${dataSetProps(JdbcConfigs.jdbcTempDatabase)}.gimel_push_down_${ - Hashing.sha256().hashString(selectSQL, StandardCharsets.UTF_8) - .toString.substring(0, 7) - }" - logger.info(s"Resolved temp table name: $tableName") - // delete the temp table if it exists - JdbcAuxiliaryUtilities.dropTable(tableName, connection, logger = loggerOption) - // create volatile table as select with data - // Recording the time taken for the query execution - val createTableStatement: String = s"CREATE TABLE $tableName AS ( ${selectSQL.trim} ) WITH DATA " - logger.info(s"Proceeding to execute: $createTableStatement") - JdbcAuxiliaryUtilities.executeQueryStatement(createTableStatement, connection, - incomingLogger = loggerOption, recordTimeTakenToExecute = true) - // rewrite the selectSql with `select * from temp_table` and set JdbcConfigs.jdbcDbTable => temp_table - sqlToBeExecutedInJdbcRDD = s"SELECT * from $tableName" - mutableJdbcOptions += (JdbcConfigs.jdbcTempTable -> tableName) - mutableJdbcOptions += (JdbcConfigs.jdbcDbTable -> tableName) - // Set the first column name as partition column if data split is needed - if (!dataSetProps.contains(JdbcConfigs.jdbcPartitionColumns)) { - val tempTableSchema = JdbcReadUtility.resolveTable( - mutableJdbcOptions(JdbcConfigs.jdbcUrl), - sqlToBeExecutedInJdbcRDD, connection - ) - mutableJdbcOptions += (JdbcConfigs.jdbcPartitionColumns -> tempTableSchema.head.name) - } - } - } - - if (!selectSQL.equals(sqlToBeExecutedInJdbcRDD)) { - logger.info("Re-calculating the connection info as the SQL to be executed is changed ") - val tuple = JdbcAuxiliaryUtilities.getConnectionInfo(sparkSession, - jdbcConnectionUtility, dataSetProps, sqlToBeExecutedInJdbcRDD, loggerOption, partitionColumns) - // below syntax to override compilation error - connectionDetails = tuple._1 - connectionUtilityPerIncomingSQL = tuple._2 - } - - // create JDBC rdd - - logger.info(s"Final SQL for Query Push Down --> $sqlToBeExecutedInJdbcRDD") - val tableSchema = JdbcReadUtility.resolveTable( - mutableJdbcOptions(JdbcConfigs.jdbcUrl), - sqlToBeExecutedInJdbcRDD, - connection - ) - - JdbcAuxiliaryUtilities.createJdbcDataFrame(sparkSession, sqlToBeExecutedInJdbcRDD, - connectionDetails.fetchSize, connectionDetails.numOfPartitions, - connectionUtilityPerIncomingSQL, partitionColumns, tableSchema) - } - } - - def validateAllTablesAreFromSameJdbcSystem(sparkSession: SparkSession, - tables: Seq[String], - sqlToBeExecuted: String): (Boolean, Option[Map[String, String]]) = { - val dataSetPropertiesForAllTables: Iterable[Option[DataSetProperties]] = tables.map { - tableName => - Try(CatalogProvider.getDataSetProperties(tableName, mergeAllConfs(sparkSession))).toOption - } - if (dataSetPropertiesForAllTables.nonEmpty && dataSetPropertiesForAllTables.head.isDefined) { - var queryPushDownFlag: Boolean = false - val headJdbcUrl = dataSetPropertiesForAllTables.head.get.props.get(JdbcConfigs.jdbcUrl) - if (headJdbcUrl.isDefined) { - queryPushDownFlag = dataSetPropertiesForAllTables.forall { - dataSetProperty => - dataSetProperty.isDefined && dataSetProperty.get.datasetType == GimelConstants.STORAGE_TYPE_JDBC && - dataSetProperty.get.props.contains(JdbcConfigs.jdbcUrl) && - headJdbcUrl.get.equalsIgnoreCase(dataSetProperty.get.props(JdbcConfigs.jdbcUrl)) - } - } - if (queryPushDownFlag && JdbcAuxiliaryUtilities.validatePushDownQuery(sparkSession, - tables.head, sqlToBeExecuted)) { - // Getting connection info from dataset properties else from the incoming properties - (queryPushDownFlag, Some(JdbcAuxiliaryUtilities.getJDBCOptions( - Map(GimelConstants.DATASET_PROPS -> dataSetPropertiesForAllTables.head.get) - ))) - } else { - (false, None) - } - } else { - (false, None) - } - } - - def validateAllDatasetsAreFromSameJdbcSystem(datasets: Seq[String]): Boolean = { - var areAllDatasetFromSameJdbcSystem: Boolean = false - if (datasets.nonEmpty) { - import com.paypal.gimel.parser.utilities.QueryParserUtils._ - val storageSystemName = Try(extractSystemFromDatasetName(datasets.head)).toOption - if (storageSystemName.isDefined && - CatalogProvider.getStorageSystemProperties( - storageSystemName.get - )(GimelConstants.STORAGE_TYPE) == GimelConstants.STORAGE_TYPE_JDBC) { - areAllDatasetFromSameJdbcSystem = datasets.forall { - dataset => - Try { - val storageSystemProperties = - CatalogProvider.getStorageSystemProperties(extractSystemFromDatasetName(dataset)) - storageSystemProperties(GimelConstants.STORAGE_TYPE) == GimelConstants - .STORAGE_TYPE_JDBC && dataset.contains(storageSystemName.get) - }.getOrElse(false) - } - } - } - areAllDatasetFromSameJdbcSystem - } - - /** - * Returns the flag whether the query has to be pushed down to dataset or not based on dataset provided - * and user supplied flag for pushdown, this method is primarily called for select only clause - * - * @param originalSQL SQLString - * @param selectSQL SQLString - * @param sparkSession : SparkSession - * @param dataSet Dataset Object - * @return String flag to show whether to push down query or not - */ - def getQueryPushDownFlag(originalSQL: String, selectSQL: String, sparkSession: SparkSession, - dataSet: com.paypal.gimel.DataSet): String = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info(" @Begin --> " + MethodName) - - val tables = getTablesFrom(selectSQL) - val userSuppliedPushDownFlag: Boolean = getQueryPushDownFlagFromConf(sparkSession) - - var queryPushDownFlag: Boolean = false - if (userSuppliedPushDownFlag && tables.nonEmpty) { - val (queryPushDownFlagR, jdbcOptions) = validateAllTablesAreFromSameJdbcSystem(sparkSession, tables, selectSQL) - if (queryPushDownFlagR) { - // if all the tables are from the same JDBC system then set query pushdown flag to be true - queryPushDownFlag = queryPushDownFlagR - logger.info(s"Since all the datasets are from same JDBC system overriding " + - s"User specified flag: $userSuppliedPushDownFlag -> true " + - s"with JDBC options: $jdbcOptions") - } else { - logger.info(s"Atleast one dataset is from an alternate JDBC system overriding " + - s"User specified flag: $userSuppliedPushDownFlag -> false") - } - } - - logger.info(s"queryPushDownFlag for data sets${ArrayUtils.toString(tables)}:" + - s" ${queryPushDownFlag.toString}") - queryPushDownFlag.toString - } - - /** - * Resolves the Query by replacing Tmp Tables in the Query String - * For Each Tmp Table placed in the Query String - a DataSet.read is initiated - * For each Tmp Table - if the dataset is a Kafka DataSet - then each KafkaDataSet object is accumulated - * Accumulated KafkaDataSet Object will be used towards the end of the Query (on success) - - * to call check pointing for each topic consumed - * - * @param sql SQL String - * @param sparkSession : SparkSession - * @param dataSet DataSet - * @return Tuple ( Target Table, select SQL String, List(KafkaDataSet) ) - */ - def resolveSQL(sql: String, sparkSession: SparkSession, dataSet: com.paypal.gimel.DataSet): - (String, Option[String], String, List[GimelDataSet], String) = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - - logger.info("@Begin --> " + MethodName) - - logger.info(s"incoming SQL --> $sql") - val uniformSQL = sql.replace("\n", " ") - val selectClauseOnly = getSelectClause(uniformSQL) - val (originalSQL, selectClause, kafkaDataSets, queryPushDownFlag) = - resolveSQLWithTmpTables(sql, selectClauseOnly, sparkSession, dataSet) - val targetTable = getTargetTables(sql) - logger.info(s"selectClause --> $selectClause") - logger.info(s"destination --> $targetTable") - (originalSQL, targetTable, selectClause, kafkaDataSets, queryPushDownFlag) - } - - /** - * Checks whether partitioned by clause is there so that we can pull out the partitions spec - * - * @param sql - incoming sql string - * @return - Boolean value to see whether partitioned clause presents or not - */ - def existsPartitionedByClause(sql: String): Boolean = { - def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName - logger.info(" @Begin --> " + MethodName) - sql.toUpperCase().contains(GimelConstants.HIVE_DDL_PARTITIONED_BY_CLAUSE) - } - - /** - * Checks the config to see if complete pushdown enabled, - * if enabled returns the transformed SQL and the JDBC options - * - * @param sparkSession -> Created SparkSession - * @param sql -> Incoming SQL to be executed - * @return - */ - def isJdbcCompletePushDownEnabled(sparkSession: SparkSession, - sql: String): (Boolean, Option[String], Option[Map[String, String]]) = { - logger.info(s"@Begin --> ${new Exception().getStackTrace.apply(1).getMethodName}") - val userSuppliedPushDownFlag: Boolean = getQueryPushDownFlagFromConf(sparkSession) - val isSelectQuery = QueryParserUtils.isSelectQuery(sql) - logger.info(s"Is select query: $isSelectQuery") - var resultTuple: (Boolean, Option[String], Option[Map[String, String]]) = (false, None, None) - if (userSuppliedPushDownFlag && !isSelectQuery) { - val tables = getAllTableSources(sql) - // val datasets = SQLDataTypesUtils.getDatasets(sql) - logger.info(s"Received tables: $tables for the query: $sql") - // if sql's target tables are of the same JDBC system - if (validateAllTablesAreFromSameJdbcSystem(sparkSession, tables, sqlToBeExecuted = sql)._1) { - logger.info("All datasets are from the same JDBC system") - // As tables emptiness is checked on the validateAllDatasetsAreFromSameJdbcSystem, getting the tables.head - - val transformedSQL = QueryParserUtils.transformUdcSQLtoJdbcSQL(sql, tables) - import com.paypal.gimel.common.utilities.DataSetUtils._ - val systemName = QueryParserUtils.extractSystemFromDatasetName(tables.head) - resultTuple = (true, Some(transformedSQL), Some(getJdbcConnectionOptions(systemName, sparkSession.conf.getAll))) - } else { - logger.info("Not all the datasets are from the same JDBC system") - } - } else if (userSuppliedPushDownFlag && isSelectQuery) { - // Set partitioning to be 1 - // sparkSession.conf.set(JdbcConfigs.jdbcCompletePushdownSelectEnabled, value = true) - logger.info(s"As we received a select query with pushdown flag enabled: $userSuppliedPushDownFlag," + - s" we redirect the output to dataset reader -> Query: $sql") - } - resultTuple - } - - private def getQueryPushDownFlagFromConf(sparkSession: SparkSession): Boolean = { - // User supplied push down flag will be overridden if all the datasets are from the same JDBC system - val userSuppliedPushDownFlag = Try( - sparkSession.conf.get(JdbcConfigs.jdbcPushDownEnabled, "true").toBoolean - ).getOrElse(true) - logger.info(s"User specified pushdown flag: $userSuppliedPushDownFlag") - userSuppliedPushDownFlag - } - - /** - * Utility for executing push down queries on the respective JDBC system, based on the incoming dataset's property - * - * @param sparkSession - * @param sql - * @param jdbcOptions - * @return - */ - def pushDownQueryAndReturnResult(sparkSession: SparkSession, - sql: String, - jdbcOptions: Map[String, String]): String = { - val jdbcConnectionUtility: JDBCConnectionUtility = validateAndGetJdbcConnectionUtility(sparkSession, jdbcOptions) - val functionName = s"[QueryHash: ${sql.hashCode}]" - logger.info(s"Proceeding to execute JDBC[System: ${jdbcConnectionUtility.jdbcSystem}," + - s" User: ${jdbcConnectionUtility.jdbcUser}] pushdown query$functionName: $sql") - GenericUtils.time(functionName, Some(logger)) { - val queryResult: String = - JDBCConnectionUtility.withResources( - JDBCUtilities.getOrCreateConnection(jdbcConnectionUtility, logger = Some(logger)) - ) { - connection => JdbcAuxiliaryUtilities.executeQueryAndReturnResultString(sql, connection) - } - queryResult - } - } - - private def validateAndGetJdbcConnectionUtility(sparkSession: SparkSession, - jdbcOptions: Map[String, String]): JDBCConnectionUtility = { - logger.info(s" @Begin --> ${new Exception().getStackTrace.apply(1).getMethodName}") - logger.info(s"Received JDBC options: $jdbcOptions") - if (!jdbcOptions.contains(JdbcConfigs.jdbcUrl)) { - throw new IllegalArgumentException("No JDBC url found. Please verify the dataset name in query") - } - - JDBCConnectionUtility(sparkSession, jdbcOptions) - } - - def createPushDownQueryDataframe(sparkSession: SparkSession, - sql: String, - jdbcOptions: Map[String, String]): DataFrame = { - val jdbcConnectionUtility: JDBCConnectionUtility = validateAndGetJdbcConnectionUtility(sparkSession, jdbcOptions) - val pushDownJdbcRDD = - new PushDownJdbcRDD(sparkSession.sparkContext, new DbConnection(jdbcConnectionUtility), sql) - sparkSession.createDataFrame(pushDownJdbcRDD, JdbcConstants.DEF_JDBC_PUSH_DOWN_SCHEMA) - } - - private lazy val userInfoString = - s""" - |------------------------------ - |User controllable Properties - |------------------------------ - | - |Query Results & Helper - |---------------------- - |${GimelConstants.SHOW_ROWS_ENABLED} --> Set this to "true" to stop getting all these messages. (Default : false) - |${GimelConstants.MAX_RESULTS_TO_SHOW} --> Number of rows to display in interactive mode (Default : 1000) - | - |Data Caching Options - |---------------------- - |${GimelConstants.DATA_CACHE_IS_ENABLED} --> true indicates dataset caching is enabled (Default : false) - |${GimelConstants.DATA_CACHE_IS_ENABLED}.for.pcatalog.flights --> if this = true & ${GimelConstants.DATA_CACHE_IS_ENABLED}=true, then only pcatalog.flights from query will be cached. (Default : false) - |${GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL} --> if this = true, then all pcatalog datasets in query will be cached (Default : true) - | - |Logging Level - |---------------------- - |${GimelConstants.LOG_LEVEL} --> set to INFO, DEBUG, WARN, ERROR to get desired level of logging (Default : ERROR) - | - |kafka Checkpointing - |---------------------- - |${KafkaConfigs.kafkaConsumerReadCheckpointKey} --> true indicates check-pointing enabled (Default : true) - |${KafkaConfigs.kafkaConsumerClearCheckpointKey} --> true indicates checkpoint will be cleared before this run begins (Default : false) - | - |kafka Stream Throttle - |---------------------- - |${KafkaConfigs.maxRatePerPartitionKey} --> Spark Configuration for Streaming Rate (Default : 3600, empirically derived) - |${KafkaConfigs.isStreamParallelKey} --> true causes ordering to be lost, but performance gain via parallelism factor. (Default : true) - |${KafkaConfigs.streamParallelKey} --> Number of parallel threads to run while processing data after fetching from kafka (Default : 10) - |${KafkaConfigs.defaultBatchInterval} --> Streaming Window Seconds (Default : 30) - | - |kafka Batch Throttle - |---------------------- - |${KafkaConfigs.rowCountOnFirstRunKey} --> Fetches Only Supplied number of rows from Kafka (Default : 25 Million) - |${KafkaConfigs.maxRecordsPerPartition} --> Advanced options to further restrict how many messages we can read from each partition - in batch mode Kafka Read (Default 25 Million rows, for this to be effective, value should be <= throttle.batch.fetchRowsOnFirstRun) - |${KafkaConfigs.batchFetchSize} --> Advanced options to parallelize in batch mode Kafka Read (Default 250) --> This will parallelize 25 Million into 250 threads - | - |HBase - |----------------------- - |${HbaseConfigs.hbaseOperation} -> Type of operation to be performed on HBase. Can be scan for reading all data or get for lookup - |${HbaseConfigs.hbaseFilter} -> Filter condition for HBase lookup. Example: rowKey=1:toGet=cf1-c1,c2|cf2-c3 - | - |Elastic - |----------------------- - |${ElasticSearchConfigs.esIsPartitioned}-> Is the index partitioned or not ? - |${ElasticSearchConfigs.esDelimiter}-> What is the delimiter which separates the index name with the partition - |${ElasticSearchConfigs.esPartition}-> "*" -> wild card to include all the specific partitions - |${ElasticSearchConfigs.esDefaultReadForAllPartitions}-> flag which indicates whether to read all partitions or not - |${ElasticSearchConfigs.esMapping}-> flag which gets the schema from the user - """.stripMargin -} diff --git a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/SQLParser.scala b/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/SQLParser.scala deleted file mode 100644 index 01be8062..00000000 --- a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/SQLParser.scala +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import scala.collection.mutable.ListBuffer -import scala.util._ - -import org.apache.hadoop.hive.ql.parse.{ASTNode, HiveParser, ParseDriver} - -object SQLParser { - - /** - * getSourceTables - Helper function to call a function which is recursive to get the source table names from the AST - * - * @param sql to be parsed - * @return - List of source table names - */ - - @deprecated - def getSourceTables(sql: String): ListBuffer[String] = { - val parsDri = new ParseDriver() - val ast_tree: ASTNode = parsDri.parse(sql) - getSourceTables(ast_tree) - } - - - /** - * getTargetTables1 - Helper function to call a function which is recursive to get the Target table names from the AST - * - * @param sql to be parsed - * @return - List of target tables if any. If it is select only table, it returns a None. - */ - - def getTargetTables(sql: String): Option[String] = { - Try { - GimelQueryUtils.isHavingInsert(sql) match { - case false => None - case true => - val lSql = sql.toLowerCase() - val tokens = GimelQueryUtils.tokenizeSql(lSql) - val tableIndex = tokens.contains("table") match { - case true => tokens.indexOf("table") - case false => tokens.indexOf("into") - } - Some(tokens(tableIndex + 1)) - } - } match { - case Success(x) => x - case Failure(f) => - throw new Exception( - s""" - |ERROR PARSING SQL IN Gimel --> ${sql} - |Exception --> ${f} - |PLEASE VALIDATE IF SQL IS FORMED CORRECTLY. - """.stripMargin) - } - } - - // TODO - Following two functions can be combined later. - - /** - * getSourceTables - Recursive function to get the source table names - * - * @param from - AST tree - * @param myList - list of source table names - */ - private def getSourceTables(from: ASTNode, - myList: ListBuffer[String] = new ListBuffer[String]()): ListBuffer[String] = { - var table: String = "" - - if (from != null) { - - if (HiveParser.TOK_TABREF == from.getType) { - val tabName = from.getChild(0) - - if (HiveParser.TOK_TABNAME == tabName.getType) { - if (tabName.getChildCount == 2) { - table = tabName.getChild(0).getText + "." + tabName.getChild(1).getText - } else { - table = tabName.getChild(0).getText - } - myList += table - } - } - - for (i <- 0 to from.getChildCount) { - val child = from.getChild(i) - if (child != null) { - getSourceTables(child.asInstanceOf[ASTNode], myList) - } - } - } - myList - } - -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryProcessorTest.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryProcessorTest.scala deleted file mode 100644 index cffd38d2..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryProcessorTest.scala +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.scalatest._ - -class GimelQueryProcessorTest extends FunSpec with Matchers with BeforeAndAfter { - var sparkSession : SparkSession = _ - - before { - sparkSession = SparkSession.builder().appName("GimelQueryProcessor Test") - .master("local") - .getOrCreate() - - // HBaseLocalClient.startHbaseCluster(sparkSession) - } - - after { - sparkSession.close() - // HBaseLocalClient.stopHbaseCluster() - } - - /* - * The test cases are in the ignored scope due to https://github.com/elastic/elasticsearch-hadoop/issues/1097 - * To test the following: Change "ignore" to "it" - * Exclude either elasticsearch-hadoop pr elasticsearch-spark from the dependencies by changing their scope to provided in gimel-elasticsearch - * OR Change the elasticsearch-hadoop version to 6.6.0 or 7.0.0 - */ - - ignore("should test json in Hdfs dataset read via sql") { - val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession) - val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass; - val resourcesPath = "file://" + (className.getResource("/hdfs_test.json")).getPath - val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath, - "gimel.hdfs.data.format" -> "json", - "gimel.hive.db.name" -> "db", - "gimel.hive.table.name" -> "table", - "gimel.hdfs.nn" -> "file:/") - gsql("set gimel.catalog.provider=USER") - val dataSetProperties = s""" - { - "datasetType" : "HDFS", - "fields" : [], - "partitionFields" : [], - "props": { - "gimel.hdfs.data.location" : "$resourcesPath", - "gimel.hdfs.data.format" : "json", - "gimel.hive.db.name" : "db", - "gimel.hive.table.name" : "table", - "gimel.hdfs.nn" : "file:/", - "datasetName" : "MyDataset" - } - }""" - gsql("set gimel.catalog.provider=USER") - gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""") - val res: DataFrame = gsql(s"""select * from udc.hdfs.json""") - assert(res.count() == 1) - } - - ignore("should test csv in Hdfs dataset read via sql") { - val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession) - val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass; - val resourcesPath = "file://" + (className.getResource("/hdfs_test.csv")).getPath - val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath, - "gimel.hdfs.data.format" -> "json", - "gimel.hive.db.name" -> "db", - "gimel.hive.table.name" -> "table", - "gimel.hdfs.nn" -> "file:/") - gsql("set gimel.catalog.provider=USER") - val dataSetProperties = s""" - { - "datasetType" : "HDFS", - "fields" : [], - "partitionFields" : [], - "props": { - "gimel.hdfs.data.location" : "$resourcesPath", - "gimel.hdfs.data.format" : "json", - "gimel.hive.db.name" : "db", - "gimel.hive.table.name" : "table", - "gimel.hdfs.nn" : "file:/", - "datasetName" : "MyDataset" - } - }""" - gsql("set gimel.catalog.provider=USER") - gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""") - val res: DataFrame = gsql(s"""select * from udc.hdfs.json""") - assert(res.count() == 1) - } - - ignore("should test text in Hdfs dataset read via sql") { - val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession) - val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass; - val resourcesPath = "file://" + (className.getResource("/hdfs_test.txt")).getPath - val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath, - "gimel.hdfs.data.format" -> "json", - "gimel.hive.db.name" -> "db", - "gimel.hive.table.name" -> "table", - "gimel.hdfs.nn" -> "file:/") - gsql("set gimel.catalog.provider=USER") - val dataSetProperties = s""" - { - "datasetType" : "HDFS", - "fields" : [], - "partitionFields" : [], - "props": { - "gimel.hdfs.data.location" : "$resourcesPath", - "gimel.hdfs.data.format" : "json", - "gimel.hive.db.name" : "db", - "gimel.hive.table.name" : "table", - "gimel.hdfs.nn" : "file:/", - "datasetName" : "MyDataset" - } - }""" - gsql("set gimel.catalog.provider=USER") - gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""") - val res: DataFrame = gsql(s"""select * from udc.hdfs.json""") - assert(res.count()==1) - } - - ignore("should test avro in Hdfs dataset read via sql") { - val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession) - val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass; - val resourcesPath = "file://" + (className.getResource("/hdfs_test.avro")).getPath - val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath, - "gimel.hdfs.data.format" -> "json", - "gimel.hive.db.name" -> "db", - "gimel.hive.table.name" -> "table", - "gimel.hdfs.nn" -> "file:/") - gsql("set gimel.catalog.provider=USER") - val dataSetProperties = s""" - { - "datasetType" : "HDFS", - "fields" : [], - "partitionFields" : [], - "props": { - "gimel.hdfs.data.location" : "$resourcesPath", - "gimel.hdfs.data.format" : "json", - "gimel.hive.db.name" : "db", - "gimel.hive.table.name" : "table", - "gimel.hdfs.nn" : "file:/", - "datasetName" : "MyDataset" - } - }""" - gsql("set gimel.catalog.provider=USER") - gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""") - val res: DataFrame = gsql(s"""select * from udc.hdfs.json""") - assert(res.count() == 2) - } - - ignore("should test gz in Hdfs dataset read via sql") { - val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession) - val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass; - val resourcesPath = "file://" + (className.getResource("/hdfs_test.txt.gz")).getPath - val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath, - "gimel.hdfs.data.format" -> "json", - "gimel.hive.db.name" -> "db", - "gimel.hive.table.name" -> "table", - "gimel.hdfs.nn" -> "file:/") - gsql("set gimel.catalog.provider=USER") - val dataSetProperties = s""" - { - "datasetType" : "HDFS", - "fields" : [], - "partitionFields" : [], - "props": { - "gimel.hdfs.data.location" : "$resourcesPath", - "gimel.hdfs.data.format" : "json", - "gimel.hive.db.name" : "db", - "gimel.hive.table.name" : "table", - "gimel.hdfs.nn" : "file:/", - "datasetName" : "MyDataset" - } - }""" - gsql("set gimel.catalog.provider=USER") - gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""") - val res: DataFrame = gsql(s"""select * from udc.hdfs.json""") - assert(res.count() == 1) - } - - ignore("should test sequence in Hdfs dataset read via sql") { - val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession) - val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass; - val resourcesPath = "file://" + (className.getResource("/hdfs_test.seq")).getPath - val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath, - "gimel.hdfs.data.format" -> "json", - "gimel.hive.db.name" -> "db", - "gimel.hive.table.name" -> "table", - "gimel.hdfs.nn" -> "file:/") - gsql("set gimel.catalog.provider=USER") - val dataSetProperties = s""" - { - "datasetType" : "HDFS", - "fields" : [], - "partitionFields" : [], - "props": { - "gimel.hdfs.data.location" : "$resourcesPath", - "gimel.hdfs.data.format" : "json", - "gimel.hive.db.name" : "db", - "gimel.hive.table.name" : "table", - "gimel.hdfs.nn" : "file:/", - "datasetName" : "MyDataset" - } - }""" - gsql("set gimel.catalog.provider=USER") - gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""") - val res: DataFrame = gsql(s"""select * from udc.hdfs.json""") - assert(res.count() == 100) - } - -// ignore("should test hbase write") { -// val tableName = "test_table" -// val gsql: String => DataFrame = com.paypal.gimel.scaas.GimelQueryProcessor.executeBatch(_: String, sparkSession) -// gsql("set " + HbaseConfigs.hbaseRowKey + "=id") -// gsql("set " + HbaseConfigs.hbaseColumnMappingKey + "=personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") -// val dataFrame = HBaseLocalClient.mockDataInDataFrame(sparkSession, 1000) -// dataFrame.registerTempTable("input_table") -// val sql = "insert into HBase.Local.default." + tableName + " select * from input_table" -// val df = gsql(sql) -// df.show -// } -// -// ignore("should test hbase read with limit") { -// val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)] -// val listener = new QueryExecutionListener { -// // Only test successful case here, so no need to implement `onFailure` -// override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {} -// override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { -// metrics += ((funcName, qe, duration)) -// } -// } -// sparkSession.listenerManager.register(listener) -// val tableName = "test_table" -// val gsql: String => DataFrame = com.paypal.gimel.scaas.GimelQueryProcessor.executeBatch(_: String, sparkSession) -// gsql("set " + HbaseConfigs.hbaseRowKey + "=id") -// gsql("set " + HbaseConfigs.hbaseColumnMappingKey + "=personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary") -// sparkSession.conf.set(GimelConstants.HBASE_PAGE_SIZE, 20) -// val sql = "select * from HBase.Local.default." + tableName + " limit 20" -// val df = gsql(sql) -// df.show(20) -// val metricInsertQuery = metrics(metrics.length - 1) -// val qe = metricInsertQuery._2 -// println(qe.executedPlan.children(0).children(0).children(0).metrics) -// val kafkaReadOutputRows = qe.executedPlan.children(0).children(0).children(0).metrics("numOutputRows").value -// assert(kafkaReadOutputRows == 20) -// sparkSession.conf.unset(GimelConstants.HBASE_PAGE_SIZE) -// -// } - -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsSpec.scala deleted file mode 100644 index 812dfd43..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsSpec.scala +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.paypal.gimel.sql - -import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers} - -class GimelQueryUtilsSpec - extends FunSpec - with SharedSparkSession - with Matchers - with BeforeAndAfterEach { - - // add things to do before each test for this specific files - protected override def beforeEach(): Unit = { - GimelQueryUtils.setCatalogProvider("UDC") - } - - // add things to do after each test for this specific files - protected override def afterEach(): Unit = { - GimelQueryUtils.setCatalogProvider("UDC") - } - - describe("setCatalogProvider") { - it("should set user specified CatalogProvider") { - - // UDC Catalog provider - GimelQueryUtils.setCatalogProvider("UDC") - GimelQueryUtils.getCatalogProvider() should be("UDC") - - GimelQueryUtils.setCatalogProvider("HIVE") - GimelQueryUtils.getCatalogProvider() should be("HIVE") - - GimelQueryUtils.setCatalogProvider("PCATALOG") - GimelQueryUtils.getCatalogProvider() should be("PCATALOG") - - } - - it("should throw warning if the catalog provider is not UDC/HIVE/PCATALOG") { - GimelQueryUtils.setCatalogProvider("TEST") - GimelQueryUtils.getCatalogProvider() should be("UDC") - - } - } - - describe("tokenizeSql") { - it("should tokenize the string passed to it") { - GimelQueryUtils.tokenizeSql(SQLMasterList.simpleInsertSelect2) should be( - Array( - "INSERT", - "INTO", - "UDC.Mysql.datalake.test.YELP_REVIEW_WRITE", - "SELECT", - "*", - "FROM", - "udc.kafka.tau.yelp.review" - ) - ) - } - } - -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsTest.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsTest.scala deleted file mode 100644 index 7066c9ae..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsTest.scala +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import scala.util.{Failure, Success, Try} - -import org.apache.commons.lang3.StringUtils -import org.mockito.Mockito._ -import org.scalamock.scalatest.MockFactory -import org.scalatest._ - -import com.paypal.gimel.common.catalog.CatalogProvider -import com.paypal.gimel.common.gimelservices.{GimelServicesProperties, GimelServiceUtilities} -import com.paypal.gimel.parser.utilities.{QueryParserUtils, SearchSchemaUtils} - -class GimelQueryUtilsTest extends FunSuite with Matchers with MockFactory { - - import com.paypal.gimel.parser.utilities.QueryParserUtils._ - - test("Test Extraction of tablename ") { - assert( - extractTableName( - "udc.teradata.test_cluster.yelp.review" - ) == "yelp.review" - ) - assert( - extractTableName( - "udc.Teradata.Test_cluster.yelp.business_details" - ) == "yelp.business_details" - ) - val tableName = "udc.teradata.test_cluster.yelp.review" - assert(extractTableName(tableName, 1) === "review") - assert(extractTableName(tableName, 2) === "yelp.review") - testErrorCase(extractTableName("", 2)) - testErrorCase( - extractTableName("yelp.review", 2) - ) - testErrorCase(extractTableName(null, 2)) - testErrorCase(extractTableName("tablename") === "tablename") - } - - test("Execute query ") { - println( - QueryParserUtils.isQueryOfGivenSeqType( - "sel * from udc.Teradata.Test_cluster.yelp.business_details sample 10;" - ) - ) - println( - QueryParserUtils.isQueryOfGivenSeqType( - "sel * fromudc.Teradata.Test_cluster.yelp.reviews sample 10;" - ) - ) - println( - QueryParserUtils.isQueryOfGivenSeqType( - "DELETE udc.Teradata.Test_cluster.yelp.business_details ALL" - ) - ) - } - - test("Transform SQL name") { - println( - QueryParserUtils.isQueryOfGivenSeqType( - "show select * from udc.Teradata.Test_cluster.yelp.business_details" - ) - ) - validateTransformSQL( - "DELETE udc.Teradata.Test_cluster.yelp.business_details ALL", - "DELETE yelp.business_details ALL" - ) - validateTransformSQL( - "show select * from udc.Teradata.Test_cluster.yelp.business_details", - "show select * from yelp.business_details" - ) - validateTransformSQL( - """ INSERT INTO udc.Teradata.Test_cluster.yelp.business_details ( - | id, - | created_date ) - | VALUES ('123fvf', '2019-08-09')""".stripMargin, - """ INSERT INTO yelp.business_details ( - | id, - | created_date ) - | VALUES ('123fvf', '2019-08-09')""".stripMargin - ) - } - - def testErrorCase[R](block: => R): Option[R] = { - Try(block) match { - case Success(value) => Option(value) - case Failure(exception) => - exception.printStackTrace() - None - } - } - - test("Replace SQL") { - val sql = - """INSERT INTO udc.teradata.test_cluster.yelp.business_details VALUES ('123fvfv', - |'2019-08-05')""".stripMargin - var transformedSQL = sql - val tables = GimelQueryUtils.getAllTableSources( - sql, - searchList = SearchSchemaUtils.ALL_TABLES_SEARCH_CRITERIA - ) - println("Tables -> " + tables) - tables.foreach( - tableName => - transformedSQL = - transformedSQL.replaceAll(tableName, extractTableName(tableName)) - ) - println("transformedSQL -> " + transformedSQL) - assert( - transformedSQL === - """INSERT INTO yelp.business_details VALUES ('123fvfv', - |'2019-08-05')""".stripMargin - ) - } - - private def validateTransformSQL(sql: String, assertString: String) = { - assert( - transformSQL(sql, QueryParserUtils.getDatasets(sql)) === assertString - ) - } - - def transformSQL(sql: String, datasets: Seq[String]): String = { - var transformedSQL = sql - datasets.foreach( - datasetName => - transformedSQL = StringUtils.replaceIgnoreCase( - transformedSQL, - datasetName, - extractTableName(datasetName) - ) - ) - - transformedSQL - } - - ignore("No connection to UDC service: TC 2") { - test("validateAllDatasetsAreFromSameJdbcSystem") { - - val gimelServiceProps = spy(new GimelServicesProperties()) - val serviceUtilities = mock[GimelServiceUtilities] - when( - serviceUtilities - .getSystemAttributesMapByName("udc.teradata.test_cluster.yelp.review") - ).thenReturn( - Map( - "gimel.storage.type" -> "JDBC", - "gimel.jdbc.url" -> "jdbc:teradata://teradata-host", - "gimel.jdbc.driver.class" -> "com.teradata.jdbc.TeraDriver", - "storageSystemID" -> "11" - ) - ) - println( - CatalogProvider - .getStorageSystemProperties("udc.teradata.test_cluster.yelp.review") - ) - println("Hello") - } - } - - test("extractSystemFromDatasetName") { - assert( - extractSystemFromDatasetName("udc.teradata.test_cluster.yelp.review") === "teradata.test_cluster" - ) - try { - extractSystemFromDatasetName("yelp.review") - } catch { - case e: IllegalStateException => e.printStackTrace() - } - try { - extractSystemFromDatasetName(null) - } catch { - case e: IllegalArgumentException => e.printStackTrace() - } - assert( - extractSystemFromDatasetName("udc. kafka.test_cluster.yelp.review ") === "kafka.test_cluster" - ) - } - - test(" IS Select Quey") { - assert( - QueryParserUtils.isSelectQuery( - "select * from udc.teradata.test_cluster.yelp.review sample 10;" - ) - ) - } - - test(" getTablesFrom SQL ") { - assert( - GimelQueryUtils - .getTablesFrom("help table udc.teradata.test_cluster.yelp.review;") - .sameElements(Array("udc.teradata.test_cluster.yelp.review")) - ) - - assert( - GimelQueryUtils - .getAllTableSources( - "help table udc.teradata.test_cluster.yelp.review;", - searchList = SearchSchemaUtils.TARGET_TABLES_SEARCH_CRITERIA - ) == List("udc.teradata.test_cluster.yelp.review") - ) - - assert( - GimelQueryUtils - .getAllTableSources( - """ - |create multiset table ${targetDb}.enriched_data as - |select - | review.review_id, - | review.review_text, - | review.user_id, - | review.review_date, - | review.business_id, - | business_details.name as business_name, - | postal_geo_map.latitude as business_latitude, - | postal_geo_map.longitude as business_longitude, - | yelp_user.name as user_name, - | yelp_user.review_count as user_review_count, - | yelp_user.yelping_since as user_yelping_since - |from - | pcatalog.teradata.tau.yelp.review review - |inner join - | pcatalog.teradata.tau.yelp.business_details business_details - |on - | review.business_id = business_details.business_id - |join - | pcatalog.teradata.tau.yelp.business_address business_address - |on - | review.business_id = business_address.business_id - |join - | pcatalog.teradata.tau.yelp.user yelp_user - |on - | yelp_user.user_id = review.user_id - |join - | pcatalog.teradata.tau.yelp.postal_geo_map - |on - | business_address.postal_code = postal_geo_map.postal_code - |where - | review.review_date > current_date -150 - |and - | review.business_id = 'ogpiys3gnfZNZBTEJw5-1Q' - |""".stripMargin, - searchList = SearchSchemaUtils.ALL_TABLES_SEARCH_CRITERIA - ).sorted.sameElements(Array( - "pcatalog.teradata.tau.yelp.review", - "${targetdb}.enriched_data", - "pcatalog.teradata.tau.yelp.business_details", - "pcatalog.teradata.tau.yelp.business_address", - "pcatalog.teradata.tau.yelp.user", - "pcatalog.teradata.tau.yelp.postal_geo_map" - ).sorted) - ) - } - - // Substitutes dataset name with tmp table in sql using regex - test ("getSQLWithTmpTable") { - // Should match as "udc.hive.test.flights" is preceded by space and is at end of the line - assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights", - "udc.hive.test.flights", - "tmp_flights") - == "select * from tmp_flights") - - // Should not match as "udc.hive.test.flights" is not preceded by any white space - assert(GimelQueryUtils.getSQLWithTmpTable("select * fromudc.hive.test.flights", - "udc.hive.test.flights", - "tmp_flights") - == "select * fromudc.hive.test.flights") - - // Should not match as "udc.hive.test.flights" is not followed by any white space, ; or , - assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights_schedule", - "udc.hive.test.flights", - "tmp_flights") - == "select * from udc.hive.test.flights_schedule") - - // Should match as "udc.hive.test.flights" is preceded by space and followed by new line - assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights\n", - "udc.hive.test.flights", - "tmp_flights") - == "select * from tmp_flights\n") - - // Should match as "udc.hive.test.flights" is preceded by space and followed by , - assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights, udc.hive.test.flights_schedule", - "udc.hive.test.flights", - "tmp_flights") - == "select * from tmp_flights, udc.hive.test.flights_schedule") - - // Should match as "udc.hive.test.flights" is preceded and followed by space - assert(GimelQueryUtils.getSQLWithTmpTable( - "select * from udc.hive.test.flights where flights_id = 123", - "udc.hive.test.flights", - "tmp_flights") - == "select * from tmp_flights where flights_id = 123") - } -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLMasterList.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLMasterList.scala deleted file mode 100644 index a784c424..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLMasterList.scala +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -object SQLMasterList { - - val insertSQL1 = - """ - |INSERT INTO data_source_tab2 PARTITION (p1 = 'part1', p2) - | SELECT id, 'part' || id FROM RANGE(1, 3) - | - """.stripMargin - - val insertSQL2 = - """ - |INSERT INTO table data_source_tab2 PARTITION (p1 = 'part1', p2) - | SELECT id, 'part' || id FROM RANGE(1, 3) - | - """.stripMargin - - val insertSQL3 = - """ - |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'partNew1', p2) - | VALUES (3, 'partNew2') - """.stripMargin - - val insertSQL4 = - """ - |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'part1', p2) - | VALUES (5, 'part1') - """.stripMargin - - val insertSQL5 = - """insert into pcatalog.Elastic_Test_Cluster_yelp_review_data - |select * from pcatalog.kafka_test_cluster_yelp_review - | - """.stripMargin - - val insertSQL6 = - """ - |insert into tgt select * from src - """.stripMargin - - val insertPartitionedTable1 = - """ - |INSERT INTO TABLE temp_table2 PARTITION(col1) SELECT col1, col2, col3, col4 FROM temp_table1; - """.stripMargin - - val baSQL4 = - """ - |select yelp_user.*, last_user_review.last_review_date - |from pcatalog.teradata.tau.yelp.user yelp_user - |join ( - |select user_id, - |max(review_date) as last_review_date - |from pcatalog.teradata.tau.yelp.review review - |group by user_id - |) last_user_review - |on yelp_user.user_id = last_user_review.user_id - |where yelp_user.review_count > 100 - |and yelp_user.useful> 100 - |and fans > 100 - |and last_user_review.last_review_date < current_date - 180 - |sample 10 - """.stripMargin - - val sqlANSIanNonANSI = - """select t1.* from - |(select - |b.* - |from - |a k,b l, c l, d k ) join t2 - |on t1.id = t2.id where - |1=2 and - |(select * from k , l where k.id = l.id) - """.stripMargin - - val sqlANSISimple = - """ - |select - |b.* - |from - |a k,b l, c l, d k - """.stripMargin - - val sqlANSISimpleSubQuery = - """ - |select t.* from - |(select - |b.* - |from - |a k,b l, c l, d k ) t - """.stripMargin - - val sqlANSIOnly = - """ - |select * from - |testdb.emp d - |left join emp_loc f - |on d.id = f.id - """.stripMargin - val plainSQL = - """ - |select - | * from abc; - """.stripMargin - - val innerSQL = - """ - |select * from (select * from a) tbl - """.stripMargin - - val joinANSISQLViaUDC = - """ - |select t2.c1, t2.c2 - |, t1.* - |from - |testdb.emp t1 - |join ( - |select f1.c11, f2.c11 - |from udc.kafka.test.emp.address f1 - |join udc.kafka.test.emp.loc f2 - |on f1.id = f2.id - |) t2 - |on t1.id = t2.id - """.stripMargin - - - val mixANSINonANSISQL = - """ - |select * from - |testdb.emp s join - |( - |select - |* from - |udc.kafka.test.test.emp a, udc.hive.test_cluster.test.calendar b, c - |where a.id = b.id - |and c.id1 = c.id1 - |) t - |on s.key = b.key - |where 1= 2 - |and exists (select 1 from udc.teradata.test_cluster.testdb.lkp where lkp.id3 = s.id3); - | - """.stripMargin - - val commaTypeSQL = - """ - |select f.*, d.* - |from f , d - |where f.id = d.id - """.stripMargin - - val mixCaseSQL = "select * FRom tmp" - - val simpleSelect1 = "SELECT * FROM UDC.Mysql.datalake.test.YELP_REVIEW_READ" - - val simpleInsertSelect1 = - """ - | INSERT INTO UDC.Mysql.datalake.test.YELP_REVIEW_WRITE - | SELECT * FROM udc.kafka.tau.yelp.review - """.stripMargin - - val simpleInsertSelect2 = - "INSERT INTO UDC.Mysql.datalake.test.YELP_REVIEW_WRITE \n " + - "SELECT * FROM udc.kafka.tau.yelp.review" + "\t" + " " - - // All DDLs are addressed here - - val simpleCreateDDL = - "CREATE table tempTable (ageField int)" - - val complexCreateDDL = - """CREATE EXTERNAL TABLE pcatalog.elastic_smoke_test(data string) - |STORED AS TEXTFILE\nLOCATION 'hdfs:///tmp/pcatalog/elastic_smoke_test' - |TBLPROPERTIES ( - |'gimel.storage.type' = 'ELASTIC_SEARCH', - |'es.mapping.date.rich' = 'true', - |'es.nodes' = 'http://es-host', - |'es.port' = '8080', - |'es.resource' = 'flights/data') - """.stripMargin - - val dropIfExistsDDL = - """DROP TABLE IF EXISTS pcatalog.elastic_smoke_test""" - - val dropPlainDDL = - """DROP TABLE pcatalog.elastic_smoke_test""" - - val dropIfExistsViewDDL = - """DROP TABLE IF EXISTS pcatalog.elastic_smoke_test""" - - val dropPlainViewDDL = - """DROP TABLE pcatalog.elastic_smoke_test""" - - val truncateTableDDL = - """TRUNCATE TABLE pcatalog.elastic_smoke_test""" - - val createTablePattern = - """CREATE TABLE udc.mive.test_cluster.default.temp age (int)""" - - val createExternalTablePattern = - """CREATE EXTERNAL TABLE udc.mive.test_cluster.default.temp age (int)""" - - val multisetPattern = - """CREATE MULTISET TABLE udc.mive.test_cluster.default.temp age (int)""" - - val setPattern = - """CREATE SET TABLE udc.mive.test_cluster.default.temp age (int)""" - - val dropTablePattern = - """DROP TABLE udc.mive.test_cluster.default.temp""" - - val truncateTablePattern = - """TRUNCATE TABLE udc.mive.test_cluster.default.temp""" - - val deleteFromPattern = - """DELETE FROM udc.mive.test_cluster.default.temp""" - - val deletePattern = - """DELETE udc.mive.test_cluster.default.temp""" - -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParseDDLSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParseDDLSpec.scala deleted file mode 100644 index c5d39dab..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParseDDLSpec.scala +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers} - -class SQLParseDDLSpec - extends FunSpec - with SharedSparkSession - with Matchers - with BeforeAndAfterEach { - - // add things to do before each test for this specific files - protected override def beforeEach(): Unit = { - GimelQueryUtils.setCatalogProvider("UDC") - } - - // add things to do after each test for this specific files - protected override def afterEach(): Unit = { - GimelQueryUtils.setCatalogProvider("UDC") - } - - describe("DROP TABLE TEMP TABLE") { - it("It should return true") { - - GimelQueryUtils.isDropTableATempTable("DROP TABLE basu", spark) should be( - false - ) - } - } - - describe("DROP TABLE with IF exists") { - it("It should return true") { - - GimelQueryUtils.isDDL(SQLMasterList.dropIfExistsDDL, spark) should be( - true - ) - } - } - - describe("DROP TABLE without IF exists") { - it("It should return true") { - - GimelQueryUtils.isDDL(SQLMasterList.dropPlainDDL, spark) should be(true) - } - } - - describe("DROP view with IF exists") { - it("It should return true") { - - GimelQueryUtils.isDDL(SQLMasterList.dropIfExistsViewDDL, spark) should be( - true - ) - } - } - - describe("DROP view without IF exists") { - it("It should return true") { - - GimelQueryUtils.isDDL(SQLMasterList.dropPlainViewDDL, spark) should be( - true - ) - } - } - - describe("truncate table") { - it("It should return true") { - GimelQueryUtils.isDDL(SQLMasterList.truncateTableDDL, spark) should be( - true - ) - } - } - - describe("Complex Create External table") { - it("It should return true") { - - GimelQueryUtils.isDDL(SQLMasterList.complexCreateDDL, spark) should be( - true - ) - } - } - - describe("createTablePattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition(SQLMasterList.createTablePattern) should be( - true - ) - } - } - - describe("createExternalTablePattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition( - SQLMasterList.createExternalTablePattern - ) should be(true) - } - } - - describe("multisetPattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition(SQLMasterList.multisetPattern) should be( - true - ) - } - } - - describe("setPattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition(SQLMasterList.setPattern) should be( - true - ) - } - } - - describe("dropTablePattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition(SQLMasterList.dropTablePattern) should be( - true - ) - } - } - - describe("truncateTablePattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition(SQLMasterList.truncateTablePattern) should be( - true - ) - } - } - - describe("deleteFromPattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition(SQLMasterList.deleteFromPattern) should be( - true - ) - } - } - - describe("deletePattern") { - it("It should return true") { - - GimelQueryUtils.isUDCDataDefinition(SQLMasterList.deletePattern) should be( - true - ) - } - } - -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSourceTableSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSourceTableSpec.scala deleted file mode 100644 index 8d5d7d65..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSourceTableSpec.scala +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import org.scalatest.{FunSpec, Matchers} - -import com.paypal.gimel.logger.Logger -import com.paypal.gimel.parser.utilities.SQLNonANSIJoinParser -import com.paypal.gimel.sql.SQLMasterList._ - -class SQLParserSourceTableSpec extends FunSpec with Matchers { - - private val logger = Logger(this.getClass.getName) - - describe("getSourceTablesFromNonAnsi") { - it("should pick correct table names from the SELECT QUERY") { - - SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSIanNonANSI) should equal( - List("a", "b", "c", "d", "k", "l") - ) - SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSISimple) should equal( - List("a", "b", "c", "d") - ) - SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSISimpleSubQuery) should equal( - List("a", "b", "c", "d") - ) - SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSIOnly) should equal( - List() - ) - SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(joinANSISQLViaUDC) should equal( - List() - ) - SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(mixANSINonANSISQL) should equal( - List( - "udc.kafka.test.test.emp", - "udc.hive.test_cluster.test.calendar", - "c", - "udc.teradata.test_cluster.testdb.lkp" - ) - ) - } - } - - describe("getAll UDC TableSources") { - it("should pick correct table names from the SELECT QUERY") { - - GimelQueryUtils.getTablesFrom(sqlANSIanNonANSI) should equal(List()) - GimelQueryUtils.getTablesFrom(sqlANSISimple) should equal(List()) - GimelQueryUtils.getTablesFrom(sqlANSISimpleSubQuery) should equal(List()) - GimelQueryUtils.getTablesFrom(sqlANSIOnly) should equal(List()) - GimelQueryUtils.getTablesFrom(mixANSINonANSISQL) should equal( - List( - "udc.kafka.test.test.emp", - "udc.hive.test_cluster.test.calendar", - "udc.teradata.test_cluster.testdb.lkp" - ) - ) - GimelQueryUtils.getTablesFrom(baSQL4) should equal( - List( - "pcatalog.teradata.tau.yelp.review", - "pcatalog.teradata.tau.yelp.user" - ) - ) - GimelQueryUtils.getTablesFrom(joinANSISQLViaUDC).sorted should equal( - List( - "udc.kafka.test.emp.address", - "udc.kafka.test.emp.loc" - ).sorted - ) - GimelQueryUtils.getAllTableSources(joinANSISQLViaUDC).sorted should equal( - List( - "testdb.emp", - "udc.kafka.test.emp.address", - "udc.kafka.test.emp.loc" - ).sorted - ) - } - } - - describe("get All Source Tables") { - it("should pick correct table names from the SELECT QUERY") { - - GimelQueryUtils.getAllTableSources(sqlANSIanNonANSI) should equal( - List("a", "b", "c", "d", "k", "l", "t2") - ) - GimelQueryUtils.getAllTableSources(sqlANSISimple) should equal( - List("a", "b", "c", "d") - ) - GimelQueryUtils.getAllTableSources(sqlANSISimpleSubQuery) should equal( - List("a", "b", "c", "d") - ) - GimelQueryUtils.getAllTableSources(sqlANSIOnly) should equal( - List("emp_loc", "testdb.emp") - ) - GimelQueryUtils.getAllTableSources(joinANSISQLViaUDC).sorted should equal( - List( - "testdb.emp", - "udc.kafka.test.emp.address", - "udc.kafka.test.emp.loc" - ).sorted - ) - GimelQueryUtils.getAllTableSources(mixANSINonANSISQL).sorted should equal( - List( - "testdb.emp", - "udc.kafka.test.test.emp", - "udc.hive.test_cluster.test.calendar", - "c", - "udc.teradata.test_cluster.testdb.lkp" - ).sorted - ) - } - } - - describe("isSQLNonANSIJoin") { - it( - "should pick tell correctly if a SQL is ANSI only or has NON-ANSI joins as well" - ) { - - SQLNonANSIJoinParser.isSQLNonANSIJoin(sqlANSIanNonANSI) should equal(true) - } - } - - describe("All DDL DML type") { - it("should pick correct table names ") { - GimelQueryUtils.getAllTableSources( - "collect statistics on yelp.tmp_table" - ) should equal(List("yelp.tmp_table")) - GimelQueryUtils.getAllTableSources("DELETE ALL yelp.tmp_table") should equal( - List("yelp.tmp_table") - ) - GimelQueryUtils.getAllTableSources("DELETE yelp.tmp_table ALL") should equal( - List("yelp.tmp_table") - ) - GimelQueryUtils.getAllTableSources("DESCRIBE yelp.tmp_table") should equal( - List("yelp.tmp_table") - ) - GimelQueryUtils.getAllTableSources("HELP table yelp.tmp_table") should equal( - List("yelp.tmp_table") - ) - GimelQueryUtils.getAllTableSources("show view yelp.tmp_table") should equal( - List("yelp.tmp_table") - ) - } - it("should exclude join desc") { - GimelQueryUtils.getAllTableSources("DESC yelp.tmp_table") should equal( - List() - ) - } - it("should pick table names from CACHE table") { - assert( - GimelQueryUtils - .getAllTableSources("""cache table work_day_employees as - |select * from udc.SFTP.Test.default.Files;""".stripMargin) == List( - "work_day_employees", - "udc.sftp.test.default.files" - ) - ) - assert( - GimelQueryUtils - .getAllTableSources( - """cache table workday_dump1 as - |select - |lower(a.ntid) as username - |,a.`Employee QID` as employee_qid - |,a.`Emplyee Last Name` as last_name - |,a.`Employee First Name` as first_name - |,concat(a.`Emplyee Last Name`,',',a.`Employee First Name`) as full_name - |,a.`Org Description` as org_desc - |,a.`Org ID ` as org_id - |,a.`Loaction` as location - |,lower(a.`Manager ID`) as manager_qid - |,lower(b.ntid) as manager_username - |from work_day_employees a - |left join work_day_employees_b b - |on a.`Manager ID` = b.`Employee QID`;""".stripMargin - ) == List( - "workday_dump1", - "work_day_employees", - "work_day_employees_b" - ) - ) - - assert( - GimelQueryUtils - .getAllTableSources( - """set gimel.jdbc.p.strategy=file; - |set gimel.jdbc.p.file=/user/testuser/udc.prod.pass; - |set gimel.jdbc.username=testadmin; - | - |insert into udc.MySql.UDC.pcatalog.workday_dump - |select * from workday_dump1 """.stripMargin - ) == List("workday_dump1", "udc.mysql.udc.pcatalog.workday_dump") - ) - } - } - - describe("Check multiple match criteria within same SQL ") { - it("should extract valid table names ") { - logger.info( - GimelQueryUtils.getAllTableSources( - "drop table if exists udc.hive.test.testdb.emp" - ) - ) - - logger.info( - GimelQueryUtils.getAllTableSources( - """cache table td_views_hive_all - |select distinct * from - |( - |select * - | from - | udc.hive.test.default.teradata_db_views_test - |union - |select * - | from - | udc.hive.test.default.teradata_2_db_views_test - |)""".stripMargin - ) - ) - logger.info( - GimelQueryUtils.getAllTableSources( - """cache table td_views_hive_all - |select distinct tlb.* from - |( - |select * - |from - |udc.hive.test.default.teradata_db_views_test - |union - |select * - |from - |udc.hive.test.default.teradata_2_db_views_test - |) tlb""".stripMargin - ) - ) - } - } -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSpec.scala deleted file mode 100644 index 0e1325fc..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSpec.scala +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import org.scalatest.{FunSpec, Matchers} - -class SQLParserSpec extends FunSpec with Matchers { - - - it("should pick the Target Table Accurately from the SQL without 'table' keyword") { - - com.paypal.gimel.sql.SQLParser.getTargetTables( - """ - |INSERT INTO data_source_tab2 PARTITION (p1 = 'part1', p2) - | SELECT id, 'part' || id FROM RANGE(1, 3) - | - """.stripMargin) shouldBe Some("data_source_tab2") - - } - - it("should pick the Target Table Accurately from the SQL with 'table' keyword") { - com.paypal.gimel.sql.SQLParser.getTargetTables( - """ - |INSERT INTO table data_source_tab2 PARTITION (p1 = 'part1', p2) - | SELECT id, 'part' || id FROM RANGE(1, 3) - | - """.stripMargin) shouldBe Some("data_source_tab2") - - } - - it("should pick the Target Table Accurately from the SQL when there is an 'override' keyword") { - com.paypal.gimel.sql.SQLParser.getTargetTables( - """ - |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'partNew1', p2) - | VALUES (3, 'partNew2') - """.stripMargin) shouldBe Some("data_source_tab2") - - } - - it("should pick the Target Table Accurately from the SQL when there is an 'override' keyword 1") { - com.paypal.gimel.sql.SQLParser.getTargetTables( - """ - |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'part1', p2) - | VALUES (5, 'part1') - """.stripMargin) shouldBe Some("data_source_tab2") - } - - it("should pick the Target Table Accurately from the SQL when the SQL has a DB.Table format") { - com.paypal.gimel.sql.SQLParser.getTargetTables( - """insert into pcatalog.elastic_cluster_flights_log_notebook_data - |select * from pcatalog.kafka_flights_log - | - """.stripMargin) shouldBe Some("pcatalog.elastic_cluster_flights_log_notebook_data") - } -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserTargetTableSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserTargetTableSpec.scala deleted file mode 100644 index d509095d..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserTargetTableSpec.scala +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import org.scalatest.{FunSpec, Matchers} - -import com.paypal.gimel.sql.SQLMasterList._ - -class SQLParserTargetTableSpec extends FunSpec with Matchers { - - - describe("getTargetTables") { - - it("should pick the TARGET Table Accurately from the SQL without 'table' keyword") { - SQLParser.getTargetTables(insertSQL1) shouldBe Some("data_source_tab2") - } - - it("should pick the TARGET Table Accurately from the SQL with 'table' keyword") { - SQLParser.getTargetTables(insertSQL2) shouldBe Some("data_source_tab2") - } - - it("should pick the TARGET Table Accurately from the SQL when there is an 'override' keyword") { - SQLParser.getTargetTables(insertSQL3) shouldBe Some("data_source_tab2") - } - - it("should pick the TARGET Table Accurately from the SQL when there is an 'override' keyword 1") { - SQLParser.getTargetTables(insertSQL4) shouldBe Some("data_source_tab2") - } - - it("should pick the TARGET Table Accurately from the SQL when the SQL has a DB.Table format") { - SQLParser.getTargetTables(insertSQL5) shouldBe Some("pcatalog.elastic_test_cluster_yelp_review_data") - } - - it("should pick correct table name from the SELECT QUERY") { - GimelQueryUtils.getTablesFrom(simpleSelect1) should equal(Array("udc.mysql.datalake.test.yelp_review_read")) - } - - it("should pick proper table name from the only the SELECT Query") { - GimelQueryUtils.getTablesFrom(simpleInsertSelect1) should equal(Array("udc.kafka.tau.yelp.review", - "udc.mysql.datalake.test.yelp_review_write")) - } - } - - - describe("isQueryContainingPartitioning") { - - it("should return true if query contains ; insert into partitions of target table. ") { - GimelQueryUtils.isQueryContainingPartitioning(insertPartitionedTable1) shouldBe (true) - GimelQueryUtils.isQueryContainingPartitioning(insertSQL1) shouldBe (true) - GimelQueryUtils.isQueryContainingPartitioning(insertSQL2) shouldBe (true) - GimelQueryUtils.isQueryContainingPartitioning(insertSQL3) shouldBe (true) - GimelQueryUtils.isQueryContainingPartitioning(insertSQL4) shouldBe (true) - } - - it("should return false if query does not contain partition") { - GimelQueryUtils.isQueryContainingPartitioning(insertSQL5) shouldBe (false) - GimelQueryUtils.isQueryContainingPartitioning(insertSQL6) shouldBe (false) - } - } -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SharedSparkSession.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SharedSparkSession.scala deleted file mode 100644 index 9c0d19bd..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SharedSparkSession.scala +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.sql - -import org.apache.spark.SparkConf -import org.apache.spark.sql.{SparkSession, SQLContext} -import org.apache.spark.sql.internal.SQLConf -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSpec, Suite} -import org.scalatest.concurrent.Eventually - -trait SharedSparkSession - extends FunSpec - with BeforeAndAfterEach - with BeforeAndAfterAll - with Eventually { self: Suite => - - /** - * The [[SparkSession]] to use for all tests in this suite. - * - * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local - * mode with the default test configurations. - */ - @transient private var _spark: SparkSession = null - - /** - * Make sure the [[SparkSession]] is initialized before any tests are run. - */ - protected override def beforeAll(): Unit = { - initializeSession() - - // Ensure we have initialized the context before calling parent code - super.beforeAll() - } - - /** - * This is the SparkSession tio be accessed everywhere within the module for tests - */ - protected implicit def spark: SparkSession = _spark - - /** - * This is the SqlContext tio be accessed everywhere within the module for tests - */ - protected implicit def sqlContext: SQLContext = _spark.sqlContext - - /** - * Generally, this is just called from - * beforeAll; however, in test using styles other than FunSuite, there is - * often code that relies on the session between test group constructs and - * the actual tests, which may need this session. It is purely a semantic - * difference, but semantically, it makes more sense to call - * 'initializeSession' between a 'describe' and an 'it' call than it does to - * call 'beforeAll'. - */ - protected def initializeSession(): Unit = { - if (_spark == null) { - _spark = createSparkSession - } - } - - /** - * - * @return sparkSession - */ - protected def createSparkSession: SparkSession = { - SparkSession - .builder() - .master("local") - .appName("Spark Unit Tests") - .config(sparkConf) - .getOrCreate() - } - - // Here add all the spark confs to be initialized in order to start the sparksession with. - protected def sparkConf = { - new SparkConf() - .set("spark.unsafe.exceptionOnMemoryLeak", "true") - .set(SQLConf.CODEGEN_FALLBACK.key, "false") - } - - /** - * Stop the underlying [[org.apache.spark.SparkContext]], if any. - */ - protected override def afterAll(): Unit = { - try { - super.afterAll() - } finally { - try { - if (_spark != null) { - try { - _spark.sessionState.catalog.reset() - } finally { - _spark.stop() - _spark = null - } - } - } finally { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - } - } - } - - /** - * Things to do before each test - */ - protected override def beforeEach(): Unit = { - super.beforeEach() - } - - /** - * Things to do after each test - */ - protected override def afterEach(): Unit = { - super.afterEach() - // Clear all persistent datasets after each test - spark.sharedState.cacheManager.clearCache() - } -} diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.avro b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.avro deleted file mode 100644 index 8ffdc972..00000000 Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.avro and /dev/null differ diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.csv b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.csv deleted file mode 100644 index 59f3f4dc..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.csv +++ /dev/null @@ -1 +0,0 @@ -a,b,c,d,e \ No newline at end of file diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.json b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.json deleted file mode 100644 index 715b02d4..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.json +++ /dev/null @@ -1 +0,0 @@ -{"key":"value"} \ No newline at end of file diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.parquet b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.parquet deleted file mode 100644 index 2ae23dac..00000000 Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.parquet and /dev/null differ diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.seq b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.seq deleted file mode 100755 index 78822754..00000000 Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.seq and /dev/null differ diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt deleted file mode 100644 index 808976a7..00000000 --- a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt +++ /dev/null @@ -1 +0,0 @@ -This is a test file for hdfs read api \ No newline at end of file diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt.gz b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt.gz deleted file mode 100644 index 3f41db74..00000000 Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt.gz and /dev/null differ diff --git a/gimel-dataapi/gimel-tools/pom.xml b/gimel-dataapi/gimel-tools/pom.xml deleted file mode 100644 index bf1b097b..00000000 --- a/gimel-dataapi/gimel-tools/pom.xml +++ /dev/null @@ -1,114 +0,0 @@ - - - - - - - gimel-dataapi - com.paypal.gimel - 2.0.0-SNAPSHOT - ../pom.xml - - 4.0.0 - - gimel-tools - 2.0.0-SNAPSHOT - - - - com.paypal.gimel - gimel-sql - ${gimel.version}-SNAPSHOT - - - org.scala-lang - * - - - org.apache.kafka - kafka-clients - - - - - org.apache.kafka - kafka-clients - ${kafka.version} - provided - - - - - src/main/scala - - - - org.apache.maven.plugins - maven-shade-plugin - 3.0.0 - - - - com.google.common - gimel-shaded.com.google.common - - - com.sun.jersey - gimel-shaded.com.sun.jersey - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - - META-INF/services/org.apache.spark.sql.sources.DataSourceRegister - - - - - - - gimel-shading - package - - shade - - - - - - - - diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/BenchMarkKafkaDataSetAPI.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/BenchMarkKafkaDataSetAPI.scala deleted file mode 100644 index fca1432b..00000000 --- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/BenchMarkKafkaDataSetAPI.scala +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.tools - -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql._ - -import com.paypal.gimel.DataSet -import com.paypal.gimel.common.conf.GimelConstants -import com.paypal.gimel.common.storageadmin.HDFSAdminClient -import com.paypal.gimel.common.utilities.Timer -import com.paypal.gimel.kafka.conf.KafkaConfigs -import com.paypal.gimel.logger.Logger -import com.paypal.gimel.tools.conf.BenchmarkKafkaConstants - -@deprecated -object BenchMarkKafkaDataSetAPI extends App { - - // Logger Initiation - val logger = Logger(this.getClass.getName) - - val sparkSession = SparkSession - .builder() - .appName("SparkSessionZipsExample") - .enableHiveSupport() - .getOrCreate() - val sc = sparkSession.sparkContext - val sqlContext = sparkSession.sqlContext - - import BenchMarkHelperUtils._ - - val paramsMapBuilder = resolveRunTimeParameters(args) - lazy val appName = sparkSession.conf.get(GimelConstants.SPARK_APP_NAME, "NA") + "_" + sc.getConf.getAppId - lazy val path1 = "/tmp/" + sc.sparkUser + "_" + appName + "_" + ".Data_API.DataSet.benchmark.log" - val path = paramsMapBuilder.getOrElse("targetFile", path1) - val fetchRowsOnFirstRun = paramsMapBuilder(BenchmarkKafkaConstants.fetchRowsKey) - val maxRecordsPerPartition = paramsMapBuilder(BenchmarkKafkaConstants.maxRecordsPerPartitionKey) - val minRowsPerParallel = paramsMapBuilder(BenchmarkKafkaConstants.minRowsPerPartitionKey) - val datasetName = paramsMapBuilder("dataset") - - /** - * START DATASET API STATS CAPTURE - */ - val dataset = DataSet(sparkSession) - val props = s"""${KafkaConfigs.minRowsPerParallelKey}=$minRowsPerParallel:${KafkaConfigs.rowCountOnFirstRunKey}=$fetchRowsOnFirstRun:${KafkaConfigs.maxRecordsPerPartition}=$maxRecordsPerPartition""" - val dataDF = dataset.read(datasetName, props) - - // val timer = Timer() - // timer.start; - val timer = Timer() - timer.start - val myCount = dataDF.count() - val totalMS = timer.endWithMillSecRunTime - - val executorMemoryStatus = sc.getExecutorMemoryStatus.mkString("\n") - val totalExecutors = sc.getExecutorMemoryStatus.size - val executorStorageStatus = sc.getExecutorStorageStatus.map(x => "blockManagerId:" + x.blockManagerId + "|maxMem:" + x.maxMem + "|memUsed:" + x.memUsed + "|memRemaining:" + x.memRemaining).mkString("\n") - - val allConfs = sc.getConf.getAll.mkString("\n") - - /** - * COMPOSE STATS - */ - - val toWrite = - s""" - |DataAPI:BenchMark Count:$myCount - |DataAPI:totalExecutors:$totalExecutors - |DataAPI:TotalMS:$totalMS - """.stripMargin - - /** - * Write Stats - */ - - logger.info(s"Writing to Path --> $path") - HDFSAdminClient.writeHDFSFile(path, toWrite) - - sc.stop() - -} - -@deprecated -class CDHTimer(funcName: String) { - val logger = Logger() - - def timed[T](f: => T): T = { - val startTime = System.currentTimeMillis() - try f finally println(s"Function completed in: ${System.currentTimeMillis() - startTime} ms") - } - - var startTime: Long = -1L - - def start(): Unit = { - startTime = System.currentTimeMillis() - } - - def end(): Long = { - val endTime: Long = System.currentTimeMillis() - val elapsedTime = endTime - startTime - logger.info("TOTAL TIME " + funcName + " elapse time = " + elapsedTime) - elapsedTime - } -} - -@deprecated -object BenchMarkHelperUtils { - - val logger = Logger() - - /** - * Resolves RunTime Params - * - * @param allParams args - * @return Map[String, String] - */ - def resolveRunTimeParameters(allParams: Array[String]): Map[String, String] = { - - var paramsMapBuilder: Map[String, String] = Map() - logger.info(s"All Params From User --> ${allParams.mkString("\n")}") - val usage = - """ - |dataset=pcatalog.kafka_flights_log fetchRowsOnFirstRun=1000000 maxRecordsPerPartition=1000000 targetFile=/tmp/stats/log" - """.stripMargin - if (allParams.length == 0) { - println(usage) - throw new Exception("Args Cannot be Empty") - } - for (jobParams <- allParams) { - for (eachParam <- jobParams.split(" ")) { - paramsMapBuilder += (eachParam.split("=")(0) -> eachParam.split("=", 2)(1)) - } - } - if (!paramsMapBuilder.contains("dataset")) paramsMapBuilder += ("dataset" -> "pcatalog.kafka_flights_log") - if (!paramsMapBuilder.contains(BenchmarkKafkaConstants.fetchRowsKey)) paramsMapBuilder += (BenchmarkKafkaConstants.fetchRowsKey -> "1000000") - if (!paramsMapBuilder.contains(BenchmarkKafkaConstants.maxRecordsPerPartitionKey)) paramsMapBuilder += (BenchmarkKafkaConstants.maxRecordsPerPartitionKey -> "1000000") - if (!paramsMapBuilder.contains(BenchmarkKafkaConstants.minRowsPerPartitionKey)) paramsMapBuilder += (BenchmarkKafkaConstants.minRowsPerPartitionKey -> "100000") - logger.info(s"Resolved Params From Code --> $paramsMapBuilder") - paramsMapBuilder - } -} diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/ExecSQLWrapper.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/ExecSQLWrapper.scala deleted file mode 100644 index 0ffb915d..00000000 --- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/ExecSQLWrapper.scala +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.tools - -import java.util.Calendar - -import scala.collection.immutable.Map -import scala.language.implicitConversions - -import org.apache.spark.{SparkConf, SparkContext} -import org.apache.spark.sql.SparkSession - -import com.paypal.gimel.common.conf.GimelConstants -import com.paypal.gimel.common.storageadmin.HDFSAdminClient -import com.paypal.gimel.common.utilities.Timer -import com.paypal.gimel.hbase.conf.HbaseConfigs -import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants} -import com.paypal.gimel.logger.Logger -import com.paypal.gimel.sql.GimelQueryProcessor -import com.paypal.gimel.tools.conf.CopyDatasetConstants - -object CopyDataSet extends App { - - import CopyHelperUtils._ - import com.paypal.gimel.kafka.utilities.KafkaUtilities._ - - val logger = Logger(this.getClass.getName) - val user = sys.env("USER") - val sparkConf = new SparkConf() - val sparkSession = SparkSession - .builder() - .enableHiveSupport() - .getOrCreate() - val props = resolveRunTimeParameters(args) ++ Map(GimelConstants.SPARK_APP_ID -> sparkSession.conf.get(GimelConstants.SPARK_APP_ID), - GimelConstants.SPARK_APP_NAME -> sparkSession.conf.get(GimelConstants.SPARK_APP_NAME)) - props.foreach(prop => sparkSession.conf.set(prop._1, prop._2)) - logger.setSparkVersion(sparkSession.version) - val resolvedProps = getOptions(sparkSession) - val queryToExecute = getQuery(props) - val sparkAppName = sparkSession.conf.get("spark.app.name") - val yarnCluster = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName() - val runMode = props("mode") match { - case "stream" => KafkaConstants.gimelAuditRunTypeStream - case "batch" => KafkaConstants.gimelAuditRunTypeBatch - case "intelligent" => KafkaConstants.gimelAuditRunTypeIntelligent - case _ => GimelConstants.UNKNOWN_STRING.toLowerCase - } - - val hiveStagingDir = props.getOrElse("hiveStagingDir", "") - try { - props("mode").toLowerCase() match { - case CopyDatasetConstants.COPY_DATASET_STREAM_MODE => GimelQueryProcessor.executeStream(queryToExecute, sparkSession) - case CopyDatasetConstants.COPY_DATASET_BATCH_MODE => GimelQueryProcessor.executeBatch(queryToExecute, sparkSession) - case CopyDatasetConstants.COPY_DATASET_BATCH_INTERACTIVE_MODE => - val isBatchInfinite = props.getOrElse("isBatchRecursionInfinite", "false").toBoolean - val batchRecursionRequested = props.getOrElse("batchRecursionRequested", "100").toInt - val batchRecursinMins = props.getOrElse("batchRecursionMinutes", 30).toString.toInt - logger.info( - s""" - |-------------------------------------------------------------------- - || isBatchRecursionInfinite | ${isBatchInfinite} - || batchRecursionRequested | ${batchRecursionRequested} - || batchRecursinMins | ${batchRecursinMins} - |-------------------------------------------------------------------- - """.stripMargin) - val batchRecursionMilliSec: Double = batchRecursinMins * 60 * 1000D - var currentIteration = 1 - while (isBatchInfinite || (currentIteration <= batchRecursionRequested)) { - val startTime = Calendar.getInstance().getTime - logger.info( - s""" - |-------------------------------------------------------------------- - || Mode | ${props("mode")} - || Iteration | ${currentIteration} - || Start Time | ${Calendar.getInstance().getTime} - |-------------------------------------------------------------------- - """.stripMargin) - val timer = Timer() - timer.start - GimelQueryProcessor.executeBatch(queryToExecute, sparkSession) - val totalTimeMilliSec: Double = timer.endWithMillSecRunTime - val endTime = Calendar.getInstance().getTime - val sleepMilliSec = scala.math.max(0, batchRecursionMilliSec - totalTimeMilliSec) - logger.info( - s""" - |-------------------------------------------------------------------- - || (*) | Iteration | ${currentIteration} - || (*) | Start Time Execution | ${startTime} - || (*) | Start End Execution | ${endTime} - || (Y) | Time Taken for Execution (ms) | ${totalTimeMilliSec} - || (X) | Batch Iteration Request (ms) | ${batchRecursionMilliSec} - || (X-Y) | Time Remaining for Sleep (ms) | ${sleepMilliSec} - |-------------------------------------------------------------------- - """.stripMargin) - if (currentIteration == batchRecursionRequested) logger.info("All Iterations Completed !") - if (sleepMilliSec > 0 && currentIteration < batchRecursionRequested) { - logger.info(s"Going to Sleep at --> ${Calendar.getInstance().getTime}") - Thread.sleep(sleepMilliSec.toLong) - logger.info(s"Woke Up at --> ${Calendar.getInstance().getTime}") - } - currentIteration += 1 - } - case CopyDatasetConstants.COPY_DATASET_INTELLIGENT_MODE => - logger.info(s"Mode --> auto") - var batchRunCount = 0 - while (!isStreamable(sparkSession, props)) { - logger.info(s"====== BATCH Mode < Iteration --> ${batchRunCount} > ======") - val timer = Timer() - timer.start - GimelQueryProcessor.executeBatch(queryToExecute, sparkSession) - if (hiveStagingDir != "") sparkSession.sql(s"dfs -rm -r -f ${hiveStagingDir}") - timer.endWithMillSecRunTime - logger.info(s"====== BATCH Mode < Iteration --> ${batchRunCount} > Total Time Seconds --> ${timer.endWithMillSecRunTime / 1000} ====== ") - batchRunCount = batchRunCount + 1 - } - logger.info("====== STREAM Mode ======") - GimelQueryProcessor.executeStream(queryToExecute, sparkSession) - case _ => throw new Exception("Invalid Mode of Execution Must be one of these ") - } - - // push logs to KAFKA - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , runMode - , yarnCluster - , user - , s"${yarnCluster}/${user}/${sparkAppName}".replaceAllLiterally("/", "_").replaceAllLiterally(" ", "-") - , "copyDataSet" - , s"${queryToExecute}" - , scala.collection.mutable.Map("sql" -> queryToExecute) - , GimelConstants.SUCCESS - , GimelConstants.EMPTY_STRING - , GimelConstants.EMPTY_STRING - ) - } - catch { - case e: Throwable => { - e.printStackTrace() - - // push logs to KAFKA - logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId - , sparkSession.conf.get("spark.app.name") - , this.getClass.getName - , runMode - , yarnCluster - , user - , s"${yarnCluster}/${user}/${sparkAppName}".replaceAllLiterally("/", "_").replaceAllLiterally(" ", "-") - , "copyDataSet" - , s"${queryToExecute}" - , scala.collection.mutable.Map("sql" -> queryToExecute) - , GimelConstants.FAILURE - , e.toString + "\n" + e.getStackTraceString - , GimelConstants.UNKNOWN_STRING - ) - - // throw error to console - logger.throwError(e.toString) - - throw e - } - } - -} - -object CopyHelperUtils { - - val logger = Logger(this.getClass.getName) - - /** - * Resolves RunTime Params - * - * @param allParams args - * @return Map[String, String] - */ - def resolveRunTimeParameters(allParams: Array[String]): Map[String, String] = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - - logger.info(" @Begin --> " + MethodName) - - var paramsMapBuilder: Map[String, String] = Map() - for (jobParams <- allParams) { - for (eachParam <- jobParams.split(" ")) { - paramsMapBuilder += (eachParam.split("=")(0) -> eachParam.split("=", 2)(1)) - } - } - logger.info(s"All Params From User --> ${paramsMapBuilder.mkString("\n")}") - - val usage = - """ - |For Details : https://github.com/Paypal/gimel/blob/oss/docs/gimel-tools/ExecSQLWrapper.md - """.stripMargin - - if (allParams.length == 0) { - logger.error(usage) - throw new Exception(s"Args Cannot be Empty. Usage --> \n${usage}") - } - - if (!paramsMapBuilder.contains("mode")) throw new Exception(s"mode must be supplied as either < batch|stream > Usage --> \n${usage}") - if (!paramsMapBuilder.contains("querySourceFile")) throw new Exception(s"querySourceFile must be supplied ! Usage --> \n${usage}") - - logger.info(s"Resolved Params From Code --> ${paramsMapBuilder}") - paramsMapBuilder - } - - /** - * getOptions - read the hive context options that was set by the user else add the default values - * - * @param sparkSession SparkSession - * @return - Tuple ( String with concatenated options read from the hivecontext , Same Props as a Map[String,String] ) - */ - - def getOptions(sparkSession: SparkSession): (String, Map[String, String]) = { - def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() - - logger.info(" @Begin --> " + MethodName) - - val hiveConf: Map[String, String] = sparkSession.conf.getAll - val optionsToCheck: Map[String, String] = Map( - KafkaConfigs.rowCountOnFirstRunKey -> "250" - , KafkaConfigs.batchFetchSize -> "250" - , KafkaConfigs.maxRecordsPerPartition -> "25000000" - , GimelConstants.LOG_LEVEL -> "ERROR" - , KafkaConfigs.kafkaConsumerReadCheckpointKey -> "true" - , KafkaConfigs.kafkaConsumerClearCheckpointKey -> "false" - , KafkaConfigs.maxRatePerPartitionKey -> "3600" - , KafkaConfigs.streamParallelKey -> "10" - , KafkaConfigs.defaultBatchInterval -> "30" - , KafkaConfigs.isStreamParallelKey -> "true" - , KafkaConfigs.isBackPressureEnabledKey -> "true" - , HbaseConfigs.hbaseOperation -> "scan" - , HbaseConfigs.hbaseFilter -> "" - , GimelConstants.DATA_CACHE_IS_ENABLED -> "false" - , GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL -> "true" - ) - val resolvedOptions: Map[String, String] = optionsToCheck.map { kvPair => - (kvPair._1, hiveConf.getOrElse(kvPair._1, kvPair._2)) - } - resolvedOptions.foreach(conf => sparkSession.conf.set(conf._1, conf._2)) - (resolvedOptions.map(x => x._1 + "=" + x._2).mkString(":"), hiveConf ++ resolvedOptions) - } - - def getQuery(props: Map[String, String]): String = { - - val sql: String = { - logger.info(s"User Requested Execution of SQL from External File.") - val querySourceFile = props("querySourceFile") - val unresolvedQuery = HDFSAdminClient.readHDFSFile(querySourceFile) - logger.info(s"SQL From External File --> \n${unresolvedQuery}") - val replacementProps = props.filter(x => x._1.toUpperCase.startsWith("GIMEL.SQL.PARAM")) - logger.info( - s""" - |Following Props will be resolved in External File's SQL String --> - |${replacementProps.mkString("\n", "\n", "")} - """.stripMargin) - replacementProps.foldLeft(unresolvedQuery)((s, prop) => s.replaceAll(prop._1.toUpperCase, prop._2)) - } - logger.info(s"Resolved Query to Execute --> ${sql}") - sql - } - - /** - * getYarnClusterName - gets the yarn cluster from the hadoop config file - * - * @return - */ - def getYarnClusterName(): String = { - val hadoopConfiguration = new org.apache.hadoop.conf.Configuration() - val cluster = hadoopConfiguration.get(GimelConstants.FS_DEFAULT_NAME) - cluster.split("/").last - } -} diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/BenchmarkKafkaConstants.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/BenchmarkKafkaConstants.scala deleted file mode 100644 index 91b4e1f3..00000000 --- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/BenchmarkKafkaConstants.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.tools.conf - -object BenchmarkKafkaConstants { - val minRowsPerPartitionKey: String = "minRowsPerParallel" - val maxRecordsPerPartitionKey: String = "maxRecordsPerPartition" - val fetchRowsKey: String = "fetchRowsOnFirstRun" -} diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/CopyDatasetConstants.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/CopyDatasetConstants.scala deleted file mode 100644 index ed767d8a..00000000 --- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/CopyDatasetConstants.scala +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.tools.conf - -// Copy Dataset Constants -object CopyDatasetConstants { - val COPY_DATASET_STREAM_MODE = "stream" - val COPY_DATASET_BATCH_MODE = "batch" - val COPY_DATASET_BATCH_INTERACTIVE_MODE = "batch_iterative" - val COPY_DATASET_INTELLIGENT_MODE = "intelligent" -} diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/SinkMetricsReconcilerConstants.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/SinkMetricsReconcilerConstants.scala deleted file mode 100644 index ae64a281..00000000 --- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/SinkMetricsReconcilerConstants.scala +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.paypal.gimel.tools.conf - - -object SinkMetricsReconcilerConstants { - - val statsTargetRecordsUpdatedFlag: String = "target_records_updated_flag" - val statsMetadataID: String = "metadata._id" - val statsTargetDataSetName: String = "target_data_set_name" - val statsGimelloadID: String = "gimel_load_id" - val statsTargetRecordsCount: String = "target_records_count" - val statsGimelMetricsLaggingBatchValidationLogType: String = "GimelMetricsLaggingRecords9" - val statsGimelMetricsBatchValidationLogType: String = "GimelMetricsBatchValidation9" - -} diff --git a/gimel-dataapi/pom.xml b/gimel-dataapi/pom.xml index c80bac24..0205e8c1 100644 --- a/gimel-dataapi/pom.xml +++ b/gimel-dataapi/pom.xml @@ -24,7 +24,7 @@ under the License. gimel com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../pom.xml 4.0.0 @@ -34,31 +34,25 @@ under the License. Gimel Data API 2017 - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT gimel-logger gimel-common gimel-connectors/gimel-sftp - gimel-connectors/gimel-elasticsearch-6.2 - gimel-connectors/gimel-jdbc - gimel-connectors/gimel-hive-1.2 - gimel-connectors/gimel-hbase-1.2 - gimel-connectors/gimel-cassandra-2.0 - gimel-connectors/gimel-aerospike-3.14 - gimel-connectors/gimel-kafka-0.10 - gimel-connectors/gimel-kafka-2.2 - gimel-connectors/gimel-druid-0.82 + gimel-connectors/gimel-elasticsearch + gimel-connectors/gimel-hive gimel-connectors/gimel-restapi gimel-connectors/gimel-s3 + gimel-connectors/gimel-kafka + gimel-connectors/gimel-jdbc + gimel-connectors/gimel-cassandra + gimel-connectors/gimel-aerospike gimel-core - gimel-sql - gimel-tools - gimel-examples 3.1.5 - 2.0.6 + 2.4.3 3.3.0 6.2.1 16.0 @@ -68,20 +62,20 @@ under the License. 2.6.7 1.8 1.19.4 - 1.3.2 - 2.2.1 - 0-10 + + 2.1.1 + 1.4 2.3 0.4.3-SNAPSHOT - 0.8.2 + 0.8.3 2.13.0 0.0.7 1.3.3 15.10.00.22 3.4.13 2.13.0 - 0-10_2.11 + 0-10_2.12 1.1.3 3.9.9.Final @@ -215,7 +209,7 @@ under the License. org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-logging/gimel-logging_2.2/pom.xml b/gimel-logging/gimel-logging_2.2/pom.xml deleted file mode 100644 index eef63c33..00000000 --- a/gimel-logging/gimel-logging_2.2/pom.xml +++ /dev/null @@ -1,62 +0,0 @@ - - - - - - gimel-logging - com.paypal.gimel - 0.4.3-SNAPSHOT - ../pom.xml - - 4.0.0 - - gimel-logging_2.2 - - 2.2.0 - 2.2 - 2.11.8 - 2.11 - - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-hive_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - provided - - - - diff --git a/gimel-logging/gimel-logging_2.2/src/main/scala/com/paypal/gimel/logging/GimelSparkListener.scala b/gimel-logging/gimel-logging_2.2/src/main/scala/com/paypal/gimel/logging/GimelSparkListener.scala deleted file mode 100644 index 04b4979b..00000000 --- a/gimel-logging/gimel-logging_2.2/src/main/scala/com/paypal/gimel/logging/GimelSparkListener.scala +++ /dev/null @@ -1,576 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging - -import java.util.{Date, Map => JMap} - -import scala.collection.JavaConverters._ -import scala.util.{Failure, Success, Try} - -import org.json4s.DefaultFormats -import org.apache.spark._ -import org.apache.spark.scheduler._ -import org.json4s.DefaultFormats - -import com.paypal.gimel.logging.Logging - -/** - * A {{SparkListener}} that captures and logs all metrics - * - * @param conf - */ -class GimelSparkListener(conf: SparkConf) extends SparkListener with Logging { - - def accumulateCrossJobs: Boolean = false - - private implicit def formats = DefaultFormats - - val DEFAULT_GROUP_ID: String = "DEFAULT_GROUP_ID" - - /** - * Cluster and HostName - */ - - private val hadoopConfiguration = new org.apache.hadoop.conf.Configuration() - val clusterUrl = hadoopConfiguration.get("fs.default.name") - val clusterName = new java.net.URI(clusterUrl).getHost() - val hostName = clusterName - - /** - * Application Level Metrics - */ - - var applicationId: String = "Unknown" - var appAttemptId: String = "" - var appName: String = "Unknown" - var driverLogs: String = "Unknown" - var appStartTime: Long = 0L - var appEndTime: Long = 0L - var appElapsedTimeInSecs: Float = 0.0f - var sparkLogLevel: String = "application" - var sparkWebUI: String = "" - var sparkEnvProperties: String = "" - var sparkUser: String = "" - var sparkVersion: String = "" - var sparkMaster: String = "" - var sparkDriverMemory: Long = 0L - var sparkExecutorMemory: Long = 0L - var sparkExecutorCores: Long = 0L - var sparkExecutorInstances: Long = 0L - - /** - * Application or Job Level Metrics - */ - var numberOfExecutors: Long = 0L - var startNumberOfExecutors: Long = 0L - var minNumberOfExecutors: Long = 0L - var maxNumberOfExecutors: Long = 0L - var endNumberOfExecutors: Long = 0L - - /** - * Job Level Metrics - */ - var sparkJobId: Long = 0L - var jobCompleted: Boolean = false - var jobStartTime: Long = 0L - var jobEndTime: Long = 0L - var jobElapsedTimeInSecs: Float = 0.0f - var jobSuccess: Long = 0L - var jobFailure: Long = 0L - var sparkJobResult: String = "" - var jobSuccessStatus: String = "" - var jobErrorMessage: String = "" - var jobErrorValue: String = "" - var jobErrorTrace: String = "" - - /** - * Application Level Metrics - */ - var appMetricExecutorRunTime = 0L - var appMetricJvmGCTime = 0L - var appMetricExecutorDeserializeTime = 0L - var appInputRecordsRead = 0L - var appInputBytesRead = 0L - var appOutputBytesWritten = 0L - var appOutputRecordsWritten = 0L - var appShuffleRecordsRead = 0L - var appShuffleRemoteBytesRead = 0L - var appShuffleRecordsWritten = 0L - var appShuffleBytesWritten = 0L - var appShuffleWriteTime = 0L - - /** - * Job Level Metrics - */ - var metricExecutorRunTime: Long = 0L - var metricJvmGCTime: Long = 0L - var metricExecutorDeserializeTime: Long = 0L - var metricResultSize: Long = 0L - var metricResultSerializationTime: Long = 0L - var metricMemoryBytesSpilled: Long = 0L - var metricDiskBytesSpilled: Long = 0L - var metricPeakExecutionMemory: Long = 0L - var inputRecordsRead: Long = 0L - var inputBytesRead: Long = 0L - var outputBytesWritten: Long = 0L - var outputRecordsWritten: Long = 0L - var shuffleRecordsRead: Long = 0L - var shuffleRemoteBytesRead: Long = 0L - var shuffleRecordsWritten: Long = 0L - var shuffleRemoteBlocksFetched: Long = 0L - var shuffleLocalBlocksFetched: Long = 0L - var shuffleFetchWaitTime: Long = 0L - var shuffleLocalBytesRead: Long = 0L - var shuffleBytesWritten: Long = 0L - var shuffleWriteTime: Long = 0L - - /** - * Generate Timestamp in YYYYMMDDHHMISS format - */ - val dateTimeFormat = new java.text.SimpleDateFormat("yyyyMMddhhmmss") - - def timeStamp: Long = { - dateTimeFormat.format(new Date()).toLong - } - - /** - * Generate Date in YYYYMMDD format - */ - val dateFormat = new java.text.SimpleDateFormat("yyyyMMdd") - - def date: Long = { - dateFormat.format(new Date()).toLong - } - - /** - * Convert to bytes - */ - def sizeStrToBytes(str: String): Long = { - val lower = str.toLowerCase - if (lower.endsWith("k")) { - lower.substring(0, lower.length - 1).toLong * 1024 - } else if (lower.endsWith("m")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 - } else if (lower.endsWith("g")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 - } else if (lower.endsWith("t")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 * 1024 - } else { - // no suffix, so it's just a number in bytes - lower.toLong - } - } - - /** - * Metrics that do not change in application should be set here. E.g. {{username}} - */ - def initMetrics: Unit = { - sparkUser = Try { - conf.get("spark.app.user") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkVersion = Try { - conf.get("spark.version") - } match { - case Success(prop) => prop.toString - case Failure(strVal) => "Unknown" - } - sparkMaster = Try { - conf.get("spark.master") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkDriverMemory = Try { - conf.get("spark.driver.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorMemory = Try { - conf.get("spark.executor.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorCores = Try { - conf.get("spark.executor.cores") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkExecutorInstances = Try { - conf.get("spark.executor.instances") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkEnvProperties = Try { - conf.get("spark.env.properties") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkWebUI = Try { - conf.get("spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkLogLevel = Try { - conf.get("spark.gimel.log.level") - } match { - case Success(prop) => prop.toString.toLowerCase - case Failure(_) => "application" - } - if ((sparkLogLevel != "application") && (sparkLogLevel != "job")) { - println("Invalid sparkLogLevel (" + sparkLogLevel + "). Valid options: application or job. So, setting sparkLogLevel to application.") - sparkLogLevel = "application" - } - } - - /** - * Accumulate Job Level Metrics after each job completion to compute Application Level Metrics. - */ - def accumMetrics: Unit = { - appMetricExecutorRunTime += metricExecutorRunTime - appMetricJvmGCTime += metricJvmGCTime - appMetricExecutorDeserializeTime += metricExecutorDeserializeTime - appInputRecordsRead += inputRecordsRead - appInputBytesRead += inputBytesRead - appOutputBytesWritten += outputBytesWritten - appOutputRecordsWritten += outputRecordsWritten - appShuffleRecordsRead += shuffleRecordsRead - appShuffleRemoteBytesRead += shuffleRemoteBytesRead - appShuffleRecordsWritten += shuffleRecordsWritten - appShuffleBytesWritten += shuffleBytesWritten - appShuffleWriteTime += shuffleWriteTime - } - - /** - * Reset Job Level Metrics after each job completion. - */ - def resetMetrics: Unit = { - sparkJobId = 0L - sparkJobResult = "" - jobSuccess = 0L - jobFailure = 0L - jobStartTime = 0L - jobEndTime = 0L - jobElapsedTimeInSecs = 0.0f - jobSuccessStatus = "" - jobErrorMessage = "" - jobErrorValue = "" - jobErrorTrace = "" - - startNumberOfExecutors = 0L - endNumberOfExecutors = 0L - metricExecutorRunTime = 0L - metricJvmGCTime = 0L - metricExecutorDeserializeTime = 0L - inputRecordsRead = 0L - inputBytesRead = 0L - outputBytesWritten = 0L - outputRecordsWritten = 0L - shuffleRecordsRead = 0L - shuffleRemoteBytesRead = 0L - shuffleRecordsWritten = 0L - shuffleBytesWritten = 0L - shuffleWriteTime = 0L - } - - /** - * Log all the metrics both in JSON format and into Kafka - * - */ - def logAppMetrics: Unit = { - kafkaLogger.info(this.appKafkaArgs) - } - - def appKafkaArgs: JMap[String, Any] = { - Map( - - // App Level Config Details - "logtime" -> java.lang.System.currentTimeMillis(), - "logType" -> "LivyMetrics", - "sparkLogLevel" -> "Application", - "host" -> hostName, - "cluster" -> clusterName, - "applicationId" -> applicationId, - "appAttemptId" -> appAttemptId, - "appName" -> appName, - "sparkWebUI" -> sparkWebUI, - "driverLogs" -> driverLogs, - "sparkMaster" -> sparkMaster, - "sparkUser" -> sparkUser, - "sparkVersion" -> sparkVersion, - "sparkEnvProperties" -> sparkEnvProperties, - - // App Level Metrics - "appStartTime" -> appStartTime, - "appEndTime" -> appEndTime, - "appElapsedTimeInSecs" -> appElapsedTimeInSecs, - "sparkDriverMemory" -> sparkDriverMemory, - "sparkExecutorMemory" -> sparkExecutorMemory, - "sparkExecutorCores" -> sparkExecutorCores, - "sparkExecutorInstances" -> sparkExecutorInstances, - "startNumberOfExecutors" -> startNumberOfExecutors, - "endNumberOfExecutors" -> endNumberOfExecutors, - "minNumberOfExecutors" -> minNumberOfExecutors, - "maxNumberOfExecutors" -> maxNumberOfExecutors, - - // App or Job Level Metrics - "appResult" -> sparkJobResult, - "appErrorMessage" -> jobErrorMessage, - "appErrorValue" -> jobErrorValue, - "appErrorTrace" -> jobErrorTrace, - "appSuccess" -> jobSuccess, - "appFailure" -> jobFailure, - - // App or Job Level Metrics - "metricExecutorRunTime" -> appMetricExecutorRunTime, - "metricJvmGCTime" -> appMetricJvmGCTime, - "metricExecutorDeserializeTime" -> appMetricExecutorDeserializeTime, - "inputRecordsRead" -> appInputRecordsRead, - "inputBytesRead" -> appInputBytesRead, - "outputBytesWritten" -> appOutputBytesWritten, - "outputRecordsWritten" -> appOutputRecordsWritten, - "shuffleRecordsRead" -> appShuffleRecordsRead, - "shuffleRemoteBytesRead" -> appShuffleRemoteBytesRead, - "shuffleRecordsWritten" -> appShuffleRecordsWritten, - "shuffleBytesWritten" -> appShuffleBytesWritten, - "shuffleWriteTime" -> appShuffleWriteTime - ).asJava - } - - def logJobMetrics: Unit = { - kafkaLogger.info(this.jobKafkaArgs) - } - - def jobKafkaArgs: JMap[String, Any] = { - Map( - - // App Level Config Details - "logtime" -> java.lang.System.currentTimeMillis(), - "logType" -> "LivyMetrics", - "sparkLogLevel" -> "Job", - "host" -> hostName, - "cluster" -> clusterName, - "applicationId" -> applicationId, - "appAttemptId" -> appAttemptId, - "appName" -> appName, - - // Job Level Config Details - "sparkJobId" -> sparkJobId, - "sparkJobResult" -> sparkJobResult, - "jobSuccessStatus" -> jobSuccessStatus, - "jobErrorMessage" -> jobErrorMessage, - "jobErrorValue" -> jobErrorValue, - "jobErrorTrace" -> jobErrorTrace, - - // Job Level Metrics - "jobSuccess" -> jobSuccess, - "jobFailure" -> jobFailure, - "jobStartTime" -> jobStartTime, - "jobEndTime" -> jobEndTime, - "startNumberOfExecutors" -> startNumberOfExecutors, - "endNumberOfExecutors" -> endNumberOfExecutors, - - // App or Job Level Metrics - "metricExecutorRunTime" -> metricExecutorRunTime, - "metricJvmGCTime" -> metricJvmGCTime, - "metricExecutorDeserializeTime" -> metricExecutorDeserializeTime, - "inputRecordsRead" -> inputRecordsRead, - "inputBytesRead" -> inputBytesRead, - "outputBytesWritten" -> outputBytesWritten, - "outputRecordsWritten" -> outputRecordsWritten, - "shuffleRecordsRead" -> shuffleRecordsRead, - "shuffleRemoteBytesRead" -> shuffleRemoteBytesRead, - "shuffleRecordsWritten" -> shuffleRecordsWritten, - "shuffleBytesWritten" -> shuffleBytesWritten, - "shuffleWriteTime" -> shuffleWriteTime - ).asJava - } - - def printMetrics: Unit = { - println("CUSTOM_LISTENER: sparkVersion = " + sparkVersion) - println("CUSTOM_LISTENER: logdate = " + date.toString) - println("CUSTOM_LISTENER: logtime = " + timeStamp.toString) - println("CUSTOM_LISTENER: host = " + hostName.toString) - println("CUSTOM_LISTENER: cluster = " + clusterName.toString) - println("CUSTOM_LISTENER: applicationId = " + applicationId.toString) - println("CUSTOM_LISTENER: appName = " + appName.toString) - println("CUSTOM_LISTENER: appAttemptId = " + appAttemptId.toString) - println("CUSTOM_LISTENER: appStartTime = " + appStartTime.toString) - println("CUSTOM_LISTENER: appEndTime = " + appEndTime.toString) - println("CUSTOM_LISTENER: appElapsedTimeInSecs = " + appElapsedTimeInSecs.toString) - println("CUSTOM_LISTENER: sparkMaster = " + sparkMaster.toString) - println("CUSTOM_LISTENER: sparkUser = " + sparkUser.toString) - println("CUSTOM_LISTENER: sparkEnvProperties = " + sparkEnvProperties.toString) - println("CUSTOM_LISTENER: sparkLogLevel = " + sparkLogLevel.toString) - println("CUSTOM_LISTENER: sparkDriverMemory = " + sparkDriverMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorMemory = " + sparkExecutorMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorCores = " + sparkExecutorCores.toString) - println("CUSTOM_LISTENER: sparkExecutorInstances = " + sparkExecutorInstances.toString) - println("CUSTOM_LISTENER: sparkWebUI = " + sparkWebUI.toString) - println("CUSTOM_LISTENER: driverLogs = " + driverLogs.toString) - println("CUSTOM_LISTENER: startNumberOfExecutors = " + startNumberOfExecutors.toString) - println("CUSTOM_LISTENER: endNumberOfExecutors = " + endNumberOfExecutors.toString) - println("CUSTOM_LISTENER: minNumberOfExecutors = " + minNumberOfExecutors.toString) - println("CUSTOM_LISTENER: maxNumberOfExecutors = " + maxNumberOfExecutors.toString) - println("CUSTOM_LISTENER: sparkJobId = " + sparkJobId.toString) - println("CUSTOM_LISTENER: sparkJobResult = " + sparkJobResult.toString) - println("CUSTOM_LISTENER: jobSuccess = " + jobSuccess.toString) - println("CUSTOM_LISTENER: jobFailure = " + jobFailure.toString) - println("CUSTOM_LISTENER: jobSuccessStatus = " + jobSuccessStatus) - println("CUSTOM_LISTENER: jobErrorMessage = " + jobErrorMessage) - println("CUSTOM_LISTENER: jobErrorValue = " + jobErrorValue) - println("CUSTOM_LISTENER: jobErrorTrace = " + jobErrorTrace) - println("CUSTOM_LISTENER: jobStartTime = " + jobStartTime.toString) - println("CUSTOM_LISTENER: jobEndTime = " + jobEndTime.toString) - println("CUSTOM_LISTENER: jobElapsedTimeInSecs = " + jobElapsedTimeInSecs.toString) - println("CUSTOM_LISTENER: appMetricExecutorRunTime = " + appMetricExecutorRunTime.toString) - println("CUSTOM_LISTENER: appMetricJvmGCTime = " + appMetricJvmGCTime.toString) - println("CUSTOM_LISTENER: appMetricExecutorDeserializeTime = " + appMetricExecutorDeserializeTime.toString) - println("CUSTOM_LISTENER: appInputRecordsRead = " + appInputRecordsRead.toString) - println("CUSTOM_LISTENER: appInputBytesRead = " + appInputBytesRead.toString) - println("CUSTOM_LISTENER: appOutputBytesWritten = " + appOutputBytesWritten.toString) - println("CUSTOM_LISTENER: appOutputRecordsWritten = " + appOutputRecordsWritten.toString) - println("CUSTOM_LISTENER: appShuffleRecordsRead = " + appShuffleRecordsRead.toString) - println("CUSTOM_LISTENER: appShuffleRemoteBytesRead = " + appShuffleRemoteBytesRead.toString) - println("CUSTOM_LISTENER: appShuffleRecordsWritten = " + appShuffleRecordsWritten.toString) - println("CUSTOM_LISTENER: appShuffleBytesWritten = " + appShuffleBytesWritten.toString) - println("CUSTOM_LISTENER: appShuffleWriteTime = " + appShuffleWriteTime.toString) - println("CUSTOM_LISTENER: metricExecutorRunTime = " + metricExecutorRunTime.toString) - println("CUSTOM_LISTENER: metricJvmGCTime = " + metricJvmGCTime.toString) - println("CUSTOM_LISTENER: metricExecutorDeserializeTime = " + metricExecutorDeserializeTime.toString) - println("CUSTOM_LISTENER: inputRecordsRead = " + inputRecordsRead.toString) - println("CUSTOM_LISTENER: inputBytesRead = " + inputBytesRead.toString) - println("CUSTOM_LISTENER: outputBytesWritten = " + outputBytesWritten.toString) - println("CUSTOM_LISTENER: outputRecordsWritten = " + outputRecordsWritten.toString) - println("CUSTOM_LISTENER: shuffleRecordsRead = " + shuffleRecordsRead.toString) - println("CUSTOM_LISTENER: shuffleRemoteBytesRead = " + shuffleRemoteBytesRead.toString) - println("CUSTOM_LISTENER: shuffleRecordsWritten = " + shuffleRecordsWritten.toString) - println("CUSTOM_LISTENER: shuffleBytesWritten = " + shuffleBytesWritten.toString) - println("CUSTOM_LISTENER: shuffleWriteTime = " + shuffleWriteTime.toString) - } - - override def onApplicationStart(appStart: SparkListenerApplicationStart): Unit = { - initMetrics - applicationId = appStart.appId.get - appName = appStart.appName - appStartTime = appStart.time - appStart.appAttemptId.foreach(appAttemptId = _) - sparkUser = appStart.sparkUser - appStart.driverLogs.foreach(logs => driverLogs = logs.toString) - } - - override def onApplicationEnd(appEnd: SparkListenerApplicationEnd): Unit = { - appEndTime = appEnd.time - appElapsedTimeInSecs = (appEndTime - appStartTime).toFloat / 1000.0f - if ((jobSuccess == 0) && (jobFailure == 0)) { - jobSuccess = 1 - sparkJobResult = "JobSucceeded" - } - logAppMetrics - printMetrics - - } - - override def onJobStart(jobStart: SparkListenerJobStart) { - resetMetrics - jobCompleted = false - sparkJobId = jobStart.jobId - jobStartTime = jobStart.time - startNumberOfExecutors = numberOfExecutors - if (sparkJobId == 0) { - minNumberOfExecutors = numberOfExecutors - } - } - - override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { - jobCompleted = true - endNumberOfExecutors = numberOfExecutors - jobEndTime = jobEnd.time - jobElapsedTimeInSecs = (jobEndTime - jobStartTime).toFloat / 1000.0f - val jobResult = jobEnd.jobResult - sparkJobResult = jobResult.toString() - if ((sparkJobResult == "JobSucceeded") || - (jobSuccessStatus.endsWith("livy.repl.Interpreter$ExecuteSuccess"))) { - jobSuccess = 1 - jobFailure = 0 - } else { - jobSuccess = 0 - jobFailure = 1 - } - if (sparkLogLevel == "job") { - logJobMetrics - printMetrics - } - accumMetrics - } - - override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = { - numberOfExecutors += 1 - if (numberOfExecutors > maxNumberOfExecutors) { - maxNumberOfExecutors = numberOfExecutors - } - } - - override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { - numberOfExecutors -= 1 - if (numberOfExecutors < minNumberOfExecutors) { - minNumberOfExecutors = numberOfExecutors - } - } - - override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { - taskEnd.taskMetrics match { - case null => - kafkaLogger.debug(s"Task metrics are not available for $taskEnd") - case metrics => - metricExecutorRunTime += metrics.executorRunTime - metricJvmGCTime += metrics.jvmGCTime - metricExecutorDeserializeTime += metrics.executorDeserializeTime - metricResultSize += metrics.resultSize - metricResultSerializationTime += metrics.resultSerializationTime - metricMemoryBytesSpilled += metrics.memoryBytesSpilled - metricDiskBytesSpilled += metrics.diskBytesSpilled - metricPeakExecutionMemory += metrics.peakExecutionMemory - - inputRecordsRead += metrics.inputMetrics.recordsRead - inputBytesRead += metrics.inputMetrics.bytesRead - - outputRecordsWritten += metrics.outputMetrics.recordsWritten - outputBytesWritten += metrics.outputMetrics.bytesWritten - - shuffleRecordsRead += metrics.shuffleReadMetrics.recordsRead - shuffleRemoteBytesRead += metrics.shuffleReadMetrics.remoteBytesRead - shuffleRemoteBlocksFetched += metrics.shuffleReadMetrics.remoteBlocksFetched - shuffleLocalBlocksFetched += metrics.shuffleReadMetrics.localBlocksFetched - shuffleFetchWaitTime += metrics.shuffleReadMetrics.fetchWaitTime - shuffleLocalBytesRead += metrics.shuffleReadMetrics.localBytesRead - - shuffleRecordsWritten += metrics.shuffleWriteMetrics.recordsWritten - shuffleBytesWritten += metrics.shuffleWriteMetrics.bytesWritten - shuffleWriteTime += metrics.shuffleWriteMetrics.writeTime - } - } -} diff --git a/gimel-logging/gimel-logging_2.2/src/main/scala/com/paypal/gimel/logging/GimelStreamingListener.scala b/gimel-logging/gimel-logging_2.2/src/main/scala/com/paypal/gimel/logging/GimelStreamingListener.scala deleted file mode 100644 index 1b1257aa..00000000 --- a/gimel-logging/gimel-logging_2.2/src/main/scala/com/paypal/gimel/logging/GimelStreamingListener.scala +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging - -import java.util.{Date, Map => JMap} - -import scala.collection.JavaConverters._ -import scala.util.{Failure, Success, Try} - -import org.apache.spark._ -import org.apache.spark.streaming.scheduler._ -import org.json4s.DefaultFormats - -import org.json4s.DefaultFormats - -/** - * A {{SparkListener}} that captures and logs all metrics - * - * @param conf - */ -class GimelStreamingListener(conf: SparkConf) extends StreamingListener with Logging { - - def accumulateCrossJobs: Boolean = false - - private implicit def formats = DefaultFormats - - val DEFAULT_GROUP_ID: String = "DEFAULT_GROUP_ID" - - var appNotStarted: Boolean = true - - /** - * Cluster and HostName - */ - - private val hadoopConfiguration = new org.apache.hadoop.conf.Configuration() - val clusterUrl: String = hadoopConfiguration.get("fs.default.name") - val clusterName: String = new java.net.URI(clusterUrl).getHost() - val hostName: String = clusterName - - /** - * Application Level Metrics - */ - - var appName: String = "Unknown" - var appId: String = "Unknown" - var driverLogs: String = "Unknown" - var appStartTime: Long = 0L - var appEndTime: Long = 0L - var appElapsedTimeInSecs: Float = 0.0f - var sparkLogLevel: String = "application" - var sparkWebUI: String = "" - var sparkEnvProperties: String = "" - var sparkUser: String = "" - var sparkVersion: String = "" - var sparkMaster: String = "" - var sparkDriverMemory: Long = 0L - var sparkExecutorMemory: Long = 0L - var sparkExecutorCores: Long = 0L - var sparkExecutorInstances: Long = 0L - - /** - * Job Level Metrics - */ - var sparkJobId: Long = 0L - var jobCompleted: Boolean = false - var jobStartTime: Long = 0L - var jobEndTime: Long = 0L - var jobElapsedTimeInSecs: Float = 0.0f - var jobSuccess: Long = 0L - var jobFailure: Long = 0L - var sparkJobResult: String = "" - var jobSuccessStatus: String = "" - var jobErrorMessage: String = "" - var jobErrorValue: String = "" - var jobErrorTrace: String = "" - - /** - * Application Level Metrics - */ - var appNumRecords = 0L - var appProcessingDelay = 0L - var appSchedulingDelay = 0L - var appTotalDelay = 0L - - /** - * Job Level Metrics - */ - var numRecords: Long = 0L - var processingDelay: Long = 0L - var schedulingDelay: Long = 0L - var totalDelay: Long = 0L - - /** - * Generate Timestamp in YYYYMMDDHHMISS format - */ - val dateTimeFormat = new java.text.SimpleDateFormat("yyyyMMddhhmmss") - - def timeStamp: Long = { - dateTimeFormat.format(new Date()).toLong - } - - /** - * Generate Date in YYYYMMDD format - */ - val dateFormat = new java.text.SimpleDateFormat("yyyyMMdd") - - def date: Long = { - dateFormat.format(new Date()).toLong - } - - /** - * Convert to bytes - */ - def sizeStrToBytes(str: String): Long = { - val lower = str.toLowerCase - if (lower.endsWith("k")) { - lower.substring(0, lower.length - 1).toLong * 1024 - } else if (lower.endsWith("m")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 - } else if (lower.endsWith("g")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 - } else if (lower.endsWith("t")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 * 1024 - } else { - // no suffix, so it's just a number in bytes - lower.toLong - } - } - - /** - * Metrics that do not change in application should be set here. E.g. {{username}} - */ - def initMetrics: Unit = { - sparkUser = Try { - conf.get("spark.app.user") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - appName = Try { - conf.get("spark.app.name") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - appId = Try { - conf.get("spark.app.id") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkVersion = Try { - conf.get("spark.version") - } match { - case Success(prop) => prop.toString - case Failure(strVal) => "Unknown" - } - sparkMaster = Try { - conf.get("spark.master") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkDriverMemory = Try { - conf.get("spark.driver.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorMemory = Try { - conf.get("spark.executor.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorCores = Try { - conf.get("spark.executor.cores") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkExecutorInstances = Try { - conf.get("spark.executor.instances") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkEnvProperties = Try { - conf.get("spark.env.properties") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkWebUI = Try { - conf.get("spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkLogLevel = Try { - conf.get("spark.gimel.log.level") - } match { - case Success(prop) => prop.toString.toLowerCase - case Failure(_) => "application" - } - if ((sparkLogLevel != "application") && (sparkLogLevel != "job")) { - println("Invalid sparkLogLevel (" + sparkLogLevel + "). Valid options: application or job. So, setting sparkLogLevel to application.") - sparkLogLevel = "application" - } - } - - /** - * Accumulate Job Level Metrics after each job completion to compute Application Level Metrics. - */ - def accumMetrics: Unit = { - appNumRecords += numRecords - appProcessingDelay += processingDelay - appSchedulingDelay += schedulingDelay - appTotalDelay += totalDelay - } - - /** - * Reset Job Level Metrics after each job completion. - */ - def resetMetrics: Unit = { - sparkJobId = 0L - sparkJobResult = "" - jobSuccess = 0L - jobFailure = 0L - jobStartTime = 0L - jobEndTime = 0L - jobElapsedTimeInSecs = 0.0f - jobSuccessStatus = "" - jobErrorMessage = "" - jobErrorValue = "" - jobErrorTrace = "" - numRecords = 0L - processingDelay = 0L - schedulingDelay = 0L - totalDelay = 0L - } - - /** - * Log all the metrics both in JSON format and into Kafka - * - */ - - def logJobMetrics: Unit = { - kafkaLogger.info(this.jobKafkaArgs) - } - - def jobKafkaArgs: JMap[String, Any] = { - Map( - - // App Level Config Details - "sparkVersion" -> sparkVersion, - "logtime" -> java.lang.System.currentTimeMillis(), - "logType" -> "LivyMetrics", - "sparkLogLevel" -> "Streaming", - "host" -> hostName, - "cluster" -> clusterName, - "appName" -> appName, - "appId" -> appId, - "sparkMaster" -> sparkMaster, - "sparkUser" -> sparkUser, - "sparkDriverMemory" -> sparkDriverMemory, - "sparkExecutorMemory" -> sparkExecutorMemory, - "sparkExecutorCores" -> sparkExecutorCores, - "sparkExecutorCores" -> sparkExecutorCores, - "sparkExecutorInstances" -> sparkExecutorInstances, - "sparkWebUI" -> sparkWebUI, - - // App or Job Level Metrics - "appNumRecords" -> appNumRecords, - "appProcessingDelay" -> appProcessingDelay, - "appSchedulingDelay" -> appSchedulingDelay, - "appTotalDelay" -> appTotalDelay, - - // Job Level Config Details - "sparkJobId" -> sparkJobId, - "sparkJobResult" -> sparkJobResult, - "jobStartTime" -> jobStartTime, - "jobEndTime" -> jobEndTime, - "jobElapsedTimeInSecs" -> jobElapsedTimeInSecs, - "numRecords" -> numRecords, - "processingDelay" -> processingDelay, - "schedulingDelay" -> schedulingDelay, - "totalDelay" -> totalDelay - ).asJava - } - - def printMetrics: Unit = { - println("CUSTOM_LISTENER: sparkVersion = " + sparkVersion) - println("CUSTOM_LISTENER: logdate = " + date.toString) - println("CUSTOM_LISTENER: logtime = " + timeStamp.toString) - println("CUSTOM_LISTENER: host = " + hostName.toString) - println("CUSTOM_LISTENER: cluster = " + clusterName.toString) - println("CUSTOM_LISTENER: appName = " + appName.toString) - println("CUSTOM_LISTENER: appId = " + appId.toString) - println("CUSTOM_LISTENER: sparkMaster = " + sparkMaster.toString) - println("CUSTOM_LISTENER: sparkUser = " + sparkUser.toString) - println("CUSTOM_LISTENER: sparkEnvProperties = " + sparkEnvProperties.toString) - println("CUSTOM_LISTENER: sparkLogLevel = " + sparkLogLevel.toString) - println("CUSTOM_LISTENER: sparkDriverMemory = " + sparkDriverMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorMemory = " + sparkExecutorMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorCores = " + sparkExecutorCores.toString) - println("CUSTOM_LISTENER: sparkExecutorInstances = " + sparkExecutorInstances.toString) - println("CUSTOM_LISTENER: sparkWebUI = " + sparkWebUI.toString) - println("CUSTOM_LISTENER: appNumRecords = " + appNumRecords.toString) - println("CUSTOM_LISTENER: appProcessingDelay = " + appProcessingDelay.toString) - println("CUSTOM_LISTENER: appSchedulingDelay = " + appSchedulingDelay.toString) - println("CUSTOM_LISTENER: appTotalDelay = " + appTotalDelay.toString) - println("CUSTOM_LISTENER: sparkJobId = " + sparkJobId.toString) - println("CUSTOM_LISTENER: sparkJobResult = " + sparkJobResult.toString) - println("CUSTOM_LISTENER: jobStartTime = " + jobStartTime.toString) - println("CUSTOM_LISTENER: jobEndTime = " + jobEndTime.toString) - println("CUSTOM_LISTENER: jobElapsedTimeInSecs = " + jobElapsedTimeInSecs.toString) - println("CUSTOM_LISTENER: numRecords = " + numRecords.toString) - println("CUSTOM_LISTENER: processingDelay = " + processingDelay.toString) - println("CUSTOM_LISTENER: schedulingDelay = " + schedulingDelay.toString) - println("CUSTOM_LISTENER: totalDelay = " + totalDelay.toString) - } - - override def onBatchStarted(batchStarted: StreamingListenerBatchStarted) { - resetMetrics - } - - override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { - if (appNotStarted) { - initMetrics - appNotStarted = false - } - sparkJobId = batchCompleted.batchInfo.batchTime.toString.replaceAll(" ms", "").toLong - numRecords = batchCompleted.batchInfo.numRecords - processingDelay = batchCompleted.batchInfo.processingDelay.get - schedulingDelay = batchCompleted.batchInfo.schedulingDelay.get - totalDelay = batchCompleted.batchInfo.totalDelay.get - jobStartTime = batchCompleted.batchInfo.processingStartTime.get - jobEndTime = batchCompleted.batchInfo.processingEndTime.get - jobElapsedTimeInSecs = (jobEndTime - jobStartTime).toFloat / 1000.0f - sparkJobResult = "" - accumMetrics - logJobMetrics - printMetrics - } - -} diff --git a/gimel-logging/gimel-logging_2.3/pom.xml b/gimel-logging/gimel-logging_2.3/pom.xml deleted file mode 100644 index fdc77710..00000000 --- a/gimel-logging/gimel-logging_2.3/pom.xml +++ /dev/null @@ -1,56 +0,0 @@ - - - - - - gimel-logging - com.paypal.gimel - 0.4.3-SNAPSHOT - ../pom.xml - - 4.0.0 - - gimel-logging_2.3 - - - org.apache.spark - spark-core_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-sql_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-hive_${scala.binary.version} - ${spark.version} - provided - - - org.apache.spark - spark-streaming_${scala.binary.version} - ${spark.version} - provided - - - - diff --git a/gimel-logging/gimel-logging_2.3/src/main/scala/com/paypal/gimel/logging/GimelSparkListener.scala b/gimel-logging/gimel-logging_2.3/src/main/scala/com/paypal/gimel/logging/GimelSparkListener.scala deleted file mode 100644 index 3c1c7d45..00000000 --- a/gimel-logging/gimel-logging_2.3/src/main/scala/com/paypal/gimel/logging/GimelSparkListener.scala +++ /dev/null @@ -1,640 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging - -import java.util.{Date, Map => JMap} - -import scala.collection.JavaConverters._ -import scala.util.{Failure, Success, Try} -import org.apache.spark._ -import org.apache.spark.executor.TaskMetrics -import org.apache.spark.scheduler._ -import org.json4s.DefaultFormats -import com.paypal.gimel.logging.Logging - -/** - * A {{SparkListener}} that captures and logs all metrics - * - * @param conf - */ -class GimelSparkListener(conf: SparkConf) extends SparkListener with Logging { - - def accumulateCrossJobs: Boolean = false - - private implicit def formats = DefaultFormats - - val DEFAULT_GROUP_ID: String = "DEFAULT_GROUP_ID" - - /** - * Cluster and HostName - */ - - private val hadoopConfiguration = new org.apache.hadoop.conf.Configuration() - val clusterUrl = hadoopConfiguration.get("fs.default.name") - val clusterName = new java.net.URI(clusterUrl).getHost() - val hostName = clusterName - - /** - * Application Level Metrics - */ - - var applicationId: String = "Unknown" - var appAttemptId: String = "" - var appName: String = "Unknown" - var driverLogs: String = "Unknown" - var appStartTime: Long = 0L - var appEndTime: Long = 0L - var appElapsedTimeInSecs: Float = 0.0f - var sparkLogLevel: String = "application" - var sparkWebUI: String = "" - var sparkEnvProperties: String = "" - var sparkUser: String = "" - var sparkVersion: String = "" - var sparkMaster: String = "" - var sparkDriverMemory: Long = 0L - var sparkExecutorMemory: Long = 0L - var sparkExecutorCores: Long = 0L - var sparkExecutorInstances: Long = 0L - - /** - * Application or Job Level Metrics - */ - var numberOfExecutors: Long = 0L - var startNumberOfExecutors: Long = 0L - var minNumberOfExecutors: Long = 0L - var maxNumberOfExecutors: Long = 0L - var endNumberOfExecutors: Long = 0L - - /** - * Job Level Metrics - */ - var sparkJobId: Long = 0L - var jobCompleted: Boolean = false - var jobStartTime: Long = 0L - var jobEndTime: Long = 0L - var jobElapsedTimeInSecs: Float = 0.0f - var jobSuccess: Long = 0L - var jobFailure: Long = 0L - var sparkJobResult: String = "" - var jobSuccessStatus: String = "" - var jobErrorMessage: String = "" - var jobErrorValue: String = "" - var jobErrorTrace: String = "" - - /** - * Application Level Metrics - */ - var appMetricExecutorRunTime = 0L - var appMetricJvmGCTime = 0L - var appMetricExecutorDeserializeTime = 0L - var appInputRecordsRead = 0L - var appInputBytesRead = 0L - var appOutputBytesWritten = 0L - var appOutputRecordsWritten = 0L - var appShuffleRecordsRead = 0L - var appShuffleRemoteBytesRead = 0L - var appShuffleRecordsWritten = 0L - var appShuffleBytesWritten = 0L - var appShuffleWriteTime = 0L - - /** - * Job Level Metrics - */ - var metricExecutorRunTime: Long = 0L - var metricJvmGCTime: Long = 0L - var metricExecutorDeserializeTime: Long = 0L - var metricResultSize: Long = 0L - var metricResultSerializationTime: Long = 0L - var metricMemoryBytesSpilled: Long = 0L - var metricDiskBytesSpilled: Long = 0L - var metricPeakExecutionMemory: Long = 0L - var inputRecordsRead: Long = 0L - var inputBytesRead: Long = 0L - var outputBytesWritten: Long = 0L - var outputRecordsWritten: Long = 0L - var shuffleRecordsRead: Long = 0L - var shuffleRemoteBytesRead: Long = 0L - var shuffleRecordsWritten: Long = 0L - var shuffleRemoteBlocksFetched: Long = 0L - var shuffleLocalBlocksFetched: Long = 0L - var shuffleFetchWaitTime: Long = 0L - var shuffleLocalBytesRead: Long = 0L - var shuffleBytesWritten: Long = 0L - var shuffleWriteTime: Long = 0L - - /** - * Generate Timestamp in YYYYMMDDHHMISS format - */ - val dateTimeFormat = new java.text.SimpleDateFormat("yyyyMMddhhmmss") - - def timeStamp: Long = { - dateTimeFormat.format(new Date()).toLong - } - - /** - * Generate Date in YYYYMMDD format - */ - val dateFormat = new java.text.SimpleDateFormat("yyyyMMdd") - - def date: Long = { - dateFormat.format(new Date()).toLong - } - - /** - * Convert to bytes - */ - def sizeStrToBytes(str: String): Long = { - val lower = str.toLowerCase - if (lower.endsWith("k")) { - lower.substring(0, lower.length - 1).toLong * 1024 - } else if (lower.endsWith("m")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 - } else if (lower.endsWith("g")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 - } else if (lower.endsWith("t")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 * 1024 - } else { - // no suffix, so it's just a number in bytes - lower.toLong - } - } - - /** - * Metrics that do not change in application should be set here. E.g. {{username}} - */ - def initMetrics: Unit = { - sparkUser = Try { - conf.get("spark.app.user") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkVersion = Try { - conf.get("spark.version") - } match { - case Success(prop) => prop.toString - case Failure(strVal) => "Unknown" - } - sparkMaster = Try { - conf.get("spark.master") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkDriverMemory = Try { - conf.get("spark.driver.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorMemory = Try { - conf.get("spark.executor.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorCores = Try { - conf.get("spark.executor.cores") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkExecutorInstances = Try { - conf.get("spark.executor.instances") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkEnvProperties = Try { - conf.get("spark.env.properties") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkWebUI = Try { - conf.get("spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkLogLevel = Try { - conf.get("spark.gimel.log.level") - } match { - case Success(prop) => prop.toString.toLowerCase - case Failure(_) => "application" - } - if ((sparkLogLevel != "application") && (sparkLogLevel != "job") && (sparkLogLevel != "task")) { - println("Invalid sparkLogLevel (" + sparkLogLevel + "). Valid options: [ application | job | task ]. So, setting sparkLogLevel to application.") - } - } - - /** - * Accumulate Job Level Metrics after each job completion to compute Application Level Metrics. - */ - def accumMetrics: Unit = { - appMetricExecutorRunTime += metricExecutorRunTime - appMetricJvmGCTime += metricJvmGCTime - appMetricExecutorDeserializeTime += metricExecutorDeserializeTime - appInputRecordsRead += inputRecordsRead - appInputBytesRead += inputBytesRead - appOutputBytesWritten += outputBytesWritten - appOutputRecordsWritten += outputRecordsWritten - appShuffleRecordsRead += shuffleRecordsRead - appShuffleRemoteBytesRead += shuffleRemoteBytesRead - appShuffleRecordsWritten += shuffleRecordsWritten - appShuffleBytesWritten += shuffleBytesWritten - appShuffleWriteTime += shuffleWriteTime - } - - /** - * Reset Job Level Metrics after each job completion. - */ - def resetMetrics: Unit = { - sparkJobId = 0L - sparkJobResult = "" - jobSuccess = 0L - jobFailure = 0L - jobStartTime = 0L - jobEndTime = 0L - jobElapsedTimeInSecs = 0.0f - jobSuccessStatus = "" - jobErrorMessage = "" - jobErrorValue = "" - jobErrorTrace = "" - - startNumberOfExecutors = 0L - endNumberOfExecutors = 0L - metricExecutorRunTime = 0L - metricJvmGCTime = 0L - metricExecutorDeserializeTime = 0L - inputRecordsRead = 0L - inputBytesRead = 0L - outputBytesWritten = 0L - outputRecordsWritten = 0L - shuffleRecordsRead = 0L - shuffleRemoteBytesRead = 0L - shuffleRecordsWritten = 0L - shuffleBytesWritten = 0L - shuffleWriteTime = 0L - } - - /** - * Log all the metrics both in JSON format and into Kafka - * - */ - def logAppMetrics: Unit = { - kafkaLogger.info(this.appKafkaArgs) - } - - def appKafkaArgs: JMap[String, Any] = { - Map( - - // App Level Config Details - "logtime" -> java.lang.System.currentTimeMillis(), - "logType" -> "LivyMetrics", - "sparkLogLevel" -> "Application", - "host" -> hostName, - "cluster" -> clusterName, - "applicationId" -> applicationId, - "appAttemptId" -> appAttemptId, - "appName" -> appName, - "sparkWebUI" -> sparkWebUI, - "driverLogs" -> driverLogs, - "sparkMaster" -> sparkMaster, - "sparkUser" -> sparkUser, - "sparkVersion" -> sparkVersion, - "sparkEnvProperties" -> sparkEnvProperties, - - // App Level Metrics - "appStartTime" -> appStartTime, - "appEndTime" -> appEndTime, - "appElapsedTimeInSecs" -> appElapsedTimeInSecs, - "sparkDriverMemory" -> sparkDriverMemory, - "sparkExecutorMemory" -> sparkExecutorMemory, - "sparkExecutorCores" -> sparkExecutorCores, - "sparkExecutorInstances" -> sparkExecutorInstances, - "startNumberOfExecutors" -> startNumberOfExecutors, - "endNumberOfExecutors" -> endNumberOfExecutors, - "minNumberOfExecutors" -> minNumberOfExecutors, - "maxNumberOfExecutors" -> maxNumberOfExecutors, - - // App or Job Level Metrics - "appResult" -> sparkJobResult, - "appErrorMessage" -> jobErrorMessage, - "appErrorValue" -> jobErrorValue, - "appErrorTrace" -> jobErrorTrace, - "appSuccess" -> jobSuccess, - "appFailure" -> jobFailure, - - // App or Job Level Metrics - "metricExecutorRunTime" -> appMetricExecutorRunTime, - "metricJvmGCTime" -> appMetricJvmGCTime, - "metricExecutorDeserializeTime" -> appMetricExecutorDeserializeTime, - "inputRecordsRead" -> appInputRecordsRead, - "inputBytesRead" -> appInputBytesRead, - "outputBytesWritten" -> appOutputBytesWritten, - "outputRecordsWritten" -> appOutputRecordsWritten, - "shuffleRecordsRead" -> appShuffleRecordsRead, - "shuffleRemoteBytesRead" -> appShuffleRemoteBytesRead, - "shuffleRecordsWritten" -> appShuffleRecordsWritten, - "shuffleBytesWritten" -> appShuffleBytesWritten, - "shuffleWriteTime" -> appShuffleWriteTime - ).asJava - } - - def logJobMetrics: Unit = { - kafkaLogger.info(this.jobKafkaArgs) - } - - def jobKafkaArgs: JMap[String, Any] = { - Map( - - // App Level Config Details - "logtime" -> java.lang.System.currentTimeMillis(), - "logType" -> "GimelMetrics", - "sparkLogLevel" -> "Job", - "host" -> hostName, - "cluster" -> clusterName, - "applicationId" -> applicationId, - "appAttemptId" -> appAttemptId, - "appName" -> appName, - - // Job Level Config Details - "sparkJobId" -> sparkJobId, - "sparkJobResult" -> sparkJobResult, - "jobSuccessStatus" -> jobSuccessStatus, - "jobErrorMessage" -> jobErrorMessage, - "jobErrorValue" -> jobErrorValue, - "jobErrorTrace" -> jobErrorTrace, - - // Job Level Metrics - "jobSuccess" -> jobSuccess, - "jobFailure" -> jobFailure, - "jobStartTime" -> jobStartTime, - "jobEndTime" -> jobEndTime, - "startNumberOfExecutors" -> startNumberOfExecutors, - "endNumberOfExecutors" -> endNumberOfExecutors, - - // App or Job Level Metrics - "metricExecutorRunTime" -> metricExecutorRunTime, - "metricJvmGCTime" -> metricJvmGCTime, - "metricExecutorDeserializeTime" -> metricExecutorDeserializeTime, - "inputRecordsRead" -> inputRecordsRead, - "inputBytesRead" -> inputBytesRead, - "outputBytesWritten" -> outputBytesWritten, - "outputRecordsWritten" -> outputRecordsWritten, - "shuffleRecordsRead" -> shuffleRecordsRead, - "shuffleRemoteBytesRead" -> shuffleRemoteBytesRead, - "shuffleRecordsWritten" -> shuffleRecordsWritten, - "shuffleBytesWritten" -> shuffleBytesWritten, - "shuffleWriteTime" -> shuffleWriteTime - ).asJava - } - - def printMetrics: Unit = { - println("CUSTOM_LISTENER: sparkVersion = " + sparkVersion) - println("CUSTOM_LISTENER: logdate = " + date.toString) - println("CUSTOM_LISTENER: logtime = " + timeStamp.toString) - println("CUSTOM_LISTENER: host = " + hostName.toString) - println("CUSTOM_LISTENER: cluster = " + clusterName.toString) - println("CUSTOM_LISTENER: applicationId = " + applicationId.toString) - println("CUSTOM_LISTENER: appName = " + appName.toString) - println("CUSTOM_LISTENER: appAttemptId = " + appAttemptId.toString) - println("CUSTOM_LISTENER: appStartTime = " + appStartTime.toString) - println("CUSTOM_LISTENER: appEndTime = " + appEndTime.toString) - println("CUSTOM_LISTENER: appElapsedTimeInSecs = " + appElapsedTimeInSecs.toString) - println("CUSTOM_LISTENER: sparkMaster = " + sparkMaster.toString) - println("CUSTOM_LISTENER: sparkUser = " + sparkUser.toString) - println("CUSTOM_LISTENER: sparkEnvProperties = " + sparkEnvProperties.toString) - println("CUSTOM_LISTENER: sparkLogLevel = " + sparkLogLevel.toString) - println("CUSTOM_LISTENER: sparkDriverMemory = " + sparkDriverMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorMemory = " + sparkExecutorMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorCores = " + sparkExecutorCores.toString) - println("CUSTOM_LISTENER: sparkExecutorInstances = " + sparkExecutorInstances.toString) - println("CUSTOM_LISTENER: sparkWebUI = " + sparkWebUI.toString) - println("CUSTOM_LISTENER: driverLogs = " + driverLogs.toString) - println("CUSTOM_LISTENER: startNumberOfExecutors = " + startNumberOfExecutors.toString) - println("CUSTOM_LISTENER: endNumberOfExecutors = " + endNumberOfExecutors.toString) - println("CUSTOM_LISTENER: minNumberOfExecutors = " + minNumberOfExecutors.toString) - println("CUSTOM_LISTENER: maxNumberOfExecutors = " + maxNumberOfExecutors.toString) - println("CUSTOM_LISTENER: sparkJobId = " + sparkJobId.toString) - println("CUSTOM_LISTENER: sparkJobResult = " + sparkJobResult.toString) - println("CUSTOM_LISTENER: jobSuccess = " + jobSuccess.toString) - println("CUSTOM_LISTENER: jobFailure = " + jobFailure.toString) - println("CUSTOM_LISTENER: jobSuccessStatus = " + jobSuccessStatus) - println("CUSTOM_LISTENER: jobErrorMessage = " + jobErrorMessage) - println("CUSTOM_LISTENER: jobErrorValue = " + jobErrorValue) - println("CUSTOM_LISTENER: jobErrorTrace = " + jobErrorTrace) - println("CUSTOM_LISTENER: jobStartTime = " + jobStartTime.toString) - println("CUSTOM_LISTENER: jobEndTime = " + jobEndTime.toString) - println("CUSTOM_LISTENER: jobElapsedTimeInSecs = " + jobElapsedTimeInSecs.toString) - println("CUSTOM_LISTENER: appMetricExecutorRunTime = " + appMetricExecutorRunTime.toString) - println("CUSTOM_LISTENER: appMetricJvmGCTime = " + appMetricJvmGCTime.toString) - println("CUSTOM_LISTENER: appMetricExecutorDeserializeTime = " + appMetricExecutorDeserializeTime.toString) - println("CUSTOM_LISTENER: appInputRecordsRead = " + appInputRecordsRead.toString) - println("CUSTOM_LISTENER: appInputBytesRead = " + appInputBytesRead.toString) - println("CUSTOM_LISTENER: appOutputBytesWritten = " + appOutputBytesWritten.toString) - println("CUSTOM_LISTENER: appOutputRecordsWritten = " + appOutputRecordsWritten.toString) - println("CUSTOM_LISTENER: appShuffleRecordsRead = " + appShuffleRecordsRead.toString) - println("CUSTOM_LISTENER: appShuffleRemoteBytesRead = " + appShuffleRemoteBytesRead.toString) - println("CUSTOM_LISTENER: appShuffleRecordsWritten = " + appShuffleRecordsWritten.toString) - println("CUSTOM_LISTENER: appShuffleBytesWritten = " + appShuffleBytesWritten.toString) - println("CUSTOM_LISTENER: appShuffleWriteTime = " + appShuffleWriteTime.toString) - println("CUSTOM_LISTENER: metricExecutorRunTime = " + metricExecutorRunTime.toString) - println("CUSTOM_LISTENER: metricJvmGCTime = " + metricJvmGCTime.toString) - println("CUSTOM_LISTENER: metricExecutorDeserializeTime = " + metricExecutorDeserializeTime.toString) - println("CUSTOM_LISTENER: inputRecordsRead = " + inputRecordsRead.toString) - println("CUSTOM_LISTENER: inputBytesRead = " + inputBytesRead.toString) - println("CUSTOM_LISTENER: outputBytesWritten = " + outputBytesWritten.toString) - println("CUSTOM_LISTENER: outputRecordsWritten = " + outputRecordsWritten.toString) - println("CUSTOM_LISTENER: shuffleRecordsRead = " + shuffleRecordsRead.toString) - println("CUSTOM_LISTENER: shuffleRemoteBytesRead = " + shuffleRemoteBytesRead.toString) - println("CUSTOM_LISTENER: shuffleRecordsWritten = " + shuffleRecordsWritten.toString) - println("CUSTOM_LISTENER: shuffleBytesWritten = " + shuffleBytesWritten.toString) - println("CUSTOM_LISTENER: shuffleWriteTime = " + shuffleWriteTime.toString) - } - - override def onApplicationStart(appStart: SparkListenerApplicationStart): Unit = { - initMetrics - applicationId = appStart.appId.get - appName = appStart.appName - appStartTime = appStart.time - appStart.appAttemptId.foreach(appAttemptId = _) - sparkUser = appStart.sparkUser - appStart.driverLogs.foreach(logs => driverLogs = logs.toString) - } - - override def onApplicationEnd(appEnd: SparkListenerApplicationEnd): Unit = { - appEndTime = appEnd.time - appElapsedTimeInSecs = (appEndTime - appStartTime).toFloat / 1000.0f - if ((jobSuccess == 0) && (jobFailure == 0)) { - jobSuccess = 1 - sparkJobResult = "JobSucceeded" - } - logAppMetrics - printMetrics - - } - - override def onJobStart(jobStart: SparkListenerJobStart) { - resetMetrics - jobCompleted = false - sparkJobId = jobStart.jobId - jobStartTime = jobStart.time - startNumberOfExecutors = numberOfExecutors - if (sparkJobId == 0) { - minNumberOfExecutors = numberOfExecutors - } - } - - override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { - jobCompleted = true - endNumberOfExecutors = numberOfExecutors - jobEndTime = jobEnd.time - jobElapsedTimeInSecs = (jobEndTime - jobStartTime).toFloat / 1000.0f - val jobResult = jobEnd.jobResult - sparkJobResult = jobResult.toString() - if ((sparkJobResult == "JobSucceeded") || - (jobSuccessStatus.endsWith("livy.repl.Interpreter$ExecuteSuccess"))) { - jobSuccess = 1 - jobFailure = 0 - } else { - jobSuccess = 0 - jobFailure = 1 - } - if (sparkLogLevel == "job") { - logJobMetrics - printMetrics - } - accumMetrics - } - - override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = { - numberOfExecutors += 1 - if (numberOfExecutors > maxNumberOfExecutors) { - maxNumberOfExecutors = numberOfExecutors - } - } - - override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { - numberOfExecutors -= 1 - if (numberOfExecutors < minNumberOfExecutors) { - minNumberOfExecutors = numberOfExecutors - } - } - - /** - * Utility function to get required Task Metrics - * - * @param taskEnd SparkListenerTaskEnd - * @return Task Metrics - */ - def getTaskMetrics(taskEnd: SparkListenerTaskEnd): Map[String, Any] = { - val metrics: TaskMetrics = taskEnd.taskMetrics - Map( - // App Level Config Details - "logtime" -> java.lang.System.currentTimeMillis(), - "logType" -> "GimelMetrics", - "sparkLogLevel" -> "Task", - "host" -> hostName, - "cluster" -> clusterName, - "applicationId" -> applicationId, - "appAttemptId" -> appAttemptId, - "appName" -> appName, - // Task Level - "taskId" -> taskEnd.taskInfo.taskId, - "taskType" -> taskEnd.taskType, - "stageId" -> taskEnd.stageId, - "stageAttemptId" -> taskEnd.stageAttemptId, - "gettingResultTime" -> taskEnd.taskInfo.gettingResultTime, - "finishTime" -> taskEnd.taskInfo.finishTime, - "executorId" -> taskEnd.taskInfo.executorId, - "status" -> taskEnd.taskInfo.status, - "attemptNumber" -> taskEnd.taskInfo.attemptNumber, - "duration" -> taskEnd.taskInfo.duration, - "isSpeculative" -> taskEnd.taskInfo.speculative, - "launchTime" -> taskEnd.taskInfo.launchTime, - "host" -> taskEnd.taskInfo.host - , "metricExecutorRunTime" -> metrics.executorRunTime - , "metricJvmGCTime" -> metrics.jvmGCTime - , "metricExecutorDeserializeTime" -> metrics.executorDeserializeTime - , "metricResultSize" -> metrics.resultSize - , "metricResultSerializationTime" -> metrics.resultSerializationTime - , "metricMemoryBytesSpilled" -> metrics.memoryBytesSpilled - , "metricDiskBytesSpilled" -> metrics.diskBytesSpilled - , "metricPeakExecutionMemory" -> metrics.peakExecutionMemory - , "inputRecordsRead" -> metrics.inputMetrics.recordsRead - , "inputBytesRead" -> metrics.inputMetrics.bytesRead - , "outputRecordsWritten" -> metrics.outputMetrics.recordsWritten - , "outputBytesWritten" -> metrics.outputMetrics.bytesWritten - , "shuffleRecordsRead" -> metrics.shuffleReadMetrics.recordsRead - , "shuffleRemoteBytesRead" -> metrics.shuffleReadMetrics.remoteBytesRead - , "shuffleRemoteBlocksFetched" -> metrics.shuffleReadMetrics.remoteBlocksFetched - , "shuffleLocalBlocksFetched" -> metrics.shuffleReadMetrics.localBlocksFetched - , "shuffleFetchWaitTime" -> metrics.shuffleReadMetrics.fetchWaitTime - , "shuffleLocalBytesRead" -> metrics.shuffleReadMetrics.localBytesRead - , "shuffleRecordsWritten" -> metrics.shuffleWriteMetrics.recordsWritten - , "shuffleBytesWritten" -> metrics.shuffleWriteMetrics.bytesWritten - , "shuffleWriteTime" -> metrics.shuffleWriteMetrics.writeTime - ) - } - - /** - * Posts the Task Metrics to Sink System - * - * @param taskEnd SparkListenerTaskEnd - */ - def logTaskMetrics(taskEnd: SparkListenerTaskEnd): Unit = { - val metricsToPost = getTaskMetrics(taskEnd) - kafkaLogger.info(metricsToPost.asJava) - } - - override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { - taskEnd.taskMetrics match { - case null => - kafkaLogger.debug(s"Task metrics are not available for $taskEnd") - case metrics => - logTaskMetrics(taskEnd) - metricExecutorRunTime += metrics.executorRunTime - metricJvmGCTime += metrics.jvmGCTime - metricExecutorDeserializeTime += metrics.executorDeserializeTime - metricResultSize += metrics.resultSize - metricResultSerializationTime += metrics.resultSerializationTime - metricMemoryBytesSpilled += metrics.memoryBytesSpilled - metricDiskBytesSpilled += metrics.diskBytesSpilled - metricPeakExecutionMemory += metrics.peakExecutionMemory - - inputRecordsRead += metrics.inputMetrics.recordsRead - inputBytesRead += metrics.inputMetrics.bytesRead - - outputRecordsWritten += metrics.outputMetrics.recordsWritten - outputBytesWritten += metrics.outputMetrics.bytesWritten - - shuffleRecordsRead += metrics.shuffleReadMetrics.recordsRead - shuffleRemoteBytesRead += metrics.shuffleReadMetrics.remoteBytesRead - shuffleRemoteBlocksFetched += metrics.shuffleReadMetrics.remoteBlocksFetched - shuffleLocalBlocksFetched += metrics.shuffleReadMetrics.localBlocksFetched - shuffleFetchWaitTime += metrics.shuffleReadMetrics.fetchWaitTime - shuffleLocalBytesRead += metrics.shuffleReadMetrics.localBytesRead - - shuffleRecordsWritten += metrics.shuffleWriteMetrics.recordsWritten - shuffleBytesWritten += metrics.shuffleWriteMetrics.bytesWritten - shuffleWriteTime += metrics.shuffleWriteMetrics.writeTime - } - } -} \ No newline at end of file diff --git a/gimel-logging/gimel-logging_2.3/src/main/scala/com/paypal/gimel/logging/GimelStreamingListener.scala b/gimel-logging/gimel-logging_2.3/src/main/scala/com/paypal/gimel/logging/GimelStreamingListener.scala deleted file mode 100644 index 09e1110a..00000000 --- a/gimel-logging/gimel-logging_2.3/src/main/scala/com/paypal/gimel/logging/GimelStreamingListener.scala +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging - -import java.util.{Date, Map => JMap} - -import scala.collection.JavaConverters._ -import scala.util.{Failure, Success, Try} -import org.json4s.DefaultFormats - -import org.apache.spark.{SparkConf} -import org.apache.spark.streaming.scheduler.{ - StreamingListener, - StreamingListenerBatchStarted, - StreamingListenerBatchCompleted -} - -/** - * A {{SparkListener}} that captures and logs all metrics - * - * @param conf - */ -class GimelStreamingListener(conf: SparkConf) extends StreamingListener with Logging { - - def accumulateCrossJobs: Boolean = false - - private implicit def formats = DefaultFormats - - val DEFAULT_GROUP_ID: String = "DEFAULT_GROUP_ID" - - var appNotStarted: Boolean = true - - /** - * Cluster and HostName - */ - - private val hadoopConfiguration = new org.apache.hadoop.conf.Configuration() - val clusterUrl: String = hadoopConfiguration.get("fs.default.name") - val clusterName: String = new java.net.URI(clusterUrl).getHost() - val hostName: String = clusterName - - /** - * Application Level Metrics - */ - - var appName: String = "Unknown" - var appId: String = "Unknown" - var driverLogs: String = "Unknown" - var appStartTime: Long = 0L - var appEndTime: Long = 0L - var appElapsedTimeInSecs: Float = 0.0f - var sparkLogLevel: String = "application" - var sparkWebUI: String = "" - var sparkEnvProperties: String = "" - var sparkUser: String = "" - var sparkVersion: String = "" - var sparkMaster: String = "" - var sparkDriverMemory: Long = 0L - var sparkExecutorMemory: Long = 0L - var sparkExecutorCores: Long = 0L - var sparkExecutorInstances: Long = 0L - - /** - * Job Level Metrics - */ - var sparkJobId: Long = 0L - var jobCompleted: Boolean = false - var jobStartTime: Long = 0L - var jobEndTime: Long = 0L - var jobElapsedTimeInSecs: Float = 0.0f - var jobSuccess: Long = 0L - var jobFailure: Long = 0L - var sparkJobResult: String = "" - var jobSuccessStatus: String = "" - var jobErrorMessage: String = "" - var jobErrorValue: String = "" - var jobErrorTrace: String = "" - - /** - * Application Level Metrics - */ - var appNumRecords = 0L - var appProcessingDelay = 0L - var appSchedulingDelay = 0L - var appTotalDelay = 0L - - /** - * Job Level Metrics - */ - var numRecords: Long = 0L - var processingDelay: Long = 0L - var schedulingDelay: Long = 0L - var totalDelay: Long = 0L - - /** - * Generate Timestamp in YYYYMMDDHHMISS format - */ - val dateTimeFormat = new java.text.SimpleDateFormat("yyyyMMddhhmmss") - - def timeStamp: Long = { - dateTimeFormat.format(new Date()).toLong - } - - /** - * Generate Date in YYYYMMDD format - */ - val dateFormat = new java.text.SimpleDateFormat("yyyyMMdd") - - def date: Long = { - dateFormat.format(new Date()).toLong - } - - /** - * Convert to bytes - */ - def sizeStrToBytes(str: String): Long = { - val lower = str.toLowerCase - if (lower.endsWith("k")) { - lower.substring(0, lower.length - 1).toLong * 1024 - } else if (lower.endsWith("m")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 - } else if (lower.endsWith("g")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 - } else if (lower.endsWith("t")) { - lower.substring(0, lower.length - 1).toLong * 1024 * 1024 * 1024 * 1024 - } else { - // no suffix, so it's just a number in bytes - lower.toLong - } - } - - /** - * Metrics that do not change in application should be set here. E.g. {{username}} - */ - def initMetrics: Unit = { - sparkUser = Try { - conf.get("spark.app.user") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - appName = Try { - conf.get("spark.app.name") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - appId = Try { - conf.get("spark.app.id") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkVersion = Try { - conf.get("spark.version") - } match { - case Success(prop) => prop.toString - case Failure(strVal) => "Unknown" - } - sparkMaster = Try { - conf.get("spark.master") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkDriverMemory = Try { - conf.get("spark.driver.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorMemory = Try { - conf.get("spark.executor.memory") - } match { - case Success(prop) => sizeStrToBytes(prop.toString) - case Failure(_) => 0L - } - sparkExecutorCores = Try { - conf.get("spark.executor.cores") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkExecutorInstances = Try { - conf.get("spark.executor.instances") - } match { - case Success(prop) => prop.toLong - case Failure(_) => 0L - } - sparkEnvProperties = Try { - conf.get("spark.env.properties") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkWebUI = Try { - conf.get("spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES") - } match { - case Success(prop) => prop.toString - case Failure(_) => "Unknown" - } - sparkLogLevel = Try { - conf.get("spark.gimel.log.level") - } match { - case Success(prop) => prop.toString.toLowerCase - case Failure(_) => "application" - } - if ((sparkLogLevel != "application") && (sparkLogLevel != "job")) { - println("Invalid sparkLogLevel (" + sparkLogLevel + "). Valid options: application or job. So, setting sparkLogLevel to application.") - sparkLogLevel = "application" - } - } - - /** - * Accumulate Job Level Metrics after each job completion to compute Application Level Metrics. - */ - def accumMetrics: Unit = { - appNumRecords += numRecords - appProcessingDelay += processingDelay - appSchedulingDelay += schedulingDelay - appTotalDelay += totalDelay - } - - /** - * Reset Job Level Metrics after each job completion. - */ - def resetMetrics: Unit = { - sparkJobId = 0L - sparkJobResult = "" - jobSuccess = 0L - jobFailure = 0L - jobStartTime = 0L - jobEndTime = 0L - jobElapsedTimeInSecs = 0.0f - jobSuccessStatus = "" - jobErrorMessage = "" - jobErrorValue = "" - jobErrorTrace = "" - numRecords = 0L - processingDelay = 0L - schedulingDelay = 0L - totalDelay = 0L - } - - /** - * Log all the metrics both in JSON format and into Kafka - * - */ - - def logJobMetrics: Unit = { - kafkaLogger.info(this.jobKafkaArgs) - } - - def jobKafkaArgs: JMap[String, Any] = { - Map( - - // App Level Config Details - "sparkVersion" -> sparkVersion, - "logtime" -> java.lang.System.currentTimeMillis(), - "logType" -> "LivyMetrics", - "sparkLogLevel" -> "Streaming", - "host" -> hostName, - "cluster" -> clusterName, - "appName" -> appName, - "appId" -> appId, - "sparkMaster" -> sparkMaster, - "sparkUser" -> sparkUser, - "sparkDriverMemory" -> sparkDriverMemory, - "sparkExecutorMemory" -> sparkExecutorMemory, - "sparkExecutorCores" -> sparkExecutorCores, - "sparkExecutorCores" -> sparkExecutorCores, - "sparkExecutorInstances" -> sparkExecutorInstances, - "sparkWebUI" -> sparkWebUI, - - // App or Job Level Metrics - "appNumRecords" -> appNumRecords, - "appProcessingDelay" -> appProcessingDelay, - "appSchedulingDelay" -> appSchedulingDelay, - "appTotalDelay" -> appTotalDelay, - - // Job Level Config Details - "sparkJobId" -> sparkJobId, - "sparkJobResult" -> sparkJobResult, - "jobStartTime" -> jobStartTime, - "jobEndTime" -> jobEndTime, - "jobElapsedTimeInSecs" -> jobElapsedTimeInSecs, - "numRecords" -> numRecords, - "processingDelay" -> processingDelay, - "schedulingDelay" -> schedulingDelay, - "totalDelay" -> totalDelay - ).asJava - } - - def printMetrics: Unit = { - println("CUSTOM_LISTENER: sparkVersion = " + sparkVersion) - println("CUSTOM_LISTENER: logdate = " + date.toString) - println("CUSTOM_LISTENER: logtime = " + timeStamp.toString) - println("CUSTOM_LISTENER: host = " + hostName.toString) - println("CUSTOM_LISTENER: cluster = " + clusterName.toString) - println("CUSTOM_LISTENER: appName = " + appName.toString) - println("CUSTOM_LISTENER: appId = " + appId.toString) - println("CUSTOM_LISTENER: sparkMaster = " + sparkMaster.toString) - println("CUSTOM_LISTENER: sparkUser = " + sparkUser.toString) - println("CUSTOM_LISTENER: sparkEnvProperties = " + sparkEnvProperties.toString) - println("CUSTOM_LISTENER: sparkLogLevel = " + sparkLogLevel.toString) - println("CUSTOM_LISTENER: sparkDriverMemory = " + sparkDriverMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorMemory = " + sparkExecutorMemory.toString) - println("CUSTOM_LISTENER: sparkExecutorCores = " + sparkExecutorCores.toString) - println("CUSTOM_LISTENER: sparkExecutorInstances = " + sparkExecutorInstances.toString) - println("CUSTOM_LISTENER: sparkWebUI = " + sparkWebUI.toString) - println("CUSTOM_LISTENER: appNumRecords = " + appNumRecords.toString) - println("CUSTOM_LISTENER: appProcessingDelay = " + appProcessingDelay.toString) - println("CUSTOM_LISTENER: appSchedulingDelay = " + appSchedulingDelay.toString) - println("CUSTOM_LISTENER: appTotalDelay = " + appTotalDelay.toString) - println("CUSTOM_LISTENER: sparkJobId = " + sparkJobId.toString) - println("CUSTOM_LISTENER: sparkJobResult = " + sparkJobResult.toString) - println("CUSTOM_LISTENER: jobStartTime = " + jobStartTime.toString) - println("CUSTOM_LISTENER: jobEndTime = " + jobEndTime.toString) - println("CUSTOM_LISTENER: jobElapsedTimeInSecs = " + jobElapsedTimeInSecs.toString) - println("CUSTOM_LISTENER: numRecords = " + numRecords.toString) - println("CUSTOM_LISTENER: processingDelay = " + processingDelay.toString) - println("CUSTOM_LISTENER: schedulingDelay = " + schedulingDelay.toString) - println("CUSTOM_LISTENER: totalDelay = " + totalDelay.toString) - } - - override def onBatchStarted(batchStarted: StreamingListenerBatchStarted) { - resetMetrics - } - - override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { - if (appNotStarted) { - initMetrics - appNotStarted = false - } - sparkJobId = batchCompleted.batchInfo.batchTime.toString.replaceAll(" ms", "").toLong - numRecords = batchCompleted.batchInfo.numRecords - processingDelay = batchCompleted.batchInfo.processingDelay.get - schedulingDelay = batchCompleted.batchInfo.schedulingDelay.get - totalDelay = batchCompleted.batchInfo.totalDelay.get - jobStartTime = batchCompleted.batchInfo.processingStartTime.get - jobEndTime = batchCompleted.batchInfo.processingEndTime.get - jobElapsedTimeInSecs = (jobEndTime - jobStartTime).toFloat / 1000.0f - sparkJobResult = "" - accumMetrics - logJobMetrics - printMetrics - } - -} \ No newline at end of file diff --git a/gimel-logging/pom.xml b/gimel-logging/pom.xml deleted file mode 100644 index 1c16e24e..00000000 --- a/gimel-logging/pom.xml +++ /dev/null @@ -1,318 +0,0 @@ - - - - - gimel - com.paypal.gimel - 2.0.0-SNAPSHOT - ../pom.xml - - - 4.0.0 - com.paypal.gimel - gimel-logging - pom - 0.4.3-SNAPSHOT - - gimel-logging_2.2 - gimel-logging_2.3 - - - - UTF-8 - 2.2.4 - 2.3.0 - 1.7 - 2.10.0.pr1 - 2.5.0 - 1.4 - 0.10.2.0 - 1.10.19 - 1.6.2 - 4.13.1 - - - - joda-time - joda-time - 2.5 - provided - - - com.fasterxml.jackson.core - jackson-databind - ${jackson.databind.version} - provided - - - com.google.protobuf - protobuf-java - ${protobuf.version} - - - com.googlecode.protobuf-java-format - protobuf-java-format - ${protobuf.javaformatter.version} - - - org.apache.kafka - kafka_2.10 - ${kafka.version} - - - com.yammer.metrics - metrics-core - - - - org.apache.zookeeper - zookeeper - - - net.sf.jopt-simple - jopt-simpler - - - - - junit - junit - ${junit.version} - test - - - org.mockito - mockito-all - ${mockito.version} - test - - - org.powermock - powermock-module-junit4 - ${powermock.version} - test - - - org.powermock - powermock-api-mockito - ${powermock.version} - test - - - - - - - org.apache.maven.plugins - maven-enforcer-plugin - 1.4.1 - - - enforce-versions - - enforce - - - - - ${java.version} - - - - - - - - org.apache.maven.plugins - maven-surefire-plugin - 2.20 - - - net.alchim31.maven - scala-maven-plugin - 3.2.1 - - - - compile - testCompile - - - - - - -Xms64m - -Xmx1024m - - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.6.1 - - ${java.version} - ${java.version} - - - - org.codehaus.mojo - build-helper-maven-plugin - 1.10 - - - add-scala-sources - generate-sources - - add-source - - - - ${project.basedir}/../src/main/scala - ${project.basedir}/../src/main/java - - - - - add-scala-tests - generate-sources - - add-test-source - - - - ${project.basedir}/src/test/scala - ${project.basedir}/src/test/java - - - - - add-resource - generate-resources - - add-resource - - - - - ${project.basedir}/../src/main/resources - - - - - - - - - org.scala-tools - maven-scala-plugin - 2.15.2 - - - org.apache.maven.plugins - maven-pmd-plugin - 3.7 - - - utf-8 - 100 - 1.5 - - com/paypal/scaas/message/* - - - - - org.apache.maven.plugins - maven-shade-plugin - 3.0.0 - - - - org.apache.kafka - com.paypal.shaded.org.apache.kafka - - - kafka - com.paypal.shaded.org.kafka - - - org.xerial.snappy - com.paypal.shaded.org.xerial.snappy - - - org.I0Itec.zkclient - com.paypal.shaded.org.I0Itec.zkclient - - - net.jpountz.lz4 - com.paypal.shaded.net.jpountz.lz4 - - - net.jpountz.util - com.paypal.shaded.net.jpountz.util - - - net.jpountz.xxhash - com.paypal.shaded.net.jpountz.xxhash - - - com.google - com.paypal.shaded.com.google - - - com.googlecode - com.paypal.shaded.com.googlecode - - - - - com.yammer.metrics:metrics-core - org.apache.zookeeper:zookeeper - org.scala-lang:scala-library - org.slf4j:slf4j-api - org.slf4j:slf4j-log4j12 - net.sf.jopt-simple:jopt-simple - - - - - - package - - shade - - - - - - org.apache.maven.plugins - maven-release-plugin - 2.5.3 - - v@{project.version} - - - - - diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/Constants.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/Constants.java deleted file mode 100644 index 32c93682..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/Constants.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging; - -public class Constants { - - // Topics in Kafka - public final static String GIMEL_LOGGER_PROPERTY_PREFIX = "gimel.logger.kafka"; - public final static String GIMEL_LOGGER_SYSTEM_TOPIC_KEY = "gimel.logger.system.topic"; - public final static String GIMEL_LOGGER_APPMETRICS_TOPIC_KEY = "gimel.logger.appMetrics.topic"; - public final static String GIMEL_LOGGER_PROPERTIES_FILEPATH_KEY = "gimel.logger.properties.filepath"; - // The messages types which shall be passed into google proto as the object type - // So deserializer can know the type of message - public enum MessageType { - SYSTEM(6); - - int messageId; - - MessageType(final int messageName) { - this.messageId = messageName; - } - - public int getValue() { - return this.messageId; - } - } - public enum TopicType { - SYSTEM("gimel.logger.system.topic"), - APPLICATION("gimel.logger.appMetrics.topic"); - - String messageName; - - TopicType(final String messageName) { - this.messageName = messageName; - } - - @Override - public String toString() { - return this.messageName; - } - } -} diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/GimelLogger.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/GimelLogger.java deleted file mode 100644 index 5bff6ff2..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/GimelLogger.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging; - -import java.util.Map; -import java.util.Properties; - -public interface GimelLogger { - - - /** - * Logs the arguments to log file and sends to kafka. - * - * @param args - */ - void logCustomMetrics(Object... args); - - void initCustomMetricsProperties(Properties props); - - void initCustomMetricsProperties(Properties props, String topic); - -} diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/LogProvider.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/LogProvider.java deleted file mode 100644 index 35c98dc5..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/LogProvider.java +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging; - -import java.util.Enumeration; -import java.util.Properties; - -import com.paypal.gimel.logging.utils.Configuration; -import org.apache.kafka.clients.producer.KafkaProducer; - -/** - * Provides logger for different types of messages. - * - */ -public class LogProvider { - - private KafkaProducer kafkaProducer; - private KafkaProducer customKafkaProducer; - private Configuration config = Configuration.getInstance(); - private final Properties kafkaProps = config.getKafkaProperties(); - private final Properties topics = config.getKafkaTopics(); - - - public String getTopicName(Constants.TopicType topicType) { - return topics.get(topicType.toString()).toString(); - } - - - public LogProvider(final String className) { - } - - /** - * Returns a logger for the given type of {@linkplain Constants.TopicType}. - * - * @param topicType - * @return logger instance - */ - public KafkaProducer getKafkaProducer(final Constants.TopicType topicType) { - return this.getKafkaProducer(topicType, this.kafkaProps); - } - - public KafkaProducer getKafkaProducer(final Constants.TopicType topicType, final Properties kafkaProps) { - switch (topicType) { - case APPLICATION: - if (this.customKafkaProducer == null) { - this.customKafkaProducer = new KafkaProducer(kafkaProps); - } - return this.customKafkaProducer; - default: { - if (kafkaProducer == null) { - kafkaProducer = new KafkaProducer(config.getKafkaProperties()); - } - return this.kafkaProducer; - } - - } - } - - public KafkaProducer getDefaultLogger() { - return this.kafkaProducer; - } - - public KafkaProducer getSystemLogger() { - return this.getKafkaProducer(Constants.TopicType.SYSTEM); - } - - - public KafkaProducer getApplicationLogger() { - return this.getKafkaProducer(Constants.TopicType.APPLICATION); - } - - public void initCustomMetrics(Properties customProps) { - - for (Enumeration propertyNames = customProps.propertyNames(); - propertyNames.hasMoreElements(); ) { - Object key = propertyNames.nextElement(); - if (key.toString().indexOf(Constants.GIMEL_LOGGER_PROPERTY_PREFIX) != -1) - kafkaProps.put(key, customProps.get(key)); - - } - } - - public void initCustomMetrics(Properties customProps, String topic) { - initCustomMetrics(customProps); - topics.put(Constants.GIMEL_LOGGER_APPMETRICS_TOPIC_KEY, topic); - } -} diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/SystemLogger.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/SystemLogger.java deleted file mode 100644 index f4834c2d..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/SystemLogger.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging; - -public interface SystemLogger extends GimelLogger { - - /** - * Logs the arguments to log file and sends to kafka. - * - * @param args - */ - void debug(Object... args); - - /** - * Logs the arguments to log file and sends to kafka. - * - * @param args - */ - void info(Object... args); - - - /** - * Logs the arguments to log file and sends to kafka. - * - * @param args - */ - void warn(Object... args); - - /** - * Logs the arguments to log file and sends to kafka. - * - * @param error - */ - void error(String error); - -} diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/impl/BaseLogger.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/impl/BaseLogger.java deleted file mode 100644 index 79223027..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/impl/BaseLogger.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.impl; - -import java.util.Map; -import java.util.Properties; - -import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.google.protobuf.Message; -import com.googlecode.protobuf.format.JsonFormat; -import org.apache.kafka.clients.producer.Callback; -import org.apache.kafka.clients.producer.KafkaProducer; -import org.apache.kafka.clients.producer.ProducerRecord; -import org.apache.kafka.clients.producer.RecordMetadata; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import com.paypal.gimel.logging.Constants; -import com.paypal.gimel.logging.LogProvider; -import com.paypal.gimel.logging.GimelLogger; -import com.paypal.gimel.logging.utils.Context; - -/** - * Contains the basic logger functionalities define in {@link GimelLogger} - * - */ -public class BaseLogger implements GimelLogger { - - protected LogProvider logProvider; - private JsonFormat protoBufFormat; - private final Context context = new Context(); - protected final ObjectMapper jsonizer = new ObjectMapper(); - - private final Logger logger = LogManager.getLogger(this.getClass().toString()); - - protected BaseLogger(String className) { - this.init(className); - this.jsonizer.setSerializationInclusion(JsonInclude.Include.NON_NULL); - } - - @Override - public void logCustomMetrics(Object... args) { - if (args == null) { - return; - } - if (args.length == 1 && args[0] instanceof String) { - logger.info(args[0]); - return; - } - final ObjectNode js = this.encodeArgs(args); - this.publishToKafka(Constants.TopicType.APPLICATION, js.toString()); - } - - @Override - public void initCustomMetricsProperties(Properties props) { - this.logProvider.initCustomMetrics(props); - } - - @Override - public void initCustomMetricsProperties(Properties props, String topic) { - this.logProvider.initCustomMetrics(props, topic); - } - - - @SuppressWarnings("deprecation") - protected ObjectNode encodeArgs(final Object[] args) { - final ObjectNode result = this.jsonizer.getNodeFactory().objectNode(); - if (args.length == 1 && args[0] instanceof Map) { - return this.jsonizer.valueToTree(args[0]); - } - // arguments are alternating name and value; if there's an - // odd number of arguments, it's a value for field "data" - for (int i = 0; i < args.length; i += 2) { - try { - if (i + 1 < args.length) { - result.put(args[i].toString(), this.jsonizer.valueToTree(args[i + 1])); - } else { - result.put("data", this.jsonizer.valueToTree(args[i])); - } - } catch (final Exception jse) { - jse.printStackTrace(); - // ignore any exceptions and keep marching - } - } - return result; - } - - private void init(String classname) { - - this.logProvider = new LogProvider(classname); - this.protoBufFormat = new JsonFormat(); - } - - - protected void publishToKafka(Constants.TopicType topicType, String value) { - String currentTopic = this.logProvider.getTopicName(topicType); - KafkaProducer kafka = this.logProvider.getKafkaProducer(topicType); - - // if kafka producer is not created, ignore the error and march - // TODO: Needs to be handled better way. But for now we decided to ignore and not fail - // applications. - if (kafka == null) { - logger.error("Unable to send metrics to kafka. Printing the metrics to console."); - System.out.println(value); - return; - } - kafka.send(new ProducerRecord(currentTopic, value.getBytes()), new Callback() { - @Override - public void onCompletion(final RecordMetadata metadata, final Exception e) { - if (e != null) { - logger.error("Unable to write to Kafka in appender [ " + metadata + "]", e); - } - } - }); - - } - - /** - * Converts {@link Message} to json string - * - * @param message - * @return json string of the given message - */ - protected String messageToString(final Message message) { - String json = null; - json = this.protoBufFormat.printToString(message); - return json; - } - -} \ No newline at end of file diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/impl/JSONSystemLogger.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/impl/JSONSystemLogger.java deleted file mode 100644 index 19ff527e..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/impl/JSONSystemLogger.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.impl; - -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.paypal.gimel.logging.Constants; -import com.paypal.gimel.logging.SystemLogger; -import com.paypal.gimel.logging.utils.Context; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -/** - * Logs messages in JSON format and sends relevant messages to Kafka. - * - */ -public class JSONSystemLogger extends BaseLogger implements SystemLogger { - - private static JSONSystemLogger instance; - - private final Logger logger = LogManager.getLogger(this.getClass().toString()); - - protected JSONSystemLogger(String className) { - super(className); - } - - /** - * Gives instance of {@code Context} - *

    - * Note: {@code ipAddress} and {@code hostname} will be initialized with the current system details. Those values - *

    - * - * @return {@code Context} instance. - */ - public static JSONSystemLogger getInstance(Class clazz) { - if (instance == null) { - synchronized (Context.class) { - if (instance == null) { - instance = new JSONSystemLogger(clazz.getName()); - } - } - } - return instance; - } - - - /** - * Converts the arguments to JSON format and logs to log file and sends to kafka. - * - * @param args - */ - @Override - public void debug(final Object... args) { - if (args == null) { - return; - } - if (args.length == 1 && args[0] instanceof String) { - logger.debug(args[0]); - return; - } - - final ObjectNode js = this.encodeArgs(args); - this.publishToKafka(Constants.TopicType.SYSTEM, js.toString()); - logger.debug(js.toString()); - } - - /** - * Converts the arguments to JSON format and logs to log file and sends to kafka. - * - * @param args - */ - @Override - public void info(final Object... args) { - if (args == null) { - return; - } - if (args.length == 1 && args[0] instanceof String) { - logger.info(args[0]); - return; - } - final ObjectNode js = this.encodeArgs(args); - this.publishToKafka(Constants.TopicType.SYSTEM, js.toString()); - logger.info(js.toString()); - } - - /** - * Converts the arguments to JSON format and logs to log file and sends to kafka. - * - * @param args - */ - @Override - public void warn(final Object... args) { - if (args == null) { - return; - } - if (args.length == 1 && args[0] instanceof String) { - logger.warn(args[0]); - return; - } - final ObjectNode js = this.encodeArgs(args); - this.publishToKafka(Constants.TopicType.SYSTEM, js.toString()); - this.logger.warn(js.toString()); - } - - @Override - public void error(final String message) { - this.publishToKafka(Constants.TopicType.SYSTEM, message); - } - -} diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Configuration.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Configuration.java deleted file mode 100644 index 0377740f..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Configuration.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.utils; - -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Properties; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.SparkFiles; - -import com.paypal.gimel.logging.Constants; - -public class Configuration { - - private static Configuration instance = null; - private Properties properties = null; - private final Logger logger = LogManager.getLogger(this.getClass().toString()); - - /** - * It's a singleton class. Use {@link #getInstance()} - */ - private Configuration() { - readConfiguration(); - } - - public static synchronized Configuration getInstance() { - if (instance == null) { - instance = new Configuration(); - } - return instance; - } - - /** - * Read the configuration file gimelLoggerConfig.properties, and load to properties - */ - - public void readConfiguration() { - properties = new Properties(); - try { - this.logger.debug("Reading gimel logger properties."); - String filePathDefault = "/gimelLoggerConfig.properties"; - InputStream configStream; - if (System.getProperty(Constants.GIMEL_LOGGER_PROPERTIES_FILEPATH_KEY) == null) { - configStream = this.getClass().getResourceAsStream(filePathDefault); - } else { - String filePathPropertyValue = (String) SparkFiles.get(System.getProperty(Constants.GIMEL_LOGGER_PROPERTIES_FILEPATH_KEY)); - configStream = new FileInputStream(filePathPropertyValue); - } - properties.load(configStream); - configStream.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public Object get(String key) { - return properties.get(key); - } - - /** - * From the configuration file gimelLoggerConfig.properties, it reads the kafka properties and returns as properties - */ - public Properties getKafkaProperties() { - Properties props = new Properties(); - for (Object key : properties.keySet()) { - if (key.toString().indexOf(Constants.GIMEL_LOGGER_PROPERTY_PREFIX) != -1) { - props.put(key.toString().substring(Constants.GIMEL_LOGGER_PROPERTY_PREFIX.length() + 1, key.toString().length()), properties.get(key)); - } - - } - return props; - } - - /** - * From the configuration file gimelLoggerConfig.properties, it reads the topics and returns as properties - */ - public Properties getKafkaTopics() { - Properties props = new Properties(); - for (Object key : properties.keySet()) { - if (key.toString().contains(".topic")) { - props.put(key.toString(), properties.get(key)); - } - - } - return props; - } -} diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Context.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Context.java deleted file mode 100644 index 39ee8594..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Context.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.utils; - -import java.net.InetAddress; -import java.net.UnknownHostException; -import java.util.HashMap; -import java.util.Map; - - -/** - * Contains information which can be reused. - *

    - * Note: {@code Context} is a singleton instance. - *

    - * - */ -public class Context { - - private String host; - private String ipAddress; - - // Sccas metrics details - private String profile; - private String tenant; - private String appServiceName; - private Map dimensions; - private int resolution; - - public Context() { - InetAddress inetAddress = null; - try { - inetAddress = InetAddress.getLocalHost(); - this.host = inetAddress.getHostName(); - this.ipAddress = inetAddress.getHostAddress(); - this.dimensions = new HashMap(); - } - catch (final UnknownHostException e) { - e.printStackTrace(); - } - } - - /** - * @return the profile - */ - public String getProfile() { - return this.profile; - } - - /** - * @param profileName - * the profile to set - */ - public void setProfileName(final String profileName) { - this.profile = profileName; - } - - /** - * @return the appServiceName - */ - public String getAppServiceName() { - return this.appServiceName; - } - - /** - * @return the dimensions - */ - public Map getDimensions() { - return this.dimensions; - } - - /** - * @param appServiceName - * the appServiceName to set - */ - public void setAppServiceName(final String appServiceName) { - this.appServiceName = appServiceName; - } - - /** - * @return the tenant - */ - public String getTenant() { - return this.tenant; - } - - /** - * @param tenant - * the tenant to set - */ - public void setTenant(final String tenant) { - this.tenant = tenant; - } - - /** - * @return the ipAddress - */ - public String getIpAddress() { - return this.ipAddress; - } - - /** - * @param ipAddress - * the ipAddress to set - */ - public void setIpAddress(final String ipAddress) { - this.ipAddress = ipAddress; - } - - /** - * @return the host - */ - public String getHost() { - return this.host; - } - - /** - * @return the resolution - */ - public int getResolution() { - return resolution; - } - - /** - * Set the hostname - * - * @param host - * the host to set - */ - public void setHost(final String host) { - this.host = host; - } - -} diff --git a/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Utilities.java b/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Utilities.java deleted file mode 100644 index ab9f6c04..00000000 --- a/gimel-logging/src/main/java/com/paypal/gimel/logging/utils/Utilities.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.utils; - -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; - -import java.io.IOException; -import java.net.Socket; -import java.net.SocketAddress; -import java.net.InetAddress; -import java.net.InetSocketAddress; -import java.net.UnknownHostException; - -public class Utilities { - - private final static Logger LOGGER = LogManager.getLogger(Utilities.class); - - /** - * Get system hostname - * - * @return hostname if available otherwise returns empty string - */ - public static String getHostname() { - InetAddress ip; - String hostname = ""; - try { - ip = InetAddress.getLocalHost(); - hostname = ip.getHostName(); - } catch (UnknownHostException e) { - // Ignore the exception and keep marching - LOGGER.error("Unable to fetch local hostname" + e.getMessage()); - } - return hostname; - } - - public static boolean isReachable(String hostname, int port) { - - try (Socket socket = new Socket()) { - SocketAddress socketAddress = new InetSocketAddress(hostname, port); - int timeout = 100; - socket.connect(socketAddress, timeout); - return true; - } catch (IOException ex) { - // Ignore and march - LOGGER.warn(String.format("Host %s is not listening on %d", hostname, port)); - } - - return false; - } - -} \ No newline at end of file diff --git a/gimel-logging/src/main/resources/gimelLoggerConfig.properties b/gimel-logging/src/main/resources/gimelLoggerConfig.properties deleted file mode 100644 index ecfcc420..00000000 --- a/gimel-logging/src/main/resources/gimelLoggerConfig.properties +++ /dev/null @@ -1,31 +0,0 @@ -# -# Copyright 2018 PayPal Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# - -# kafka -gimel.logger.system.topic=gimel_logging_kafka_topic -gimel.logger.appMetrics.topic=gimel_app_metrics_kafka_topic - -# Kafka connection properties. -gimel.logger.kafka.bootstrap.servers = kafka_broker_1:9092,kafka_broker_2:9092 -gimel.logger.kafka.key.serializer=com.paypal.shaded.org.apache.kafka.common.serialization.ByteArraySerializer -gimel.logger.kafka.value.serializer=com.paypal.shaded.org.apache.kafka.common.serialization.ByteArraySerializer -gimel.logger.kafka.acks=0 -gimel.logger.kafka.retries=3 - - - - diff --git a/gimel-logging/src/main/resources/log4j.properties b/gimel-logging/src/main/resources/log4j.properties deleted file mode 100644 index 00745f63..00000000 --- a/gimel-logging/src/main/resources/log4j.properties +++ /dev/null @@ -1,38 +0,0 @@ -# -# Copyright 2018 PayPal Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# - -log4j.logger.com.paypal.gimel=INFO,stdout, stderr, file - -log4j.appender.stdout=org.apache.log4j.ConsoleAppender -log4j.appender.stdout.layout=org.apache.log4j.PatternLayout -log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n - -# Redirect log messages to a log file, support file rolling. -log4j.appender.file=org.apache.log4j.RollingFileAppender -log4j.appender.file.File=logs/gimel_log.log -log4j.appender.file.MaxFileSize=5MB -log4j.appender.file.MaxBackupIndex=10 -log4j.appender.file.layout=org.apache.log4j.PatternLayout -log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n - -# Send WARN or higher to stderr -log4j.appender.stderr=org.apache.log4j.ConsoleAppender -log4j.appender.stderr.Threshold=ERROR -log4j.appender.stderr.Target =System.err -log4j.appender.stderr.layout=org.apache.log4j.PatternLayout -log4j.appender.stderr.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n - diff --git a/gimel-logging/src/main/scala/com/paypal/gimel/logging/Logging.scala b/gimel-logging/src/main/scala/com/paypal/gimel/logging/Logging.scala deleted file mode 100644 index c00fd13e..00000000 --- a/gimel-logging/src/main/scala/com/paypal/gimel/logging/Logging.scala +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging - -import com.paypal.gimel.logging.impl.JSONSystemLogger -import org.slf4j.LoggerFactory - -trait Logging { - lazy val kafkaLogger = JSONSystemLogger.getInstance(getClass()) - - lazy val logger = LoggerFactory.getLogger(this.getClass) - - def trace(message: => Any): Unit = { - if (logger.isTraceEnabled) { - logger.trace(message.toString) - } - } - - def debug(message: => Any): Unit = { - if (logger.isDebugEnabled) { - logger.debug(message.toString) - } - } - - def info(message: => Any): Unit = { - if (logger.isInfoEnabled) { - logger.info(message.toString) - } - } - - def warn(message: => Any): Unit = { - logger.warn(message.toString) - } - - def error(message: => Any, t: Throwable): Unit = { - logger.error(message.toString, t) - } - - def error(message: => Any): Unit = { - logger.error(message.toString) - } -} diff --git a/gimel-logging/src/proto/proto-to-json.sh b/gimel-logging/src/proto/proto-to-json.sh deleted file mode 100644 index df1fdc3a..00000000 --- a/gimel-logging/src/proto/proto-to-json.sh +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -protoc -I=. --java_out=. *.proto - diff --git a/gimel-logging/src/test/java/com/paypal/gimel/logging/message/JSONLoggerTest.java b/gimel-logging/src/test/java/com/paypal/gimel/logging/message/JSONLoggerTest.java deleted file mode 100644 index 87347e84..00000000 --- a/gimel-logging/src/test/java/com/paypal/gimel/logging/message/JSONLoggerTest.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.message; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import java.util.HashMap; -import java.util.Map; -import org.junit.Test; - - -/** - */ -public class JSONLoggerTest { - - SystemLoggerMocked logger = new SystemLoggerMocked(""); - - @Test - public void testEncodeArgsArray() { - final String arrayJson = this.logger.toJson(new int[] { 1, 2, 3 }); - assertTrue(arrayJson != null); - assertEquals(arrayJson, "{\"data\":[1,2,3]}"); - } - - @Test - public void testEncodeArgsMap() { - final Map data = new HashMap<>(); - data.put("key", "value"); - final String arrayJson = this.logger.toJson(data); - assertTrue(arrayJson != null); - assertEquals(arrayJson, "{\"key\":\"value\"}"); - } - -} diff --git a/gimel-logging/src/test/java/com/paypal/gimel/logging/message/SystemLoggerMocked.java b/gimel-logging/src/test/java/com/paypal/gimel/logging/message/SystemLoggerMocked.java deleted file mode 100644 index fff610a4..00000000 --- a/gimel-logging/src/test/java/com/paypal/gimel/logging/message/SystemLoggerMocked.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.message; - -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.paypal.gimel.logging.impl.JSONSystemLogger; - - -/** - */ -public class SystemLoggerMocked extends JSONSystemLogger { - - public SystemLoggerMocked(final String className) { - super(className); - } - - public String toJson(final Object... args) { - final ObjectNode js = this.encodeArgs(args); - return js.toString(); - } - -} diff --git a/gimel-logging/src/test/java/com/paypal/gimel/logging/message/TestHelper.java b/gimel-logging/src/test/java/com/paypal/gimel/logging/message/TestHelper.java deleted file mode 100644 index 423f8826..00000000 --- a/gimel-logging/src/test/java/com/paypal/gimel/logging/message/TestHelper.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright 2018 PayPal Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - -package com.paypal.gimel.logging.message; - -import java.io.ByteArrayInputStream; -import java.lang.reflect.Field; -import java.lang.reflect.Method; -import java.util.Properties; -import com.google.protobuf.Message; -import com.google.protobuf.Message.Builder; -import com.googlecode.protobuf.format.JsonFormat; -import com.paypal.gimel.logging.LogProvider; - - -/** - */ -public class TestHelper { - - private static JsonFormat jsonFormat; - - static { - jsonFormat = new JsonFormat(); - } - - /** - * Currently mocking for a private member of a class is not available. This method will use reflections to set the - * mocked member (Technically hack it). - * - * @param logger - * @param logProvider - */ - public static void setLogProvider(final SystemLoggerMocked logger, final LogProvider logProvider) { - if (logger == null) { - return; - } - try { - final Field field = logger.getClass().getDeclaredField("logProvider"); - field.setAccessible(true); - field.set(logger, logProvider); - } - catch (final Exception e) { - e.printStackTrace(); - } - } - - /** - * Converts given message to Proto Buf message. - * - * @param message - * @return protoBufMessage if given message is parsable; null otherwise - */ - public static Message getProtobufMessage(final String message, final Class protoMsg) { - - try { - if (protoMsg == null) { - return null; - } - final Method method = protoMsg.getMethod("newBuilder", null); - final Object builder = method.invoke(null); - TestHelper.jsonFormat.merge(new ByteArrayInputStream(message.getBytes()), (Builder) builder); - System.out.println(builder.getClass()); - final Method buildMethod = builder.getClass().getMethod("build", null); - final Object obj = buildMethod.invoke(builder, null); - return (Message) obj; - } - catch (final Exception e) { - e.printStackTrace(); - } - return null; - } - - public static Properties getLog4jConfig() { - final Properties props = new Properties(); - props.put("log4j.rootLogger", "INFO"); - props.put("log4j.logger.testKafka", "DEBUG, KAFKA"); - props.put("log4j.appender.KAFKA.layout", "org.apache.log4j.PatternLayout"); - props.put("log4j.appender.KAFKA.layout.ConversionPattern", "%-5p: %c - %m%n"); - props.put("log4j.appender.KAFKA.BrokerList", "localhost:9093"); - props.put("log4j.appender.KAFKA.Topic", "test-topic"); - props.put("log4j.appender.KAFKA.RequiredNumAcks", "1"); - props.put("log4j.appender.KAFKA.SyncSend", "false"); - return props; - } - -} diff --git a/gimel-parser/pom.xml b/gimel-parser/pom.xml index 87df46a1..9896b5e3 100644 --- a/gimel-parser/pom.xml +++ b/gimel-parser/pom.xml @@ -4,20 +4,20 @@ gimel com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../pom.xml 4.0.0 gimel-parser - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT - org.scala-lang - scala-xml - 2.11.0-M4 + org.scala-lang.modules + scala-xml_${scala.binary.version} + ${scala.xml.version} ${scala.packaging.scope} diff --git a/gimel-serde/gimel-deserializers/generic-deserializers/pom.xml b/gimel-serde/gimel-deserializers/generic-deserializers/pom.xml index 96cb0fc4..9dceaa9e 100644 --- a/gimel-serde/gimel-deserializers/generic-deserializers/pom.xml +++ b/gimel-serde/gimel-deserializers/generic-deserializers/pom.xml @@ -5,20 +5,21 @@ com.paypal.gimel gimel-serde - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 com.paypal.gimel generic-deserializers - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT com.paypal.gimel serde-common - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT + ${packaging.scope} org.scala-lang @@ -43,7 +44,7 @@ com.paypal.gimel serde-common - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT test-jar test @@ -57,24 +58,10 @@ org.apache.kafka kafka_${scala.binary.version} ${kafka.version} - test - - - io.confluent - kafka-schema-registry - ${confluent.version} - javax.ws.rs - javax.ws.rs-api - - - org.glassfish.jersey.containers - jersey-container-servlet - - - org.glassfish.jersey.core - jersey-server + com.fasterxml.jackson.* + * test @@ -90,13 +77,29 @@ javax.ws.rs javax.ws.rs-api - - org.glassfish.jersey.containers - jersey-container-servlet - test + + + + + + + + + + + + + + + + + + + + org.apache.curator curator-test @@ -192,7 +195,7 @@ test - test + diff --git a/gimel-serde/gimel-serializers/generic-serializers/pom.xml b/gimel-serde/gimel-serializers/generic-serializers/pom.xml index a15fbeb9..92b27242 100644 --- a/gimel-serde/gimel-serializers/generic-serializers/pom.xml +++ b/gimel-serde/gimel-serializers/generic-serializers/pom.xml @@ -5,20 +5,21 @@ gimel-serde com.paypal.gimel - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT ../../pom.xml 4.0.0 com.paypal.gimel generic-serializers - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT com.paypal.gimel serde-common - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT + ${packaging.scope} org.scala-lang @@ -49,7 +50,7 @@ com.paypal.gimel serde-common - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT test-jar test @@ -63,6 +64,12 @@ org.apache.kafka kafka_${scala.binary.version} ${kafka.version} + + + com.fasterxml.jackson.* + * + + test @@ -96,29 +103,37 @@ javax.ws.rs javax.ws.rs-api - - org.glassfish.jersey.containers - jersey-container-servlet - - - org.glassfish.jersey.core - jersey-server - test + + + + + + + + + + + + + + + + + + + + + + org.apache.curator curator-test 2.9.0 test - - io.confluent - kafka-avro-serializer - ${confluent.version} - test - junit junit @@ -168,7 +183,7 @@ src/main/scala - + src/test/scala net.alchim31.maven @@ -202,7 +217,7 @@ test - test + diff --git a/gimel-serde/pom.xml b/gimel-serde/pom.xml index 3ddbcfe5..41c0b3e5 100644 --- a/gimel-serde/pom.xml +++ b/gimel-serde/pom.xml @@ -5,7 +5,7 @@ gimel com.paypal.gimel - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT ../pom.xml 4.0.0 @@ -13,11 +13,11 @@ com.paypal.gimel gimel-serde pom - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT + serde-common gimel-deserializers/generic-deserializers gimel-serializers/generic-serializers - serde-common @@ -38,7 +38,7 @@ 3.9.9.Final 4.1.17.Final - 2.2.1 + 2.1.1
    @@ -97,14 +97,6 @@
    - - org.apache.avro - avro-maven-plugin - - - String - - org.apache.maven.plugins maven-surefire-plugin @@ -117,7 +109,7 @@ compile - testCompile + @@ -164,7 +156,7 @@ org.apache.maven.plugins maven-shade-plugin - 3.0.0 + ${maven.shade.plugin.version} diff --git a/gimel-serde/serde-common/pom.xml b/gimel-serde/serde-common/pom.xml index 2bb54284..be29c4a0 100644 --- a/gimel-serde/serde-common/pom.xml +++ b/gimel-serde/serde-common/pom.xml @@ -5,14 +5,14 @@ com.paypal.gimel gimel-serde - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT ../pom.xml 4.0.0 com.paypal.gimel serde-common - 1.0-SNAPSHOT + 2.4.7-SNAPSHOT @@ -50,12 +50,12 @@ - - com.databricks - spark-avro_${scala.binary.version} - 4.0.0 - ${packaging.scope} - + + org.apache.spark + spark-avro_2.12 + ${spark.version} + ${packaging.scope} + io.spray spray-json_${scala.binary.version} @@ -66,9 +66,11 @@ com.paypal.gimel gimel-logger ${gimel.version}-SNAPSHOT + ${packaging.scope} + org.apache.kafka kafka-clients @@ -79,6 +81,12 @@ org.apache.kafka kafka_${scala.binary.version} ${kafka.version} + + + com.fasterxml.jackson.* + * + + test diff --git a/gimel-serde/serde-common/src/main/scala/com/paypal/gimel/serde/common/avro/AvroUtils.scala b/gimel-serde/serde-common/src/main/scala/com/paypal/gimel/serde/common/avro/AvroUtils.scala index 9173ec9d..0d0bf0c1 100644 --- a/gimel-serde/serde-common/src/main/scala/com/paypal/gimel/serde/common/avro/AvroUtils.scala +++ b/gimel-serde/serde-common/src/main/scala/com/paypal/gimel/serde/common/avro/AvroUtils.scala @@ -25,16 +25,15 @@ import scala.collection.JavaConverters._ import scala.collection.immutable.Map import scala.collection.mutable -import com.databricks.spark.avro.SchemaConverters._ import org.apache.avro.Schema import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.io.DecoderFactory import org.apache.avro.io.EncoderFactory import org.apache.avro.specific.SpecificDatumWriter import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SparkSession} -import org.apache.spark.sql.catalyst.encoders.RowEncoder -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.avro._ +import org.apache.spark.sql.functions._ import spray.json._ import spray.json.DefaultJsonProtocol._ import spray.json.JsValue @@ -109,6 +108,26 @@ object AvroUtils extends Serializable { newGenericRec } + /** + * Takes An Avro Schema String and Returns the list of field names in the "fields" list + * @param schemaString + * @return List(fieldNames) + */ + def getTopLevelFieldNamesFromAvro(schemaString: String): Seq[String] = { + // Parse as JsValue + val schemaAsJsVal = schemaString.parseJson + // Convert to JsObject + val schemaAsJsObject = schemaAsJsVal.asJsObject + // Get the Map of each element & Value + val schemaElementsMap: Map[String, JsValue] = schemaAsJsObject.fields + // These fields will be added with "to-add" fields + val schemaFields: Seq[JsValue] = schemaAsJsObject.getFields("fields").head.convertTo[Seq[JsValue]] + schemaFields.map{ x => + x.asJsObject.fields.head._2.toString().replace(""""""", "") + } + + } + /** * Adds additional fields to the Avro Schema * @@ -189,39 +208,24 @@ object AvroUtils extends Serializable { */ def getDeserializedDataFrame(dataframe: DataFrame, columnToDeserialize: String, avroSchemaString: String): DataFrame = { val originalFields: Array[String] = dataframe.columns.filter(field => field != columnToDeserialize) - val newAvroSchemaString = addAdditionalFieldsToSchema(originalFields.toList, avroSchemaString) - try { - dataframe.map { eachRow => - val recordToDeserialize: Array[Byte] = eachRow.getAs(columnToDeserialize).asInstanceOf[Array[Byte]] - val originalColumnsMap = originalFields.map { - field => { - val index = eachRow.fieldIndex(field) - if (eachRow.isNullAt(index)) { - (field -> "null") - } else { - (field -> eachRow.getAs(field).toString) - } - } - } - val deserializedGenericRecord: GenericRecord = bytesToGenericRecordWithSchemaRecon(recordToDeserialize, avroSchemaString, avroSchemaString) - val newDeserializedGenericRecord: GenericRecord = copyToGenericRecord(deserializedGenericRecord, avroSchemaString, newAvroSchemaString) - originalColumnsMap.foreach { kv => newDeserializedGenericRecord.put(kv._1, kv._2) } - val avroSchemaObj: Schema = (new Schema.Parser).parse(newAvroSchemaString) - val converter = AvroToSQLSchemaConverter.createConverterToSQL(avroSchemaObj) - converter(newDeserializedGenericRecord).asInstanceOf[Row] - } { - val avroSchema: Schema = (new Schema.Parser).parse(newAvroSchemaString) - val schemaType: SchemaType = toSqlType(avroSchema) - val encoder = RowEncoder(schemaType.dataType.asInstanceOf[StructType]) - encoder - }.toDF - } catch { - case ex: Throwable => { - ex.printStackTrace() - throw ex - } - } + logger.debug(s"Original Fields \n${originalFields}") + logger.debug(s"schema \n${avroSchemaString}") + val fieldsInAvro = getTopLevelFieldNamesFromAvro(avroSchemaString ) + logger.debug(s"Avro Fields \n${fieldsInAvro}") + logger.debug(s"**************** schema before deserialize ************************") + dataframe.printSchema() + val op = dataframe.withColumn("avro", from_avro(col(columnToDeserialize), avroSchemaString) ) + logger.debug(s"**************** schema after deserialize ************************") + op.printSchema() + op.show(2) + logger.debug(s"**************** Fields in avro that will be projected in dataFrame ************************") + logger.debug(fieldsInAvro.mkString(",")) + val colsToSelect: Seq[String] = fieldsInAvro.map{ x => s"avro.${x}"} + logger.debug(colsToSelect.mkString(",")) + val k = op.select(colsToSelect.head, colsToSelect.tail: _*) + k + } /** diff --git a/pom.xml b/pom.xml index 327fe26d..1d5c6342 100644 --- a/pom.xml +++ b/pom.xml @@ -28,9 +28,8 @@ under the License. Gimel Data API 2017 - 2.0.0-SNAPSHOT + 2.4.7-SNAPSHOT - gimel-logging gimel-dataapi gimel-parser gimel-serde @@ -62,10 +61,10 @@ under the License. - general + dataproc_1.5x - general + dataproc_1.5x true @@ -76,110 +75,26 @@ under the License. provided provided 3.4.13 - 2.11 + 2.12 3.0.1 4.1.0 - 2.0.0 - 2.11.8 - 2.3 - 2.3.0 - 2.7.3 - 1.2.1 - 1.1.2 + 2.4.7 + 2.12.10 + 1.3.0 + 2.4 + 2.4.7 + 2.10.0 + 2.3.7 + 1.5.0 1.8 - 2.0.0 + 0.17.1 2.8.0 + 2.11 + 2.11 + 3.2.4 - - hwx-2.6.3.11-1 - - - hortonworks-2.6.3.11-1_spark-2.3.0 - - false - - - compile - provided - compile - provided - provided - 2.0.0 - 2.11 - 2.11.8 - 3.0.1 - 2.3 - 2.3.0.2.6.3.11-1 - 2.7.3.2.6.3.11-1 - 1.2.1000.2.6.3.11-1 - 1.1.2.2.6.3.11-1 - 3.4.6.2.6.3.12-2 - 2.8.0 - - - - - hwx-2.6.3.5-4 - - - hortonworks-2.6.3.5-4_spark-2.3.0 - - false - - - compile - provided - compile - provided - provided - 2.0.0 - 2.11 - 2.11.8 - 3.0.1 - 3.0.1 - 4.1.0 - 2.3 - 2.3.0.2.6.3.5-4 - 2.7.3.2.6.3.5-4 - 1.2.1000.2.6.3.5-4 - 1.1.2.2.6.3.5-4 - 3.4.6.2.6.3.5-4 - 2.8.0 - - - - - hwx-2.6.5.0-292 - - - hortonworks-2.6.5.0-292_spark-2.3.0 - - - - compile - provided - compile - provided - provided - 2.11.8 - 2.11 - 4.1.0 - 2.3 - 2.3 - 2.3.0.2.6.5.0-292 - 2.7.3.2.6.5.0-292 - 1.2.1000.2.6.5.0-292 - 1.1.2.2.6.5.0-292 - 3.4.13.2.6.5.0-292 - hwx-2.6.5.0-292_s-${spark.binary.version} - 2.0.0 - 3.0.1 - 2.8.0 - - - standalone @@ -279,18 +194,6 @@ under the License. [ci skip] v-@{project.version} - - - org.apache.maven.scm - maven-scm-api - 1.8.1 - - - org.apache.maven.scm - maven-scm-provider-gitexe - 1.8.1 - -
    diff --git a/quickstart/set-env b/quickstart/set-env index 35b77311..cb0af616 100644 --- a/quickstart/set-env +++ b/quickstart/set-env @@ -20,6 +20,6 @@ if [ -z $GIMEL_HOME ]; then fi export gimel_repo_home=${GIMEL_HOME} export standalone_dir=${gimel_repo_home}/gimel-dataapi/gimel-standalone -export gimel_jar_name=gimel-sql-2.0.0-SNAPSHOT-uber.jar +export gimel_jar_name=gimel-sql-2.4.7-SNAPSHOT-uber.jar export final_jar=${standalone_dir}/lib/$gimel_jar_name export FLIGHTS_DATA_PATH=$GIMEL_HOME/gimel-dataapi/gimel-quickstart/flights