diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/pom.xml b/gimel-dataapi/gimel-connectors/gimel-druid/pom.xml
deleted file mode 100644
index 939557b8..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/pom.xml
+++ /dev/null
@@ -1,175 +0,0 @@
-
-
-
-
-
-
- gimel-dataapi
- com.paypal.gimel
- 2.4.7-SNAPSHOT
- ../../pom.xml
-
-
- 4.0.0
- gimel-druid-0.82
- 2.4.7-SNAPSHOT
-
-
-
- com.paypal.gimel
- gimel-common
- ${gimel.version}-SNAPSHOT
-
-
-
- io.druid
- tranquility-core_2.11
- ${tranquility.version}
-
-
- com.fasterxml.jackson.core
- jackson-databind
-
-
- org.scala-lang
- *
-
-
- com.fasterxml.jackson.core
- jackson-core
-
-
- org.apache.derby
- derbyclient
-
-
-
-
-
-
-
- com.fasterxml.jackson.core
- jackson-annotations
- ${fasterxml.jackson.core.version}
-
-
- io.druid
- tranquility-spark_${scala.binary.version}
- ${tranquility.version}
-
-
- org.scalatest
- scalatest_${scala.binary.version}
- ${scalatest.version}
- test
-
-
-
-
- src/main/scala
-
-
-
- net.alchim31.maven
- scala-maven-plugin
- 3.2.1
-
-
-
- compile
- testCompile
-
-
-
-
-
- -Xms64m
- -Xmx1024m
-
-
-
-
- org.scalatest
- scalatest-maven-plugin
- 1.0
-
- ${project.build.directory}/surefire-reports
- .
- WDF TestSuite.txt
-
-
-
- test
-
- test
-
-
-
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 3.0.0
-
-
-
- scala.tools
- gimel-shaded.scala.tools
-
-
- com.google.common
- gimel-shaded.com.google.common
-
-
- com.sun.jersey
- gimel-shaded.com.sun.jersey
-
-
-
- org.apache.hadoop
- gimel-shaded.org.apache.hadoop
-
-
-
- :
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
- gimel-shading
- package
-
- shade
-
-
-
-
-
-
-
-
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/DataSet.scala
deleted file mode 100644
index aeee42e2..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/DataSet.scala
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid
-
-import scala.language.implicitConversions
-import scala.reflect.runtime.universe._
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SparkSession}
-
-import com.paypal.gimel.datasetfactory.GimelDataSet
-import com.paypal.gimel.druid.conf.{DruidClientConfiguration, DruidConfigs, DruidConstants}
-import com.paypal.gimel.druid.util.DruidUtility
-import com.paypal.gimel.druid.writer.DruidRealtimeWriter
-import com.paypal.gimel.logger.Logger
-
-/**
- * Concrete Implementation for Druid DataSet.
- *
- * @param sparkSession : SparkSession
- */
-
-class DataSet(sparkSession: SparkSession) extends GimelDataSet(sparkSession: SparkSession) {
-
- // GET LOGGER
- val logger = Logger()
- logger.info(s"Initiated --> ${this.getClass.getName}")
-
- /**
- * Read Implementation for Casandra DataSet.
- *
- * @param dataset Name of the UDC Data Set.
- * @param datasetProps Additional parameters for read and write operations in DataSet class.
- * @return DataFrame
- */
- override def read(dataset: String, datasetProps: Map[String, Any]): DataFrame = {
- throw new Exception("Read for Druid Dataset is not enabled.")
- }
-
- /** Write Implementation for Druid DataSet.
- *
- * @param dataset Name of the UDC Data Set.
- * @param dataFrame The DataFrame to write to target.
- * @param datasetProps Additional parameters for read and write operations in DataSet class.
- * @return DataFrame
- */
-
- override def write(dataset: String, dataFrame: DataFrame,
- datasetProps: Map[String, Any]): DataFrame = {
- logger.info(s"Druid Dataset Write Initialized for ---> $dataset.")
- logger.info(s"Scala Version Used ---> ${scala.util.Properties.versionString}")
-
- if (datasetProps.isEmpty) {
- throw new DataSetException("Props Map Cannot be emtpy for DruidDataSet Write.")
- }
-
- val allProps = datasetProps ++
- Map(DruidConfigs.FIELDS -> DruidUtility.getFieldNames(dataFrame))
-
- logger.info(s"Begin Building DruidClientConfiguration")
- logger.debug(s"Incoming Properties --> ${
- allProps.map(x => s"${x._1} -> ${x._2}")
- .mkString("\n")
- }")
-
- val conf = new DruidClientConfiguration(allProps)
-
- logger.debug(s"DruidClientConfiguration --> $conf")
- logger.info(s"DruidClientConfiguration Building done --> " +
- s"${conf.getClass.getName}")
-
- // Get load type from DruidClientConfiguration.
- // i.e Real-time or Batch and then runs appropriate driver.
- // Defaults to Real-time Driver.
- conf.druidLoadType match {
- case DruidConstants.REALTIME_LOAD =>
- DruidRealtimeWriter.writeToTable(sparkSession, conf, dataFrame)
-
- case DruidConstants.BATCH_LOAD =>
- val errorMsg = "Batch Load type for druid-connector has not been implemented."
- throw new IllegalArgumentException(errorMsg)
-
- case _ =>
- DruidRealtimeWriter.writeToTable(sparkSession, conf, dataFrame)
- }
-
- dataFrame
- }
-
- // Add Additional Supported types to this list as and when we support other Types of RDD
- // Example to support RDD[String], add to List
- override val supportedTypesOfRDD: List[String] = List(typeOf[Map[String, Any]].toString)
-
- /**
- * Writes a given dataframe to the actual target System.
- * (Example Hive : DB.Table | HBASE namespace.Table)
- *
- * The inheriting DataSet Operators must typeCast the RDD to supported types.
- *
- *
- * instance#1:
- * ElasticSearchDataSet may support just RDD[Seq(Map[String, String])],
- * so Elastic Search must implement supported Type checking
- *
- * instance#2: Kafka, HDFS, HBASE throw Unsupported Operation Exception.
- * The exception should clearly educate users—Until they support an RDD operation for Any Type T.
- *
- * Additional parameters for read and write operations in DataSet class
- * Example: to write kafka with a specific parallelism:
- * {{{
- * val props = Map("parallelsPerPartition" -> 10)
- * Dataset(sc).write(clientDataFrame, props)
- * }}}
- *
- * @param dataset Name of the UDC Data Set.
- * @param rdd The RDD[T] to write into Target.
- * @param datasetProps Map containing dataset props
- * @return RDD[T]
- */
- def write[T: TypeTag](dataset: String, rdd: RDD[T], datasetProps: Map[String, Any]): RDD[T] = {
- logger.info(s"Druid Dataset Write Initialized for ---> $dataset.")
- logger.info(s"Scala Version Used ---> ${scala.util.Properties.versionString}")
-
- if (!supportedTypesOfRDD.contains(typeOf[T].toString)) {
- throw new UnsupportedOperationException(
- s"""Invalid RDD Type. Supported Types :
- |${supportedTypesOfRDD.mkString(" | ")}""".stripMargin)
- }
-
- if (datasetProps.isEmpty) {
- throw new DataSetException("Props Map Cannot be emtpy for DruidDataSet Write.")
- }
-
- val allProps = datasetProps ++
- Map(DruidConfigs.FIELDS -> DruidUtility.getFieldNames(dataset, sparkSession))
-
- logger.info(s"Begin Building DruidClientConfiguration")
- logger.debug(s"Incoming Properties --> ${
- allProps.map(x => s"${x._1} -> ${x._2}")
- .mkString("\n")
- }")
-
- val conf = new DruidClientConfiguration(allProps)
-
- logger.debug(s"DruidClientConfiguration --> $conf")
- logger.info(s"DruidClientConfiguration Building done --> " +
- s"${conf.getClass.getName}")
-
- // Get load type from DruidClientConfiguration.
- // i.e Real-time or Batch and then runs appropriate driver.
- // Defaults to Real-time Driver.
- conf.druidLoadType match {
- case DruidConstants.REALTIME_LOAD =>
- DruidRealtimeWriter.writeToTable(sparkSession, conf, rdd.asInstanceOf[RDD[Map[String, Any]]])
-
- case DruidConstants.BATCH_LOAD =>
- val errorMsg = "Batch Load type for druid-connector has not been implemented."
- throw new IllegalArgumentException(errorMsg)
-
- case _ =>
- DruidRealtimeWriter.writeToTable(sparkSession, conf, rdd.asInstanceOf[RDD[Map[String, Any]]])
- }
-
- rdd
- }
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataSetProps
- * * @return Boolean
- */
- override def create(dataset: String, dataSetProps: Map[String, Any]): Unit = {
- throw new Exception(s"DataSet create for druid currently not Supported")
- }
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataSetProps
- * * @return Boolean
- */
- override def drop(dataset: String, dataSetProps: Map[String, Any]): Unit = {
- throw new Exception(s"DataSet drop for druid currently not Supported")
- }
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataSetProps
- * * @return Boolean
- */
- override def truncate(dataset: String, dataSetProps: Map[String, Any]): Unit = {
- throw new Exception(s"DataSet truncate for druid currently not Supported")
- }
-
- /**
- * Save Checkpoint
- */
- override def clearCheckPoint(): Unit = {
- logger.info(s"Clear check Point functionality is not available for Druid Dataset")
- }
-
- /**
- * Clear Checkpoint
- */
- override def saveCheckPoint(): Unit = {
- logger.info(s"Save check Point functionality is not available for Druid Dataset")
- }
-}
-
-/**
- * Custom Exception for DruidDataset initiation errors
- *
- * @param message Message to Throw
- * @param cause A Throwable Cause
- */
-private class DataSetException(message: String, cause: Throwable)
- extends RuntimeException(message) {
- if (cause != null) {
- initCause(cause)
- }
-
- def this(message: String) = this(message, null)
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidClientConfiguration.scala
deleted file mode 100644
index 130be1e6..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidClientConfiguration.scala
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.conf
-
-import scala.collection.immutable.Map
-import scala.reflect.ClassTag
-
-import com.metamx.common.Granularity
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.common.conf.GimelConstants
-import com.paypal.gimel.druid.model.{DruidDimension, DruidMetric}
-import com.paypal.gimel.druid.util.DruidUtility
-
-/**
- * DruidClientConfiguration Class. Takes a map of properties and build its own properties.
- * This Class extends Serializable as it is needed to be passed to the executors.
- *
- * @param props Map[String, Any] of the properties specified by the user.
- */
-@SerialVersionUID(100L)
-class DruidClientConfiguration(props: Map[String, Any]) extends Serializable {
-
- // This is the DataSet Properties
- val datasetProps: DataSetProperties = props(GimelConstants.DATASET_PROPS).asInstanceOf[DataSetProperties]
- val tableProps: Map[String, String] = datasetProps.props
-
- val druidLoadType: String = fetchProperty[String](DruidConfigs.LOAD_TYPE)
- .getOrElse(DruidConstants.REALTIME_LOAD)
-
- // Zookeeper services running. Example: localhost:2121. Required Configuration
- val zookeeper: String = fetchProperty[String](DruidConfigs.ZOOKEEPER, isRequired = true).get
-
- // Index Service as specified for druid cluster. Required Configuration
- val indexService: String = fetchProperty[String](DruidConfigs.INDEX_SERVICE, isRequired = true).get
-
- // Duscovery Path as specified for druid cluster. Required Configuration
- val discoveryPath: String = fetchProperty[String](DruidConfigs.DISCOVERY_PATH, isRequired = true).get
-
- // Datasource in Druid to index for. Required Configuration
- val datasource: String = fetchProperty[String](DruidConfigs.DATASOURCE, isRequired = true).get
-
- val fieldNames: List[String] =
- fetchProperty[List[String]](DruidConfigs.FIELDS, isRequired = true).get
-
- val timestamp_field: String =
- fetchProperty[String](DruidConfigs.TIMESTAMP)
- .getOrElse(DruidConstants.TIMESTAMP_FIELD_NAME)
-
- val timestamp_format: String = fetchProperty[String](DruidConfigs.TIMESTAMP_FORMAT)
- .getOrElse(DruidConstants.TIMESTAMP_FORMAT)
-
- // Get Segment Granularity String from the props and convert it into com.metamx.common.Granularity
- val segmentGranularity: Granularity = {
- val granularityString = fetchProperty[String](DruidConfigs.SEGMENT_GRANULARITY)
- .getOrElse(DruidConstants.SEGMENT_GRANULARITY_FIFTEEN_MINUTE)
-
- val granularity = Granularity.values.find(g => granularityString.equalsIgnoreCase(g.toString))
-
- // If given Granularity is not found then throw an Error
- if (granularity.isEmpty) {
- val errorMsg = s"Specified Segment Granularity $granularityString is not a valid Granularity"
- throw new IllegalArgumentException(errorMsg)
- }
-
- granularity.get
- }
-
- // Get Segment Granularity String from the props and convert it into com.metamx.common.Granularity
- val queryGranularity: String = {
- fetchProperty[String](DruidConfigs.QUERY_GRANULARITY)
- .getOrElse(DruidConstants.QUERY_GRANULARITY_ONE_MINUTE)
- }
-
- // Window Period for which druid will accept the incoming data. Defaults to PT10M
- val windowPeriod: String = fetchProperty[String](DruidConfigs.WINDOW_PERIOD).getOrElse(DruidConstants.WINDOW_PERIOD)
-
- // Number of Partitions Defined
- val numPartitions: Int = fetchProperty[Int](DruidConfigs.PARTITIONS).getOrElse(DruidConstants.PARTITIONS)
-
- // Number of Replicants Specified
- val numReplicants: Int = fetchProperty[Int](DruidConfigs.REPLICANTS).getOrElse(DruidConstants.REPLICANTS)
-
- val ARROW = DruidConstants.ARROW
- val NEW_LINE = DruidConstants.NEW_LINE
-
- // Get List of Druid Field names from props that is a string value for the list.
- lazy val fields: List[DruidDimension] = fieldNames.map(DruidDimension(_))
-
- // Get List of Druid Dimensions from props that is a string value for the list.
- lazy val dimensions: List[DruidDimension] = {
- errorIfMissing(DruidConfigs.DIMENSIONS)
-
- DruidUtility.parseString[List[String]](
- fetchProperty[String](DruidConfigs.DIMENSIONS, isRequired = true).get
- ).map(DruidDimension(_))
- }
-
- // Get List of Druid Metric from props that is a string value for the list.
- lazy val metrics: List[DruidMetric] = {
- val metricString = fetchProperty[String](DruidConfigs.METRICS)
-
- // Check if metricString is not null or else return a Default count Metric
- if (metricString.isDefined) {
- DruidUtility.parseString[List[DruidMetric]](metricString.get)
- } else {
- List(DruidMetric.getDefaultMetric)
- }
- }
-
- /**
- * Private Method to check if the key exists in the props.
- * If the key doesnt exists than throw an error.
- *
- * @param key String value for the key
- */
- private def errorIfMissing(key: String): Unit = {
- if (tableProps.get(key).isEmpty && props.get(key).isEmpty) {
- val errorMsg = s"Missing Property: $key for the Druid Client Configuration!"
- throw new IllegalArgumentException(errorMsg)
- }
- }
-
- /**
- * Method to fetch property value from props and tableProps.
- * This methods first looks for a key in props, if not than looks for in tableProps.
- *
- * @param key Key of the property to be fetched
- * @param isRequired If the key is required or not.
- * If it is required than it throws an error if the key
- * does not exist in either of props or tableProps
- * @tparam T Type of the value to return for a given property
- * @return An Option of the value or None if the property key does not exist.
- */
- def fetchProperty[T](key: String, isRequired: Boolean = false)
- (implicit tag: ClassTag[T]): Option[T] = {
- // If isRequired is true, than throw an error if the key is missing
- if (isRequired) errorIfMissing(key)
-
- val propValue = props.get(key).orElse(tableProps.get(key))
-
- if (propValue.isDefined) {
- propValue.get match {
- case _: T =>
- Option(propValue.get.asInstanceOf[T])
- case _ =>
- val errorMsg = s"Value for Property Key: $key cannot be cast."
- throw new IllegalArgumentException(errorMsg)
- }
- } else None
- }
-
- /**
- * Overriden Method to Print Configuration Variables for this config.
- *
- * @return Print message for this Coniguration.
- */
- override def toString: String = {
- var message = "Druid Client Configuration Parameters --->" + DruidConstants.NEW_LINE
-
- message += DruidConfigs.ZOOKEEPER + ARROW + this.zookeeper + NEW_LINE
- message += DruidConfigs.INDEX_SERVICE + ARROW + this.indexService + NEW_LINE
- message += DruidConfigs.DISCOVERY_PATH + ARROW + this.discoveryPath + NEW_LINE
- message += DruidConfigs.DATASOURCE + ARROW + this.datasource + NEW_LINE
- message += DruidConfigs.FIELDS + ARROW + this.fieldNames + NEW_LINE
- message += DruidConfigs.DIMENSIONS + ARROW +
- this.dimensions.map(_.name).mkString(",") + NEW_LINE
- message += DruidConfigs.METRICS + ARROW + this.metrics.mkString(",") + NEW_LINE
- message += DruidConfigs.TIMESTAMP + ARROW + this.timestamp_field + NEW_LINE
- message += DruidConfigs.TIMESTAMP_FORMAT + ARROW +
- this.timestamp_format + NEW_LINE
- message += DruidConfigs.QUERY_GRANULARITY + ARROW +
- this.queryGranularity + NEW_LINE
- message += DruidConfigs.SEGMENT_GRANULARITY + ARROW +
- this.segmentGranularity + NEW_LINE
- message += DruidConfigs.WINDOW_PERIOD + ARROW + this.windowPeriod + NEW_LINE
- message += DruidConfigs.PARTITIONS + ARROW + this.numPartitions + NEW_LINE
- message += DruidConfigs.REPLICANTS + ARROW + this.numReplicants + NEW_LINE
- message += DruidConfigs.LOAD_TYPE + ARROW + this.druidLoadType + NEW_LINE
-
- message
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidConfigs.scala
deleted file mode 100644
index f934a40d..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidConfigs.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.conf
-
-/**
- * Object Defining List of available Configuration Keys
- */
-object DruidConfigs {
- val ZOOKEEPER = "gimel.druid.zookeeper.hosts"
- val INDEX_SERVICE = "gimel.druid.cluster.index.service"
- val DISCOVERY_PATH = "gimel.druid.cluster.discovery.path"
- val DATASOURCE = "gimel.druid.datasource.name"
- val FIELDS = "gimel.druid.datasource.fields"
- val DIMENSIONS = "gimel.druid.datasource.dimensions"
- val METRICS = "gimel.druid.datasource.metrics"
- val TIMESTAMP = "gimel.druid.timestamp.fieldname"
- val TIMESTAMP_FORMAT = "gimel.druid.timestamp.format"
- val QUERY_GRANULARITY = "gimel.druid.query.granularity"
- val SEGMENT_GRANULARITY = "gimel.druid.segment.granularity"
- val WINDOW_PERIOD = "gimel.druid.stream.window.period"
- val PARTITIONS = "gimel.druid.datasource.partitions"
- val REPLICANTS = "gimel.druid.datasource.replicas"
- val LOAD_TYPE = "gimel.druid.ingestion.type"
-}
-
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidConstants.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidConstants.scala
deleted file mode 100644
index 35a184e8..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/conf/DruidConstants.scala
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.conf
-
-/**
- * Object Defining Default Values for Configuration
- */
-object DruidConstants {
- val TIMESTAMP_FIELD_NAME = "timestamp"
- val TIMESTAMP_FORMAT = "millis"
- val QUERY_GRANULARITY_ONE_MINUTE = "MINUTE"
- val SEGMENT_GRANULARITY_FIFTEEN_MINUTE = "FIFTEEN_MINUTE"
- val WINDOW_PERIOD = "PT10M"
- val PARTITIONS = 1
- val REPLICANTS = 1
- val REALTIME_LOAD = "realtime"
- val BATCH_LOAD = "batch"
- val ARROW = "->"
- val NEW_LINE = "\n"
- val MILLISECONDS = "millis"
- val SECONDS = "seconds"
- val ISO = "iso"
-}
-
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/model/DruidDimension.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/model/DruidDimension.scala
deleted file mode 100644
index a8a28677..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/model/DruidDimension.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.model
-
-import org.json4s.FieldSerializer
-import org.json4s.FieldSerializer.{renameFrom, renameTo}
-
-/**
- * Druid Dimension Object.
- * Class extends Serializable as it is passed to executors.
- *
- * @param name Name for the dimension
- */
-@SerialVersionUID(100L)
-case class DruidDimension(name: String) extends Serializable
-
-object DruidDimension {
- object DimensionFieldNames {
- val NAME = "name"
- }
-
- // Deserializer for Druid Dimension.
- // Rename name -> name
- val drudDimensionSerializer: FieldSerializer[DruidDimension] = FieldSerializer[DruidDimension] (
- renameTo("name", DimensionFieldNames.NAME),
- renameFrom(DimensionFieldNames.NAME, "name")
- )
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/model/DruidMetric.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/model/DruidMetric.scala
deleted file mode 100644
index 5280c619..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/model/DruidMetric.scala
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.model
-
-import com.fasterxml.jackson.annotation.JsonIgnoreProperties
-import io.druid.query.aggregation.{AggregatorFactory, CountAggregatorFactory, LongSumAggregatorFactory}
-import io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory
-import org.json4s._
-import org.json4s.FieldSerializer._
-
-import com.paypal.gimel.druid.model.DruidMetric.{MetricFieldNames, MetricTypes}
-
-/**
- * Model class representing a DruidMetric.
- * This case class extends Serializable.
- *
- * @param metricsType Type of the metric to be computed.
- * @param fieldName Name of field to perform aggregation on
- * @param name Name of the metric
- */
-@SerialVersionUID(100L)
-@JsonIgnoreProperties(ignoreUnknown = true)
-case class DruidMetric( metricsType: String, fieldName: String, name: String)
- extends Serializable {
-
- /**
- * Method to initialize a DruidMetric from a map supplied
- *
- * @param map Map[String, String] having (key -> value) for a given druid metric
- * @return DruidMetric object using values from the map
- */
- def initializeFromMap(map: Map[String, String]): DruidMetric = {
- DruidMetric(map.get(MetricFieldNames.TYPE).orNull,
- map.get(MetricFieldNames.FIELD_NAME).orNull,
- map.get(MetricFieldNames.NAME).orNull)
- }
-
- /**
- * Converts the given DruidMetric to its corresponding AggregatorFactory
- * Object that is used by Tranquility.
- * Supported MetricTypes - Count, LongSum, HyperUnique
- *
- * @return AggregatorFactory object corresponding to the given Metric Type
- */
- def getAggregator: AggregatorFactory = {
- metricsType match {
- case MetricTypes.LONG_SUM =>
- new LongSumAggregatorFactory(name, fieldName)
- case MetricTypes.COUNT =>
- new CountAggregatorFactory(name)
- case MetricTypes.HYPER_UNIQUE =>
- new HyperUniquesAggregatorFactory(name, fieldName)
- case otherType: String =>
- throw new Exception(s"Metric Type: $otherType is not supported.")
- }
- }
-}
-
-object DruidMetric {
- def getDefaultMetric: DruidMetric = {
- DruidMetric(MetricTypes.COUNT, null, MetricTypes.COUNT)
- }
-
- object MetricFieldNames {
- val TYPE = "type"
- val FIELD_NAME = "field_name"
- val NAME = "name"
- }
-
- object MetricTypes {
- val LONG_SUM = "longSum"
- val COUNT = "count"
- val HYPER_UNIQUE = "hyperUnique"
- }
-
- // Deserializer for Druid Metric.
- // Ignore fieldName if does not exists.
- // Rename metricsType -> type, fieldName -> field_name, name -> name
- val drudMetricSerializer: FieldSerializer[DruidMetric] = FieldSerializer[DruidMetric] (
- ignore("fieldName") orElse renameTo("metricsType", MetricFieldNames.TYPE) orElse
- renameTo("fieldName", MetricFieldNames.FIELD_NAME) orElse
- renameTo("name", MetricFieldNames.NAME),
- renameFrom(MetricFieldNames.TYPE, "metricsType") orElse
- renameFrom(MetricFieldNames.FIELD_NAME, "fieldName") orElse
- renameFrom(MetricFieldNames.NAME, "name")
- )
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/reader/DruidReader.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/reader/DruidReader.scala
deleted file mode 100644
index b6099278..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/reader/DruidReader.scala
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.reader
-
-import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.hive.HiveContext
-
-import com.paypal.gimel.druid.conf.DruidClientConfiguration
-import com.paypal.gimel.logger.Logger
-
-/**
- * DruidReader Object. Main class to implement Reader for Druid.
- */
-object DruidReader {
- private val logger = Logger()
-
- /**
- * Method for reading from Druid. This is not yet implemented.
- *
- * @param hiveContext HiveContext Object to be used.
- * @param conf DruidClientConfiguration specified.
- * @return DataFrame after processing.
- */
- def readTable(hiveContext: HiveContext, conf: DruidClientConfiguration): DataFrame = {
- // TODO: Read Implementation not done
- throw new Exception("Read for druid-connector is not implemented.")
- }
-
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/util/DruidEventBeam.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/util/DruidEventBeam.scala
deleted file mode 100644
index 7ad94f2d..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/util/DruidEventBeam.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.util
-
-import com.metamx.tranquility.beam.{Beam, ClusteredBeamTuning}
-import com.metamx.tranquility.druid.{DruidBeams, DruidLocation, DruidRollup, SpecificDruidDimensions}
-import com.metamx.tranquility.spark.BeamFactory
-import com.metamx.tranquility.typeclass.Timestamper
-import io.druid.data.input.impl.TimestampSpec
-import org.apache.curator.framework.CuratorFrameworkFactory
-import org.apache.curator.retry.BoundedExponentialBackoffRetry
-import org.joda.time.{DateTime, DateTimeZone, Period}
-
-import com.paypal.gimel.druid.conf.DruidClientConfiguration
-
-/**
- * DruidEventBeam object.
- * Given a DruidClientConfiguration, returns a singleton instance of DruidBeam.
- * DruidBeam instance should be a singleton inorder to share the same connection
- */
-object DruidEventBeam {
- var druidConfig: DruidClientConfiguration = _
-
- /**
- * Method to initialize the required params for the DruidEventBeam instance.
- * This method must be call before trying to fetch BeamInstance
- * DruidClientConfiguration is a required param that needs to be set.
- *
- * @param configMgr DruidClientConfiguration object to be set for defining configuration.
- */
- def init(configMgr: DruidClientConfiguration): Unit = {
- druidConfig = configMgr
- }
-
- /**
- * Timestamper object that defines how to extract a timestamp from any custom object
- */
- implicit val timestamper = new Timestamper[Map[String, Any]]() {
-
- /**
- * Overriden method to extract timestamp from a given custom object.
- *
- * @param rowMap Map[String, String] representing a single row with
- * (columnName -> columnValue) format map
- * @return org.joda.time.DateTime by extracting timestamp from the rowMap
- */
- override def timestamp(rowMap: Map[String, Any]): DateTime = {
- new DateTime(rowMap(druidConfig.timestamp_field), DateTimeZone.UTC)
- }
- }
-
- /**
- * Builds and stores a singleton instance of Beam[T] given the
- * DruidClientConfiguration object for configuration.
- */
- lazy val BeamInstance: Beam[Map[String, Any]] = {
-
- // Tranquility uses ZooKeeper (through Curator framework) for coordination.
- val curator = CuratorFrameworkFactory.newClient(
- druidConfig.zookeeper,
- new BoundedExponentialBackoffRetry(100, 3000, 5)
- )
- curator.start()
-
- // Transforms List[DruidDimensions] from the DruidClientConfiguration to List[String]
- val dimensions = druidConfig
- .dimensions
- .map(_.name)
-
- // Transforms List[DruidMetrics] from the DruidClientConfiguration to List[AggregatorFactory]
- val aggregators = druidConfig
- .metrics
- .map(_.getAggregator)
-
- // Building a Druid Beam
- DruidBeams
- .builder()
- .curator(curator)
- .discoveryPath(druidConfig.discoveryPath)
- .location(DruidLocation.create(druidConfig.indexService, druidConfig.datasource))
- .rollup(DruidRollup(SpecificDruidDimensions(dimensions),
- aggregators, DruidUtility.fetchQueryGranularity(druidConfig.queryGranularity)))
- .tuning(
- ClusteredBeamTuning (
- segmentGranularity = druidConfig.segmentGranularity,
- windowPeriod = new Period(druidConfig.windowPeriod),
- partitions = druidConfig.numPartitions, replicants = druidConfig.numReplicants
- )
- )
- .timestampSpec(new TimestampSpec(druidConfig.timestamp_field, "iso", null))
- .buildBeam()
- }
-}
-
-class DruidEventBeam(config: DruidClientConfiguration) extends BeamFactory[Map[String, Any]] {
- // Return a singleton, so the same connection is shared across all tasks in the same JVM.
- def makeBeam: Beam[Map[String, Any]] = {
- DruidEventBeam.init(config)
- DruidEventBeam.BeamInstance
- }
-}
-
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/util/DruidUtility.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/util/DruidUtility.scala
deleted file mode 100644
index 458df4c1..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/util/DruidUtility.scala
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.util
-
-import java.lang.reflect.Field
-
-import scala.reflect.ClassTag
-
-import io.druid.granularity.{QueryGranularities, QueryGranularity}
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.types.{StructField, StructType}
-import org.joda.time.{DateTime, DateTimeZone}
-import org.joda.time.format.DateTimeFormat
-import org.json4s.{DefaultFormats, Formats}
-import org.json4s.jackson.JsonMethods._
-
-import com.paypal.gimel.druid.conf.DruidConstants
-import com.paypal.gimel.druid.model.{DruidDimension, DruidMetric}
-
-/**
- * Object instance for Druid Utility. Contains all the utility methods required for druid-connector
- */
-object DruidUtility {
-
- // Format object to serialize and deserialize used by json4s
- implicit val format: Formats =
- DefaultFormats + DruidMetric.drudMetricSerializer + DruidDimension.drudDimensionSerializer
-
- /**
- * Given a date or time in String, this method converts this datetime to
- * org.joda.time.DateTime using the specified format.
- * Returns current datetime if datetime string is null.
- * Supports - millis, seconds and other DateTime format
- *
- * @param datetime Datetime in String. Can be time in millis, seconds or in any DATETIME format
- * @param format String format for transforming string to org.joda.time.DateTime. Default: millis
- * @return org.joda.time.DateTime given a datetime in string and the specified format.
- */
- def extractDateTime(datetime: String, format: String = "millis"): DateTime = {
- if (Option(datetime).isDefined) {
- format match {
- // Converts Milliseconds to org.joda.time.DateTime
- case DruidConstants.MILLISECONDS =>
- new DateTime(datetime.toLong, DateTimeZone.UTC)
-
- // Converts Seconds to org.joda.time.DateTime
- case DruidConstants.SECONDS =>
- new DateTime(toMillis(datetime.toLong), DateTimeZone.UTC)
-
- // Converts ISO datetime to org.joda.time.DateTime
- case DruidConstants.ISO =>
- DateTime.parse(datetime).withZone(DateTimeZone.UTC)
-
- // Converts all the other DateTime formats to org.joda.time.DateTime
- case otherFormat: String =>
- val formatter = DateTimeFormat.forPattern(otherFormat).withZoneUTC()
- formatter.parseDateTime(datetime)
- }
- } else {
- // Returns current time in UTC if datetime string is null.
- new DateTime(DateTimeZone.UTC)
- }
- }
-
- /**
- * Converts seconds to Milliseconds
- *
- * @param seconds Long seconds to be converted
- * @return Long Milliseconds corresponding to the seconds
- */
- def toMillis(seconds: Long): Long = seconds * 1000
-
- /**
- * Fetch List of class variables
- *
- * @param tag ClassTag[T] object
- * @tparam T Class type passed
- * @return List[Field] of fields that T class contains
- */
- def fetchClassVariable[T](implicit tag: ClassTag[T]): List[Field] =
- tag.runtimeClass.getDeclaredFields.toList
-
-
- /**
- * Get Hive Table Field names given the name of hive table
- *
- * @param dataset Hive Table name
- * @return List[String] of Field names for the hive table
- */
- def getFieldNames(dataset: String, sparkSession: SparkSession): List[String] = {
- extractFields(sparkSession.read.table(dataset).schema)
- }
-
-
- /**
- * Get Hive Table Field names given the Dataframe.
- *
- * @param dataFrame Dataframe for which schema is to be returned
- * @return List[String] of Field names for the hive table
- */
- def getFieldNames(dataFrame: DataFrame): List[String] = {
- extractFields(dataFrame.schema)
- }
-
- /**
- * Given a Schema StructType, extract the field names.
- *
- * @param schema StructType Schema
- * @return List[String] of field names
- */
- def extractFields(schema: StructType): List[String] = {
- Option(schema)
- .getOrElse(StructType(List.empty[StructField]))
- .map(_.name).toList
- }
-
- /**
- * Method to parse a string to a Custom object.
- *
- * @param value String value to be parsed.
- * @tparam T Custom object to parse the String.
- * @return Parsed object based on the value and T.
- */
- def parseString[T: ClassTag](value: String)(implicit manifest: Manifest[T]): T = {
- parse(s"""$value""")
- .extract[T](format, mf = manifest)
- }
-
- /**
- * Method to Fetch Query Granularity based on the String Provided.
- *
- * @param granularityString Query Granularity String to be parsed
- * @return QueryGranularity Object corresponding to the string
- */
- def fetchQueryGranularity(granularityString: String): QueryGranularity = {
- // Using Reflection, find a field with the same name
- // as the query granularity string specified by the user
- val granularityField = DruidUtility.fetchClassVariable[QueryGranularities]
- .find(field => granularityString.equalsIgnoreCase(field.getName))
-
- // If given Granularity is not found then throw an Error
- if (granularityField.isEmpty) {
- val errorMsg = s"Specified Query Granularity $granularityString is not a valid Granularity"
- throw new IllegalArgumentException(errorMsg)
- }
-
- // Extract QueryGranularity Variable value from the field
- val queryGranularity = QueryGranularities.MINUTE
- granularityField.get.get(queryGranularity).asInstanceOf[QueryGranularity]
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/writer/DruidRealtimeWriter.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/writer/DruidRealtimeWriter.scala
deleted file mode 100644
index ee211fcc..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/writer/DruidRealtimeWriter.scala
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.writer
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SparkSession}
-
-import com.paypal.gimel.druid.conf.DruidClientConfiguration
-import com.paypal.gimel.druid.util.{DruidEventBeam, DruidUtility}
-
-/**
- * DruidRealtimeWriter Object.
- * Main method to implement writer for Druid Realtime Ingestion.
- * Extends DruidWriter trait.
- */
-object DruidRealtimeWriter extends DruidWriter {
- /**
- * Write To Table for Druid Realtime Ingestion for a given RDD.
- *
- * @param sparkSession : SparkSession
- * @param conf DruidClientConfiguration Object
- * @param dataFrame Dataframe to be ingested to druid
- * @return Dataframe
- */
- def writeToTable(sparkSession: SparkSession,
- conf: DruidClientConfiguration, dataFrame: DataFrame): DataFrame = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- // Convert to RDD of Map[String, String]
- val eventsRDD: RDD[Map[String, Any]] = dataFrame.rdd
- .map(row => {
- conf.fields
- .map(field => {
- var fieldValue: Any = row.getAs[Any](field.name)
-
- if (field.name.equalsIgnoreCase(conf.timestamp_field)) {
- fieldValue = DruidUtility
- .extractDateTime(fieldValue.toString, conf.timestamp_format)
- .toString
- }
-
- field.name -> fieldValue
- })
- .toMap
- })
-
- // Converting RDD to data frame
- import com.metamx.tranquility.spark.BeamRDD._
- eventsRDD.propagate(new DruidEventBeam(conf))
-
- dataFrame
- }
-
- /**
- * Write To Table for Druid Realtime Ingestion for a given RDD.
- *
- * @param sparkSession : SparkSession
- * @param conf DruidClientConfiguration Object
- * @param rdd RDD of Map[String, String] to be ingested to druid
- * @return RDD
- */
- def writeToTable(sparkSession: SparkSession,
- conf: DruidClientConfiguration,
- rdd: RDD[Map[String, Any]]): RDD[Map[String, Any]] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- // Convert to RDD of Map[String, String]
- val eventsRDD: RDD[Map[String, Any]] = rdd
- .map(rowMap => {
- conf.fields
- .map(field => {
- var fieldValue: Any = rowMap(field.name)
-
- if (field.name.equalsIgnoreCase(conf.timestamp_field)) {
- fieldValue = DruidUtility
- .extractDateTime(fieldValue.toString, conf.timestamp_format)
- .toString
- }
-
- field.name -> fieldValue
- })
- .toMap
- })
-
- // Converting RDD to data frame
- import com.metamx.tranquility.spark.BeamRDD._
- eventsRDD.propagate(new DruidEventBeam(conf))
-
- eventsRDD
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/writer/DruidWriter.scala b/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/writer/DruidWriter.scala
deleted file mode 100644
index 74495544..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-druid/src/main/scala/com/paypal/gimel/druid/writer/DruidWriter.scala
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.druid.writer
-
-import com.paypal.gimel.logger.Logger
-
-/**
- * DruidWriter trait.
- * Generic trait to be extended by all the DruidWriters - DruidRealtimeWriter, DruidBatchWriter
- */
-trait DruidWriter {
- protected val logger = Logger()
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/pom.xml b/gimel-dataapi/gimel-connectors/gimel-hbase/pom.xml
deleted file mode 100644
index d0165507..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/pom.xml
+++ /dev/null
@@ -1,232 +0,0 @@
-
-
-
-
-
-
- gimel-dataapi
- com.paypal.gimel
- 2.4.7-SNAPSHOT
- ../../pom.xml
-
- 4.0.0
-
- gimel-hbase-1.2
- 2.4.7-SNAPSHOT
-
-
-
- com.paypal.gimel
- gimel-common
- ${gimel.version}-SNAPSHOT
- ${packaging.scope}
-
-
- org.scalatest
- scalatest_${scala.binary.version}
- ${scalatest.version}
- test
-
-
- com.hortonworks
- shc-core
- 1.1.1-2.1-s_2.11
- ${packaging.scope}
-
-
- org.apache.spark
- *
-
-
- org.scala-lang
- *
-
-
-
-
- org.apache.hbase
- hbase-common
- ${hbase.version}
- ${packaging.scope}
-
-
- org.jboss.netty
- netty
-
-
- io.netty
- netty
-
-
-
-
- org.apache.hbase
- hbase-protocol
- ${hbase.version}
- ${packaging.scope}
-
-
- org.apache.hbase
- hbase-server
- ${hbase.version}
- ${packaging.scope}
-
-
- org.jboss.netty
- netty
-
-
- io.netty
- netty
-
-
-
-
- org.htrace
- htrace-core
- 3.0.4
- ${packaging.scope}
-
-
- org.apache.hive
- hive-hbase-handler
- ${hive.version}
-
- ${packaging.scope}
-
-
- org.apache.commons
- *
-
-
-
-
-
- org.apache.hbase
- hbase-testing-util
- ${hbase.version}
- test
-
-
-
- io.netty
- netty
- ${netty.hadoop.version}
- test
-
-
- io.netty
- netty-all
- ${netty.all.hadoop.version}
- test
-
-
- net.jpountz.lz4
- lz4
- 1.3.0
- test
-
-
-
-
- src/main/scala
- src/test/scala
-
-
- net.alchim31.maven
- scala-maven-plugin
- 3.2.1
-
-
-
- compile
- testCompile
-
-
-
-
-
- -Xms64m
- -Xmx1024m
-
-
-
-
- org.scalatest
- scalatest-maven-plugin
- 1.0
-
- ${project.build.directory}/surefire-reports
- .
- WDF TestSuite.txt
-
-
-
- test
-
- test
-
-
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- ${maven.shade.plugin.version}
-
-
-
- com.google.common
- gimel-shaded.com.google.common
-
-
- com.sun.jersey
- gimel-shaded.com.sun.jersey
-
-
-
- org.apache.hadoop
- gimel-shaded.org.apache.hadoop
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
- gimel-shading
- package
-
- shade
-
-
-
-
-
-
-
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/DataSet.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/DataSet.scala
deleted file mode 100644
index 34ab8269..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/DataSet.scala
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase
-
-import scala.reflect.runtime.universe._
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SparkSession}
-
-import com.paypal.gimel.datasetfactory.GimelDataSet
-import com.paypal.gimel.hbase.conf.{HbaseConfigs, HbaseConstants}
-import com.paypal.gimel.hbase.utilities.{HBaseLookUp, HBasePut, HBaseSparkConnector, HBaseUtilities}
-import com.paypal.gimel.logger.Logger
-
-/**
- * Concrete Implementation for HBASE Dataset
- *
- * @param sparkSession : SparkSession
- */
-
-class DataSet(sparkSession: SparkSession) extends GimelDataSet(sparkSession: SparkSession) {
-
- // GET LOGGER
- val logger = Logger()
- /**
- * Change this parameter with cluster config
- */
- logger.info(s"Initiated --> ${this.getClass.getName}")
- lazy val hbaseUtilities = HBaseUtilities(sparkSession)
- lazy val hbaseLookUp = HBaseLookUp(sparkSession)
- lazy val hbasePut = HBasePut(sparkSession)
- lazy val hbaseSparkConnector = HBaseSparkConnector(sparkSession)
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataSetProps
- * props is the way to set various additional parameters for read and write operations in DataSet class
- * Example Usecase : to get 10 factor parallelism (specifically)
- * val props = Map("coalesceFactor" -> 10)
- * val data = Dataset(sc).read("flights", props)
- * data.coalesce(props.get("coalesceFactor"))
- * @return DataFrame
- */
- override def read(dataset: String, dataSetProps: Map[String, Any]): DataFrame = {
- if (dataSetProps.isEmpty) throw new DataSetException("props cannot be empty !")
-
- val hbaseOperation = dataSetProps.getOrElse(HbaseConfigs.hbaseOperation, HbaseConstants.SCAN_OPERATION).toString
- hbaseOperation match {
- case HbaseConstants.GET_OPERATION =>
- logger.info("Reading through Java Get API.")
- hbaseLookUp.get(dataset, dataSetProps)
- case _ =>
- logger.info("Reading through SHC Connector.")
- hbaseSparkConnector.read(dataset, dataSetProps)
- }
- }
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataFrame The Dataframe to write into Target
- * @param dataSetProps
- * Example Usecase : we want only 1 executor for hbase (specifically)
- * val props = Map("coalesceFactor" -> 1)
- * Dataset(sc).write(clientDataFrame, props)
- * Inside write implementation :: dataFrame.coalesce(props.get("coalesceFactor"))
- * @return DataFrame
- */
-
- override def write(dataset: String, dataFrame: DataFrame, dataSetProps: Map[String, Any]): DataFrame = {
- if (dataSetProps.isEmpty) {
- throw new DataSetException("props cannot be empty !")
- }
-
- val castedDataFrame = hbaseUtilities.castAllColsToString(dataFrame)
- val hbaseOperation = dataSetProps.getOrElse(HbaseConfigs.hbaseOperation, HbaseConstants.SCAN_OPERATION).toString
- hbaseOperation match {
- case HbaseConstants.PUT_OPERATION =>
- logger.info("Writing through Java Put API.")
- hbasePut.put(dataset, castedDataFrame, dataSetProps)
- case _ =>
- logger.info("Writing through SHC Connector.")
- hbaseSparkConnector.write(dataset, castedDataFrame, dataSetProps)
- }
- }
-
- // Add Additional Supported types to this list as and when we support other Types of RDD
- // Example to start supporting RDD[String], add to List < typeOf[Seq[Map[String, String]]].toString) >
- override val supportedTypesOfRDD: List[String] = List[String]()
-
- /**
- * Function writes a given dataframe to the actual Target System (Example Hive : DB.Table | HBASE namespace.Table)
- *
- * @param dataset Name of the UDC Data Set
- * @param rdd The RDD[T] to write into Target
- * Note the RDD has to be typeCast to supported types by the inheriting DataSet Operators
- * instance#1 : ElasticSearchDataSet may support just RDD[Seq(Map[String, String])], so Elastic Search must implement supported Type checking
- * instance#2 : Kafka, HDFS, HBASE - Until they support an RDD operation for Any Type T : They throw Unsupporter Operation Exception & Educate Users Clearly !
- * @param dataSetProps
- * props is the way to set various additional parameters for read and write operations in DataSet class
- * Example Usecase : to write kafka with a specific parallelism : One can set something like below -
- * val props = Map("parallelsPerPartition" -> 10)
- * Dataset(sc).write(clientDataFrame, props)
- * @return RDD[T]
- */
- def write[T: TypeTag](dataset: String, rdd: RDD[T], dataSetProps: Map[String, Any]): RDD[T] = {
-
- if (!supportedTypesOfRDD.contains(typeOf[T].toString)) {
- throw new UnsupportedOperationException(s"""Invalid RDD Type. Supported Types : ${supportedTypesOfRDD.mkString(" | ")}""")
- } else {
- // todo Implementation for Write
- rdd
- }
- }
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataSetProps
- * * @return Boolean
- */
- override def create(dataset: String, dataSetProps: Map[String, Any]): Unit = {
- throw new UnsupportedOperationException(s"DataSet create for hbase currently not Supported")
- }
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataSetProps
- * * @return Boolean
- */
- override def drop(dataset: String, dataSetProps: Map[String, Any]): Unit = {
- throw new UnsupportedOperationException(s"DataSet drop for hbase currently not Supported")
- }
-
- /**
- *
- * @param dataset Name of the UDC Data Set
- * @param dataSetProps
- * * @return Boolean
- */
- override def truncate(dataset: String, dataSetProps: Map[String, Any]): Unit = {
- throw new UnsupportedOperationException(s"DataSet truncate for hbase currently not Supported")
- }
-
- /**
- * Save Checkpoint
- */
- override def clearCheckPoint(): Unit = {
- logger.info(s"Clear check Point functionality is not available for Hbase Dataset")
- }
-
- /**
- * Clear Checkpoint
- */
- override def saveCheckPoint(): Unit = {
- logger.info(s"Save check Point functionality is not available for Hbase Dataset")
- }
-}
-
-/**
- * Custom Exception for HBase API initiation errors
- *
- * @param message Message to Throw
- * @param cause A Throwable Cause
- */
-private class DataSetException(message: String, cause: Throwable)
- extends RuntimeException(message) {
- if (cause != null) {
- initCause(cause)
- }
-
- def this(message: String) = this(message, null)
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseClientConfiguration.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseClientConfiguration.scala
deleted file mode 100644
index d9a0d530..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseClientConfiguration.scala
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.conf
-
-import scala.collection.immutable.Map
-import scala.language.implicitConversions
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.common.conf.{GimelConstants, GimelProperties}
-import com.paypal.gimel.common.utilities.GenericUtils
-import com.paypal.gimel.logger.Logger
-
-/**
- * Gimel Client Configuration for Hbase Dataset Operations.
- *
- * @param props Hbase Client properties.
- */
-class HbaseClientConfiguration(val props: Map[String, Any]) {
-
- private val logger = Logger()
- logger.info(s"Begin Building --> ${this.getClass.getName}")
- // logger.info(s"Incoming Properties --> ${props.map(x => s"${x._1} -> ${x._2}").mkString("\n")}")
-
- // Load Default Prop from Resource File
- val pcatProps = GimelProperties()
-
- // appTag is used to maintain checkpoints & various other factors that are unique to the application
- val appTag: String = props.getOrElse(GimelConstants.APP_TAG, "").toString
-
- // This is the DataSet Properties
- val datasetProps: DataSetProperties = props(GimelConstants.DATASET_PROPS).asInstanceOf[DataSetProperties]
- val tableProps: Map[String, String] = datasetProps.props
-
- val clusterName = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName()
- val hbaseNameSpaceAndTable = GenericUtils.getValueFailIfEmpty(tableProps, HbaseConfigs.hbaseTableKey,
- "HBase table name not found. Please set the property " + HbaseConfigs.hbaseTableKey)
- val hbaseTableColumnMapping = tableProps.getOrElse(HbaseConfigs.hbaseColumnMappingKey, "")
- val maxSampleRecordsForSchema = GenericUtils.getValue(tableProps, HbaseConfigs.hbaseMaxRecordsForSchema, HbaseConstants.MAX_SAMPLE_RECORDS_FOR_SCHEMA).toInt
- val maxColumnsForSchema = GenericUtils.getValue(tableProps, HbaseConfigs.hbaseMaxColumnsForSchema, HbaseConstants.MAX_COLUMNS_FOR_SCHEMA).toInt
- // If this property consists of namespace and tablename both separated by colon ":", take the table name by splitting this string
- val hbaseTableNamespaceSplit = hbaseNameSpaceAndTable.split(":")
- val hbaseTableName = if (hbaseTableNamespaceSplit.length > 1) {
- hbaseTableNamespaceSplit(1)
- } else {
- hbaseNameSpaceAndTable
- }
- val hbaseNameSpace = tableProps.getOrElse(HbaseConfigs.hbaseNamespaceKey, HbaseConstants.DEFAULT_NAMESPACE)
- // If ColumnFamily name needs to be appneded with Column Name in resultant Dataframe
- val hbaseColumnNamewithColumnFamilyAppended = tableProps.getOrElse(HbaseConfigs.hbaseColumnNamewithColumnFamilyAppended, "false").toString.toBoolean
- // HDFS path for hbase-site.xml
- val hbaseSiteXMLHDFSPath = tableProps.getOrElse(HbaseConfigs.hbaseSiteXMLHDFSPathKey, HbaseConstants.NONE_STRING)
- val schema: Array[String] = if (datasetProps.fields != null && datasetProps.fields.nonEmpty) {
- datasetProps.fields.map(_.fieldName)
- } else {
- Array.empty[String]
- }
-
- val getOption = tableProps.getOrElse(HbaseConfigs.hbaseFilter, "")
-
- // Getting Row Key from user otherwise from schema in UDC or hive table. If it is not present in schema also, set defaultValue
- val hbaseRowKeys = tableProps.getOrElse(HbaseConfigs.hbaseRowKey, HbaseConstants.DEFAULT_ROW_KEY_COLUMN).split(",")
-
- logger.info(s"Fields Initiated --> ${this.getClass.getFields.map(f => s"${f.getName} --> ${f.get().toString}").mkString("\n")}")
- logger.info(s"Completed Building --> ${this.getClass.getName}")
-
-}
-
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConfigs.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConfigs.scala
deleted file mode 100644
index 6edcda3e..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConfigs.scala
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.conf
-
-// keys related to HBASE
-object HbaseConfigs {
- // metastore properties
- val hbaseTableKey: String = "gimel.hbase.table.name"
- val hbaseColumnMappingKey: String = "gimel.hbase.columns.mapping"
- val hbaseNamespaceKey = "gimel.hbase.namespace.name"
-
- // misc properties for read/write
- val hbaseStorageHandler: String = "org.apache.hadoop.hive.hbase.HBaseStorageHandler"
- val hbaseOperation: String = "gimel.hbase.operation"
- val hbaseFilter: String = "gimel.hbase.get.filter"
- val hbaseRowKey: String = "gimel.hbase.rowkey"
- val hbaseColumnNamewithColumnFamilyAppended: String = "gimel.hbase.colName.with.cfName.appended"
- val hbaseSiteXMLHDFSPathKey: String = "gimel.hbase.site.xml.hdfs.path"
- val hbaseMaxRecordsForSchema: String = "gimel.hbase.schema.max.records"
- val hbaseMaxColumnsForSchema: String = "gimel.hbase.schema.max.columns"
-
-}
-
-
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConstants.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConstants.scala
deleted file mode 100644
index 1eb3c8f0..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/conf/HbaseConstants.scala
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.conf
-
-object HbaseConstants {
- // basic variable references
- val DEFAULT_ROW_KEY_COLUMN = "rowKey"
- val DEFAULT_NAMESPACE = "default"
-
- val SCAN_OPERATION = "scan"
- val GET_OPERATION = "get"
- val PUT_OPERATION = "put"
-
- val NONE_STRING = "NONE"
-
- val MAX_SAMPLE_RECORDS_FOR_SCHEMA = "1000"
- val MAX_COLUMNS_FOR_SCHEMA = "100000"
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseCatalog.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseCatalog.scala
deleted file mode 100644
index 10b75758..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseCatalog.scala
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.apache.spark.sql.types.StructType
-
-import com.paypal.gimel.logger.Logger
-
-/**
- * This is a Template Implementation to Create a Catalog for HBASE Spark Connector
- * Take a variety of parameters for mapping a DataFrame to its HBASE equivalent
- * 1. NameSpace
- * 2. TableName
- * 3. Keys
- * 4. Column Families with a list of columns to put in each column family
- * 5. TableCoder
- */
-
-
-object HBaseCatalog {
- val logger = Logger()
-
- val holderNameSpace = ""
- val holderTableName = ""
- val holderTableCoder = ""
- val holderKey = ""
- val holderKeysAsCols = ""
- val holderColumns = ""
- val catalogTemplate: String =
- s"""|{"table":{"namespace":"$holderNameSpace", "name":"$holderTableName", "tableCoder":"$holderTableCoder"},
- |"rowkey":"$holderKey",
- |"columns":{
- |$holderKeysAsCols,
- |$holderColumns
- |}
- |}
- """.stripMargin
-
- /**
- * This function creates fields as String for Catalog with column Family appending with Column Name
- *
- * @param fields Hbase Table namespace
- * @param columnFamily Hbase table name
- */
- def fieldsAsStringForCataLogAppendColumnFamily(fields: Array[String], columnFamily: String = "rowkey"): String = {
-
- var lengthString = ""
- fields.map {
- eachKey =>
- val hbaseCol = if (columnFamily == "rowkey") {
- lengthString = """, "length":"50""""
- eachKey
- } else eachKey
- s""""$columnFamily""" + s"""_$eachKey":{"cf":"$columnFamily", "col":"$hbaseCol", "type":"string"$lengthString}"""
- }.mkString("", ",\n", "")
- }
-
-
- /**
- * This function creates fields as String for Catalog
- *
- * @param fields Hbase Table namespace
- * @param columnFamily Hbase table name
- */
- def fieldsAsStringForCataLog(fields: Array[String], columnFamily: String = "rowkey"): String = {
-
- var lengthString = ""
- fields.map {
- eachKey =>
- val hbaseCol = if (columnFamily == "rowkey") {
- lengthString = """, "length":"50""""
- eachKey
- } else eachKey
- s""""$eachKey":{"cf":"$columnFamily", "col":"$hbaseCol", "type":"string"$lengthString}"""
- }.mkString("", ",\n", "")
- }
-
- /**
- * This function creates a catalog for hbase table with single column family
- *
- * @param nameSpace Hbase Table namespace
- * @param tableName Hbase table name
- * @param dfSchema Array of columns in dataframe
- * @param keys Array of row key columns
- * @param columnFamily
- * @param tableCoder
- * @return String
- */
- def apply(nameSpace: String, tableName: String, dfSchema: Array[String], keys: Array[String], columnFamily: String, tableCoder: String = "PrimitiveType"): String = {
- val key = keys.mkString(":")
- val keysAsCols = fieldsAsStringForCataLog(keys)
- val columns = dfSchema.diff(keys)
- val colsAsCols = fieldsAsStringForCataLog(columns, columnFamily)
- val catalogString = catalogTemplate.
- replaceAllLiterally(holderNameSpace, nameSpace)
- .replaceAllLiterally(holderTableName, tableName)
- .replaceAllLiterally(holderTableCoder, tableCoder)
- .replaceAllLiterally(holderKey, key)
- .replaceAllLiterally(holderColumns, colsAsCols)
- .replaceAllLiterally(holderKeysAsCols, keysAsCols)
- catalogString
- }
-
- /**
- * This function creates a catalog for hbase table with multiple column family
- *
- * @param nameSpace Hbase Table namespace
- * @param tableName Hbase table name
- * @param cfColsMap Map[Column Family -> Array[Column Names ] ]
- * @param keys Array of row key columns
- * @param tableCoder
- * @return String
- */
-
- def apply(nameSpace: String, tableName: String, cfColsMap: Map[String, Array[String]], keys: Array[String], tableCoder: String, readWithColumnFamily: Boolean): String = {
- val key = keys.mkString(":")
- val keysAsCols = if (readWithColumnFamily) {
- fieldsAsStringForCataLogAppendColumnFamily(keys)
- } else {
- fieldsAsStringForCataLog(keys)
- }
- val colsAsCols = if (readWithColumnFamily) {
- cfColsMap.map { x => fieldsAsStringForCataLogAppendColumnFamily(x._2.diff(keys), x._1) }.mkString("", ",\n", "")
- }
- else {
- cfColsMap.map { x => fieldsAsStringForCataLog(x._2.diff(keys), x._1) }.mkString("", ",\n", "")
- }
- val catalogString = catalogTemplate.
- replaceAllLiterally(holderNameSpace, nameSpace)
- .replaceAllLiterally(holderTableName, tableName)
- .replaceAllLiterally(holderTableCoder, tableCoder)
- .replaceAllLiterally(holderKey, key)
- .replaceAllLiterally(holderColumns, colsAsCols)
- .replaceAllLiterally(holderKeysAsCols, keysAsCols)
- logger.info(catalogString)
- logger.info("catalog is --> " + catalogString)
- catalogString
- }
-
- /**
- * This function creates a catalog for hbase table with single column family from a dataframe schema
- *
- * @param nameSpace Hbase Table namespace
- * @param tableName Hbase table name
- * @param dfSchema Dataframe Schema
- * @param keys Array of row key columns
- * @param columnFamily
- * @param tableCoder
- * @return String
- */
- def apply(nameSpace: String, tableName: String, dfSchema: StructType, keys: Array[String], columnFamily: String, tableCoder: String): String = {
- val key = keys.mkString(":")
- val keysAsCols = fieldsAsStringForCataLog(keys)
- val columns = dfSchema.fieldNames.diff(keys)
- val colsAsCols = fieldsAsStringForCataLog(columns, columnFamily)
- val catalogString = catalogTemplate.
- replaceAllLiterally(holderNameSpace, nameSpace)
- .replaceAllLiterally(holderTableName, tableName)
- .replaceAllLiterally(holderTableCoder, tableCoder)
- .replaceAllLiterally(holderKey, key)
- .replaceAllLiterally(holderColumns, colsAsCols)
- .replaceAllLiterally(holderKeysAsCols, keysAsCols)
- logger.info(catalogString)
- logger.info("catalog is --> " + catalogString)
- catalogString
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseLookUp.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseLookUp.scala
deleted file mode 100644
index c5b12b43..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseLookUp.scala
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import scala.collection.JavaConverters._
-import scala.collection.immutable.{Iterable, Map}
-
-import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}
-import org.apache.hadoop.hbase.client.{ConnectionFactory, Get, Result}
-import org.apache.hadoop.hbase.util.Bytes
-import org.apache.spark.sql.{DataFrame, SparkSession, SQLContext}
-import spray.json._
-import spray.json.DefaultJsonProtocol._
-
-import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs}
-import com.paypal.gimel.logger.Logger
-
-object HBaseLookUp {
-
- def apply(sparkSession: SparkSession): HBaseLookUp = new HBaseLookUp(sparkSession)
-
-}
-
-class HBaseLookUp(sparkSession: SparkSession) {
-
- val logger = Logger()
-
- /**
- * This function reads all or given columns in column family for a rowKey specified by user
- *
- * @param dataset Name
- * @param dataSetProps
- * props is the way to set various additional parameters for read and write operations in DataSet class
- * Example Usecase : Hbase lookup for rowKey=r1 and columns c1, c12 of column family cf1 and c2 of cf2
- * val options: Map[String, Any] = Map("operation"->"get","filter"->"rowKey=r1:toGet=cf1-c1,c12|cf2-c2")
- * val recsDF = dataSet.read("udc.test123", options);
- * @return DataFrame
- */
- def get(dataset: String, dataSetProps: Map[String, Any]): DataFrame = {
- try {
-
- // Hbase configuration
- val conf = new HbaseClientConfiguration(dataSetProps)
- var options = Map.empty[String, String]
- try {
- if (conf.getOption.isEmpty) {
- throw new IllegalArgumentException(
- s"""
- | HBase get filter condition not found. Please set the property ${HbaseConfigs.hbaseFilter}.
- | Example: rowKey=1:toGet=personal
- | where personal is the column family name
- |""".stripMargin)
- }
- options = conf.getOption.split(":").map { x => x.split("=")(0) -> x.split("=")(1) }.toMap
- if (!options.contains("rowKey")) {
- throw new IllegalArgumentException(
- s"""
- | rowKey not present in the filter condition. Please check the property ${HbaseConfigs.hbaseFilter}.
- | Examples: rowKey=1:toGet=personal
- | where personal is the column family name
- |""".stripMargin)
- }
- } catch {
- case ex: Throwable =>
- logger.error(
- s"""
- | Unable to parse the filter condition. Please check the property ${HbaseConfigs.hbaseFilter}
- | Example: rowKey=1:toGet=personal
- | where personal is the column family name
- |""".stripMargin)
- ex.printStackTrace()
- throw ex
- }
-
- val rowKey = options("rowKey")
-
- val dataFromHBASE: Map[String, String] = if (!options.contains("toGet")) {
- getColumnsInRowKey(conf.hbaseNameSpace + ":" + conf.hbaseTableName, rowKey)
- } else {
- val cfsAndCols = options("toGet")
- // (Column family to Array[Columns]) mapping specified by user in toGet
- val cfsSets: Map[String, Array[String]] = cfsAndCols.split('|').map { x =>
- if (x.split("-").length > 1) x.split('-')(0) -> x.split('-')(1).split(',') else x.split('-')(0) -> null
- }.toMap
- getColumnsInRowKey(conf.hbaseNameSpace + ":" + conf.hbaseTableName, rowKey, cfsSets)
- }
- val hbaseDataJSON = dataFromHBASE.toJson.compactPrint
- val hbaseDf = jsonStringToDF(sparkSession, hbaseDataJSON)
- hbaseDf
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- logger.error(s"Unable to get data from HBase table.")
- throw ex
- }
- }
-
- /**
- * Returns all/specified columns in column family for a rowKey specified by user
- *
- * @param hbaseTable Name of the Data Set
- * @param rowKey row Key for the lookup
- * @param cfsSets User Specified column family and columns
- * @return Map[Column -> Column Value ]
- */
- def getColumnsInRowKey(hbaseTable: String, rowKey: String, cfsSets: Map[String, Array[String]]): Map[String, String] = {
- val k: Iterable[Map[String, String]] = cfsSets.map { x =>
- val cf1 = x._1
- val cols = x._2
- val hbaseData = getColumnsInFamily(hbaseTable, rowKey, cf1, cols)
- hbaseData
- }
- val foldedMap: Map[String, String] = k.tail.foldLeft(k.head)((x, y) => x ++ y)
- foldedMap
- }
-
- /**
- * Converts a String to DataFrame
- *
- * @param sqlCntxt SQLContext
- * @param string Input String (must be JSON Format)
- */
- def jsonStringToDF(sqlCntxt: SQLContext, string: String): DataFrame = {
- val rdd = sqlCntxt.sparkContext.parallelize(Seq(string))
- sqlCntxt.read.json(rdd)
- }
-
- /**
- * Converts a String to DataFrame
- *
- * @param sparkSession : SparkSession
- * @param string Input String (must be JSON Format)
- */
- def jsonStringToDF(sparkSession: SparkSession, string: String): DataFrame = {
- val rdd = sparkSession.sparkContext.parallelize(Seq(string))
- sparkSession.read.json(rdd)
- }
-
- /**
- * Returns Column Value for each column in a column family
- *
- * @param hbaseTable HBASE Table Name
- * @param rowKey Row Key
- * @param columnFamily Column Family Name
- * @param columns Array of Column Names
- * @return Map[Column -> Column Value ]
- */
- def getColumnsInFamily(hbaseTable: String, rowKey: String, columnFamily: String, columns: Array[String]): Map[String, String] = {
- try {
- val hbaseColumnFamily: Array[Byte] = Bytes.toBytes(columnFamily)
- val hTable = TableName.valueOf(hbaseTable)
- val rowKeyBytes = Bytes.toBytes(rowKey)
- val getRowKey: Get = new Get(rowKeyBytes)
- // Configure And Connect
- val conf = HBaseConfiguration.create()
- val cnxn = ConnectionFactory.createConnection(conf)
- // Get Operation
- val tbl = cnxn.getTable(hTable)
- val k: Result = tbl.get(getRowKey)
-
- // Get Column values of each column as Map of [Column Name -> Column Value]
- val allColumns: Map[String, String] = columns match {
- // If user specifies only column family, get all the columns in that column family otherwise get specified columns
- case null =>
- k.getFamilyMap(Bytes.toBytes(columnFamily)).asScala.map(x => (Bytes.toString(x._1), Bytes.toString(x._2))).toMap
- case _ =>
- // Columns Bytes
- val hbaseColumns = columns.map(Bytes.toBytes)
- // Mapping Cf with Columns into single collection
- val cfAndColumns: Array[(Array[Byte], Array[Byte])] = hbaseColumns.map((hbaseColumnFamily, _))
- // Return requested Columns and their values in a Map
- val allColumns = cfAndColumns.map { x =>
- Bytes.toString(x._2) -> Bytes.toString(k.getValue(x._1, x._2))
- }.toMap
- allColumns
- }
- allColumns
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- throw ex
- }
-
- }
-
- /**
- * Returns all columns in all column families for a rowKey specified by user
- *
- * @param hbaseTable Name of the Data Set
- * @param rowKey row Key for the lookup
- * @return Map[Column -> Column Value ]
- */
- def getColumnsInRowKey(hbaseTable: String, rowKey: String): Map[String, String] = {
- try {
- val hTable = TableName.valueOf(hbaseTable)
- val rowKeyBytes = Bytes.toBytes(rowKey)
- val getRowKey: Get = new Get(rowKeyBytes)
- // Configure And Connect
- val conf = HBaseConfiguration.create()
- val cnxn = ConnectionFactory.createConnection(conf)
- // Get Operation
- val tbl = cnxn.getTable(hTable)
- val k: Result = tbl.get(getRowKey)
- val columnsVals = k.rawCells().map(cell => (Bytes.toString(CellUtil.cloneQualifier(cell)), Bytes.toString(CellUtil.cloneValue(cell)))).toMap
- tbl.close()
- columnsVals
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- throw ex
- }
- }
-
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBasePut.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBasePut.scala
deleted file mode 100644
index d7bb489e..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBasePut.scala
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
-import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
-import org.apache.hadoop.hbase.util.Bytes
-import org.apache.spark.sql.{DataFrame, SparkSession}
-
-import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs}
-import com.paypal.gimel.logger.Logger
-
-object HBasePut {
-
- def apply(sparkSession: SparkSession): HBasePut = new HBasePut(sparkSession)
-
-}
-
-class HBasePut(sparkSession: SparkSession) {
- val logger = Logger()
- lazy val hbaseUtilities = HBaseUtilities(sparkSession)
-
- /**
- * This function performs put(insert/update) operation on each row of dataframe
- *
- * @param dataset Name
- * @param dataFrame The Dataframe to write into Target
- * @param dataSetProps
- * props is the way to set various additional parameters for read and write operations in DataSet class
- * Example Usecase : Hbase put
- * val props = Map("operation" -> "put")
- * val recsDF = dataSet.write("pcatalog.test123", df, options);
- * @return DataFrame
- */
- def put(dataset: String, dataFrame: DataFrame, dataSetProps: Map[String, Any]): DataFrame = {
- try {
- // Hbase configuration
- val conf = new HbaseClientConfiguration(dataSetProps)
- // Getting (Column family -> Array[Columns]) mapping
- val columnFamilyToColumnMapping: Map[String, Array[String]] = hbaseUtilities.getColumnMappingForColumnFamily(conf.hbaseNameSpace,
- conf.hbaseTableName,
- conf.hbaseTableColumnMapping,
- conf.maxSampleRecordsForSchema,
- conf.maxColumnsForSchema)
- logger.info("Column mapping -> " + columnFamilyToColumnMapping)
- // Converting columnFamilyToColumnMapping to a map of (Column -> Column Family)
- val columnToColumnFamilyMapping = columnFamilyToColumnMapping.flatMap(cfCols => cfCols._2.map(col => (col, cfCols._1)))
- // Create Put object for each row in dataframe
- putRows(conf.hbaseNameSpace + ":" + conf.hbaseTableName, dataFrame, conf.hbaseRowKeys.mkString(":"), dataFrame.columns, columnToColumnFamilyMapping)
- dataFrame
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- logger.error(s"Unable to put data into HBase table.")
- throw ex
- }
- }
-
- /**
- *
- * @param hbaseTable Hbase Table Name
- * @param dataFrame The Dataframe to put into Target
- * @param rowKeyColumn Name of the row Key column in hive table
- * @param columns Array of Columns to be put
- * @param cfColsMap Map of (Column -> Column Family)
- */
- def putRows(hbaseTable: String, dataFrame: DataFrame, rowKeyColumn: String, columns: Array[String], cfColsMap: Map[String, String]) {
- try {
- // Configure And Connect
- val conf = HBaseConfiguration.create()
- val cnxn = ConnectionFactory.createConnection(conf)
- // Create Connection to HBase table
- val tbl = cnxn.getTable(TableName.valueOf(hbaseTable))
- val rows = dataFrame.rdd.map { row =>
- (row.getAs(rowKeyColumn).toString,
- columns.map(eachCol => (cfColsMap.getOrElse(eachCol, ""), eachCol, row.getAs(eachCol).asInstanceOf[String]))
- )
- }.collect()
- // Performing put operation on each row of dataframe
- rows.foreach { row =>
- val putRow: Put = new Put(Bytes.toBytes(row._1.asInstanceOf[String]))
- row._2.foreach(x => if (x._2 != rowKeyColumn) putRow.addColumn(Bytes.toBytes(x._1), Bytes.toBytes(x._2), Bytes.toBytes(x._3)))
- tbl.put(putRow)
- }
- tbl.close()
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- throw ex
- }
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseScanner.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseScanner.scala
deleted file mode 100644
index 9049e93d..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseScanner.scala
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.apache.commons.lang.StringEscapeUtils
-import org.apache.hadoop.hbase.{CellUtil, HBaseConfiguration, TableName}
-import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Scan}
-import org.apache.hadoop.hbase.filter.PageFilter
-import org.apache.hadoop.hbase.util.Bytes
-import scala.collection.JavaConverters._
-
-import com.paypal.gimel.common.utilities.GenericUtils
-import com.paypal.gimel.logger.Logger
-
-object HBaseScanner {
-
- def apply(): HBaseScanner = new HBaseScanner()
-
-}
-
-class HBaseScanner() {
-
- val logger = Logger(this.getClass)
-
- /**
- * Returns schema of hbase table
- *
- * @param connection
- * @param namespace
- * @param tableName
- * @param maxResults : Number of maximum records to be scanned
- * @return Map of [Column Family -> Array[Columns] ]
- */
- def getSchema(connection: Connection, namespace: String, tableName: String, rowKey: String, maxResults: Int): Map[String, Array[String]] = {
- val table: TableName = TableName.valueOf(namespace + ":" + tableName)
- val tbl = connection.getTable(table)
- // INITIATE SCANNER
- val scan = new Scan()
-
- // Setting the Page Filter to retrieve pageSize records from each region server
- val pageSize = getPageSize(connection, table, maxResults)
- logger.info("Setting the pageSize = " + pageSize)
- val filter = new PageFilter(maxResults)
- scan.setFilter(filter)
-
- var count = 0
- // Iterate through all the records retrieved from HBase and get column family and column names
- GenericUtils.withResources(tbl.getScanner(scan)) { scanner =>
- val res = scanner.iterator().asScala.flatMap { result =>
- count = count + 1
- val cells = result.listCells().iterator().asScala
- cells.map(cell => (Bytes.toString(CellUtil.cloneFamily(cell)), Bytes.toString(CellUtil.cloneQualifier(cell)))).toList
- }.toList.distinct.groupBy(_._1).map(x => (x._1, x._2.map(p => p._2).toArray))
- logger.info(s"Records Count for ${tableName} : " + count)
- val rowKeyMap = Map("rowKey" -> Array(rowKey))
- rowKeyMap ++ res
- }
- }
-
- /**
- * Returns schema of hbase table with specified maximum number of columns and result size
- *
- * @param connection
- * @param namespace
- * @param tableName
- * @param maxResults : Number of maximum records to be scanned
- * @param maxColumns : Number of maximum columns to be scanned
- * @param maxResultSize : Maximum result size in bytes
- * @return Map of [Column Family -> Array[Columns] ]
- */
- def getSchema(connection: Connection, namespace: String, tableName: String, maxResults: Int, maxColumns: Int, maxResultSize : Long): Map[String, Array[String]] = {
- val table: TableName = TableName.valueOf(namespace + ":" + tableName)
- val tbl = connection.getTable(table)
- // INITIATE SCANNER
- val scan = new Scan()
-
- // Setting the Page Filter to retrieve pageSize records from each region server
- val pageSize = getPageSize(connection, table, maxResults)
- logger.info("Setting the pageSize = " + pageSize)
- val fil = new PageFilter(maxResults)
- scan.setFilter(fil)
- // Setting the maximum result size in bytes
- scan.setMaxResultSize(maxResultSize)
-
- var count = 0
- var columnsCount = 0
- // Iterate through all the records retrieved from HBase and get column family and column names
- GenericUtils.withResources(tbl.getScanner(scan)) { scanner =>
- val res = scanner.iterator().asScala.takeWhile(_ => columnsCount < maxColumns).flatMap { result =>
- count = count + 1
- val cells = result.listCells()
- columnsCount = cells.size()
- val cellsItr = cells.iterator().asScala
- // Escape each column family and column in case of any special characters
- cellsItr.map(cell => (StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneFamily(cell))),
- StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneQualifier(cell))))).toList
- }.toList.distinct.groupBy(_._1).map(x => (x._1, x._2.map(p => p._2).toArray))
- logger.info(s"Records Count for ${tableName} : " + count)
- res
- }
- }
-
- /**
- * Returns schema of hbase table with specified maximum number of columns
- *
- * @param connection
- * @param namespace
- * @param tableName
- * @param maxResults : Number of maximum records to be scanned
- * @param maxColumns : Number of maximum columns to be scanned
- * @return Map of [Column Family -> Array[Columns] ]
- */
- def getSchema(connection: Connection, namespace: String, tableName: String, maxResults: Int, maxColumns: Int): Map[String, Array[String]] = {
- val table: TableName = TableName.valueOf(namespace + ":" + tableName)
- val tbl = connection.getTable(table)
- // INITIATE SCANNER
- val scan = new Scan()
-
- // Setting the Page Filter to retrieve pageSize records from each region server
- val pageSize = getPageSize(connection, table, maxResults)
- logger.info("Setting the pageSize = " + pageSize)
- val filter = new PageFilter(pageSize)
- scan.setFilter(filter)
-
- var count = 0
- var columnsCount = 0
- // Iterate through all the records retrieved from HBase and get column family and column names
- GenericUtils.withResources(tbl.getScanner(scan)) { scanner =>
- val res = scanner.iterator().asScala.takeWhile(_ => columnsCount < maxColumns).flatMap { result =>
- count = count + 1
- val cells = result.listCells()
- columnsCount = cells.size()
- val cellsItr = cells.iterator().asScala
- // Escape each column family and column in case of any special characters
- cellsItr.map(cell => (StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneFamily(cell))),
- StringEscapeUtils.escapeJava(Bytes.toString(CellUtil.cloneQualifier(cell))))).toList
- }.toList.distinct.groupBy(_._1).map(x => (x._1, x._2.map(p => p._2).toArray))
- logger.info(s"Records Count for ${tableName} : " + count)
- res
- }
- }
-
- /**
- * Returns page size based on the number of regions and maxResults size
- *
- * @param connection
- * @param table
- * @param maxResults : Number of maximum records to be scanned
- * @return Page Size
- */
- def getPageSize(connection: Connection, table: TableName, maxResults: Int): Int = {
- // Getting total region servers to decide the PageFilter size
- val regionLocator = connection.getRegionLocator(table)
- val numRegionServers = regionLocator.getAllRegionLocations().asScala.map(eachRegion => eachRegion.getHostname()).distinct.size
- if (numRegionServers == 0) {
- 0
- } else {
- Math.max(maxResults / numRegionServers, 1)
- }
- }
-
- /**
- * Returns schema of hbase table by creating a connection
- *
- * @param namespace : Name of hbase name space
- * @param tableName : Name of hbase table
- * @param maxResults : Number of maximum records to be scanned
- * @param maxColumns : Number of maximum columns to be scanned
- * @return Map of [Column Family -> Array[Columns] ]
- */
- def getSchema(namespace: String, tableName: String, maxResults: Int, maxColumns: Int): Map[String, Array[String]] = {
- val conf = HBaseConfiguration.create()
- GenericUtils.withResources(ConnectionFactory.createConnection(conf)) {
- connection =>
- getSchema(connection, namespace, tableName, maxResults, maxColumns)
- }
- }
-
- /**
- * Returns schema of hbase table by creating a connection with specified maximum number of columns
- *
- * @param namespace
- * @param tableName
- * @param maxResults : Number of maximum records to be scanned
- * @param maxColumns : Number of maximum columns to be scanned
- * @param maxResultSize : Maximum result size in bytes
- * @return Map of [Column Family -> Array[Columns] ]
- */
- def getSchema(namespace: String, tableName: String, maxResults: Int, maxColumns: Int, maxResultSize : Long): Map[String, Array[String]] = {
- val conf = HBaseConfiguration.create()
- GenericUtils.withResources(ConnectionFactory.createConnection(conf)) {
- connection =>
- getSchema(connection, namespace, tableName, maxResults, maxColumns, maxResultSize)
- }
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnector.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnector.scala
deleted file mode 100644
index 13669183..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnector.scala
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.execution.datasources.hbase.{HBaseRelation, HBaseTableCatalog}
-
-import com.paypal.gimel.common.storageadmin.HBaseAdminClient
-import com.paypal.gimel.hbase.conf.{HbaseClientConfiguration, HbaseConfigs, HbaseConstants}
-import com.paypal.gimel.logger.Logger
-
-/**
- * Spark Hbase Connector by Hortonworks implementations internal to Gimel
- */
-object HBaseSparkConnector {
-
- def apply(sparkSession: SparkSession): HBaseSparkConnector = new HBaseSparkConnector(sparkSession)
-
-}
-
-class HBaseSparkConnector(sparkSession: SparkSession) {
- val logger = Logger()
- lazy val hbaseUtilities = HBaseUtilities(sparkSession)
-
- /**
- * This function performs scan/bulkGet on hbase table
- *
- * @param dataset Name
- * @param dataSetProps
- * props is the way to set various additional parameters for read and write operations in DataSet class
- * Example Usecase : to get 10 factor parallelism (specifically)
- * val props = Map("coalesceFactor" -> 10)
- * val data = Dataset(sc).read("flights", props)
- * data.coalesce(props.get("coalesceFactor"))
- * @return DataFrame
- */
- def read(dataset: String, dataSetProps: Map[String, Any] = Map.empty): DataFrame = {
- try {
-
- val conf = new HbaseClientConfiguration(dataSetProps)
- // Setting the map (Column family -> Array of columns)
- val columnFamilyToColumnMapping: Map[String, Array[String]] = hbaseUtilities.getColumnMappingForColumnFamily(conf.hbaseNameSpace,
- conf.hbaseTableName,
- conf.hbaseTableColumnMapping,
- conf.maxSampleRecordsForSchema,
- conf.maxColumnsForSchema)
- logger.info("Column mapping -> " + columnFamilyToColumnMapping)
- // Get the hbase-site.xml file location
- val hbaseConfigFileLocation = HBaseAdminClient.getHbaseSiteXml(conf.hbaseSiteXMLHDFSPath)
- // Create catalog for the SHC connector
- val catalog = HBaseCatalog(conf.hbaseNameSpace, conf.hbaseTableName, columnFamilyToColumnMapping, conf.hbaseRowKeys,
- "PrimitiveType", conf.hbaseColumnNamewithColumnFamilyAppended)
- logger.info(s"Reading with catalog --> $catalog")
-
- val dataframe = conf.hbaseSiteXMLHDFSPath match {
- case HbaseConstants.NONE_STRING =>
- readWithCatalog(catalog)
- case _ =>
- readWithCatalog(catalog, hbaseConfigFileLocation)
- }
- dataframe
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- logger.error(s"Unable to read data from HBase table.")
- throw ex
- }
- }
-
- /**
- * This function reads data from HBase with catalog string.
- *
- * @param catalog
- * @return
- */
- private def readWithCatalog(catalog: String): DataFrame = {
- try {
- sparkSession
- .read
- .options(Map((HBaseTableCatalog.tableCatalog, catalog)))
- .format("org.apache.spark.sql.execution.datasources.hbase")
- .load()
- }
- catch {
- case ex: Throwable =>
- ex.printStackTrace()
- throw ex
- }
- }
-
- /**
- * This function reads data from HBase with catalog string.
- *
- * @param catalog
- * @param hbaseConfigFileLocation The HBASE Configuration File : hbase-site.xml
- * @return DataFrame
- */
- private def readWithCatalog(catalog: String, hbaseConfigFileLocation: String): DataFrame = {
- try {
- sparkSession
- .read
- .options(Map((HBaseTableCatalog.tableCatalog, catalog), (HBaseRelation.HBASE_CONFIGFILE, hbaseConfigFileLocation)))
- .format("org.apache.spark.sql.execution.datasources.hbase")
- .load()
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- throw ex
- }
- }
-
- /**
- * This function performs bulk write into hbase table
- *
- * @param dataset Name
- * @param dataFrame The Dataframe to write into Target
- * @param dataSetProps
- * Example Usecase : we want only 1 executor for hbase (specifically)
- * val props = Map("coalesceFactor" -> 1)
- * Dataset(sc).write(clientDataFrame, props)
- * Inside write implementation :: dataFrame.coalesce(props.get("coalesceFactor"))
- * @return DataFrame
- */
-
- def write(dataset: String, dataFrame: DataFrame, dataSetProps: Map[String, Any]): DataFrame = {
- try {
- val conf = new HbaseClientConfiguration(dataSetProps)
-
- if (conf.hbaseRowKeys.diff(dataFrame.columns.toSeq).nonEmpty) {
- throw new IllegalArgumentException(
- s"""
- |Row Key columns not found in input dataframe.
- |You can modify the value through ${HbaseConfigs.hbaseRowKey} parameter.
- |Note: Default value is first column of the schema from UDC or ${HbaseConstants.DEFAULT_ROW_KEY_COLUMN}.
- |""".stripMargin)
- }
- // Get columns in dataframe excluding row key columns
- val dfColumns = dataFrame.columns.filter(x => !conf.hbaseRowKeys.contains(x)).toSeq
- logger.info("Columns in dataframe -> " + dfColumns)
- // Setting (Column family -> array of columns) mapping
- val columnFamilyToColumnMapping: Map[String, Array[String]] = hbaseUtilities.getColumnMappingForColumnFamily(conf.hbaseNameSpace,
- conf.hbaseTableName,
- conf.hbaseTableColumnMapping,
- conf.maxSampleRecordsForSchema,
- conf.maxColumnsForSchema)
- logger.info("Column mapping -> " + columnFamilyToColumnMapping)
- val columnsInSchema = columnFamilyToColumnMapping.map(_._2).flatten.toSeq
- logger.info("Columns in schema : " + columnsInSchema)
- // Check what columns in the input hbase column mapping are not present in the input dataframe
- val diff = columnsInSchema.diff(dfColumns)
- if (diff.nonEmpty) {
- throw new IllegalArgumentException(
- s"""
- |Columns : ${diff.mkString(",")} not found in dataframe schema.
- |Please check the property : ${HbaseConfigs.hbaseColumnMappingKey} = ${conf.hbaseTableColumnMapping}
- |""".stripMargin
- )
- }
- // Select columns provided in gimel.hbase.column.mapping property and row keys from the input dataframe.
- val dataFrameToWrite = dataFrame.selectExpr(columnsInSchema ++ conf.hbaseRowKeys: _*)
- // Get the hbase-site.xml file location
- val hbaseConfigFileLocation = HBaseAdminClient.getHbaseSiteXml(conf.hbaseSiteXMLHDFSPath)
- // Create catalog for the SHC connector
- val catalog = HBaseCatalog(conf.hbaseNameSpace, conf.hbaseTableName, columnFamilyToColumnMapping, conf.hbaseRowKeys, "PrimitiveType", false)
- logger.info(s"Writing with catalog --> $catalog")
- conf.hbaseSiteXMLHDFSPath match {
- case HbaseConstants.NONE_STRING =>
- logger.info(s"PLAIN WRITE")
- writeWithCatalog(dataFrameToWrite, catalog)
- case _ =>
- logger.info(s"write with ${conf.hbaseSiteXMLHDFSPath}")
- writeWithCatalog(dataFrameToWrite, catalog, hbaseConfigFileLocation)
- }
- dataFrame
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- logger.error(s"Unable to write data to HBase table.")
- throw ex
- }
- }
-
- /**
- * This function writes data to HBase with catalog string.
- *
- * @param dataFrame DataFrame to write to HBase.
- * @param hbaseConfigFileLocation The HBASE Configuration File : hbase-site.xml
- * @param catalog Catalog string holdin schema for HBase table.
- */
- // HBaseTableCatalog.newTable Property needs to be set as a default paramaneter as SHC Connector expects this argumnet but it doesnt create the table again[SHC ISSUE- https://github.com/hortonworks-spark/shc/issues/151].If we take the master branch and build it ,we dont need this parameter.
- private def writeWithCatalog(dataFrame: DataFrame, catalog: String, hbaseConfigFileLocation: String) = {
- try {
- dataFrame
- .write
- .options(Map((HBaseTableCatalog.tableCatalog, catalog), (HBaseTableCatalog.newTable, "5"), (HBaseRelation.HBASE_CONFIGFILE, hbaseConfigFileLocation)))
- .format("org.apache.spark.sql.execution.datasources.hbase")
- .save()
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- throw ex
- }
- }
-
- /**
- * This function writes data to HBase with catalog string.
- *
- * @param dataFrame DataFrame to write to HBase.
- * @param catalog Catalog string holdin schema for HBase table.
- */
- // HBaseTableCatalog.newTable Property needs to be set as a default paramaneter as SHC Connector expects this argumnet but it doesnt create the table again[SHC ISSUE- https://github.com/hortonworks-spark/shc/issues/151].If we take the master branch and build it ,we dont need this parameter.
- private def writeWithCatalog(dataFrame: DataFrame, catalog: String) = {
- try {
- dataFrame
- .write
- .options(Map((HBaseTableCatalog.tableCatalog, catalog), (HBaseTableCatalog.newTable, "5")))
- .format("org.apache.spark.sql.execution.datasources.hbase")
- .save()
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- throw ex
- }
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseUtilities.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseUtilities.scala
deleted file mode 100644
index 7a91bf71..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/main/scala/com/paypal/gimel/hbase/utilities/HBaseUtilities.scala
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.apache.commons.lang.StringEscapeUtils
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.StringType
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.common.conf.GimelConstants
-import com.paypal.gimel.common.security.AuthHandler
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-import com.paypal.gimel.logger.Logger
-
-/**
- * HBASE implementations internal to Gimel
- */
-object HBaseUtilities {
-
- def apply(sparkSession: SparkSession): HBaseUtilities = new HBaseUtilities(sparkSession)
-
-}
-
-class HBaseUtilities(sparkSession: SparkSession) {
- val logger = Logger()
- val columnFamilyNamePattern = "(.+):(.+)".r
- lazy val hbaseScanner = HBaseScanner()
-
- /**
- *
- * @param dataDrame DataFrame to cast all columns to string format.
- * @return Dataframe with all string data.
- */
- def castAllColsToString(dataDrame: DataFrame): DataFrame = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- logger.info("Casting All Columns as String")
- val k = dataDrame.schema.fieldNames.foldRight(dataDrame) {
- (column: String, df: DataFrame) => df.withColumn(column, df(column).cast(StringType))
- }
- logger.info("Coalescing All Columns with Null Values to Empty String")
- val returningDF = k.schema.fieldNames.foldRight(k) {
- (fieldName: String, df: DataFrame) => df.withColumn(fieldName, coalesce(df(fieldName), lit("")))
- }
- logger.info("Done with Column Coalese operation")
- returningDF
- }
-
- /**
- * This function scans the sample records from hbase table if column mapping parameter is empty
- *
- * @param namespace String HBase Namespace Name
- * @param tableName String HBase Table Name
- * @param tableColumnMapping String (:key,cf1:c1,cf1:c2,cf2:c3)
- * @return
- */
-
- def getColumnMappingForColumnFamily(namespace: String, tableName: String, tableColumnMapping: String, maxRecords: Int, maxColumns: Int): Map[String, Array[String]] = {
- val schema = getColumnMappingForColumnFamily(tableColumnMapping)
- if (schema.isEmpty) {
- logger.info("Column family to column mapping is not present or is in wrong format, scanning the sample records.")
- val schemaFromSampleRecords = hbaseScanner.getSchema(namespace, tableName, maxRecords, maxColumns)
- if (schemaFromSampleRecords.isEmpty) {
- throw new IllegalStateException("No columns found while scanning. May be the table is empty.")
- }
- schemaFromSampleRecords
- } else {
- schema
- }
- }
-
- /**
- * This function performs Table Column Mapping
- *
- * @param tableColumnMapping String (:key,cf1:c1,cf1:c2,cf2:c3)
- * @return
- */
-
- def getColumnMappingForColumnFamily(tableColumnMapping: String): Map[String, Array[String]] = {
- // to remove the exact location of :Key
- val indexOfKey: Int = tableColumnMapping.split(",").indexOf(":key")
- val updateMapping = if (indexOfKey != -1) {
- val mappingBuffer = tableColumnMapping.split(",").toBuffer
- mappingBuffer.remove(indexOfKey)
- mappingBuffer.toArray.mkString(",")
- } else {
- tableColumnMapping
- }
-
- try {
- // checking if CF Mapping matches the pattern
- val columnMapping = updateMapping.split(",").flatMap {
- case columnFamilyNamePattern(cf, cname) => Some((StringEscapeUtils.escapeJava(cf), StringEscapeUtils.escapeJava(cname)))
- case _ => throw new IllegalArgumentException(
- s"""
- |Column family to column mapping pattern is not correct -> ${tableColumnMapping}
- |Please check the property ${HbaseConfigs.hbaseColumnMappingKey}, it should be in format -> cf1:c1,cf1:c2,cf2:c3
- |""".stripMargin)
- }.groupBy(_._1).map { case (k, v) => (k, v.map(_._2)) }
- columnMapping
- } catch {
- case ex: IllegalArgumentException =>
- logger.warning(ex.getMessage)
- Map.empty[String, Array[String]]
- }
- }
-
- /**
- * Authenticate Read/Write with HBASE Policies
- *
- * @param dataset
- * @param operation
- * @param dataSetProps
- */
- def authenticateThroughRangerPolicies(dataset: String, operation: String, dataSetProps: Map[String, Any]): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- val datasetProps: DataSetProperties = dataSetProps(GimelConstants.DATASET_PROPS).asInstanceOf[DataSetProperties]
- val tableProperties = datasetProps.props
- val hbaseTable = dataSetProps.getOrElse(HbaseConfigs.hbaseTableKey, tableProperties.getOrElse(HbaseConfigs.hbaseTableKey, "")).asInstanceOf[String]
- val hbaseNameSpace = dataSetProps.getOrElse(GimelConstants.HBASE_NAMESPACE, tableProperties.getOrElse(GimelConstants.HBASE_NAMESPACE, "default")).asInstanceOf[String]
- val hbaseTableName = hbaseTable.split(":")(1)
- val hBaseNameSpaceAndTable = hbaseNameSpace + ":" + hbaseTableName
- val clusterName = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName()
- logger.info("hBaseNameSpaceAndTable and clusterName" + hBaseNameSpaceAndTable + clusterName)
- val currentUser = datasetProps.props.getOrElse(GimelConstants.GTS_USER_CONFIG, sparkSession.sparkContext.sparkUser)
- if (AuthHandler.isAuthRequired(sparkSession)) {
- AuthHandler.authenticateHbasePolicy(currentUser, operation, hBaseNameSpaceAndTable, dataset, clusterName, dataSetProps)
- }
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/DataSetTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/DataSetTest.scala
deleted file mode 100644
index f2eee94b..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/DataSetTest.scala
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase
-
-import org.scalatest._
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-import com.paypal.gimel.hbase.utilities.HBaseLocalClient
-
-class DataSetTest extends HBaseLocalClient with Matchers {
- test("Write operation") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val dataFrame = mockDataInDataFrame(10)
- dataFrame.show(1)
- val df = dataSet.write(dataSetName, dataFrame, datasetProps)
- assert(df.count() == 10)
- }
-
- test("Read operation") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val df = dataSet.read(dataSetName, datasetProps)
- df.show(1)
- assert(df.count() == 10)
- }
-
- test("Write operation - column given in input via " + HbaseConfigs.hbaseColumnMappingKey + " not present in dataframe to write") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:manager,professional:comp")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val dataFrame = mockDataInDataFrame(10)
- dataFrame.show(1)
- val exception = intercept[Exception] {
- dataSet.write(dataSetName, dataFrame, datasetProps)
- }
- assert(exception.getMessage.contains("Columns : manager,comp not found in dataframe schema") == true)
- }
-
- test("Read operation - select specific columns") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,professional:company")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val df = dataSet.read(dataSetName, datasetProps)
- df.show(1)
- assert(df.columns.sameElements(Array("id", "company", "name", "address")))
- }
-
- test("Read operation - same column in 2 column families") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,professional:name",
- HbaseConfigs.hbaseColumnNamewithColumnFamilyAppended -> "true")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val df = dataSet.read(dataSetName, datasetProps)
- df.show(1)
- assert(df.columns.sameElements(Array("rowkey_id", "professional_name", "personal_name", "personal_address")))
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseCatalogTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseCatalogTest.scala
deleted file mode 100644
index cc768df3..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseCatalogTest.scala
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
-import org.scalatest.{FunSpec, Matchers}
-
-class HBaseCatalogTest extends FunSpec with Matchers {
-
- describe("fieldsAsStringForCataLogAppendColumnFamily") {
- it ("should create json of fields with type as string for Catalog with column Family appended with Column Name") {
- HBaseCatalog.fieldsAsStringForCataLogAppendColumnFamily(columnsList, "cf1") should be (
- s""""cf1_c1":{"cf":"cf1", "col":"c1", "type":"string"},
- |"cf1_c2":{"cf":"cf1", "col":"c2", "type":"string"},
- |"cf1_c3":{"cf":"cf1", "col":"c3", "type":"string"}""".stripMargin)
-
- HBaseCatalog.fieldsAsStringForCataLogAppendColumnFamily(keyList, "rowkey") should be (
- s""""rowkey_key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"},
- |"rowkey_key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}""".stripMargin)
- }
- }
-
- describe("fieldsAsStringForCataLog") {
- it ("should create json of fields with type as string for Catalog") {
- HBaseCatalog.fieldsAsStringForCataLog(columnsList, "cf1") should be (
- s""""c1":{"cf":"cf1", "col":"c1", "type":"string"},
- |"c2":{"cf":"cf1", "col":"c2", "type":"string"},
- |"c3":{"cf":"cf1", "col":"c3", "type":"string"}""".stripMargin)
-
- HBaseCatalog.fieldsAsStringForCataLog(keyList, "rowkey") should be (
- s""""key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"},
- |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"}""".stripMargin)
- }
- }
-
- describe("HBaseCatalog") {
- it ("should create a catalog string with one column family and df columns array for shc connector") {
- HBaseCatalog("namespace", "tablename", columnsList, keyList, "cf1") should be
- s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"},
- |"rowkey":"key1:key2",
- |"columns":{
- |"key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"},
- |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"},
- |"c1":{"cf":"cf1", "col":"c1", "type":"string"},
- |"c2":{"cf":"cf1", "col":"c2", "type":"string"},
- |"c3":{"cf":"cf1", "col":"c3", "type":"string"}
- |}
- |}
- |""".stripMargin
- }
-
- it ("should create a catalog string with one column family and df schema for shc connector") {
- HBaseCatalog("namespace", "tablename", schema, keyList, "cf1", "PrimitiveType") should be
- s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"},
- |"rowkey":"key1:key2",
- |"columns":{
- |"key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"},
- |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"},
- |"num":{"cf":"cf1", "col":"num", "type":"string"},
- |"letter":{"cf":"cf1", "col":"letter", "type":"string"}
- |}
- |}
- |""".stripMargin
- }
-
- it ("should create a catalog string with multiple column families for shc connector") {
- // With column family appended
- HBaseCatalog("namespace", "tablename", columnFamilyToColumnMapping, keyList, "PrimitiveType", true) should be
- s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"},
- |"rowkey":"key1:key2",
- |"columns":{
- |"rowkey_key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"},
- |"rowkey_key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"},
- |"cf1_c11":{"cf":"cf1", "col":"c11", "type":"string"},
- |"cf1_c12":{"cf":"cf1", "col":"c12", "type":"string"},
- |"cf2_c21":{"cf":"cf2", "col":"c21", "type":"string"},
- |"cf2_c22":{"cf":"cf2", "col":"c22", "type":"string"}
- |}
- |}
- |""".stripMargin
-
- // Without column family appended
- HBaseCatalog("namespace", "tablename", columnFamilyToColumnMapping, keyList, "PrimitiveType", false) should be
- s"""{"table":{"namespace":"namespace", "name":"tablename", "tableCoder":"PrimitiveType"},
- |"rowkey":"key1:key2",
- |"columns":{
- |"key1":{"cf":"rowkey", "col":"key1", "type":"string", "length":"50"},
- |"key2":{"cf":"rowkey", "col":"key2", "type":"string", "length":"50"},
- |"c11":{"cf":"cf1", "col":"c11", "type":"string"},
- |"c12":{"cf":"cf1", "col":"c12", "type":"string"},
- |"c21":{"cf":"cf2", "col":"c21", "type":"string"},
- |"c22":{"cf":"cf2", "col":"c22", "type":"string"}
- |}
- |}
- |""".stripMargin
-
- }
- }
-
- val schema: StructType = StructType(
- List(
- StructField("num", IntegerType, true),
- StructField("letter", StringType, true)
- )
- )
-
- val columnFamilyToColumnMapping = Map("cf1" -> Array("c11", "c12"),
- "cf2" -> Array("c21", "c22"))
-
- val keyList = Array("key1", "key2")
-
- val columnsList = Array("c1", "c2", "c3")
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLocalClient.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLocalClient.scala
deleted file mode 100644
index e8945f86..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLocalClient.scala
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import java.io.File
-
-import scala.collection.mutable.ArrayBuffer
-
-import com.google.common.io.Files
-import org.apache.hadoop.hbase.{HBaseTestingUtility, TableName}
-import org.apache.hadoop.hbase.util.Bytes
-import org.apache.spark.SparkConf
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.apache.spark.sql.execution.QueryExecution
-import org.apache.spark.sql.execution.datasources.hbase.SparkHBaseConf
-import org.apache.spark.sql.util._
-import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
-
-import com.paypal.gimel.common.catalog.Field
-import com.paypal.gimel.hbase.DataSet
-
-class HBaseLocalClient extends FunSuite with Matchers with BeforeAndAfterAll {
-
- var sparkSession : SparkSession = _
- var dataSet: DataSet = _
- val hbaseTestingUtility = new HBaseTestingUtility()
- val tableName = "test_table"
- val cfs = Array("personal", "professional")
- val columns = Array("id", "name", "age", "address", "company", "designation", "salary")
- val fields = columns.map(col => new Field(col))
-
- val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]
-
- protected override def beforeAll(): Unit = {
- val tempDir: File = Files.createTempDir
- tempDir.deleteOnExit
- hbaseTestingUtility.startMiniCluster()
- SparkHBaseConf.conf = hbaseTestingUtility.getConfiguration
- createTable(tableName, cfs)
- val conf = new SparkConf
- conf.set(SparkHBaseConf.testConf, "true")
- sparkSession = SparkSession.builder()
- .master("local")
- .appName("HBase Test")
- .config(conf)
- .getOrCreate()
-
- val listener = new QueryExecutionListener {
- // Only test successful case here, so no need to implement `onFailure`
- override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
- override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
- metrics += ((funcName, qe, duration))
- }
- }
- sparkSession.listenerManager.register(listener)
- sparkSession.sparkContext.setLogLevel("ERROR")
- dataSet = new DataSet(sparkSession)
- }
-
- protected override def afterAll(): Unit = {
- hbaseTestingUtility.shutdownMiniCluster()
- sparkSession.close()
- }
-
- def createTable(name: String, cfs: Array[String]) {
- val tName = Bytes.toBytes(name)
- val bcfs = cfs.map(Bytes.toBytes(_))
- try {
- hbaseTestingUtility.deleteTable(TableName.valueOf(tName))
- } catch {
- case _ : Throwable =>
- println("No table = " + name + " found")
- }
- hbaseTestingUtility.createMultiRegionTable(TableName.valueOf(tName), bcfs)
- }
-
- // Mocks data for testing
- def mockDataInDataFrame(numberOfRows: Int): DataFrame = {
- def stringed(n: Int) = s"""{"id": "$n","name": "MAC-$n", "address": "MAC-${n + 1}", "age": "${n + 1}", "company": "MAC-$n", "designation": "MAC-$n", "salary": "${n * 10000}" }"""
- val texts: Seq[String] = (1 to numberOfRows).map { x => stringed(x) }
- val rdd: RDD[String] = sparkSession.sparkContext.parallelize(texts)
- val dataFrame: DataFrame = sparkSession.read.json(rdd)
- dataFrame
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLookUpTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLookUpTest.scala
deleted file mode 100644
index 63dd6511..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseLookUpTest.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.scalatest.{BeforeAndAfterAll, Matchers}
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-
-class HBaseLookUpTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll {
- ignore ("get") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseFilter -> "rowKey=10",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary",
- HbaseConfigs.hbaseOperation -> "get")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val dataFrame = mockDataInDataFrame(10)
- dataFrame.show(1)
- val df = HBasePut(sparkSession).put(dataSetName, dataFrame, datasetProps)
- val dfLookUp = HBaseLookUp(sparkSession).get(dataSetName, datasetProps)
- dfLookUp.show
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBasePutTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBasePutTest.scala
deleted file mode 100644
index 04800f50..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBasePutTest.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.scalatest.{BeforeAndAfterAll, Matchers}
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-
-class HBasePutTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll {
- ignore ("put") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary",
- HbaseConfigs.hbaseOperation -> "put")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val dataFrame = mockDataInDataFrame(10)
- dataFrame.show(1)
- val df = HBasePut(sparkSession).put(dataSetName, dataFrame, datasetProps)
- assert(df.count() == 10)
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseScannerTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseScannerTest.scala
deleted file mode 100644
index ba62c4c1..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseScannerTest.scala
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.scalatest.{BeforeAndAfterAll, Matchers}
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-
-class HBaseScannerTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll {
- ignore("getSchema") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val dataFrame = mockDataInDataFrame(1000)
- dataFrame.show(1)
- HBaseSparkConnector(sparkSession).write(dataSetName, dataFrame, datasetProps)
- val schema = HBaseScanner().getSchema("default", tableName, 100, 100000)
- println(schema)
- assert(schema.keys.sameElements(cfs))
- assert(schema("personal").sameElements(Array("name", "age", "address")))
- assert(schema("professional").sameElements(Array("company", "designation", "salary")))
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnectorTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnectorTest.scala
deleted file mode 100644
index 657e7fef..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseSparkConnectorTest.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.scalatest.{BeforeAndAfterAll, Matchers}
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.common.conf.GimelConstants
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-
-class HBaseSparkConnectorTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll {
-
- test("write operation") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val dataFrame = mockDataInDataFrame(1000)
- dataFrame.show(1)
- val df = HBaseSparkConnector(sparkSession).write(dataSetName, dataFrame, datasetProps)
- assert(df.count() == 1000)
- }
-
- test("read operation") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties" -> dataSetProperties)
- val df = HBaseSparkConnector(sparkSession).read(dataSetName, datasetProps)
- df.show(1)
- assert(df.count() == 1000)
- }
-
- test("read operation with page size") {
- val props : Map[String, String] = Map(HbaseConfigs.hbaseNamespaceKey -> "default",
- HbaseConfigs.hbaseTableKey -> s"""$tableName""",
- HbaseConfigs.hbaseRowKey -> "id",
- HbaseConfigs.hbaseColumnMappingKey -> "personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
- sparkSession.conf.set(GimelConstants.HBASE_PAGE_SIZE, 20)
- val dataSetName = "HBase.Local.default." + tableName
- val dataSetProperties = DataSetProperties(dataSetName, null, null, props)
- val datasetProps : Map[String, Any] = Map("dataSetProperties"->dataSetProperties)
- val df = HBaseSparkConnector(sparkSession).read(dataSetName, datasetProps)
- df.show(20)
- val metricInsertQuery = metrics(metrics.length - 1)
- val qe = metricInsertQuery._2
- println(qe.executedPlan.children(0).children(0).children(0).metrics)
- val kafkaReadOutputRows = qe.executedPlan.children(0).children(0).children(0).metrics("numOutputRows").value
- assert(kafkaReadOutputRows == 20)
- sparkSession.conf.unset(GimelConstants.HBASE_PAGE_SIZE)
- }
-}
diff --git a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseUtilitiesTest.scala b/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseUtilitiesTest.scala
deleted file mode 100644
index 3cd9e867..00000000
--- a/gimel-dataapi/gimel-connectors/gimel-hbase/src/test/scala/com/paypal/gimel/hbase/utilities/HBaseUtilitiesTest.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.hbase.utilities
-
-import org.apache.spark.sql.types.StringType
-import org.scalatest.{BeforeAndAfterAll, Matchers}
-
-import com.paypal.gimel.common.catalog.DataSetProperties
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-
-class HBaseUtilitiesTest extends HBaseLocalClient with Matchers with BeforeAndAfterAll {
-
- var hbaseUtilities : HBaseUtilities = new HBaseUtilities(sparkSession)
-
- test ("castAllColsToString") {
- // it should cast all the columns in dataframe to string
- assert (hbaseUtilities.castAllColsToString(mockDataInDataFrame(5)).schema.filter(col => col.dataType != StringType).length == 0)
- }
-
- test ("getColumnMappingForColumnFamily") {
- // it should return the map of column family to column with correct pattern
- val mapping = hbaseUtilities.getColumnMappingForColumnFamily("cf1:c1,cf1:c2,cf1:c3,cf2:c4")
- assert(mapping("cf1").sameElements(Array("c1", "c2", "c3")))
- assert(mapping("cf2").sameElements(Array("c4")))
-
- // it should return the map of column family to column with correct pattern including :key
- val mapping1 = hbaseUtilities.getColumnMappingForColumnFamily(":key,cf1:c1,cf1:c2,cf1:c3,cf2:c4")
- assert(mapping1("cf1").sameElements(Array("c1", "c2", "c3")))
- assert(mapping1("cf2").sameElements(Array("c4")))
-
- val mapping2 = hbaseUtilities.getColumnMappingForColumnFamily("cf1:c1,:key,cf1:c2,cf1:c3,cf2:c4")
- assert(mapping2("cf1").sameElements(Array("c1", "c2", "c3")))
- assert(mapping2("cf2").sameElements(Array("c4")))
- }
-}
diff --git a/gimel-dataapi/gimel-examples/pom.xml b/gimel-dataapi/gimel-examples/pom.xml
deleted file mode 100644
index 498f8fcc..00000000
--- a/gimel-dataapi/gimel-examples/pom.xml
+++ /dev/null
@@ -1,83 +0,0 @@
-
-
-
-
-
-
- gimel-dataapi
- com.paypal.gimel
- 2.4.7-SNAPSHOT
- ../pom.xml
-
- 4.0.0
- gimel-examples
- 2.4.7-SNAPSHOT
-
-
-
- com.paypal.gimel
- gimel-core
- ${gimel.version}-SNAPSHOT
-
-
-
-
- src/main/scala
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 3.0.0
-
-
-
- com.google.common
- gimel-shaded.com.google.common
-
-
- com.sun.jersey
- gimel-shaded.com.sun.jersey
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
- gimel-shading
- package
-
- shade
-
-
-
-
-
-
-
diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageAcrossDataSets.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageAcrossDataSets.scala
deleted file mode 100644
index f1419742..00000000
--- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageAcrossDataSets.scala
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.examples
-
-import org.apache.spark.sql._
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-import com.paypal.gimel.DataSet
-import com.paypal.gimel.logger.Logger
-
-object APIUsageAcrossDataSets {
-
- // spark-shell --master yarn-client --driver-memory 4g \
- // --executor-memory 4g --executor-cores 1 --num-executors 2 --jars ~/pcatalog.jar
- // Initiate Logger
- val logger = Logger(this.getClass.getName)
- // Specify Batch Interval for Streaming
- val batchInterval = 5
-
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- sc.setLogLevel("ERROR")
- val sqlContext = sparkSession.sqlContext
-
- /**
- * --------------------- Context Initiation ---------------------
- */
-
- val ssc = new StreamingContext(sc, Seconds(batchInterval.toInt))
-
- /**
- * --------------------- DataSet Initiation ---------------------
- */
-
- // Initiate Pcatalog DataSet
- // val systemType = DataSetType
- val dataSet: DataSet = DataSet(sparkSession)
-
- /**
- * --------------------- Begin KAFKA Params ---------------------
- */
- // Create a Schema String for Avro SerDe
- val schema: String =
- """ {
- | "type" : "record",
- | "namespace" : "default",
- | "name" : "flights",
- | "fields" : [
- | { "name" : "month" , "type" : "string" },
- | { "name" : "dayofmonth" , "type" : "string" },
- | { "name" : "dayofweek" , "type" : "string" },
- | { "name" : "deptime" , "type" : "string" },
- | { "name" : "crsdeptime" , "type" : "string" },
- | { "name" : "arrtime" , "type" : "string" },
- | { "name" : "crsarrtime" , "type" : "string" },
- | { "name" : "uniquecarrier" , "type" : "string" },
- | { "name" : "flightnum" , "type" : "string" },
- | { "name" : "tailnum" , "type" : "string" },
- | { "name" : "actualelapsedtime" , "type" : "string" },
- | { "name" : "crselapsedtime" , "type" : "string" },
- | { "name" : "airtime" , "type" : "string" },
- | { "name" : "arrdelay" , "type" : "string" },
- | { "name" : "depdelay" , "type" : "string" },
- | { "name" : "origin" , "type" : "string" },
- | { "name" : "dest" , "type" : "string" },
- | { "name" : "distance" , "type" : "string" },
- | { "name" : "taxiin" , "type" : "string" },
- | { "name" : "taxiout" , "type" : "string" },
- | { "name" : "cancelled" , "type" : "string" },
- | { "name" : "cancellationcode" , "type" : "string" },
- | { "name" : "diverted" , "type" : "string" },
- | { "name" : "carrierdelay" , "type" : "string" },
- | { "name" : "weatherdelay" , "type" : "string" },
- | { "name" : "nasdelay" , "type" : "string" },
- | { "name" : "securitydelay" , "type" : "string" },
- | { "name" : "lateaircraftdelay" , "type" : "string" },
- | { "name" : "year" , "type" : "string" }
- | ]
- |}
- """.stripMargin
- // Create a Host:Port for Kafka, below works for Kafka installed on local machine
- val hostAndPort = "localhost:6667"
- val topic = "flights_avro_data1"
- // Create Kafka Params for Consumer
- val consumerParamsKafka: Map[String, String] = Map[String, String]("bootstrap.servers" -> hostAndPort,
- "group.id" -> 111.toString, "zookeeper.connection.timeout.ms" -> 10000.toString, "auto.offset.reset" -> "smallest",
- "avro.schema.string" -> schema)
- // Create Kafka Params for Producer
- val producerParamsKafka: Map[String, String] = Map[String, String]("bootstrap.servers" -> hostAndPort,
- "key.serializer" -> "org.apache.kafka.common.serialization.StringSerializer",
- "value.serializer" -> "org.apache.kafka.common.serialization.ByteArraySerializer",
- "avro.schema.string" -> schema)
- // Produce to Kafka
-
-
- /**
- * ------------------------- ES Props ------------------------------
- */
-
- val esOptions: Map[String, String] = Map("pushdown" -> "true", "es.nodes" -> "localhost", "es.port" -> "9200", "es.index.auto.create" -> "true")
-
- /**
- * ------------------------ HDFS Props ------------------------------
- */
-
- val wrtoptionsParquet: Map[String, String] = Map("hiveDatabaseName" -> "default", "hdfsPath" -> "hdfs:///tmp/parquet_demo/parquet_out", "inDataFormat" -> "parquet", "compressionCodec" -> "gzip", "columnDelimiter" -> "20")
-
- /**
- * --------------------- Begin Demo of API Usage ---------------------
- */
-
-
- // Read Hive
-
- val flights_from_hive: DataFrame = dataSet.read("flights_1m")
- flights_from_hive.show()
-
-
- // Write Kafka
-
- dataSet.write(topic, flights_from_hive.limit(10000), producerParamsKafka)
-
- // Read Kafka
-
- val flights_from_kafka: DataFrame = dataSet.read(topic, consumerParamsKafka)
- flights_from_kafka.show()
-
- // write HBase
-
- dataSet.write("flights_hbase", flights_from_kafka)
-
- // Read HBase
-
- val flights_from_hbase: DataFrame = dataSet.read("flights_hbase")
- flights_from_hbase.show()
-
- // Write ES
-
- dataSet.write("flights/demo", flights_from_hbase, esOptions)
-
- // Read ES
-
- val flights_from_ES: DataFrame = dataSet.read("flights/demo", esOptions)
- flights_from_ES.show()
-
- // Write HDFS
-
- dataSet.write("parquet_out", flights_from_ES, wrtoptionsParquet)
-
- // Read HDFS via Hive
-
- val flights_parquet_via_hive: DataFrame = dataSet.read("flights_parquet")
- flights_parquet_via_hive.show()
-
- // Comparison of All Operations --> Expected 999999
-
- flights_parquet_via_hive.unionAll(flights_from_hive).distinct().count()
-
-}
diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageElasticSearchDataSet.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageElasticSearchDataSet.scala
deleted file mode 100644
index e658b891..00000000
--- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageElasticSearchDataSet.scala
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.examples
-
-import org.apache.spark.sql._
-
-import com.paypal.gimel.DataSet
-
-object APIUsageElasticSearchDataSet {
-
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- sc.setLogLevel("ERROR")
- val sqlContext = sparkSession.sqlContext
-
- val nodes = "elastic_host_ip"
-
- val dataSet = DataSet(sparkSession)
-
- /* Use case for write as JSON for a given rdd */
- // WriteJSONforRDD
-
- val json1 = """{"reason2" : "business2", "airport2" : "SFO2"}"""
- val json2 = """{"participants2" : 5, "airport2" : "OTP2"}"""
- var options = Map("pushdown" -> "true",
- "es.nodes" -> "localhost", "es.port" -> "9200",
- "es.index.auto.create" -> "true", "JSON" -> "TRUE")
- val airportsRDD = sc.makeRDD(Seq(json1, json2))
-
-
- /* Use case for Read API into a DF */
- // ReadEStoDF
- options = Map("gimel.es.index.partitioned" -> "false"
- , "gimel.es.index.partition.delimiter" -> "_"
- , "gimel.es.index.partition" -> "20170602,20170603")
-
-
- /* Use case for write API for a given rdd */
- // WriteESfromRdd
-
- val game = Map("name" -> "dheeraj3", "age" -> "28", "gender" -> "male")
- val game1 = Map("name" -> "dheeraj4", "age" -> "28", "gender" -> "male")
- val rdd = sc.makeRDD(Seq(game, game1))
- options = Map("pushdown" -> "true"
- , "es.nodes" -> nodes, "es.port" -> "9200"
- , "es.index.auto.create" -> "true"
- , "gimel.es.index.partitioned" -> "true"
- , "gimel.es.index.partition.delimiter" -> "_"
- , "gimel.es.index.partition" -> "20170603")
-
-
- /* Use case for Read API as JSON into a DF */
- // ReadasJSONintoDF
-
- options = Map("pushdown" -> "true"
- , "es.nodes" -> "localhost"
- , "es.port" -> "9200"
- , "es.index.auto.create" -> "true"
- , "JSON" -> "TRUE"
- , "gimel.es.index.partitioned" -> "true"
- , "gimel.es.index.partition.delimiter" -> "_"
- , "gimel.es.index.partition" -> "20170602")
-
- /* Use case for Write API From a DF */
- // WriteESfromDF
-
- options = Map("gimel.es.index.partition" -> "20170602")
- val json31 = s"""{"name" : "dheeraj11", "age" : "28","gender":"male"}"""
- val json41 = s"""{"name" : "dheeraj12", "age" : "28","gender":"male"}"""
- val rdd11 = sc.parallelize(Seq(json31, json41))
- val df12 = sqlContext.read.json(rdd11)
-
- /* Use case for Write API From a DF as JSON */
- // WriteasJSONfromDF
-
- options = Map("pushdown" -> "true"
- , "es.nodes" -> nodes
- , "es.port" -> "9200"
- , "es.index.auto.create" -> "true"
- , "JSON" -> "TRUE")
- val json3 = """{"name" : "dheeraj", "age" : 28,","gender":"male"}"""
- val json4 = """{"name" : "baskar", "age" : 16,","gender":"male"}"""
-
- val rdd12 = sc.parallelize(Seq(json3, json4))
- val df1 = sqlContext.read.json(rdd12)
-
-}
diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaProduceConsume.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaProduceConsume.scala
deleted file mode 100644
index 9803c400..00000000
--- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaProduceConsume.scala
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.examples
-
-import scala.language.implicitConversions
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-import com.paypal.gimel._
-import com.paypal.gimel.logger.Logger
-
-/**
- * Demo's Kafka Producer and Consumer for DataSet
- */
-object APIUsageKafkaProduceConsume extends App {
-
- // Option to Run the Code in spark-submit mode,
- // if a table name is passed - it is considered. Else, default of kafka_testing_flights is read
- val datasetName = if (args.isEmpty) {
- "pcatalog.kafka_flights_log"
- } else {
- args(0)
- }
- // Initiate Logger
- val logger = Logger(this.getClass.getName)
- // Specify Batch Interval for Streaming
- val batchInterval = 5
- // Context
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- sc.setLogLevel("ERROR")
- val sqlContext = sparkSession.sqlContext
- val ssc = new StreamingContext(sc, Seconds(batchInterval.toInt))
-
- /**
- * ---------Initiate DataSet-----------
- */
- val dataSet: DataSet = DataSet(sparkSession)
-
-
- /**
- * ------------CDH Example ----------------
- */
- val options = "throttle.batch.fetchRowsOnFirstRun=2500000:throttle.batch.batch.parallelsPerPartition=250:throttle.batch.maxRecordsPerPartition=25000000"
-
- /**
- * ---------Read from Kafka, using the Table Props-----------
- */
- val recsDF = dataSet.read(datasetName, options)
- recsDF.show
-
- /**
- * ---------Get StateFul Kafka Operator before next read or any step of operation-----------
- */
- val kafkaOperator = dataSet.latestKafkaDataSetReader.get
-
- /**
- * ---------to clear Checkpoint (Ideally, one would not clear checkpoint in a continuous Batch or Stream in production)-----------
- * This operation Deletes the Zookeeper Node where the checkpoint is being done
- */
-
- kafkaOperator.clearCheckPoint()
-
- /**
- * ---------- Ability to check if already checkpointed -------
- * Once checkpoint is done - we set kafkaOperator.alreadyCheckPointed = true
- * This prevents second time checkpointing (for protection)
- * Below will return "true"
- */
-
- // val isAlreadyCheckPointed = kafkaOperator.alreadyCheckPointed
-
- /**
- * Second call on CheckPoint function will not perform any save but throw a warning message to user -
- * "Warning --> Already Check-Pointed, Consume Again to Checkpoint !"
- */
- kafkaOperator.saveCheckPoint()
-
- /**
- * ---------Write to Kafka Some Custom Data (NOT CDH !!)-----------
- */
-
- // Create Dummy Data Set for Write
- def stringed(n: Int): String = {
- s"""{"age": $n, "name": "MAC-$n", "rev": ${n * 10000}}"""
- }
-
- val texts: Seq[String] = (1 to 20).map { x => stringed(x) }
- val rdd: RDD[String] = sc.parallelize(texts)
- val df: DataFrame = sqlContext.read.json(rdd)
-
- // Get a List of Supported Systems for DataSet Operations
- // val systemType = DataSetType
-
- // DataSet Write API Call
- dataSet.write(datasetName, df)
-
-}
diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaStreamProduceConsume.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaStreamProduceConsume.scala
deleted file mode 100644
index 0f1ceed9..00000000
--- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/APIUsageKafkaStreamProduceConsume.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.examples
-
-import scala.language.implicitConversions
-
-import org.apache.avro.generic.GenericRecord
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-
-import com.paypal.gimel._
-import com.paypal.gimel.datastreamfactory.StreamingResult
-import com.paypal.gimel.logger.Logger
-
-/**
- * Demo's Kafka Producer and Consumer for DataStream
- */
-object APIUsageKafkaStreamProduceConsume extends App {
-
- // Initiate Logger
- val logger = Logger(this.getClass.getName)
- // Specify Batch Interval for Streaming
- val batchInterval = 10
- // Context
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- sc.setLogLevel("ERROR")
- val sqlContext = sparkSession.sqlContext
-
- // Initiate DStream
- val dataStream = DataStream(sc)
-
- // Option to Run the Code in spark-submit mode,
- // if a table name is passed - it is considered. Else, default of kafka_testing_flights is read
- val datasetName = if (args.isEmpty) {
- "kafka_testing_flights"
- } else {
- args(0)
- }
-
- // Get Reference to Stream
- val streamingResult: StreamingResult = dataStream.read(datasetName)
-
- // Clear CheckPoint if necessary
- streamingResult.clearCheckPoint("some message")
-
- // Helper for Clients
- streamingResult.dStream.foreachRDD { rdd =>
-
- val count = rdd.count()
-
- if (count > 0) {
-
- /**
- * Mandatory | Get Offset for Current Window, so we can checkpoint at the end of this window's operation
- */
-
- streamingResult.getCurrentCheckPoint(rdd)
-
- /**
- * Begin | User's Usecases
- */
-
- // Sample UseCase | Display Count
- logger.debug("count is -->")
- logger.debug(count)
-
- // Sample UseCase | Get Avro Generic Record
- val rddAvro: RDD[GenericRecord] = streamingResult.convertBytesToAvro(rdd)
- rddAvro.map(x => x.toString)
- logger.debug("sample records from Avro-->")
- rddAvro.map(x => x.toString).take(10).foreach(record => logger.debug(record))
-
- // Sample UseCase | Convert to DataFrame
- val df: DataFrame = streamingResult.convertAvroToDF(sqlContext, rddAvro)
- logger.debug("sample records -->")
- df.show(5)
-
- // JSON / String / Bytes (Avro) / Bytes (CDH) --> All can be deserialized into Spark DataFrame via this function
- streamingResult.getAsDF(sqlContext, rdd)
-
-
- /**
- * End | User's Usecases
- */
-
- /**
- * Mandatory | Save Current Window - CheckPoint
- */
-
- streamingResult.saveCurrentCheckPoint()
- }
- }
-
- // Start the Context
- dataStream.streamingContext.start()
- dataStream.streamingContext.awaitTermination()
-
-}
diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/GimelDruidRealtimeIngestion.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/GimelDruidRealtimeIngestion.scala
deleted file mode 100644
index 44d84ae6..00000000
--- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/GimelDruidRealtimeIngestion.scala
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.examples
-
-import org.apache.avro.generic.GenericRecord
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-import com.paypal.gimel.{DataSet, DataStream}
-import com.paypal.gimel.datastreamfactory.StreamingResult
-import com.paypal.gimel.logger.Logger
-
-object GimelDruidRealtimeIngestion {
- val logger = Logger(this.getClass.getName)
-
-
- def main(args: Array[String]): Unit = {
-
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- sc.setLogLevel("ERROR")
- val sqlContext = sparkSession.sqlContext
-
- // Create Streaming context
-
- val ssc = new StreamingContext(sc, Seconds(20))
-
- val dataStream = DataStream(ssc)
-
- val streamingResult: StreamingResult =
- dataStream
- .read("pcatalog.kafka_flights_log")
-
- streamingResult.clearCheckPoint("Clearing Checkpoint.")
-
- streamingResult.dStream.foreachRDD { rdd =>
- val count = rdd.count()
-
- if (count > 0) {
- /**
- * Mandatory | Get Offset for Current Window, so we can checkpoint at the end of this window's operation
- */
- streamingResult.getCurrentCheckPoint(rdd)
-
- logger.info(s"Count for current Checkpoint: $count")
- logger.info(s"Scala Version Used ---> ${scala.util.Properties.versionString}")
-
- val rddAvro: RDD[GenericRecord] = streamingResult.convertBytesToAvro(rdd)
- rddAvro.map(_.toString)
-
- val df: DataFrame = streamingResult.convertAvroToDF(sqlContext, rddAvro)
-
- // Call Druid Connector for realtime ingestion.
-
- val dataSet = new DataSet(sparkSession)
- val dataSetProps = Map("load_type" -> "realtime")
- dataSet.write("gimel.druid_flights_log", df, dataSetProps)
-
- streamingResult.saveCurrentCheckPoint()
- }
- }
-
- dataStream.streamingContext
-
- // Start the computation
- ssc.start()
-
- // Wait for the computation to terminate
- ssc.awaitTermination()
- }
-
-}
diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingKafkaMessageTesting.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingKafkaMessageTesting.scala
deleted file mode 100644
index 8b0c247c..00000000
--- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingKafkaMessageTesting.scala
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.examples
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql.functions._
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-import com.paypal.gimel._
-import com.paypal.gimel.datastreamfactory._
-import com.paypal.gimel.logger.Logger
-
-object SparkStreamingKafkaMessageTesting extends App {
-
- // Initiate Logger
- val logger = Logger(this.getClass.getName)
-
- import SparkStreamingKafkaStringMessageUtils._
-
- var params = resolveRunTimeParameters(args)
- val sourceName = params("source")
- val targetName = params.getOrElse("target", "NA")
- val messageFormat = params("messageFormat")
- // Specify Batch Interval for Streaming
- val batchInterval = params.getOrElse("batchInterval", "10").toInt
- val timeOutSeconds = params.getOrElse("timeOutSeconds", "60").toInt
- // Context
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- sc.setLogLevel("ERROR")
- val sqlContext = sparkSession.sqlContext
- import sqlContext.implicits._
-
- val ssc = new StreamingContext(sc, Seconds(batchInterval.toInt))
-
- val dataStream = DataStream(sc)
- val dataSet = DataSet(sparkSession)
- // Get Reference to Stream
- val streamingResult: StreamingResult = dataStream.read(sourceName)
- // Clear CheckPoint if necessary
- streamingResult.clearCheckPoint("some message")
- streamingResult.dStream.foreachRDD { rdd =>
- val k: RDD[WrappedData] = rdd
- val count = rdd.count()
-
- logger.info(s"Count is --> ${count}")
- logger.info(s"Message Type Specified is ${messageFormat}...")
- if (count > 0) {
-
- val df1 = streamingResult.getAsDF(sqlContext, rdd)
- df1.printSchema()
- df1.show(10)
- val updatedDataFrame: DataFrame = df1
- // updatedDataFrame.show
- val col1 = date_format(from_unixtime(col("logtime").divide(1000)), "yyyyMMdd")
- val dfWithDate = updatedDataFrame.withColumn("dfWithDate", col1)
- val dateList = dfWithDate.select("dfWithDate").distinct().collect.flatMap(_.toSeq)
- val dateListMap = dateList.map { date =>
- (date -> dfWithDate.where($"dfWithDate" <=> date))
- }.toMap
-
- dateListMap.foreach { case (key, dfes) =>
- val schemaMapping: String = s"""{"appStartTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date" }, "appEndTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date"},"jobStartTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date"}, "jobEndTime": {"format": "strict_date_optional_time||epoch_millis", "type": "date"}, "logtime": { "format": "strict_date_optional_time||epoch_millis", "type": "date"}}"""
- val options: Map[String, String] = Map("gimel.es.index.partition.suffix" -> s"$key", "gimel.es.schema.mapping" -> schemaMapping)
- if (targetName != "NA") {
- logger.info(s"Begin Writing To : ${targetName}")
- val res = dataSet.write(targetName, dfes, options)
- }
- }
- }
- streamingResult.saveCurrentCheckPoint()
- }
-
- dataStream.streamingContext.start()
- dataStream.streamingContext.awaitTerminationOrTimeout(-1)
- dataStream.streamingContext.stop(false, true)
-}
-
-object SparkStreamingKafkaStringMessageUtils {
-
- val logger = Logger(this.getClass.getName)
-
- def resolveRunTimeParameters(allParams: Array[String]): Map[String, String] = {
- def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName()
- logger.info(" @Begin --> " + MethodName)
- var paramsMapBuilder: Map[String, String] = Map()
- logger.info(s"All Params From User --> \n${allParams.mkString("\n")}")
- if (allParams.length == 0) {
- throw new Exception("Args Cannot be Empty")
- }
-
- for (jobParams <- allParams) {
- for (eachParam <- jobParams.split(" ")) {
- paramsMapBuilder += (eachParam.split("=")(0) -> eachParam.split("=", 2)(1))
- }
- }
- logger.info(s"Resolved Params From Code --> ${paramsMapBuilder}")
- paramsMapBuilder
- }
-}
diff --git a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingPCatalogUSDemo.scala b/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingPCatalogUSDemo.scala
deleted file mode 100644
index e67abf9b..00000000
--- a/gimel-dataapi/gimel-examples/src/main/scala/com/paypal/gimel/examples/SparkStreamingPCatalogUSDemo.scala
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.examples
-
-import org.apache.spark.sql._
-import org.apache.spark.sql.expressions.UserDefinedFunction
-import org.apache.spark.sql.functions._
-import org.apache.spark.streaming._
-
-import com.paypal.gimel.{DataSet, DataStream}
-import com.paypal.gimel.logger.Logger
-
-object SparkStreamingPCatalogUSDemo {
-
- // Define Geo Function
- case class Geo(lat: Double, lon: Double)
-
- val myUDF: UserDefinedFunction = udf((lat: Double, lon: Double) => Geo(lat, lon))
-
- def main(args: Array[String]) {
-
- // Creating SparkContext
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- sc.setLogLevel("ERROR")
- val sqlContext = sparkSession.sqlContext
- val ssc = new StreamingContext(sc, Seconds(10))
- val logger = Logger(this.getClass.getName)
-
- // Initiating PCatalog DataSet and DataStream
- val dataSet = DataSet(sparkSession)
- val dataStream = DataStream(ssc)
-
- // Reading from HDFS Dataset
- logger.info("Reading address_geo HDFS Dataset")
- val geoLookUpDF = dataSet.read("pcatalog.address_geo")
- val geoLookUp = geoLookUpDF.withColumn("geo", myUDF(geoLookUpDF("lat"), geoLookUpDF("lon"))).drop("lat").drop("lon")
- geoLookUp.cache()
- logger.info("Read" + geoLookUp.count() + " records")
-
- // Reading from Kafka DataStream and Loading into Elastic Search Dataset
- val streamingResult = dataStream.read("pcatalog.kafka_transactions")
- streamingResult.clearCheckPoint("OneTimeOnly")
- streamingResult.dStream.foreachRDD { rdd =>
- if (rdd.count() > 0) {
- streamingResult.getCurrentCheckPoint(rdd)
- val txnDF = streamingResult.convertAvroToDF(sqlContext, streamingResult.convertBytesToAvro(rdd))
- val resultSet = txnDF.join(geoLookUp, txnDF("account_number") === geoLookUp("customer_id"))
- .selectExpr("CONCAT(time_created,'000') AS time_created", "geo", "usd_amount")
-
- dataSet.write("pcatalog.elastic_transactions_dmz", resultSet)
- streamingResult.saveCurrentCheckPoint()
- }
- }
-
- // Start Streaming
- dataStream.streamingContext.start()
- dataStream.streamingContext.awaitTermination()
-
- sc.stop()
- }
-}
diff --git a/gimel-dataapi/gimel-sql/pom.xml b/gimel-dataapi/gimel-sql/pom.xml
deleted file mode 100644
index 9c2747a5..00000000
--- a/gimel-dataapi/gimel-sql/pom.xml
+++ /dev/null
@@ -1,141 +0,0 @@
-
-
-
- gimel-dataapi
- com.paypal.gimel
- 2.4.7-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- gimel-sql
- 2.4.7-SNAPSHOT
-
-
-
- com.paypal.gimel
- gimel-core
- ${gimel.version}-SNAPSHOT
-
-
- io.netty
- netty-handler
-
-
-
-
-
-
- org.scalatest
- scalatest_${scala.binary.version}
- ${scalatest.version}
- test
-
-
- org.scalamock
- scalamock_${scala.binary.version}
- ${scalamock.version}
- test
-
-
-
- io.netty
- netty
- ${netty.hadoop.version}
- test
-
-
- io.netty
- netty-all
- ${netty.all.hadoop.version}
- test
-
-
- net.jpountz.lz4
- lz4
- 1.3.0
- test
-
-
-
-
- src/main/scala
- src/test/scala
-
-
- net.alchim31.maven
- scala-maven-plugin
- 3.2.1
-
-
-
- compile
- testCompile
-
-
-
-
-
- -Xms64m
- -Xmx1024m
-
-
-
-
- org.scalatest
- scalatest-maven-plugin
- 1.0
-
- ${project.build.directory}/surefire-reports
- .
- WDF TestSuite.txt
-
-
-
- test
-
- test
-
-
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 3.0.0
-
-
-
- com.google.common
- gimel-shaded.com.google.common
-
-
- com.sun.jersey
- gimel-shaded.com.sun.jersey
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
- gimel-shading
- package
-
- shade
-
-
-
-
-
-
-
-
diff --git a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryProcessor.scala b/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryProcessor.scala
deleted file mode 100644
index be1a9387..00000000
--- a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryProcessor.scala
+++ /dev/null
@@ -1,1397 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import scala.collection.immutable.Map
-import scala.util.{Failure, Success, Try}
-
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql._
-import org.apache.spark.sql.streaming.{StreamingQuery, Trigger}
-import org.apache.spark.sql.types.StructField
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-import com.paypal.gimel._
-import com.paypal.gimel.common.catalog.{CatalogProvider, DataSetProperties}
-import com.paypal.gimel.common.conf.{CatalogProviderConfigs, GimelConstants}
-import com.paypal.gimel.common.gimelserde.GimelSerdeUtils
-import com.paypal.gimel.common.query.guard.QueryGuard
-import com.paypal.gimel.common.security.AuthHandler
-import com.paypal.gimel.common.utilities.{DataSetType, DataSetUtils, Timer}
-import com.paypal.gimel.datasetfactory.GimelDataSet
-import com.paypal.gimel.datastreamfactory.{StreamingResult, StructuredStreamingResult, WrappedData}
-import com.paypal.gimel.jdbc.conf.JdbcConfigs
-import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants}
-import com.paypal.gimel.logger.Logger
-import com.paypal.gimel.logging.GimelStreamingListener
-import com.paypal.gimel.parser.utilities.{QueryConstants, QueryParserUtils}
-
-object GimelQueryProcessor {
-
- val logger: Logger = Logger(this.getClass.getName)
- lazy val pCatalogStreamingKafkaTmpTableName = "pcatalog_streaming_kafka_tmp_table"
- val queryUtils = GimelQueryUtils
-
- import queryUtils._
-
- val originalUser = sys.env("USER")
- var user = originalUser
- var isQueryFromGTS = false
- val yarnCluster = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName()
- var queryGuard: Option[QueryGuard] = None
-
- /**
- * At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE)
- *
- * @param sparkSession Spark Session
- */
- def setCatalogProviderInfo(sparkSession: SparkSession): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val catalogProvider: String = sparkSession.conf.get(CatalogProviderConfigs.CATALOG_PROVIDER, GimelConstants.UDC_STRING)
- val catalogProviderName: String = sparkSession.conf.get(CatalogProviderConfigs.CATALOG_PROVIDER_NAME_SPACE, GimelConstants.UDC_STRING)
- logger.info(s"Catalog Provider --> [${catalogProvider}] | Catalog Provider Name --> [${catalogProviderName}] ")
- setCatalogProvider(catalogProvider)
- setCatalogProviderName(catalogProviderName)
- }
-
- /**
- * Sets Spark GTS User Name if available
- *
- * @param sparkSession SparkSession
- */
- def setGtsUser(sparkSession: SparkSession): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val gtsUser: String = sparkSession.sparkContext.getLocalProperty(GimelConstants.GTS_USER_CONFIG)
- val gts_default_user = GimelConstants.GTS_DEFAULT_USER(sparkSession.conf)
- if (gtsUser != null && originalUser.equalsIgnoreCase(gts_default_user)) {
- logger.info(s"GTS User [${gtsUser}] will be used to over ride executing user [${originalUser}] who started GTS.")
- sparkSession.sql(s"set ${GimelConstants.GTS_USER_CONFIG}=${gtsUser}")
-
- // set jdbc username,if already not set in sparkconf
- val jdbcUser: Option[String] = sparkSession.conf.getOption(JdbcConfigs.jdbcUserName)
- if (jdbcUser.isEmpty) {
- logger.info(s"Setting ${JdbcConfigs.jdbcUserName}=${gtsUser}")
- sparkSession.sql(s"set ${JdbcConfigs.jdbcUserName}=${gtsUser}")
- }
- user = gtsUser
- isQueryFromGTS = true
- }
- }
-
- /**
- * This function guards any runtime changes attempted by users to override GTS specific configurations.
- *
- * @param sql
- * @param sparkSession
- */
- def guardGTSStatements(sql: String, sparkSession: SparkSession): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- // Guard only if user is GTS Super User
- if (sparkSession.sparkContext.sparkUser.equalsIgnoreCase(GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))) {
-
- val checkFlag =
- // Impersonation Flag is not allowed to be set in GSQL
- sql.toLowerCase.contains(GimelConstants.GTS_IMPERSONATION_FLAG) ||
- // JDBC User is not alloweed to be set in GSQL
- // sql.toLowerCase.contains(JdbcConstants.jdbcUserName) ||
- // GTS User should not be overridded
- sql.toLowerCase.contains(GimelConstants.GTS_USER_CONFIG)
-
- if (checkFlag) throw new Exception(s"SECURITY VIOLATION | Execution of this statement is not allowed: ${sql}")
- }
-
- // Enable or stop query guard based on user config
- // switchQueryGuard(sparkSession)
- }
-
- /**
- *
- * @param sparkSession
- */
- def switchQueryGuard(sparkSession: SparkSession): Unit = {
- if (queryGuard.isEmpty) {
- queryGuard = Some(new QueryGuard(sparkSession))
- }
- // Turn ON and OFF Query guard
- val queryGuardControl = if (sparkSession.conf.getOption(GimelConstants.GTS_SPARK_QUERY_GUARD).isDefined) {
- sparkSession.conf.getOption(GimelConstants.GTS_SPARK_QUERY_GUARD)
- } else if (sparkSession.conf.getOption(GimelConstants.GTS_QUERY_GUARD).isDefined) {
- sparkSession.conf.getOption(GimelConstants.GTS_QUERY_GUARD)
- } else {
- None
- }
- queryGuardControl.foreach {
- case control: String if control.toLowerCase == "true" =>
- // start
- logger.info("Starting query guard")
- queryGuard.get.start()
- case control: String if control.toLowerCase == "false" =>
- // stop
- logger.info("Starting query guard")
- queryGuard.get.stop()
- case _ =>
- // wrong config received do nothing
- logger.info(s"Wrong config: $queryGuardControl received. So, stopping query guard")
- queryGuard.get.stop()
- }
- }
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- *
- * @param sql SQL String supplied by client
- * @param sparkSession : SparkSession
- * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries
- */
- def executeBatch(sql: String, sparkSession: SparkSession): DataFrame = {
-
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val uniformSQL = sql.replace("\n", " ").trim
- val sqlArray: Array[String] = uniformSQL.split(";")
- val totalStatements = sqlArray.length
- val dataFrames: Array[DataFrame] = sqlArray.zipWithIndex.map(eachSql => {
- val sqlString = eachSql._1
- val index = eachSql._2
- logger.info(s"Executing statement: ${sqlString}")
- try {
- executeBatchStatement(sqlString, sparkSession)
- }
- catch {
- case e: Throwable =>
- val errorMsg =
- s"""
- | Statements[${index}/${totalStatements}] successfully executed.
- | Statement[${index + 1}] execution failed --> ${sqlString}
- """.stripMargin
- logger.throwError(s"${errorMsg}")
- throw e
- }
- })
- logger.info(s"${totalStatements}/${totalStatements} statements successfully executed.")
- dataFrames(totalStatements - 1)
- }
-
- /**
- * This method will process one statement from executebatch
- *
- * @param sql SQL String supplied by client
- * @param sparkSession : SparkSession
- * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries
- */
- def executeBatchStatement(sql: String, sparkSession: SparkSession): DataFrame = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- logger.setSparkVersion(sparkSession.version)
-
- // Set gimel log level and flag to audit logs to kafka
- DataSetUtils.setGimelLogLevel(sparkSession, logger)
- guardGTSStatements(sql, sparkSession)
- switchQueryGuard(sparkSession)
-
- val sparkAppName = sparkSession.conf.get("spark.app.name")
-
- try {
-
- // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE)
- setCatalogProviderInfo(sparkSession)
-
- // If query comes from GTS - interpret the GTS user and set it
- setGtsUser(sparkSession)
-
- val options = queryUtils.getOptions(sparkSession)._2
-
- var resultingString = ""
- // val queryTimer = Timer()
- // val startTime = queryTimer.start
- val isCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean
- // val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean
-
- val sessionID = sparkSession.sparkContext.getLocalProperty(GimelConstants.GTS_GIMEL_LIVY_SESSION_ID)
-
- logger.debug(s"Is CheckPointing Requested By User --> $isCheckPointEnabled")
- val dataSet: DataSet = DataSet(sparkSession)
-
- // Query is via GTS
- val isGTSImpersonated = AuthHandler.isAuthRequired(sparkSession)
-
- // Query has Hive / HBASE related DML that requires authentication.
- lazy val isDMLHiveOrHbase = queryUtils.isHiveHbaseDMLAndGTSUser(sql, options, sparkSession)
- // Query is a DDL operation
- lazy val isDDL = queryUtils.isDDL(sql, sparkSession)
-
- // Identify JDBC complete pushdown
- val (isJdbcCompletePushDownEnabled, transformedSql, jdbcOptions) =
- GimelQueryUtils.isJdbcCompletePushDownEnabled(sparkSession, sql)
-
- val data = if (isJdbcCompletePushDownEnabled) {
- GimelQueryUtils.createPushDownQueryDataframe(sparkSession, transformedSql.get, jdbcOptions.get)
- } else if (isGTSImpersonated && (isDDL || isDMLHiveOrHbase)) {
- throw new UnsupportedOperationException(
- s"""
- | DDL or DML for [Hive | HBASE] are not supported in GTS (Gimel thrift server).
- | Please run the query in separate spark session.
- |""".stripMargin)
- } else if (queryUtils.isUDCDataDefinition(sql)) {
- logger.info("This path is dynamic dataset creation path")
- var resultingStr = ""
- Try(
- handleDDLs(sql, sparkSession, dataSet, options)
- ) match {
- case Success(result) =>
- resultingStr = "Query Completed."
- case Failure(e) =>
- resultingStr = s"Query Failed in function : $MethodName. Error --> \n\n ${
- e.toString
- }"
- logger.error(resultingStr)
- throw e
- }
- stringToDF(sparkSession, resultingStr)
- } else {
- // Allow thrift server to execute the Query for all other cases.
- val isSelectFromHiveOrHBase = queryUtils.isSelectFromHiveHbaseAndGTSUser(sql, options, sparkSession)
- logger.info(s"isSelectFromHiveOrHBase -> $isSelectFromHiveOrHBase")
- if (isSelectFromHiveOrHBase) {
- logger.info("Select query consists of Hive or HBase dataset, authenticating access through ranger.")
- queryUtils.authenticateAccess(sql, sparkSession, options)
- }
-
- // Set HBase Page Size for optimization if selecting from HBase with limit
- if (QueryParserUtils.isHavingLimit(sql)) {
- setLimitForHBase(sql, options, sparkSession)
- }
-
- val (originalSQL, destination, selectSQL, kafkaDataSets, queryPushDownFlag) =
- resolveSQL(sql, sparkSession, dataSet)
- destination match {
- case Some(target) =>
- logger.info(s"Target Exists --> ${target}")
- Try(
- executeResolvedQuery(originalSQL, destination, selectSQL, sparkSession, dataSet, queryPushDownFlag)
- ) match {
- case Success(result) =>
- resultingString = result
- case Failure(e) =>
- resultingString = s"Query Failed in function : $MethodName. Error --> \n\n ${
- e.toString
- }"
- logger.error(resultingString)
- throw e
- }
-
- if (isCheckPointEnabled) {
- saveCheckPointforKafka(kafkaDataSets)
- }
- import sparkSession.implicits._
- Seq(resultingString).toDF("Query Execution")
-
- case _ =>
- logger.info(s"No Target, returning DataFrame back to client.")
- executeSelectClause(selectSQL, sparkSession, queryPushDownFlag)
- }
- }
-
- // pushing logs to ES
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeBatch
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql, "isQueryFromGTS" -> isQueryFromGTS.toString, "originalUser" -> originalUser)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
-
- data
-
- } catch {
- case e: Throwable =>
-
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeBatch
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql, "isQueryFromGTS" -> isQueryFromGTS.toString, "originalUser" -> originalUser)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
-
- // throw error to console
- logger.throwError(e.toString)
-
- throw new Exception(s"${e.getMessage}\n", e)
- } finally {
- logger.info("Unsetting the property -> " + GimelConstants.HBASE_PAGE_SIZE)
- sparkSession.conf.unset(GimelConstants.HBASE_PAGE_SIZE)
- }
- }
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- * Executes the executeBatch function in streaming window
- *
- * @param sql SQL String from client
- * @param sparkSession : SparkSession
- * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries
- */
-
- def executeStream(sql: String, sparkSession: SparkSession): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- logger.setSparkVersion(sparkSession.version)
- val sparkAppName = sparkSession.conf.get("spark.app.name")
-
- // Set gimel log level and flag to audit logs to kafka
- DataSetUtils.setGimelLogLevel(sparkSession, logger)
-
- // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE)
- setCatalogProviderInfo(sparkSession)
-
- try {
-
- sparkSession.conf.set(GimelConstants.GIMEL_KAFKA_VERSION, GimelConstants.GIMEL_KAFKA_VERSION_ONE)
- val options = queryUtils.getOptions(sparkSession)._2
- val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt
- val streamRate = options(KafkaConfigs.maxRatePerPartitionKey)
- val isBackPressureEnabled = options(KafkaConfigs.isBackPressureEnabledKey)
- val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean
- val isSaveCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean
- val isStreamFailureBeyondThreshold = options.getOrElse(KafkaConfigs.isStreamBatchSwitchEnabledKey, "false").toBoolean
- val streamFailureThresholdPerSecond = options.getOrElse(KafkaConfigs.failStreamThresholdKey, "1200").toInt
- val streamFailureWindowFactor = options.getOrElse(KafkaConfigs.streamFailureWindowFactorKey, "10").toString.toInt
- val isStreamParallel = options(KafkaConfigs.isStreamParallelKey)
- val streamParallels = options(KafkaConfigs.streamParallelKey)
- val streamawaitTerminationOrTimeout = options(KafkaConfigs.streamaWaitTerminationOrTimeoutKey).toLong
- val sc = sparkSession.sparkContext
- val sqlContext = sparkSession.sqlContext
- val conf = new org.apache.spark.SparkConf()
- val ssc = new StreamingContext(sc, Seconds(batchInterval))
- val listner: GimelStreamingListener = new GimelStreamingListener(conf)
- ssc.addStreamingListener(listner)
- logger.debug(
- s"""
- |isStreamParallel --> $isStreamParallel
- |streamParallels --> $streamParallels
- """.stripMargin)
- ssc.sparkContext.getConf
- .set(KafkaConfigs.isBackPressureEnabledKey, isBackPressureEnabled)
- .set(KafkaConfigs.streamMaxRatePerPartitionKey, streamRate)
- .set(KafkaConfigs.isStreamParallelKey, isStreamParallel)
- .set(KafkaConfigs.streamParallelKey, streamParallels)
- val dataStream = DataStream(ssc)
- val sourceTables = getTablesFrom(sql)
- val kafkaTables = sourceTables.filter { table =>
- DataSetUtils.getSystemType(table, sparkSession, options) == DataSetType.KAFKA
- }
- if (kafkaTables.isEmpty) {
- throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !")
- } else {
- val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName
- val newSQL = sql.replaceAll(kafkaTables.head, tmpKafkaTable)
- val streamingResult: StreamingResult = dataStream.read(kafkaTables.head, options)
- if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint As Requested By User")
- try {
- streamingResult.dStream.foreachRDD { (rdd, time) =>
- printStats(time, listner)
- val count = rdd.count()
- if (count > 0) {
- if (isStreamFailureBeyondThreshold) {
- if ((count / batchInterval) > streamFailureThresholdPerSecond) throw new Exception(s"Current Messages Per Second : ${count / batchInterval} exceeded Supplied Stream Capacity ${streamFailureThresholdPerSecond}")
- else logger.info(s"Current Messages Per Second : ${count / batchInterval} within Supplied Stream Capacity ${streamFailureThresholdPerSecond}")
- }
- val failureThreshold = (batchInterval * streamFailureWindowFactor)
- val totalDelay = (listner.totalDelay / 1000)
- if (totalDelay > failureThreshold) {
- throw new Exception(
- s"""Current Total_Delay:$totalDelay exceeded $failureThreshold
-If mode=intelligent, then Restarting will result in Batch Mode Execution first for catchup, and automatically migrate to stream mode !
- """.stripMargin
- )
- } else logger.info(s"Current Total_Delay:$totalDelay within $failureThreshold ")
- streamingResult.getCurrentCheckPoint(rdd)
- streamingResult.getAsDF(sqlContext, rdd).registerTempTable(tmpKafkaTable)
- try {
- executeBatch(newSQL, sparkSession)
- } catch {
- case ex: Throwable =>
- // logger.error(s"Stream Query Failed in function : $MethodName. Error --> \n\n${ex.getStackTraceString}")
- // ex.printStackTrace()
- // logger.error("Force - Stopping Streaming Context")
- ssc.sparkContext.stop()
- throw ex
- }
- try {
- if (isSaveCheckPointEnabled) streamingResult.saveCurrentCheckPoint()
- if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint as Requested by User")
- }
- catch {
- case ex: Throwable =>
- // logger.error("Error in CheckPoint Operations in Streaming.")
- // ex.printStackTrace()
- ssc.sparkContext.stop()
- }
- }
- }
- } catch {
- case ex: Throwable =>
- // logger.error(s"ERROR In Streaming Window --> \n\n${ex.getStackTraceString}")
- // ex.printStackTrace()
- ssc.sparkContext.stop()
- throw ex
- }
- dataStream.streamingContext.start()
- dataStream.streamingContext.awaitTerminationOrTimeout(streamawaitTerminationOrTimeout)
- dataStream.streamingContext.stop(false, true)
-
- // push to logger
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
- "Success"
- }
-
- }
- catch {
- case e: Throwable =>
-
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
-
- // throw error to console
- logger.throwError(e.toString)
-
- throw e
- }
-
- }
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- * Executes the executeBatch function in streaming window
- *
- * @param sql SQL String from client
- * @param sparkSession : SparkSession
- * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries
- */
-
- def executeStream2(sql: String, sparkSession: SparkSession): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- logger.setSparkVersion(sparkSession.version)
- val sparkAppName = sparkSession.conf.get("spark.app.name")
-
- // Set gimel log level and flag to audit logs to kafka
- DataSetUtils.setGimelLogLevel(sparkSession, logger)
-
- // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE)
- setCatalogProviderInfo(sparkSession)
-
- try {
-
- val options = queryUtils.getOptions(sparkSession)._2
- val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt
- val triggerInterval = options.getOrElse(GimelConstants.GIMEL_STREAMING_TRIGGER_INTERVAL, "").toString
- val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean
- val sc = sparkSession.sparkContext
- val conf = new org.apache.spark.SparkConf()
- val ssc = new StreamingContext(sc, Seconds(batchInterval))
- val listener: GimelStreamingListener = new GimelStreamingListener(conf)
- ssc.addStreamingListener(listener)
- val dataStream = DataStream2(sparkSession)
- val sourceTables = getTablesFrom(sql)
- val targetTable = getTargetTables(sql)
- val kafkaTables = sourceTables.filter { table =>
- val dataSetType = DataSetUtils.getSystemType(table, sparkSession, options)
- (dataSetType == DataSetType.KAFKA || dataSetType == DataSetType.KAFKA2)
- }
- if (kafkaTables.isEmpty) {
- throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !")
- } else {
- val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName
- val selectSQL = getSelectClause(sql)
- val newSQL = selectSQL.toLowerCase().replaceAll(kafkaTables.head, tmpKafkaTable)
- val datasetProps = CatalogProvider.getDataSetProperties(kafkaTables.head, options)
- /*
- * Sets the appropriate deserializer class based on the kafka.message.value.type and value.serializer properties
- * This is mainly required for backward compatibility for KAFKA datasets
- */
- val newOptions = GimelSerdeUtils.setGimelDeserializer(sparkSession, datasetProps, options, true)
- val streamingResult: StructuredStreamingResult = dataStream.read(kafkaTables.head, newOptions)
- val streamingDF = streamingResult.df
- streamingDF.createOrReplaceTempView(tmpKafkaTable)
-
- val streamingSQLDF = sparkSession.sql(newSQL)
- var writer: StreamingQuery = null
- try {
- val datastreamWriter = targetTable match {
- case Some(target) =>
- val datasetProps = CatalogProvider.getDataSetProperties(target, options)
- /*
- * Sets the appropriate serializer class based on the kafka.message.value.type and value.serializer properties
- * This is mainly required for backward compatibility for KAFKA datasets
- */
- val newOptions = GimelSerdeUtils.setGimelSerializer(sparkSession, datasetProps, options, true)
- dataStream.write(target, streamingSQLDF, newOptions)
- case _ =>
- streamingSQLDF
- .writeStream
- .outputMode("append")
- .format("console")
- }
-
- writer = if (triggerInterval.isEmpty) {
- datastreamWriter.start()
- } else {
- datastreamWriter
- .trigger(Trigger.ProcessingTime(triggerInterval + " seconds"))
- .start()
- }
-
- } catch {
- case ex: Throwable =>
- // logger.error(s"ERROR In Streaming Window --> \n\n${ex.getStackTraceString}")
- // ex.printStackTrace()
- if (writer != null) {
- writer.stop
- }
- throw ex
- }
-
- // push to logger
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
- "Success"
- }
-
- }
- catch {
- case e: Throwable =>
-
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
-
- // throw error to console
- logger.throwError(e.toString)
-
- throw e
- }
-
- }
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- *
- * @return RDD[Resulting String < either sample data for select queries, or "success" / "failed" for insert queries]
- */
- def executeBatchSparkMagic: (String, SparkSession) => RDD[String] = executeBatchSparkMagicRDD
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- * Executes the executeBatchSparkMagicRDD function in streaming window
- *
- * @return RDD[Resulting String] < either sample data for select queries, or "success" / "failed" for insert queries
- */
- def executeStreamSparkMagic: (String, SparkSession) => RDD[String] = executeStreamSparkMagicRDD
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- *
- * @param sql SQL String supplied by client
- * @param sparkSession : SparkSession
- * @return RDD[Resulting String < either sample data for select queries, or "success" / "failed" for insert queries]
- */
- def executeBatchSparkMagicRDD(sql: String, sparkSession: SparkSession): RDD[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- logger.setSparkVersion(sparkSession.version)
-
- // Set gimel log level and flag to audit logs to kafka
- DataSetUtils.setGimelLogLevel(sparkSession, logger)
-
- val sparkAppName = sparkSession.conf.get("spark.app.name")
-
- // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE)
- setCatalogProviderInfo(sparkSession)
-
- try {
-
- val options = queryUtils.getOptions(sparkSession)._2
-
- var resultingRDD: RDD[String] = sparkSession.sparkContext.parallelize(Seq(""))
- val queryTimer = Timer()
- val startTime = queryTimer.start
- val isCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean
- val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean
- logger.debug(s"Is CheckPointing Requested By User --> ${
- isCheckPointEnabled
- }")
- val dataSet: DataSet = DataSet(sparkSession)
- val (originalSQL, destination, selectSQL, kafkaDataSets, queryPushDownFlag) = resolveSQL(sql, sparkSession, dataSet)
- Try(executeResolvedQuerySparkMagic(originalSQL, destination, selectSQL, sparkSession, dataSet, queryPushDownFlag)) match {
- case Success(result) =>
- resultingRDD = result
- case Failure(e) =>
- resultingRDD = sparkSession.sparkContext.parallelize(Seq(
- s"""{"Batch Query Error" : "${
- e.getStackTraceString
- }" """))
- val resultMsg = resultingRDD.collect().mkString("\n")
- // logger.error(resultMsg)
- throw new Exception(resultMsg)
- }
- if (isCheckPointEnabled) {
- saveCheckPointforKafka(kafkaDataSets)
- }
- if (isClearCheckPointEnabled) {
- clearCheckPointforKafka(kafkaDataSets)
- }
-
- // push logs to ES
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
- resultingRDD
- }
- catch {
- case e: Throwable =>
-
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
- // throw error to console
- logger.throwError(e.toString)
-
- throw e
- }
- }
-
-
- def saveCheckPointforKafka(kafkaDataSets: List[GimelDataSet]): Unit = {
- kafkaDataSets.foreach {
- case kafka: com.paypal.gimel.kafka.DataSet =>
- kafka.saveCheckPoint()
- case kafka2: com.paypal.gimel.kafka2.DataSet =>
- kafka2.saveCheckPoint()
- }
-
- }
-
-
- def clearCheckPointforKafka(kafkaDataSets: List[GimelDataSet]): Unit = {
- kafkaDataSets.foreach {
- case kafka: com.paypal.gimel.kafka.DataSet =>
- kafka.clearCheckPoint()
- case kafka2: com.paypal.gimel.kafka2.DataSet =>
- kafka2.clearCheckPoint()
- }
- }
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- * Executes the executeBatchSparkMagicRDD function in streaming window
- *
- * @param sql SQL String from client
- * @param sparkSession : SparkSession
- * @return RDD[Resulting String] < either sample data for select queries, or "success" / "failed" for insert queries
- */
-
- def executeStreamSparkMagicRDD(sql: String, sparkSession: SparkSession): RDD[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- logger.setSparkVersion(sparkSession.version)
-
- // Set gimel log level and flag to audit logs to kafka
- DataSetUtils.setGimelLogLevel(sparkSession, logger)
-
- val sparkAppName = sparkSession.conf.get("spark.app.name")
-
- // At Run Time - Set the Catalog Provider and The Name Space of the Catalog (like the Hive DB Name when catalog Provider = HIVE)
- setCatalogProviderInfo(sparkSession)
-
- try {
-
- sparkSession.conf.set(GimelConstants.GIMEL_KAFKA_VERSION, GimelConstants.GIMEL_KAFKA_VERSION_ONE)
- val options = getOptions(sparkSession)._2
-
- val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt
- val streamRate = options(KafkaConfigs.maxRatePerPartitionKey)
- val isBackPressureEnabled = options(KafkaConfigs.isBackPressureEnabledKey)
- val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean
- val isSaveCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean
- val isStreamParallel = options(KafkaConfigs.isStreamParallelKey)
- val sc = sparkSession.sparkContext
- val sqlContext = sparkSession.sqlContext
- val ssc = new StreamingContext(sc, Seconds(batchInterval))
- val listner: GimelStreamingListener = new GimelStreamingListener(sc.getConf)
- ssc.addStreamingListener(listner)
- ssc.sparkContext.getConf
- .set(KafkaConfigs.isBackPressureEnabledKey, isBackPressureEnabled)
- .set(KafkaConfigs.streamMaxRatePerPartitionKey, streamRate)
- .set(KafkaConfigs.isStreamParallelKey, isStreamParallel)
- val dataStream = DataStream(ssc)
- val sourceTables = getTablesFrom(sql)
- val kafkaTables = sourceTables.filter { table =>
- val dataSetProperties: DataSetProperties =
- CatalogProvider.getDataSetProperties(table, options)
- DataSetUtils.getSystemType(dataSetProperties) == DataSetType.KAFKA
- }
- val data = if (kafkaTables.isEmpty) {
- throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !")
- } else {
- try {
- val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName
- val newSQL = sql.replaceAll(kafkaTables.head, tmpKafkaTable)
- val streamingResult: StreamingResult = dataStream.read(kafkaTables.head, options)
- if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint As Requested By User")
- streamingResult.dStream.foreachRDD {
- (rdd, time) =>
- printStats(time, listner)
- val k: RDD[WrappedData] = rdd
- val count = rdd.count()
- if (count > 0) {
- streamingResult.getCurrentCheckPoint(rdd)
- streamingResult.getAsDF(sqlContext, rdd).registerTempTable(tmpKafkaTable)
- try {
- executeBatchSparkMagicRDD(newSQL, sparkSession)
- }
- catch {
- case ex: Throwable =>
- // logger.error(s"Stream Query Failed in function : $MethodName. Error --> \n\n${ex.getStackTraceString}")
- // ex.printStackTrace()
- // logger.error("Force - Stopping Streaming Context")
- ssc.sparkContext.stop()
- }
- try {
- if (isSaveCheckPointEnabled) streamingResult.saveCurrentCheckPoint()
- if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint as Requested by User")
- }
- catch {
- case ex: Throwable =>
- // logger.error("Error in CheckPoint Operations in Streaming.")
- // ex.printStackTrace()
- ssc.sparkContext.stop()
- }
- }
- }
- dataStream.streamingContext.start()
- dataStream.streamingContext.awaitTermination()
- dataStream.streamingContext.sparkContext.parallelize(Seq(s"""{"Query" : "Running..." }"""))
- } catch {
- case ex: Throwable =>
- ex.printStackTrace()
- val msg =
- s"""{"Error" : "${
- ex.getStackTraceString
- }" }"""
- dataStream.streamingContext.stop()
- // dataStream.streamingContext.spark.parallelize(Seq(s"""{"Error" : "${ex.getStackTraceString}" }"""))
- throw new Exception(msg)
- }
- }
-
- // push logs to ES
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
-
- data
- }
- catch {
- case e: Throwable =>
-
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
-
- // throw error to console
- logger.throwError(e.toString)
-
- throw e
- }
-
- }
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- *
- * @param sql SQL String supplied by client
- * @param sparkSession : SparkSession
- * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries
- */
-
- @deprecated
- def executeBatchSparkMagicJSON(sql: String, sparkSession: SparkSession): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- val sparkAppName = sparkSession.conf.get("spark.app.name")
-
- // Set gimel log level and flag to audit logs to kafka
- DataSetUtils.setGimelLogLevel(sparkSession, logger)
-
- try {
- val options = queryUtils.getOptions(sparkSession)._2
- var resultSet = ""
- val queryTimer = Timer()
- val startTime = queryTimer.start
- val isCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean
- val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean
- logger.debug(s"Is CheckPointing Requested By User --> ${
- isCheckPointEnabled
- }")
- val dataSet: DataSet = DataSet(sparkSession)
- val (originalSQL, destination, selectSQL, kafkaDataSets, queryPushDownFlag) = resolveSQL(sql, sparkSession, dataSet)
- Try(executeResolvedQuerySparkMagic(originalSQL, destination, selectSQL, sparkSession, dataSet, queryPushDownFlag)) match {
- case Success(result) =>
- resultSet =
- s"""{"Batch Query Result" : "${
- result.collect().mkString("[", ",", "]")
- } }"""
- case Failure(e) =>
- resultSet =
- s"""{"Batch Query Error" : "${
- e.getStackTraceString
- }" """
- // logger.error(resultSet)
- throw new Exception(resultSet)
- }
-
- if (isCheckPointEnabled) {
- saveCheckPointforKafka(kafkaDataSets)
- }
- if (isClearCheckPointEnabled) {
- clearCheckPointforKafka(kafkaDataSets)
- }
-
- // push logs to ES
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
- resultSet
- }
- catch {
- case e: Throwable =>
-
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
-
- // throw error to console
- logger.throwError(e.toString)
-
- throw e
- }
-
- }
-
- /**
- * Core Function that will be called from SCAAS for executing a SQL
- * Executes the @executeBatchSparkMagicJSON function in streaming window
- *
- * @param sql SQL String from client
- * @param sparkSession : SparkSession
- * @return Resulting String < either sample data for select queries, or "success" / "failed" for insert queries
- */
-
- @deprecated
- def executeStreamSparkMagicJSON(sql: String, sparkSession: SparkSession): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- val sparkAppName = sparkSession.conf.get("spark.app.name")
- var returnMsg = ""
-
- // Set gimel log level and flag to audit logs to kafka
- DataSetUtils.setGimelLogLevel(sparkSession, logger)
-
- try {
- sparkSession.conf.set(GimelConstants.GIMEL_KAFKA_VERSION, GimelConstants.GIMEL_KAFKA_VERSION_ONE)
- val options = queryUtils.getOptions(sparkSession)._2
- val batchInterval = options(KafkaConfigs.defaultBatchInterval).toInt
- val streamRate = options(KafkaConfigs.maxRatePerPartitionKey)
- val isBackPressureEnabled = options(KafkaConfigs.isBackPressureEnabledKey)
- val isClearCheckPointEnabled = options(KafkaConfigs.kafkaConsumerClearCheckpointKey).toBoolean
- val isSaveCheckPointEnabled = options(KafkaConfigs.kafkaConsumerReadCheckpointKey).toBoolean
- val isStreamParallel = options(KafkaConfigs.isStreamParallelKey)
- val streamParallels = options(KafkaConfigs.streamParallelKey)
- val sc = sparkSession.sparkContext
- val sqlContext = sparkSession.sqlContext
- val ssc = new StreamingContext(sc, Seconds(batchInterval))
- logger.debug(
- s"""
- |isStreamParallel --> ${
- isStreamParallel
- }
- |streamParallels --> ${
- streamParallels
- }
- """.stripMargin)
- ssc.sparkContext.getConf
- .set(KafkaConfigs.isBackPressureEnabledKey, isBackPressureEnabled)
- .set(KafkaConfigs.streamMaxRatePerPartitionKey, streamRate)
- .set(KafkaConfigs.isStreamParallelKey, isStreamParallel)
- .set(KafkaConfigs.streamParallelKey, streamParallels)
- val dataStream = DataStream(ssc)
- val sourceTables = getTablesFrom(sql)
- val kafkaTables = sourceTables.filter { table =>
- val dataSetProperties: DataSetProperties =
- CatalogProvider.getDataSetProperties(table, options)
- DataSetUtils.getSystemType(dataSetProperties) == DataSetType.KAFKA
- }
- if (kafkaTables.isEmpty) {
- throw new Exception("ERROR --> No Kafka Type DataSet In the Query To Stream !")
- } else {
- val tmpKafkaTable = pCatalogStreamingKafkaTmpTableName
- val newSQL = sql.replaceAll(kafkaTables.head, tmpKafkaTable)
- val streamingResult: StreamingResult = dataStream.read(kafkaTables.head, options)
- if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint As Requested By User")
- try {
- streamingResult.dStream.foreachRDD {
- rdd =>
- val k: RDD[WrappedData] = rdd
- val count = rdd.count()
- if (count > 0) {
- streamingResult.getCurrentCheckPoint(rdd)
- streamingResult.convertAvroToDF(sqlContext, streamingResult.convertBytesToAvro(rdd)).registerTempTable(tmpKafkaTable)
- try {
- executeBatchSparkMagicJSON(newSQL, sparkSession)
- if (isSaveCheckPointEnabled) streamingResult.saveCurrentCheckPoint()
- if (isClearCheckPointEnabled) streamingResult.clearCheckPoint("Clearing CheckPoint as Requested by User")
- } catch {
- case ex: Throwable =>
- returnMsg =
- s"""{ "Stream Query Error" : "${
- ex.getStackTraceString
- }" } """
- // logger.error(returnMsg)
- // ex.printStackTrace()
- logger.warning("Force - Stopping Streaming Context")
- ssc.sparkContext.stop()
- throw new Exception(returnMsg)
- }
- }
- }
- } catch {
- case ex: Throwable =>
- returnMsg =
- s"""{ "Stream Query ERROR" : "${
- ex.getStackTraceString
- }" } """
- // logger.error(returnMsg)
- // ex.printStackTrace()
- logger.warning("Force - Stopping Streaming Context")
- ssc.sparkContext.stop()
- throw new Exception(returnMsg)
- }
- dataStream.streamingContext.start()
- dataStream.streamingContext.awaitTermination()
- dataStream.streamingContext.stop()
- returnMsg = s"""{"Stream Query" : "SUCCESS"} """
- }
-
- // push logs to ES
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
-
- returnMsg
- }
- catch {
- case e: Throwable =>
-
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , KafkaConstants.gimelAuditRunTypeStream
- , yarnCluster
- , user
- , toLogFriendlyString(s"${yarnCluster}/${user}/${sparkAppName}")
- , MethodName
- , sql
- , scala.collection.mutable.Map("sql" -> sql)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
-
- // throw error to console
- logger.throwError(e.toString)
-
- throw e
- }
-
- }
-
- private def toLogFriendlyString(str: String): String = {
- str.replaceAllLiterally("/", "_").replaceAllLiterally(" ", "-")
- }
-
- /**
- * handleDDLs will direct to respective data set create/drop/truncate based on the incoming DDL
- *
- * @param sql - SQL that is passed to create/drop/delete
- * @param sparkSession - spark session
- * @param dataSet - dataset name
- * @param options - List of options
- * @return
- */
- def handleDDLs(sql: String, sparkSession: SparkSession, dataSet: DataSet, options: Map[String, String]): Unit = {
- val uniformSQL = sql.replace("\n", " ")
- val sqlParts: Array[String] = uniformSQL.split(" ")
- // remove all additional white spaces in the DDL statment
- val newSql = sqlParts.filter(x => !x.isEmpty).mkString(" ")
- val newSqlParts = newSql.split(" ")
- sqlParts.head.toUpperCase match {
- // We have two "create ddl" paths. One with full create (plain) statement provided by the user
- // the other where we have to construct from the dataframe after running select clause in given sql/ddl
- // create table db.tablename(x int, y varchar(10) will be handled by handlePlainCreateDDL funcation
- // create table db.tablename tblproperties("table_type":"SET") as select * from another_table.
- case QueryConstants.DDL_CREATE_STRING => {
- val index = sqlParts.indexWhere(_.toLowerCase().contains(GimelConstants.UDC_STRING))
- // Find out whether select is part of the create statement
- val isHavingSelect = QueryParserUtils.isHavingSelect(sql)
- isHavingSelect match {
- case true => handleSelectDDL(newSqlParts, newSql, dataSet, options, sparkSession)
- case false => handlePlainCreateDDL(newSqlParts, dataSet, options, sparkSession)
- }
- }
- // following case will cover DROP DDL
- case QueryConstants.DDL_DROP_STRING => {
- val dataSetName = newSqlParts(2)
- dataSet.drop(dataSetName, options)
- }
- // following case will cover TRUNCATE DDL
- case QueryConstants.DDL_TRUNCATE_STRING => {
- val dataSetName = newSqlParts(2)
- dataSet.truncate(dataSetName, options)
- }
- // following case will cover both DELETE AND DELETE FROM DDL
- case QueryConstants.DDL_DELETE_STRING => {
- val dataSetName = newSqlParts.map(_.toUpperCase()).contains(QueryConstants.DDL_FROM_STRING) match {
- case true => newSqlParts(2)
- case _ => newSqlParts(1)
- }
- dataSet.truncate(dataSetName, options)
- }
- case _ => throw new Exception("Unexpected path at runtime. We should not arrive at this location !")
- }
- }
-
- /**
- * handleSelectDDL -
- * Strip out the the select statement
- * Run the sql using executeBatch and get the data frame back
- * Get the schema from data frame and pass it in options
- * Strip out the table properties and pass it in options
- * Create the object/table
- * Call dataSet.Write to the object/table that got created
- *
- * @param sqlParts - each word in the sql comes as array
- * @param sql - the full sql query
- * @param dataSet - dataset Object itself
- * @param options - options comings from user
- * @param sparkSession - Spark session
- * @return
- */
- def handleSelectDDL(sqlParts: Array[String], sql: String, dataSet: DataSet, options: Map[String, String], sparkSession: SparkSession): Unit = {
- val selectIndex = sqlParts.indexWhere(_.toUpperCase().contains(QueryConstants.SQL_SELECT_STRING))
- val selectClause = sqlParts.slice(selectIndex, sqlParts.length).mkString(" ")
- val pcatalogIndex = sqlParts.indexWhere(_.toLowerCase().contains(GimelConstants.UDC_STRING))
- val datasetname = sqlParts(pcatalogIndex)
-
- // Run the Select statement and get the results in a dataframe
- val selectDF = executeBatch(selectClause, sparkSession)
- val schema: Array[StructField] = selectDF.schema.fields
-
- // Check if 'PARTITIONED' clause present in the sql. If so we want to get the partitioned fileds so that we will use it during creation of the table when building CREATE TABLE statement.
- val partitionFields: Array[com.paypal.gimel.common.catalog.Field] = existsPartitionedByClause(sql) match {
- case true => getPartitionsFields(sql)
- case _ => Array[com.paypal.gimel.common.catalog.Field]()
- }
-
- val newOptions: Map[String, Any] = options ++ Map[String, Any](GimelConstants.TABLE_FILEDS -> schema, GimelConstants.CREATE_STATEMENT_IS_PROVIDED -> "false", GimelConstants.TABLE_SQL -> sql, GimelConstants.HIVE_DDL_PARTITIONS_STR -> partitionFields)
-
- // Create the table and Write data into it from the selected dataframe
- try {
- dataSet.create(datasetname, newOptions)
- logger.info("Table/object creation success")
- dataSet.write(datasetname, selectDF, newOptions)
- } catch {
- case e: Throwable =>
- val msg = s"Error creating/writing table: ${e.getMessage}"
- throw new Exception(msg, e)
- }
- }
-
- def handlePlainCreateDDL(sqlParts: Array[String], dataSet: DataSet, options: Map[String, String], sparkSession: SparkSession): Unit = {
-
- // Since select is not part of create statement it has to be full create statement
- // We need to replace the pcatalog.storagetype.storagesystem.DB.Table with DB.Table
- // So that we can pass the entire create statement as is to respective storage engines
- val index = sqlParts.indexWhere(_.toLowerCase().contains(GimelConstants.UDC_STRING))
-
- val datasetname = sqlParts(index)
- val newSQL = sqlParts.map(element => {
- if (element.toLowerCase().contains(GimelConstants.UDC_STRING + ".")) {
- // we replace pcatalog.storagetype.storagesystem.DB.Table with DB.Table
- element.split('.').tail.mkString(".").split('.').tail.mkString(".").split('.').tail.mkString(".")
- }
- else {
- element
- }
- }
- ).mkString(" ")
- val newOptions = options ++ Map[String, String](GimelConstants.TABLE_SQL -> newSQL.toString, GimelConstants.CREATE_STATEMENT_IS_PROVIDED -> "true")
- dataSet.create(datasetname, newOptions)
-
- }
-
- /** booltoDF will convert the boolean result to a dataframe
- *
- * @param spark - sparksessionboolToDF
- * @param result - boolean return from the create/drop/truncate methods
- * @return
- */
- def boolToDFWithErrorString(spark: SparkSession, result: Boolean, addOnString: String): DataFrame = {
- val resultStr = if (result) "success" else "failure"
- import spark.implicits._
- result match {
- case false => throw new Exception(s"${addOnString}\n")
- case _ => Seq(resultStr).toDF("Query Execution")
- }
- }
-
- /** booltoDF will convert the boolean result to a dataframe
- *
- * @param spark - sparksession
- * @param result - boolean return from the create/drop/truncate methods
- * @return
- */
- def boolToDF(spark: SparkSession, result: Boolean): DataFrame = {
- val resultStr = if (result) "success" else "failure"
- import spark.implicits._
- Seq(resultStr).toDF("Query Execution")
- }
-
- /** stringToDF will convert the string result to a dataframe
- *
- * @param spark - sparksession
- * @param result - boolean return from the create/drop/truncate methods
- * @return
- */
- def stringToDF(spark: SparkSession, result: String): DataFrame = {
- import spark.implicits._
- Seq(result).toDF("Query Execution")
- }
-
- /**
- * From the create table SQL, parse the partitioned by clause and get all the partitions
- *
- * @param sql - Incoming sql
- * @return - Array of Fields which has partition column name with data type hard coded as String for now as it is not going to be used elsewhere
- */
- def getPartitionsFields(sql: String): Array[com.paypal.gimel.common.catalog.Field] = {
- val pattern = """^.+PARTITIONED BY \((.*?)\).+""".r
- val pattern(partitions) = sql.toUpperCase()
- var fieldsList: Array[com.paypal.gimel.common.catalog.Field] = Array[com.paypal.gimel.common.catalog.Field]()
- val listParts = partitions.split(",")
- listParts.map(parts => fieldsList :+= com.paypal.gimel.common.catalog.Field(parts, "String"))
- fieldsList
- }
-
-
- /**
- *
- * Method to check in special checks in SQL string
- *
- * @param sql
- * @return
- */
- def vulnerabilityCheck(sql: String): Unit = {
-
- val checkFlag = if (sql.toUpperCase.contains(s"SET ${JdbcConfigs.jdbcUserName}".toUpperCase)) {
- true
- }
- else {
- false
- }
-
- if (checkFlag) {
- throw new Exception(
- s"""
- |SECURITY VIOLATION | Execution of this statement is not allowed: ${sql}
- """.stripMargin)
- }
- }
-
-}
-
diff --git a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryUtils.scala b/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryUtils.scala
deleted file mode 100644
index 2a9e8147..00000000
--- a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/GimelQueryUtils.scala
+++ /dev/null
@@ -1,1742 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import java.nio.charset.StandardCharsets
-import java.sql.SQLException
-import java.text.SimpleDateFormat
-import java.util.Date
-
-import scala.collection.immutable.Map
-import scala.collection.mutable
-import scala.util.{Failure, Success, Try}
-
-import com.google.common.hash.Hashing
-import org.apache.commons.lang3.ArrayUtils
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SparkSession}
-import org.apache.spark.streaming.Time
-
-import com.paypal.gimel.common.catalog.{CatalogProvider, DataSetProperties}
-import com.paypal.gimel.common.conf.{GimelConstants, _}
-import com.paypal.gimel.common.gimelserde.GimelSerdeUtils
-import com.paypal.gimel.common.utilities.{DataSetType, DataSetUtils, GenericUtils, RandomGenerator}
-import com.paypal.gimel.common.utilities.DataSetUtils._
-import com.paypal.gimel.datasetfactory.GimelDataSet
-import com.paypal.gimel.elasticsearch.conf.ElasticSearchConfigs
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-import com.paypal.gimel.hbase.utilities.HBaseUtilities
-import com.paypal.gimel.hive.conf.HiveConfigs
-import com.paypal.gimel.hive.utilities.HiveUtils
-import com.paypal.gimel.jdbc.conf.{JdbcConfigs, JdbcConstants}
-import com.paypal.gimel.jdbc.utilities._
-import com.paypal.gimel.jdbc.utilities.JdbcAuxiliaryUtilities._
-import com.paypal.gimel.jdbc.utilities.PartitionUtils.ConnectionDetails
-import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants}
-import com.paypal.gimel.logger.Logger
-import com.paypal.gimel.logging.GimelStreamingListener
-import com.paypal.gimel.parser.utilities.{QueryParserUtils, SearchCriteria, SearchSchemaUtils, SQLNonANSIJoinParser}
-
-object GimelQueryUtils {
-
- val logger: Logger = Logger(this.getClass.getName)
- /*
- * Regex for substituting tmp table in a sql.
- * This regex matches if key is preceded by any whitespace character - new line, tab, space
- * and followed by (new line, tab, space, round brackets, semi colon and comma) or is at end of line ($$)
- * ? inorder to get from all types of SQL pass searchlist
- * to be List("into", "view", "table", "from", "join")
- * @return Seq[Tables]
- */
-
- def getAllTableSources(sql: String,
- searchList: Seq[SearchCriteria] = SearchSchemaUtils.ALL_TABLES_SEARCH_CRITERIA): Seq[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
- val finalList = QueryParserUtils.getAllSourceTables(sql, searchList)
- logger.info(s"Final List of Tables --> ${finalList.mkString("[", " , ", "]")}")
- finalList
- }
-
- /**
- * Sets the Catalog Provider
- *
- * @param provider Catalog Provider, say - UDC , PCATALOG , HIVE , USER
- */
- def setCatalogProvider(provider: String): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- logger.info(s"Supplied catalog provider --> [$provider]")
- provider.toUpperCase() match {
- case CatalogProviderConstants.HIVE_PROVIDER | CatalogProviderConstants.USER_PROVIDER =>
- catalogProvider = provider
- case CatalogProviderConstants.PCATALOG_PROVIDER =>
- logger.warning(" ************************* WARNING ************************* ")
- logger.warning(s"DEPRECATED Catalog Provider --> [${CatalogProviderConstants.PCATALOG_PROVIDER}]")
- logger.warning(s"Please migrate to Catalog Provider --> [${CatalogProviderConstants.UDC_PROVIDER}]")
- logger.warning(" ************************* WARNING ************************* ")
- catalogProvider = provider
- logger.info(s"Auto-Setting catalog provider Namespace to --> [${provider.toUpperCase}]")
- setCatalogProviderName(provider.toUpperCase)
- case CatalogProviderConstants.UDC_PROVIDER =>
- logger.info(s"Auto-Setting catalog provider Namespace to --> [${provider.toUpperCase}]")
- catalogProvider = provider
- setCatalogProviderName(provider.toUpperCase)
- case _ => logger.warning(
- s"""
- |Invalid Catalog Provider --> [${provider}]
- |Valid Options --> [ ${CatalogProviderConstants.HIVE_PROVIDER}| ${CatalogProviderConstants.UDC_PROVIDER}| ${CatalogProviderConstants.PCATALOG_PROVIDER}| ${CatalogProviderConstants.USER_PROVIDER} ]
- """.stripMargin
- )
- }
- }
-
- /**
- * Client Function to Get Catalog Provider
- *
- * @return The Catalog Provider
- */
- def getCatalogProvider(): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- catalogProvider
- }
-
- /**
- * Sets the Catalog Provider Name
- *
- * @param providerNameSpace Catalog Provider, say - default, pcatalog, udc, any_other_hive_db_name
- */
-
- def setCatalogProviderName(providerNameSpace: String): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val catalogProvider = getCatalogProvider
- if (catalogProvider.equalsIgnoreCase(CatalogProviderConstants.HIVE_PROVIDER) |
- catalogProvider.equalsIgnoreCase(CatalogProviderConstants.USER_PROVIDER)) {
- logger.info(s"setting catalog provider Name to --> [$providerNameSpace]")
- catalogProviderNameSpace = providerNameSpace
- }
- else catalogProviderNameSpace = catalogProvider.toLowerCase()
- }
-
- /**
- * Client Function to Get Catalog Provider Name
- *
- * @return The Catalog Provider Name Space, say the hive DB name
- */
-
- def getCatalogProviderName(): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- catalogProviderNameSpace
- }
-
- /**
- * Returns the individual works from the SQL as tokens
- *
- * @param sql SqlString
- * @return String tokens
- */
- def tokenizeSql(sql: String): Array[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
- QueryParserUtils.tokenize(sql)
- }
-
- /**
- * This search will return true if the hive query has a partitioning criteria in it.
- *
- * @param sql SqlString
- * @return true, if query contains ; insert into partitions of target table.
- */
- def isQueryContainingPartitioning(sql: String): Boolean = {
- val tokens = tokenizeSql(sql.toLowerCase)
- var isHaving = false
- var tmp = ""
- tokens.foreach { token =>
- if ((tmp == "partition" & token == "(") || tmp.contains("partition(")) isHaving = true
- tmp = token
- }
- isHaving
- }
-
- /**
- * Gets the Tables List from SQL
- *
- * @param sql SQL String
- * @return List of Tables
- */
- def getTablesFrom(sql: String): Array[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- val otherCatalogProvider = getCatalogProviderName().toLowerCase match {
- case GimelConstants.UDC_STRING => GimelConstants.PCATALOG_STRING
- case GimelConstants.PCATALOG_STRING => GimelConstants.UDC_STRING
- case _ => "hive"
-
- }
- val allTables = getAllTableSources(sql)
- val finalList = allTables.filter(
- token =>
- token.toLowerCase.contains(s"${getCatalogProviderName().toLowerCase}.") ||
- token.toLowerCase.contains(s"$otherCatalogProvider.")
- )
- logger.info(s"Source Catalog [udc/pcatalog] Tables from entire SQL --> ${finalList.mkString("[", " , ", "]")}")
- finalList.toArray
- }
-
- /**
- * Gets the Tables List from SQL
- *
- * @param sql SQL String
- * @return List of Tables
- */
- @deprecated
- def getTablesFrom1(sql: String): Array[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- val sqlLower = sql.toLowerCase
- val searchList = List("insert", "select", "from", "join", "where")
- var lastKey = if (searchList.contains(tokenizeSql(sqlLower).head)) {
- tokenizeSql(sqlLower).head
- } else {
- ""
- }
- var currentKey = ""
- val catalogProviderNameSpace = getCatalogProviderName.toLowerCase
- // logger.info(s"catalogProviderNameSpace is --> [${catalogProviderNameSpace}]")
- var catalogTables = List[String]()
- val otherCatalogProvider = getCatalogProviderName.toLowerCase match {
- case GimelConstants.UDC_STRING => GimelConstants.PCATALOG_STRING
- case GimelConstants.PCATALOG_STRING => GimelConstants.UDC_STRING
- case _ => "hive"
-
- }
- // Pick each catalog.table only if its appearing at specific places in the SQL String
- // This guard necessary if someone uses "catalog" as an alias, example - udc or pcatalog
- tokenizeSql(sqlLower).tail.foreach {
- token =>
-
- currentKey = if (searchList.contains(token)) token else currentKey
- val pickCriteriaMet = token.toLowerCase.contains(s"${getCatalogProviderName.toLowerCase}.") ||
- token.toLowerCase.contains(s"${otherCatalogProvider}.")
-
- if (pickCriteriaMet) {
- if (lastKey == "from" & !(currentKey == "select")) catalogTables ++= List(token)
- if (lastKey == "join" & !(currentKey == "select")) catalogTables ++= List(token)
- }
- lastKey = if (searchList.contains(token)) currentKey else lastKey
- currentKey = ""
- }
-
- val nonANSIJoinTables: Seq[String] = SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sql)
- val nonANSIJoinTablesCatalog = nonANSIJoinTables.filter(
- token =>
- token.toLowerCase.contains(s"${getCatalogProviderName.toLowerCase}.") ||
- token.toLowerCase.contains(s"${otherCatalogProvider}.")
- )
- val finalList = (catalogTables.toArray ++ nonANSIJoinTablesCatalog).distinct
- logger.info(s"Source Tables from Non-ANSI Join --> ${nonANSIJoinTables.mkString("[", " , ", "]")}")
- logger.info(s"Source Catalog Tables from Non-ANSI Join --> ${nonANSIJoinTablesCatalog.mkString("[", " , ", "]")}")
- logger.info(s"Source Catalog Tables from ANSI Join --> ${catalogTables.mkString("[", " , ", "]")}")
- logger.info(s"Source Catalog Tables from entire SQL --> ${finalList.mkString("[", " , ", "]")}")
- finalList
- }
-
-
- /**
- * Prints Stats for Streaming Batch Window
- *
- * @param time Time Object - Spark Streaming
- * @param listener GIMEL Streaming Listener
- */
- def printStats(time: Time, listener: GimelStreamingListener): Unit = {
- val batchTimeMS = time.milliseconds.toString
- val batchDate = new Date(batchTimeMS.toLong)
- val df = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss")
- val batchTime = df.format(batchDate)
- logger.info(s"Current Batch ID --> $time | $batchTime | $batchDate")
- logger.info(
- s"""|-----------------------------------------------------------------------
- |Batch ID -->
- |-----------------------------------------------------------------------
- |time : $time
- |batchTimeMS : $batchTimeMS
- |batchTime : $batchTime
- |-----------------------------------------------------------------------
- |Listener Metrics -->
- |-----------------------------------------------------------------------
- |appProcessingDelay : ${listener.appProcessingDelay}
- |appSchedulingDelay : ${listener.appSchedulingDelay}
- |appTotalDelay : ${listener.appTotalDelay}
- |processingDelay : ${listener.processingDelay}
- |schedulingDelay : ${listener.schedulingDelay}
- |totalDelay : ${listener.totalDelay}
- |-----------------------------------------------------------------------
- |""".stripMargin)
- }
-
- /**
- * Cache the DataSet (lazily) if its configured to be cached - by user in properties.
- *
- * @param df DataFrame
- * @param dataSetName DataSetName representing the DataFrame
- * @param options Props
- */
- def cacheIfRequested(df: DataFrame, dataSetName: String, options: Map[String, String]): Unit = {
- val isCachingEnabled = (
- options.getOrElse(GimelConstants.DATA_CACHE_IS_ENABLED, "false").toBoolean
- && (
- options.getOrElse(s"${GimelConstants.DATA_CACHE_IS_ENABLED}.for.$dataSetName", "false").toBoolean
- || options.getOrElse(s"${GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL}", "false").toBoolean
- )
- )
- if (isCachingEnabled) df.cache()
- }
-
- private def mergeAllConfs(sparkSession: SparkSession): Map[String, String] = {
- sparkSession.conf.getAll ++ Map(CatalogProviderConfigs.CATALOG_PROVIDER -> sparkSession.conf.get(
- CatalogProviderConfigs.CATALOG_PROVIDER, CatalogProviderConstants.PRIMARY_CATALOG_PROVIDER)
- )
- }
-
- /**
- * Resolves the Query by replacing Tmp Tables in the Query String
- * For Each Tmp Table placed in the Query String - a DataSet.read is initiated
- * For each Tmp Table - if the dataset is a Kafka DataSet - then each KafkaDataSet object is accumulated
- * Accumulated KafkaDataSet Object will be used towards the end of the Query (on success) -
- * to call check pointing for each topic consumed
- *
- * @param originalSQL SQLString
- * @param selectSQL SQLString
- * @param sparkSession : SparkSession
- * @param dataSet Dataset Object
- * @return Tuple of (Resolved Original SQL, Resolved Select SQL, List of (KafkaDataSet)
- */
- def resolveSQLWithTmpTables(originalSQL: String, selectSQL: String, sparkSession: SparkSession,
- dataSet: com.paypal.gimel.DataSet): (String, String, List[GimelDataSet], String) = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- // get queryPushDown flag
- val queryPushDownFlag = getQueryPushDownFlag(originalSQL, selectSQL, sparkSession, dataSet)
-
- var kafkaDataSets: List[GimelDataSet] = List()
- var sqlTmpString = selectSQL
- var sqlOriginalString = originalSQL
- val pCatalogTablesToReplaceAsTmpTable: Map[String, String] = getTablesFrom(selectSQL).map {
- eachSource =>
- val options = getOptions(sparkSession)._2
- // Create a random string with random length for tmp table
- val randomString = RandomGenerator.getRandomString(
- RandomGenerator.getRandomInt(GimelConstants.GSQL_TMP_TABLE_RANDOM_GENERATOR_MIN,
- GimelConstants.GSQL_TMP_TABLE_RANDOM_GENERATOR_MAX))
- val tmpTableName = "tmp_" + eachSource.replaceAll("[^\\w\\s]", "_") + "_" + randomString
-
- // do DataSet.read() only if queryPushDownFlag is set to "false"
- queryPushDownFlag match {
- case "false" =>
- logger.info(s"Setting transformation dataset.read for ${eachSource}")
- logger.info("printing all options during read" + options.toString())
- val datasetProps = CatalogProvider.getDataSetProperties(eachSource, options)
- /*
- * Sets the appropriate deserializer class based on the kafka.message.value.type and value.serializer properties
- * This is mainly required for backward compatibility for KAFKA datasets
- */
- val newOptions = GimelSerdeUtils.setGimelDeserializer(sparkSession, datasetProps, options)
- val df = dataSet.read(eachSource, newOptions)
- cacheIfRequested(df, eachSource, newOptions)
- df.createOrReplaceTempView(tmpTableName)
- case _ =>
- // do nothing if query pushdown is true. No need to do dataset.read
- }
-
- if (dataSet.latestKafkaDataSetReader.isDefined) {
- logger.info(s"@$MethodName | Added Kafka Reader for Source --> $eachSource")
- kafkaDataSets = kafkaDataSets ++ List(dataSet.latestKafkaDataSetReader.get)
- }
- (eachSource, tmpTableName)
- }.toMap
-
- // replacing the dataset names with original tables names if queryPushDown is "true"
- queryPushDownFlag match {
- case "true" =>
- logger.info("PATH IS -> QUERY PUSH DOWN")
- pCatalogTablesToReplaceAsTmpTable.foreach { kv =>
- val resolvedSourceTable = resolveDataSetName(kv._1)
- val dataSetProperties: DataSetProperties =
- CatalogProvider.getDataSetProperties(resolvedSourceTable, mergeAllConfs(sparkSession))
- val hiveTableParams = dataSetProperties.props
- val jdbcTableName: String = hiveTableParams(JdbcConfigs.jdbcInputTableNameKey)
- logger.info(s"JDBC input table name : ${jdbcTableName}")
- logger.info(s"Setting JDBC URL : ${hiveTableParams(JdbcConfigs.jdbcUrl)}")
- sparkSession.conf.set(JdbcConfigs.jdbcUrl, hiveTableParams(JdbcConfigs.jdbcUrl))
- logger.info(s"Setting JDBC driver Class : ${hiveTableParams(JdbcConfigs.jdbcDriverClassKey)}")
- sparkSession.conf.set(JdbcConfigs.jdbcDriverClassKey, hiveTableParams(JdbcConfigs.jdbcDriverClassKey))
- sqlTmpString = getSQLWithTmpTable(sqlTmpString, kv._1, jdbcTableName)
- sqlOriginalString = getSQLWithTmpTable(sqlOriginalString, kv._1, jdbcTableName)
- }
- case _ =>
- logger.info("PATH IS --> DEFAULT")
- pCatalogTablesToReplaceAsTmpTable.foreach { kv =>
- sqlTmpString = getSQLWithTmpTable(sqlTmpString, kv._1, kv._2)
- sqlOriginalString = getSQLWithTmpTable(sqlOriginalString, kv._1, kv._2)
- }
- }
-
- logger.info(s"incoming SQL --> $selectSQL")
- logger.info(s"resolved SQL with Temp Table(s) --> $sqlTmpString")
- (sqlOriginalString, sqlTmpString, kafkaDataSets, queryPushDownFlag)
- }
-
- /*
- * Substitutes dataset name with tmp table in sql using regex
- *
- * @param sql
- * @param datasetName : Mame of dataset to substitute
- * @param tmpTableName : Temp table name to substitute
- *
- * Example:
- * sql = select * from udc.hive.test.flights
- * key = udc.hive.test.flights
- * This should match udc.hive.test.flights in the sql string.
- *
- * sql = select * fromudc.hive.test.flights
- * key = udc.hive.test.flights
- * This should not match udc.hive.test.flights in the sql string.
- *
- * sql = select * from udc.hive.test.flights_schedule
- * key = udc.hive.test.flights
- * This should not match udc.hive.test.flights in the sql string.
- */
- def getSQLWithTmpTable(sql: String, datasetName: String, tmpTableName: String): String = {
- sql.replaceAll(regexTmpTable.replace("key", datasetName), tmpTableName)
- }
-
- /**
- * Checks if a Query has Insert or if its just a select
- *
- * @param sql SQL String
- * @return true - if there is an "insert" clause, else false
- */
- def isHavingInsert(sql: String): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
- QueryParserUtils.isHavingInsert(sql)
- }
-
- /**
- * This function tokenize the incoming sql and parses it using JSQL parser and identify whether the query is of Insert type
- * If it is a insert query, it checks whether it is of HIVE insert, which the caller will use it decide whether to execute it through Livy.
- *
- * @param sql - Incoming SQL
- * @param options - set of Options from the user
- * @param sparkSession - spark session
- * @return - It returns a boolean that tells whether it is hive insert from GTS
- */
- def isHiveHbaseDMLAndGTSUser(sql: String, options: Map[String, String], sparkSession: SparkSession): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql)
- val isHive: Boolean = nonEmptyStrTokenized.head.toLowerCase match {
- case "insert" => {
- val insertTable = getTargetTables(sql)
- getSystemType(insertTable.get, sparkSession, options) match {
- case DataSetType.HIVE => {
- if (
- sparkSession.sparkContext.sparkUser.equalsIgnoreCase(GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))
- ) {
- logger.info("Hive insert query and comes from GTS")
- true
- } else {
- false
- }
- }
- case DataSetType.HBASE => {
- if (
- sparkSession.sparkContext.sparkUser.equalsIgnoreCase(GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))
- ) {
- logger.info("hBase insert query and comes from GTS")
- true
- } else {
- false
- }
- }
- case _ => false
- }
- }
- case _ =>
- false
- }
- isHive
- }
-
- /**
- * This function tokenize the incoming sql and parses it using JSQL parser and identify whether the query is of Select type
- * If it is a select query, it checks whether it is of HIVE or HBase, which the caller will use to decide whether to authenticate through ranger.
- *
- * @param sql - Incoming SQL
- * @param options - set of Options from the user
- * @param sparkSession - spark session
- * @return - It returns a boolean that tells whether it is hive insert from GTS
- */
- def isSelectFromHiveHbaseAndGTSUser(sql: String, options: Map[String, String],
- sparkSession: SparkSession): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- var isHiveHbase: Boolean = false
- Try {
- val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql)
- isHiveHbase = nonEmptyStrTokenized.head.toLowerCase match {
- case "select" =>
- val selectTables = getAllTableSources(sql)
- if (selectTables.isEmpty) return false
- selectTables.map(eachTable => getSystemType(eachTable, sparkSession, options) match {
- case DataSetType.HIVE =>
- if (sparkSession.sparkContext.sparkUser.equalsIgnoreCase(
- GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))) {
- logger.info("Hive select query and comes from GTS")
- true
- } else {
- false
- }
- case DataSetType.HBASE =>
- if (sparkSession.sparkContext.sparkUser.equalsIgnoreCase(
- GimelConstants.GTS_DEFAULT_USER(sparkSession.conf))) {
- logger.info("hBase select query and comes from GTS")
- true
- } else {
- false
- }
- case _ => false
- }).reduce((x, y) => x | y)
- case _ =>
- false
- }
- } match {
- case Success(_) =>
- logger.info(s"Interpreted isSelectFromHiveHbaseAndGTSUser with $isHiveHbase")
- case Failure(exception) =>
- logger.error(s"Exeception occurred while interpretting " +
- s"isSelectFromHiveHbaseAndGTSUser with ${exception.getMessage}")
- if (exception.getMessage.toLowerCase().contains("table not found")) {
- logger.info("Suppressing the table not found exception")
- } else {
- throw exception
- }
- }
- isHiveHbase
- }
-
- /**
- * Checks whether the sql is of drop table/view pattern and checks whether the table/view is a temp table
- * This will help to take a path to whether to go in livy session or normal gsql session
- *
- * @param sql - incoming sql
- * @param sparkSession - current spark session
- * @return - true or false based on whether the dropped table/view is a temp (cached) table.
- */
- def isDropTableATempTable(sql: String, sparkSession: SparkSession): Boolean = {
- val dropTableIfExistsPattern = s"DROP TABLE IF EXISTS .(.*)".r
- val dropViewIfExistsPattern = s"DROP VIEW IF EXISTS .(.*)".r
- val dropTablePattern = s"DROP TABLE .(.*)".r
- val dropViewPattern = s"DROP VIEW .(.*)".r
- val uniformSQL = sql.replace("\n", " ")
- val sqlParts: Array[String] = uniformSQL.split(" ")
- val newSql = sqlParts.filter(x => !x.isEmpty).mkString(" ")
- val tableName = newSql.toUpperCase() match {
- case dropTableIfExistsPattern(_) | dropViewIfExistsPattern(_) =>
- newSql.split(" ")(newSql.split(" ").indexWhere(_.toUpperCase() == "EXISTS") + 1)
-
- case dropTablePattern(_) =>
- newSql.split(" ")(newSql.split(" ").indexWhere(_.toUpperCase() == "TABLE") + 1)
-
- case dropViewPattern(_) =>
- newSql.split(" ")(newSql.split(" ").indexWhere(_.toUpperCase() == "VIEW") + 1)
-
- case _ => "."
- }
- if (tableName.contains(".")) {
- false
- } else {
- isSparkCachedTable(tableName, sparkSession)
- }
- }
-
- /**
- * This function call will check SQL is a DDL
- *
- * @param sql - Incoming SQL
- * @param sparkSession - Spark Session object
- */
-
- def isDDL(sql: String, sparkSession: SparkSession): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- QueryParserUtils.isDDL(sql, isDropTableATempTable(sql, sparkSession))
- }
-
- /**
- * This function call will check whether SQL is setting conf, say - "SET key=val"
- *
- * @param sql - Incoming SQL
- * @param sparkSession - Spark Session object
- */
-
- def isSetConf(sql: String, sparkSession: SparkSession): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- val tokenized = GimelQueryUtils.tokenizeSql(sql)
- val nonEmptyStrTokenized = tokenized.filter(x => !x.isEmpty)
- nonEmptyStrTokenized.head.toUpperCase.equals("SET")
- }
-
- /**
- * isDataDefinition - will find whether we need to take to the Data definition path or select/insert DML path
- *
- * @param sql SQL String from client
- * @return Resulting Boolean
- */
-
- def isUDCDataDefinition(sql: String): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- // Add alter table
- val catalogName = getCatalogProvider.toUpperCase
- val createTablePattern = s"CREATE TABLE ${catalogName}.(.*)".r
- val createExternalTablePattern = s"CREATE EXTERNAL TABLE ${catalogName}.(.*)".r
- val multisetPattern = s"CREATE MULTISET TABLE ${catalogName}.(.*)".r
- val setPattern = s"CREATE SET TABLE ${catalogName}.(.*)".r
- val dropTablePattern = s"DROP TABLE ${catalogName}.(.*)".r
- val truncateTablePattern = s"TRUNCATE TABLE ${catalogName}.(.*)".r
- val deleteFromPattern = s"DELETE FROM ${catalogName}.(.*)".r
- val deletePattern = s"DELETE ${catalogName}.(.*)".r
-
- val uniformSQL = sql.replace("\n", " ")
- val sqlParts: Array[String] = uniformSQL.split(" ")
- // remove all additional white spaces in the DDL statment
- val newSql = sqlParts.filter(x => !x.isEmpty).mkString(" ")
- newSql.toUpperCase() match {
- case createTablePattern(_) | createExternalTablePattern(_) | multisetPattern(_) | setPattern(_) | dropTablePattern(_) | truncateTablePattern(_) | deleteFromPattern(_) | deletePattern(_) => {
- true
- }
- case _ => {
- false
- }
- }
- }
-
- /**
- * Parse the SQL and get the entire select clause
- *
- * @param sql SQL String
- * @return SQL String - that has just the select clause
- */
- def getSelectClause(sql: String): String = {
- QueryParserUtils.getSelectClause(sql)
- }
-
- /**
- * Parse the SQL and get the entire select clause
- *
- * @param sql SQL String
- * @return SQL String - that has just the select clause
- */
- def getPlainSelectClause(sql: String): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val uniformSQL = sql.replace("\n", " ")
- val sqlParts: Array[String] = uniformSQL.split(" ")
- val index = sqlParts.indexWhere(_.toUpperCase() == "SELECT")
- val selectClauseOnly = sqlParts.slice(index, sqlParts.length).mkString(" ")
- selectClauseOnly
- }
-
- /**
- * Gets the target table
- *
- * @param sql SQL String
- * @return Table Name
- */
- def getTargetTables(sql: String): Option[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
- SQLParser.getTargetTables(sql)
- }
-
- /**
- * getOptions - read the SparkSession options that was set by the user else add the default values
- *
- * @param sparkSession : SparkSession
- * @return Tuple ( String with concatenated options read from the SparkSession , Same Props as a Map[String, String] )
- */
-
- def getOptions(sparkSession: SparkSession): (String, Map[String, String]) = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val hiveConf: Map[String, String] = sparkSession.conf.getAll
- val optionsToCheck: Map[String, String] = Map(
- KafkaConfigs.rowCountOnFirstRunKey -> "250"
- , KafkaConfigs.batchFetchSize -> "250"
- , KafkaConfigs.maxRecordsPerPartition -> "25000000"
- , GimelConstants.LOG_LEVEL -> "ERROR"
- , KafkaConfigs.kafkaConsumerReadCheckpointKey -> "true"
- , KafkaConfigs.kafkaConsumerClearCheckpointKey -> "false"
- , KafkaConfigs.maxRatePerPartitionKey -> "3600"
- , KafkaConfigs.streamParallelKey -> "10"
- , KafkaConfigs.defaultBatchInterval -> "30"
- , KafkaConfigs.isStreamParallelKey -> "true"
- , KafkaConfigs.streamaWaitTerminationOrTimeoutKey -> "-1"
- , KafkaConfigs.isBackPressureEnabledKey -> "true"
- , JdbcConfigs.teradataReadType -> ""
- , HbaseConfigs.hbaseOperation -> "scan"
- , HbaseConfigs.hbaseFilter -> ""
- , GimelConstants.DATA_CACHE_IS_ENABLED -> "false"
- , GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL -> "true"
- , KafkaConfigs.isStreamBatchSwitchEnabledKey -> "false"
- , KafkaConfigs.streamFailureThresholdPerSecondKey -> "1500"
- , ElasticSearchConfigs.esIsDailyIndex -> "false"
- , CatalogProviderConfigs.CATALOG_PROVIDER -> CatalogProviderConstants.PRIMARY_CATALOG_PROVIDER
- , GimelConstants.SPARK_APP_ID -> sparkSession.conf.get(GimelConstants.SPARK_APP_ID)
- , GimelConstants.SPARK_APP_NAME -> sparkSession.conf.get(GimelConstants.SPARK_APP_NAME)
- , GimelConstants.APP_TAG -> getAppTag(sparkSession.sparkContext)
- )
- val resolvedOptions: Map[String, String] = optionsToCheck.map { kvPair =>
- (kvPair._1, hiveConf.getOrElse(kvPair._1, kvPair._2))
- }
- resolvedOptions.foreach(conf => sparkSession.conf.set(conf._1, conf._2))
- (resolvedOptions.map(x => x._1 + "=" + x._2).mkString(":"), hiveConf ++ resolvedOptions)
- }
-
-
- /**
- * Executes the SQL and Returns DataFrame
- *
- * @param selectSQL The Select SQL
- * @param sparkSession : SparkSession
- * @return DataFrame
- */
- def executeSelectClause(selectSQL: String, sparkSession: SparkSession, queryPushDownFlag: String): DataFrame = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
-
- val selectDF: DataFrame = queryPushDownFlag match {
- case "true" =>
-
- // set the SparkContext as well as TaskContext property for JdbcPushDown flag to "false"
- // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext")
- // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false")
-
- logger.info(s"Executing Pushdown Query: ${selectSQL}")
- val df = executePushdownQuery(selectSQL, sparkSession)
-
- df
- case _ =>
- sparkSession.sql(selectSQL)
- }
- selectDF
- }
-
- /**
- * Executes the Resolved SQL Query by calling the DataSet code that has been generated
- *
- * @param clientSQL Original SQL String submitted by Client
- * @param dest Target Table
- * @param selectSQL SQl String for Select Clause alone
- * @param sparkSession :SparkSession
- * @param dataset DataSet
- * @return Result String
- */
- def executeResolvedQuery(clientSQL: String, dest: Option[String], selectSQL: String, sparkSession: SparkSession,
- dataset: com.paypal.gimel.DataSet, queryPushDownFlag: String): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- logger.info(s"Client SQL is --> $clientSQL")
- logger.info(s"Select SQL is --> $selectSQL")
- var resultString = ""
- if (dest.isDefined) {
- logger.info(s"EXECUTION PATH ====== DATASET WRITE ======")
- if (clientSQL.toLowerCase.contains("partition")) {
- sparkSession.sql("set hive.exec.dynamic.partition.mode=nonstrict")
- }
- Try {
- val options = getOptions(sparkSession)._2
- val selectDF = executeSelectClause(selectSQL, sparkSession, queryPushDownFlag)
- // --- EXISTING LOGIC
- // dataset.write(dest.get, selectDF, options)
- // --- NEW LOGIC
- // Get the DataSet Properties
-
- val tgt = dest.get
- (tgt.split(",").length > 2) match {
- case true =>
- case _ =>
- }
- val dataSetProperties: DataSetProperties = CatalogProvider.getDataSetProperties(dest.get, options)
- // val dataSetProperties = GimelServiceUtilities().getDataSetProperties(dest.get)
- dataSetProperties.datasetType.toString match {
- case "HIVE" | "NONE" =>
- // If Hive
- val sqlToInsertIntoHive = queryPushDownFlag.toLowerCase match {
- case "true" =>
- logger.info(s"Invoking write API in gimel with queryPushDownFlag=${queryPushDownFlag}...")
-
- // create a temp view for pushdown dataframe.
- val jdbcPushDownTempTable = "jdbcPushDownTempTable"
- logger.info(s"Creating temp view for pushdown query dataframe as ${jdbcPushDownTempTable}")
- selectDF.createOrReplaceTempView(jdbcPushDownTempTable)
-
- val pushDownSelectQuery = s"SELECT * FROM ${jdbcPushDownTempTable}"
-
- // replace selectSQL in clientSQL with pushDownSelectQuery
- logger.info(s"Replacing ${selectSQL} in ${clientSQL} with ${pushDownSelectQuery}")
- val pushDownSparkSql = clientSQL.replace(selectSQL, pushDownSelectQuery)
- // dataset.write(dest.get, selectDF, options)
- logger.info(s"Spark SQL after Pushdown Query: ${pushDownSparkSql}")
- pushDownSparkSql
- case _ =>
- logger.info(s"Invoking sparkSession.sql for write with queryPushDownFlag=${queryPushDownFlag}...")
- // Get the DB.TBL from UDC
- clientSQL
- }
- // execute on hive
- val db = dataSetProperties.props(HiveConfigs.hiveDBName)
- val tbl = dataSetProperties.props(HiveConfigs.hiveTableName)
- val actual_db_tbl = s"${db}.${tbl}"
- // Replace the SQL with DB.TBL
- logger.info(s"Replacing ${dest.get} with ${actual_db_tbl}")
- val sqlToExecute = sqlToInsertIntoHive.replaceAll(s"(?i)${dest.get}", actual_db_tbl)
- logger.info(s"Passing through SQL to Spark for write since target [${actual_db_tbl}] is of data set type - HIVE ...")
- logger.info(s"Final SQL to Run --> \n ${sqlToExecute}")
- sparkSession.sql(sqlToExecute)
- case _ =>
- // If Non-HIVE
- logger.info(s"Invoking write API in gimel with queryPushDownFlag=${queryPushDownFlag}...")
- /*
- * Sets the appropriate serializer class based on the kafka.message.value.type and value.serializer properties
- * This is mainly required for backward compatibility for KAFKA datasets
- */
- val newOptions = GimelSerdeUtils.setGimelSerializer(sparkSession, dataSetProperties, options)
- dataset.write(dest.get, selectDF, newOptions)
- }
-
- } match {
- case Success(_) =>
- resultString = "Query Completed."
- logger.info(resultString)
- case Failure(e) =>
- // e.printStackTrace()
- resultString =
- s"""Query Failed in function : $MethodName via path dataset.write. Error -->
- |
- |${e.toString}""".stripMargin
- // logger.error(resultString)
- throw e
- }
- } else {
- logger.info(s"EXECUTION PATH ====== DATASET SELECT ======")
- val selectDF: DataFrame = queryPushDownFlag match {
- case "true" =>
- // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext")
- // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false")
- val df = executePushdownQuery(selectSQL, sparkSession)
- df
- case _ =>
- sparkSession.sql(selectSQL)
- }
- val count = selectDF.cache.count
- val rowsToShow = sparkSession.conf.get(GimelConstants.MAX_RESULTS_TO_SHOW, "250").toInt
- val showRowsOnly = sparkSession.conf.get(GimelConstants.SHOW_ROWS_ENABLED, "false").toBoolean
- val resultSet = selectDF.take(rowsToShow).mkString("\n")
- val marginString = "-------------------------------------------------------------------------------------------------------"
- val extraMessage =
- s"""
- |$marginString
- |Total Rows Returned from original Query --> $count
- |Displaying Rows ${scala.math.min(rowsToShow, count)} of $count
- |
- |$userInfoString
- """.stripMargin
- resultString =
- s"""${if (!showRowsOnly) extraMessage else ""}
- |$marginString
- |$resultSet
- |$marginString""".stripMargin
- }
- resultString
- }
-
- /**
- * Executes the Resolved SQL Query by calling the DataSet code that has been generated
- *
- * @param clientSQL Original SQL String submitted by Client
- * @param dest Target Table
- * @param selectSQL SQl String for Select Clause alone
- * @param sparkSession : SparkSession
- * @param dataset DataSet
- * @return RDD[Result JSON String]
- */
- // def executeResolvedQuerySparkMagic(clientSQL: String, dest: Option[String], selectSQL: String, hiveContext: HiveContext, dataset: DataSet): RDD[String] = {
- // def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- //
- // logger.info(" @Begin --> " + MethodName)
- //
- // logger.info(s"Client SQL is --> $clientSQL")
- // logger.info(s"Select SQL is --> $selectSQL")
- // logger.silence
- // val selectDF = hiveContext.sql(selectSQL)
- // selectDF.toJSON
- // }
-
- def executeResolvedQuerySparkMagic(clientSQL: String, dest: Option[String], selectSQL: String, sparkSession: SparkSession, dataset: com.paypal.gimel.DataSet, queryPushDownFlag: String): RDD[String] = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- logger.info(s"Client SQL is --> $clientSQL")
- logger.info(s"Select SQL is --> $selectSQL")
- var resultString = ""
- if (dest.isDefined) {
- logger.info(s"EXECUTION PATH ====== DATASET WRITE ======")
- if (clientSQL.toLowerCase.contains("partition")) {
- sparkSession.sql("set hive.exec.dynamic.partition.mode=nonstrict")
- }
- Try {
- val (_, options) = getOptions(sparkSession)
- val selectDF: DataFrame = queryPushDownFlag match {
- case "true" =>
- // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext")
- // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false")
- val df = executePushdownQuery(selectSQL, sparkSession)
- df
- case _ =>
- sparkSession.sql(selectSQL)
- }
- dataset.write(dest.get, selectDF, options)
- } match {
- case Success(_) =>
- resultString = """{"Query Execution":"Success"}"""
- logger.info(resultString)
- sparkSession.read.json(sparkSession.sparkContext.parallelize(Seq(resultString))).toJSON.rdd
- case Failure(e) =>
- // e.printStackTrace()
- resultString =
- s"""{"Query Execution Failed":${e.toString}}"""
- // logger.error(resultString)
- sparkSession.read.json(sparkSession.sparkContext.parallelize(Seq(resultString))).toJSON.rdd
- // throw e
- }
- } else {
- logger.info(s"EXECUTION PATH ====== DATASET SELECT ======")
- val selectDF: DataFrame = queryPushDownFlag match {
- case "true" =>
- // logger.info(s"Setting jdbcPushDownFlag to false in SparkContext")
- // sparkSession.conf.set(JdbcConfigs.jdbcPushDownEnabled, "false")
- val df = executePushdownQuery(selectSQL, sparkSession)
- df
- case _ =>
- sparkSession.sql(selectSQL)
- }
- // val count = selectDF.cache.count
- val rowsToShow = sparkSession.conf.get(GimelConstants.MAX_RESULTS_TO_SHOW, "250").toInt
- selectDF.registerTempTable("tmp_table_spark_magic")
- val resultSet = sparkSession.sql(s"select * from tmp_table_spark_magic limit ${rowsToShow}").toJSON.rdd
- resultSet
- }
- }
-
- /**
- * This function parses the SQL and get all the source tables.
- * It calls hiveutils.ranger authentication if it is a HIVE table (Either UDC or non UDC tables are covered)
- *
- * @param sql - incoming sql
- * @param sparkSession - spark session object
- * @param options - incoming user options
- */
-
- def authenticateAccess(sql: String, sparkSession: SparkSession, options: Map[String, String]): Unit = {
-
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql)
- val sqlToAuthenticate: Option[String] = nonEmptyStrTokenized.head.toLowerCase match {
- case "select" =>
- // Handling a Select clause...
- val userSuppliedPushDownFlag = sparkSession.conf.get(JdbcConfigs.jdbcPushDownEnabled, "false").toBoolean
- // If the pushDownFlag is true it is a pure Teradata query and do not do authentication.
- // So don't return any SQL for authentication
- if (!userSuppliedPushDownFlag) Some(sql) else None
- case "cache" =>
- logger.info("Handling Cache statement ...")
- Some(getPlainSelectClause(sql))
- case "insert" =>
- logger.info("Handling Insert statement ...Do ranger checks for the select tables if they from hive or hbase")
- Some(getPlainSelectClause(sql))
- case _ => None
- }
-
- logger.info("The incoming SQL for authenticateRangerPolicies =>" + sql)
- sqlToAuthenticate match {
- case Some(sql) => authenticateRangerPolicies(sql, sparkSession, options)
- case _ => logger.info("No SQL to Authenticate.")
- }
-
- }
-
- /**
- * Checks whether a table is cached in spark Catalog
- *
- * @param tableName - incoming table name
- * @param sparkSession - spark session
- * @return - A boolean value to tell whether the table is cached or not
- */
-
- def isSparkCachedTable(tableName: String, sparkSession: SparkSession): Boolean = {
-
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- val isCached = Try {
- sparkSession.catalog.isCached(tableName)
- }
- match {
- case Success(result) => {
- result match {
- case true => true
- case _ => false
- }
- }
- case Failure(e) => false
- }
- isCached match {
- case true => logger.info(tableName + "====> a Cached table")
- case _ => logger.info(tableName + "====> NOT a Cached table")
- }
- isCached
- }
-
- /**
- * This function parses the SQL and get all the source tables.
- * It calls hiveutils.ranger authentication if it is a HIVE table (Either UDC or non UDC tables are covered)
- *
- * @param sql - incoming sql
- * @param sparkSession - spark session object
- * @param options - incoming user options
- */
- def authenticateRangerPolicies(sql: String, sparkSession: SparkSession, options: Map[String, String]): Unit = {
-
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val listTables: Seq[String] = getAllTableSources(sql)
- val newList = listTables.toList.filter(dataSetName => {
- logger.info("the current data set name is " + dataSetName)
- if (dataSetName.contains(".")) {
- true
- } else {
- !isSparkCachedTable(dataSetName, sparkSession)
- }
- })
- newList.foreach(dataSet => {
- logger.info(
- "Data Sets to be checked for Ranger authentication are " + dataSet)
- authLogicWrapper(dataSet, sparkSession, options)
- }
- )
- }
-
- /**
- * core logic to check each data set to see whether if it is HIVE or HBASE, if so do impersonation based on the impersonation flag.
- *
- * @param dataSet - data set name
- * @param sparkSession - spark session
- * @param options - user options
- */
- def authLogicWrapper(dataSet: String, sparkSession: SparkSession, options: Map[String, String]): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info("@Begin --> " + MethodName)
-
- logger.info("Data set name is --> " + dataSet)
- val formattedProps: Map[String, Any] = DataSetUtils.getProps(options) ++
- Map(CatalogProviderConfigs.CATALOG_PROVIDER ->
- sparkSession.conf.get(CatalogProviderConfigs.CATALOG_PROVIDER,
- CatalogProviderConstants.PRIMARY_CATALOG_PROVIDER))
-
- // if storage type unknown we will default to HIVE PROVIDER
- if (DataSetUtils.isStorageTypeUnknown(dataSet)) {
- formattedProps ++ Map(CatalogProviderConfigs.CATALOG_PROVIDER -> CatalogProviderConstants.HIVE_PROVIDER)
- }
-
- val dataSetProperties: DataSetProperties = CatalogProvider.getDataSetProperties(dataSet, options)
- logger.info("dataSetProperties ==> " + dataSetProperties.toString())
- val systemType = DataSetUtils.getSystemType(dataSetProperties)
-
- val newProps: Map[String, Any] = DataSetUtils.getProps(options) ++ Map(
- GimelConstants.DATASET_PROPS -> dataSetProperties
- , GimelConstants.DATASET -> dataSet
- , GimelConstants.RESOLVED_HIVE_TABLE -> resolveDataSetName(dataSet))
-
- systemType match {
- case DataSetType.HIVE =>
- val hiveUtils = new HiveUtils
-
- // If its cross cluster access, do not allow dynamic dataset access as it would mean the dataset is not present in UDC
- // and it will try to read from hive directly which would fail.
- // Also, if HDFS location is not present, it may be a view, so abort it.
- if (hiveUtils.isCrossCluster(dataSetProperties)) {
- val isDynamicDataset = dataSetProperties.props.getOrElse(CatalogProviderConstants.DYNAMIC_DATASET, "false").toBoolean
- if (isDynamicDataset) {
- throw new Exception(
- s"""
- | Cross Cluster Access Detected. Cannot read dynamic dataset.
- | This means the dataset does not exist in UDC.
- """.stripMargin)
- }
-
- if (!dataSetProperties.props.contains(HiveConfigs.dataLocation) ||
- dataSetProperties.props.get(HiveConfigs.dataLocation).get == GimelConstants.NOT_APPLICABLE) {
- throw new Exception(
- s"""
- | Cross Cluster Access Detected. Cannot find ${HiveConfigs.dataLocation} property.
- | Please check if it is a view as Gimel currently does not support cross cluster view access.
- """.stripMargin)
- }
- hiveUtils.authenticateTableAndLocationPolicy(dataSet, options, sparkSession, GimelConstants.READ_OPERATION)
- } else {
- val hiveTableName = (dataSetProperties.props(GimelConstants.HIVE_DATABASE_NAME) + "." + dataSetProperties.props(GimelConstants.HIVE_TABLE_NAME))
- val hiveTableObject = CatalogProvider.getHiveTable(hiveTableName)
- val tableType = hiveTableObject.getTableType
- if (tableType == "VIRTUAL_VIEW") {
- logger.info("Seems we are querying a view.")
- val viewSql = hiveTableObject.getViewExpandedText()
- logger.info(s"View SQL --> \n${viewSql}")
- println(s"View SQL --> \n${viewSql}")
- val allTableSources = getAllTableSources(viewSql)
- logger.info(s"List of tables to be authenticated --> \n${allTableSources.mkString("\n")}")
- println(s"List of tables to be authenticated --> \n${allTableSources.mkString("\n")}")
- allTableSources.foreach(x => authLogicWrapper(x.replaceAll("`", ""), sparkSession, options))
- } else {
- hiveUtils.authenticateTableAndLocationPolicy(dataSet, options, sparkSession, GimelConstants.READ_OPERATION)
- }
- }
- case DataSetType.HBASE =>
- val hBASEUtilities = HBaseUtilities(sparkSession)
- hBASEUtilities.authenticateThroughRangerPolicies(dataSet, GimelConstants.READ_OPERATION, newProps)
- case _ => None
- }
- }
-
-
- /**
- * Checks if a Query has Cache statemnt
- *
- * @param sql SQL String
- * @return true - if there is an "Cache" clause, else false
- */
- def isHavingCache(sql: String): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- GimelQueryUtils.tokenizeSql(sql).head.equalsIgnoreCase("cache")
- }
-
- /**
- * This function tokenizes the incoming sql and parses it using GSQL parser and identify whether the query is of Select type
- * If it is a select query, it checks whether it is of HBase and has limit clause.
- *
- * @param sql - Incoming SQL
- * @param options - set of Options from the user
- * @param sparkSession - spark session
- * @return
- */
- def setLimitForHBase(sql: String, options: Map[String, String],
- sparkSession: SparkSession): Unit = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
- Try {
- val nonEmptyStrTokenized = GimelQueryUtils.tokenizeSql(sql)
- nonEmptyStrTokenized.head.toLowerCase match {
- case "select" =>
- val selectTables = getAllTableSources(sql)
- // Checks if there is more than 1 source tables
- if (selectTables.isEmpty || selectTables.length > 1) return
- selectTables.map(eachTable => DataSetUtils.getSystemType(
- eachTable, sparkSession, options) match {
- case DataSetType.HBASE =>
- logger.info("Sql contains limit clause, setting the HBase Page Size.")
- val limit = Try(QueryParserUtils.getLimit(sql)).get
- sparkSession.conf.set(GimelConstants.HBASE_PAGE_SIZE, limit)
- case _ =>
- return
- })
- case _ =>
- return
- }
- } match {
- case Success(_) =>
- case Failure(exception) =>
- logger.error(s"Exeception occurred while setting the limit for HBase -> ${exception.getMessage}")
- throw exception
- }
- }
-
- /**
- * Parse the SQL and get cache Query & select statement
- *
- * @param sql SQL String
- */
- def splitCacheQuery(sql: String): (Option[String], String) = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val uniformSQL = sql.replace("\n", " ")
- val sqlParts: Array[String] = uniformSQL.split(" ")
- if (isHavingCache(sql)) {
- logger.info("Splitting sql since it contains cache table")
- val index = sqlParts.indexWhere(_.toUpperCase() == "SELECT")
- (Some(sqlParts.slice(0, index).mkString(" ")), sqlParts.slice(index, sqlParts.length).mkString(" "))
- } else {
- (None, sqlParts.mkString(" "))
- }
- }
-
- /**
- * This method will execute the ' cache table t as...' query
- *
- * @param cacheStatment cache table statement
- * @param dataFrame pushdown dataframe
- * @param sparkSession sparksesssion
- * @return dataframe
- */
- def cachePushDownQuery(cacheStatment: String, dataFrame: DataFrame, sparkSession: SparkSession): DataFrame = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- // create a temp view for pushdown dataframe.
- val pushDownCacheTempTable = "pushDownCacheTempTable"
- logger.info(s"Creating temp view for pushdown query dataframe as ${pushDownCacheTempTable}")
- dataFrame.createOrReplaceTempView(pushDownCacheTempTable)
-
- val sql =
- s"""
- | ${cacheStatment} SELECT * FROM ${pushDownCacheTempTable}
- """.stripMargin
-
- // execute the cached statement
- logger.info(s"Now caching dataframe for pushdown query: ${sql}")
- sparkSession.sql(sql)
- }
-
- /**
- * Push downs the SELECT query to JDBC data source and executes using JDBC read.
- *
- * @param inputSQL SELECT SQL string
- * @param sparkSession : SparkSession
- * @return DataFrame
- */
-
- def executePushdownQuery(inputSQL: String, sparkSession: SparkSession): DataFrame = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- // check if SQL contains cache query
- val (cacheStatement, selectSQL) = splitCacheQuery(inputSQL)
-
- val dataSetProps = sparkSession.conf.getAll
- val jdbcOptions: Map[String, String] = JdbcAuxiliaryUtilities.getJDBCOptions(dataSetProps)
-
- if (!jdbcOptions.contains(JdbcConfigs.jdbcUrl)) {
- throw new IllegalArgumentException("No JDBC url found. Please verify the dataset name in query")
- }
-
- val userSpecifiedFetchSize = dataSetProps.getOrElse("fetchSize", JdbcConstants.DEFAULT_READ_FETCH_SIZE).toString.toInt
-
- try {
- val jdbcSystem = getJDBCSystem(jdbcOptions(JdbcConfigs.jdbcUrl))
- val pushDownDf = jdbcSystem match {
- case JdbcConstants.TERADATA =>
- executeTeradataSelectPushDownQuery(sparkSession, selectSQL, dataSetProps, jdbcOptions, userSpecifiedFetchSize)
- case _ =>
- val pushDownSqlAsTempTable = s"( $selectSQL ) as pushDownTempTable"
- logger.info(s"Final SQL for Query Push Down --> $pushDownSqlAsTempTable")
- val jdbcConnectionUtility: JDBCConnectionUtility = JDBCConnectionUtility(sparkSession, dataSetProps)
- JdbcAuxiliaryUtilities.sparkJdbcRead(sparkSession, jdbcOptions(JdbcConfigs.jdbcUrl), pushDownSqlAsTempTable,
- None, JdbcConstants.DEFAULT_LOWER_BOUND, JdbcConstants.DEFAULT_UPPER_BOUND,
- 1, userSpecifiedFetchSize, jdbcConnectionUtility.getConnectionProperties)
- }
-
- // cache query if inputSql contains cache query
- cacheStatement match {
- case Some(cacheTable) =>
- // cache the query results from pushdown
- logger.info(s"Now caching the dataframe for -> $selectSQL")
- cachePushDownQuery(cacheTable, pushDownDf, sparkSession)
- case _ =>
- pushDownDf
- }
- }
- catch {
- case exec: SQLException =>
- val errors = new mutable.StringBuilder()
- var ex: SQLException = exec
- var lastException: SQLException = exec
- while (ex != null) {
- if (errors.nonEmpty) {
- errors.append(s"${GimelConstants.COMMA} ")
- }
- errors.append(s = ex.getErrorCode().toString)
- lastException = ex
- ex = ex.getNextException
- }
- if (lastException != null) {
- lastException.printStackTrace()
- }
- logger.error(s"SQLException: Error codes ${errors.toString()}")
- throw exec
- case e: Throwable =>
- throw e
- }
- finally {
- // re-setting all configs for JDBC
- JDBCCommons.resetPushDownConfigs(sparkSession)
- }
- }
-
-
- def executeTeradataSelectPushDownQuery(sparkSession: SparkSession, selectSQL: String,
- dataSetProps: Map[String, String], jdbcOptions: Map[String, String],
- userSpecifiedFetchSize: Int): DataFrame = {
- logger.info(s" @Begin --> ${new Exception().getStackTrace.apply(1).getMethodName}")
- val jdbcConnectionUtility: JDBCConnectionUtility = JDBCConnectionUtility(sparkSession, dataSetProps)
- import JDBCUtilities._
- val loggerOption = Some(logger)
- val mutableJdbcOptions: mutable.Map[String, String] = scala.collection.mutable.Map(jdbcOptions.toSeq: _*)
- var sqlToBeExecutedInJdbcRDD: String = selectSQL
- logger.info(s"In query pushdown SQL to be executed --> $sqlToBeExecutedInJdbcRDD")
-
- // Get connection details per the explain plan of the incomingSql
- import JDBCConnectionUtility.withResources
- var (connectionDetails, connectionUtilityPerIncomingSQL): (ConnectionDetails, JDBCConnectionUtility) =
- (null, jdbcConnectionUtility)
- var partitionColumns: Seq[String] = Seq.empty
- withResources(getOrCreateConnection(connectionUtilityPerIncomingSQL, logger = loggerOption)) {
- connection =>
- // get the partition columns
- partitionColumns = JdbcAuxiliaryUtilities.getAndSetPartitionParameters(
- sparkSession, dataSetProps, userSpecifiedFetchSize, mutableJdbcOptions, connection)
- val tuple = JdbcAuxiliaryUtilities.getConnectionInfo(sparkSession,
- jdbcConnectionUtility, dataSetProps, sqlToBeExecutedInJdbcRDD, loggerOption, partitionColumns)
- connectionDetails = tuple._1
- connectionUtilityPerIncomingSQL = tuple._2
- }
-
- // Create a new connection as per the new config
- withResources(getOrCreateConnection(connectionUtilityPerIncomingSQL, logger = loggerOption)) {
- connection =>
- // if partitions greater than 1
- if (connectionDetails.numOfPartitions > 1) {
- // if sql having analytical functions
- if (QueryParserUtils.isHavingAnalyticalFunction(selectSQL)) {
- require(dataSetProps.contains(JdbcConfigs.jdbcTempDatabase),
- s"Expecting CONF: ${JdbcConfigs.jdbcTempDatabase} to be available")
- val tableName =
- s"${dataSetProps(JdbcConfigs.jdbcTempDatabase)}.gimel_push_down_${
- Hashing.sha256().hashString(selectSQL, StandardCharsets.UTF_8)
- .toString.substring(0, 7)
- }"
- logger.info(s"Resolved temp table name: $tableName")
- // delete the temp table if it exists
- JdbcAuxiliaryUtilities.dropTable(tableName, connection, logger = loggerOption)
- // create volatile table as select with data
- // Recording the time taken for the query execution
- val createTableStatement: String = s"CREATE TABLE $tableName AS ( ${selectSQL.trim} ) WITH DATA "
- logger.info(s"Proceeding to execute: $createTableStatement")
- JdbcAuxiliaryUtilities.executeQueryStatement(createTableStatement, connection,
- incomingLogger = loggerOption, recordTimeTakenToExecute = true)
- // rewrite the selectSql with `select * from temp_table` and set JdbcConfigs.jdbcDbTable => temp_table
- sqlToBeExecutedInJdbcRDD = s"SELECT * from $tableName"
- mutableJdbcOptions += (JdbcConfigs.jdbcTempTable -> tableName)
- mutableJdbcOptions += (JdbcConfigs.jdbcDbTable -> tableName)
- // Set the first column name as partition column if data split is needed
- if (!dataSetProps.contains(JdbcConfigs.jdbcPartitionColumns)) {
- val tempTableSchema = JdbcReadUtility.resolveTable(
- mutableJdbcOptions(JdbcConfigs.jdbcUrl),
- sqlToBeExecutedInJdbcRDD, connection
- )
- mutableJdbcOptions += (JdbcConfigs.jdbcPartitionColumns -> tempTableSchema.head.name)
- }
- }
- }
-
- if (!selectSQL.equals(sqlToBeExecutedInJdbcRDD)) {
- logger.info("Re-calculating the connection info as the SQL to be executed is changed ")
- val tuple = JdbcAuxiliaryUtilities.getConnectionInfo(sparkSession,
- jdbcConnectionUtility, dataSetProps, sqlToBeExecutedInJdbcRDD, loggerOption, partitionColumns)
- // below syntax to override compilation error
- connectionDetails = tuple._1
- connectionUtilityPerIncomingSQL = tuple._2
- }
-
- // create JDBC rdd
-
- logger.info(s"Final SQL for Query Push Down --> $sqlToBeExecutedInJdbcRDD")
- val tableSchema = JdbcReadUtility.resolveTable(
- mutableJdbcOptions(JdbcConfigs.jdbcUrl),
- sqlToBeExecutedInJdbcRDD,
- connection
- )
-
- JdbcAuxiliaryUtilities.createJdbcDataFrame(sparkSession, sqlToBeExecutedInJdbcRDD,
- connectionDetails.fetchSize, connectionDetails.numOfPartitions,
- connectionUtilityPerIncomingSQL, partitionColumns, tableSchema)
- }
- }
-
- def validateAllTablesAreFromSameJdbcSystem(sparkSession: SparkSession,
- tables: Seq[String],
- sqlToBeExecuted: String): (Boolean, Option[Map[String, String]]) = {
- val dataSetPropertiesForAllTables: Iterable[Option[DataSetProperties]] = tables.map {
- tableName =>
- Try(CatalogProvider.getDataSetProperties(tableName, mergeAllConfs(sparkSession))).toOption
- }
- if (dataSetPropertiesForAllTables.nonEmpty && dataSetPropertiesForAllTables.head.isDefined) {
- var queryPushDownFlag: Boolean = false
- val headJdbcUrl = dataSetPropertiesForAllTables.head.get.props.get(JdbcConfigs.jdbcUrl)
- if (headJdbcUrl.isDefined) {
- queryPushDownFlag = dataSetPropertiesForAllTables.forall {
- dataSetProperty =>
- dataSetProperty.isDefined && dataSetProperty.get.datasetType == GimelConstants.STORAGE_TYPE_JDBC &&
- dataSetProperty.get.props.contains(JdbcConfigs.jdbcUrl) &&
- headJdbcUrl.get.equalsIgnoreCase(dataSetProperty.get.props(JdbcConfigs.jdbcUrl))
- }
- }
- if (queryPushDownFlag && JdbcAuxiliaryUtilities.validatePushDownQuery(sparkSession,
- tables.head, sqlToBeExecuted)) {
- // Getting connection info from dataset properties else from the incoming properties
- (queryPushDownFlag, Some(JdbcAuxiliaryUtilities.getJDBCOptions(
- Map(GimelConstants.DATASET_PROPS -> dataSetPropertiesForAllTables.head.get)
- )))
- } else {
- (false, None)
- }
- } else {
- (false, None)
- }
- }
-
- def validateAllDatasetsAreFromSameJdbcSystem(datasets: Seq[String]): Boolean = {
- var areAllDatasetFromSameJdbcSystem: Boolean = false
- if (datasets.nonEmpty) {
- import com.paypal.gimel.parser.utilities.QueryParserUtils._
- val storageSystemName = Try(extractSystemFromDatasetName(datasets.head)).toOption
- if (storageSystemName.isDefined &&
- CatalogProvider.getStorageSystemProperties(
- storageSystemName.get
- )(GimelConstants.STORAGE_TYPE) == GimelConstants.STORAGE_TYPE_JDBC) {
- areAllDatasetFromSameJdbcSystem = datasets.forall {
- dataset =>
- Try {
- val storageSystemProperties =
- CatalogProvider.getStorageSystemProperties(extractSystemFromDatasetName(dataset))
- storageSystemProperties(GimelConstants.STORAGE_TYPE) == GimelConstants
- .STORAGE_TYPE_JDBC && dataset.contains(storageSystemName.get)
- }.getOrElse(false)
- }
- }
- }
- areAllDatasetFromSameJdbcSystem
- }
-
- /**
- * Returns the flag whether the query has to be pushed down to dataset or not based on dataset provided
- * and user supplied flag for pushdown, this method is primarily called for select only clause
- *
- * @param originalSQL SQLString
- * @param selectSQL SQLString
- * @param sparkSession : SparkSession
- * @param dataSet Dataset Object
- * @return String flag to show whether to push down query or not
- */
- def getQueryPushDownFlag(originalSQL: String, selectSQL: String, sparkSession: SparkSession,
- dataSet: com.paypal.gimel.DataSet): String = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info(" @Begin --> " + MethodName)
-
- val tables = getTablesFrom(selectSQL)
- val userSuppliedPushDownFlag: Boolean = getQueryPushDownFlagFromConf(sparkSession)
-
- var queryPushDownFlag: Boolean = false
- if (userSuppliedPushDownFlag && tables.nonEmpty) {
- val (queryPushDownFlagR, jdbcOptions) = validateAllTablesAreFromSameJdbcSystem(sparkSession, tables, selectSQL)
- if (queryPushDownFlagR) {
- // if all the tables are from the same JDBC system then set query pushdown flag to be true
- queryPushDownFlag = queryPushDownFlagR
- logger.info(s"Since all the datasets are from same JDBC system overriding " +
- s"User specified flag: $userSuppliedPushDownFlag -> true " +
- s"with JDBC options: $jdbcOptions")
- } else {
- logger.info(s"Atleast one dataset is from an alternate JDBC system overriding " +
- s"User specified flag: $userSuppliedPushDownFlag -> false")
- }
- }
-
- logger.info(s"queryPushDownFlag for data sets${ArrayUtils.toString(tables)}:" +
- s" ${queryPushDownFlag.toString}")
- queryPushDownFlag.toString
- }
-
- /**
- * Resolves the Query by replacing Tmp Tables in the Query String
- * For Each Tmp Table placed in the Query String - a DataSet.read is initiated
- * For each Tmp Table - if the dataset is a Kafka DataSet - then each KafkaDataSet object is accumulated
- * Accumulated KafkaDataSet Object will be used towards the end of the Query (on success) -
- * to call check pointing for each topic consumed
- *
- * @param sql SQL String
- * @param sparkSession : SparkSession
- * @param dataSet DataSet
- * @return Tuple ( Target Table, select SQL String, List(KafkaDataSet) )
- */
- def resolveSQL(sql: String, sparkSession: SparkSession, dataSet: com.paypal.gimel.DataSet):
- (String, Option[String], String, List[GimelDataSet], String) = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
-
- logger.info("@Begin --> " + MethodName)
-
- logger.info(s"incoming SQL --> $sql")
- val uniformSQL = sql.replace("\n", " ")
- val selectClauseOnly = getSelectClause(uniformSQL)
- val (originalSQL, selectClause, kafkaDataSets, queryPushDownFlag) =
- resolveSQLWithTmpTables(sql, selectClauseOnly, sparkSession, dataSet)
- val targetTable = getTargetTables(sql)
- logger.info(s"selectClause --> $selectClause")
- logger.info(s"destination --> $targetTable")
- (originalSQL, targetTable, selectClause, kafkaDataSets, queryPushDownFlag)
- }
-
- /**
- * Checks whether partitioned by clause is there so that we can pull out the partitions spec
- *
- * @param sql - incoming sql string
- * @return - Boolean value to see whether partitioned clause presents or not
- */
- def existsPartitionedByClause(sql: String): Boolean = {
- def MethodName: String = new Exception().getStackTrace.apply(1).getMethodName
- logger.info(" @Begin --> " + MethodName)
- sql.toUpperCase().contains(GimelConstants.HIVE_DDL_PARTITIONED_BY_CLAUSE)
- }
-
- /**
- * Checks the config to see if complete pushdown enabled,
- * if enabled returns the transformed SQL and the JDBC options
- *
- * @param sparkSession -> Created SparkSession
- * @param sql -> Incoming SQL to be executed
- * @return
- */
- def isJdbcCompletePushDownEnabled(sparkSession: SparkSession,
- sql: String): (Boolean, Option[String], Option[Map[String, String]]) = {
- logger.info(s"@Begin --> ${new Exception().getStackTrace.apply(1).getMethodName}")
- val userSuppliedPushDownFlag: Boolean = getQueryPushDownFlagFromConf(sparkSession)
- val isSelectQuery = QueryParserUtils.isSelectQuery(sql)
- logger.info(s"Is select query: $isSelectQuery")
- var resultTuple: (Boolean, Option[String], Option[Map[String, String]]) = (false, None, None)
- if (userSuppliedPushDownFlag && !isSelectQuery) {
- val tables = getAllTableSources(sql)
- // val datasets = SQLDataTypesUtils.getDatasets(sql)
- logger.info(s"Received tables: $tables for the query: $sql")
- // if sql's target tables are of the same JDBC system
- if (validateAllTablesAreFromSameJdbcSystem(sparkSession, tables, sqlToBeExecuted = sql)._1) {
- logger.info("All datasets are from the same JDBC system")
- // As tables emptiness is checked on the validateAllDatasetsAreFromSameJdbcSystem, getting the tables.head
-
- val transformedSQL = QueryParserUtils.transformUdcSQLtoJdbcSQL(sql, tables)
- import com.paypal.gimel.common.utilities.DataSetUtils._
- val systemName = QueryParserUtils.extractSystemFromDatasetName(tables.head)
- resultTuple = (true, Some(transformedSQL), Some(getJdbcConnectionOptions(systemName, sparkSession.conf.getAll)))
- } else {
- logger.info("Not all the datasets are from the same JDBC system")
- }
- } else if (userSuppliedPushDownFlag && isSelectQuery) {
- // Set partitioning to be 1
- // sparkSession.conf.set(JdbcConfigs.jdbcCompletePushdownSelectEnabled, value = true)
- logger.info(s"As we received a select query with pushdown flag enabled: $userSuppliedPushDownFlag," +
- s" we redirect the output to dataset reader -> Query: $sql")
- }
- resultTuple
- }
-
- private def getQueryPushDownFlagFromConf(sparkSession: SparkSession): Boolean = {
- // User supplied push down flag will be overridden if all the datasets are from the same JDBC system
- val userSuppliedPushDownFlag = Try(
- sparkSession.conf.get(JdbcConfigs.jdbcPushDownEnabled, "true").toBoolean
- ).getOrElse(true)
- logger.info(s"User specified pushdown flag: $userSuppliedPushDownFlag")
- userSuppliedPushDownFlag
- }
-
- /**
- * Utility for executing push down queries on the respective JDBC system, based on the incoming dataset's property
- *
- * @param sparkSession
- * @param sql
- * @param jdbcOptions
- * @return
- */
- def pushDownQueryAndReturnResult(sparkSession: SparkSession,
- sql: String,
- jdbcOptions: Map[String, String]): String = {
- val jdbcConnectionUtility: JDBCConnectionUtility = validateAndGetJdbcConnectionUtility(sparkSession, jdbcOptions)
- val functionName = s"[QueryHash: ${sql.hashCode}]"
- logger.info(s"Proceeding to execute JDBC[System: ${jdbcConnectionUtility.jdbcSystem}," +
- s" User: ${jdbcConnectionUtility.jdbcUser}] pushdown query$functionName: $sql")
- GenericUtils.time(functionName, Some(logger)) {
- val queryResult: String =
- JDBCConnectionUtility.withResources(
- JDBCUtilities.getOrCreateConnection(jdbcConnectionUtility, logger = Some(logger))
- ) {
- connection => JdbcAuxiliaryUtilities.executeQueryAndReturnResultString(sql, connection)
- }
- queryResult
- }
- }
-
- private def validateAndGetJdbcConnectionUtility(sparkSession: SparkSession,
- jdbcOptions: Map[String, String]): JDBCConnectionUtility = {
- logger.info(s" @Begin --> ${new Exception().getStackTrace.apply(1).getMethodName}")
- logger.info(s"Received JDBC options: $jdbcOptions")
- if (!jdbcOptions.contains(JdbcConfigs.jdbcUrl)) {
- throw new IllegalArgumentException("No JDBC url found. Please verify the dataset name in query")
- }
-
- JDBCConnectionUtility(sparkSession, jdbcOptions)
- }
-
- def createPushDownQueryDataframe(sparkSession: SparkSession,
- sql: String,
- jdbcOptions: Map[String, String]): DataFrame = {
- val jdbcConnectionUtility: JDBCConnectionUtility = validateAndGetJdbcConnectionUtility(sparkSession, jdbcOptions)
- val pushDownJdbcRDD =
- new PushDownJdbcRDD(sparkSession.sparkContext, new DbConnection(jdbcConnectionUtility), sql)
- sparkSession.createDataFrame(pushDownJdbcRDD, JdbcConstants.DEF_JDBC_PUSH_DOWN_SCHEMA)
- }
-
- private lazy val userInfoString =
- s"""
- |------------------------------
- |User controllable Properties
- |------------------------------
- |
- |Query Results & Helper
- |----------------------
- |${GimelConstants.SHOW_ROWS_ENABLED} --> Set this to "true" to stop getting all these messages. (Default : false)
- |${GimelConstants.MAX_RESULTS_TO_SHOW} --> Number of rows to display in interactive mode (Default : 1000)
- |
- |Data Caching Options
- |----------------------
- |${GimelConstants.DATA_CACHE_IS_ENABLED} --> true indicates dataset caching is enabled (Default : false)
- |${GimelConstants.DATA_CACHE_IS_ENABLED}.for.pcatalog.flights --> if this = true & ${GimelConstants.DATA_CACHE_IS_ENABLED}=true, then only pcatalog.flights from query will be cached. (Default : false)
- |${GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL} --> if this = true, then all pcatalog datasets in query will be cached (Default : true)
- |
- |Logging Level
- |----------------------
- |${GimelConstants.LOG_LEVEL} --> set to INFO, DEBUG, WARN, ERROR to get desired level of logging (Default : ERROR)
- |
- |kafka Checkpointing
- |----------------------
- |${KafkaConfigs.kafkaConsumerReadCheckpointKey} --> true indicates check-pointing enabled (Default : true)
- |${KafkaConfigs.kafkaConsumerClearCheckpointKey} --> true indicates checkpoint will be cleared before this run begins (Default : false)
- |
- |kafka Stream Throttle
- |----------------------
- |${KafkaConfigs.maxRatePerPartitionKey} --> Spark Configuration for Streaming Rate (Default : 3600, empirically derived)
- |${KafkaConfigs.isStreamParallelKey} --> true causes ordering to be lost, but performance gain via parallelism factor. (Default : true)
- |${KafkaConfigs.streamParallelKey} --> Number of parallel threads to run while processing data after fetching from kafka (Default : 10)
- |${KafkaConfigs.defaultBatchInterval} --> Streaming Window Seconds (Default : 30)
- |
- |kafka Batch Throttle
- |----------------------
- |${KafkaConfigs.rowCountOnFirstRunKey} --> Fetches Only Supplied number of rows from Kafka (Default : 25 Million)
- |${KafkaConfigs.maxRecordsPerPartition} --> Advanced options to further restrict how many messages we can read from each partition - in batch mode Kafka Read (Default 25 Million rows, for this to be effective, value should be <= throttle.batch.fetchRowsOnFirstRun)
- |${KafkaConfigs.batchFetchSize} --> Advanced options to parallelize in batch mode Kafka Read (Default 250) --> This will parallelize 25 Million into 250 threads
- |
- |HBase
- |-----------------------
- |${HbaseConfigs.hbaseOperation} -> Type of operation to be performed on HBase. Can be scan for reading all data or get for lookup
- |${HbaseConfigs.hbaseFilter} -> Filter condition for HBase lookup. Example: rowKey=1:toGet=cf1-c1,c2|cf2-c3
- |
- |Elastic
- |-----------------------
- |${ElasticSearchConfigs.esIsPartitioned}-> Is the index partitioned or not ?
- |${ElasticSearchConfigs.esDelimiter}-> What is the delimiter which separates the index name with the partition
- |${ElasticSearchConfigs.esPartition}-> "*" -> wild card to include all the specific partitions
- |${ElasticSearchConfigs.esDefaultReadForAllPartitions}-> flag which indicates whether to read all partitions or not
- |${ElasticSearchConfigs.esMapping}-> flag which gets the schema from the user
- """.stripMargin
-}
diff --git a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/SQLParser.scala b/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/SQLParser.scala
deleted file mode 100644
index 01be8062..00000000
--- a/gimel-dataapi/gimel-sql/src/main/scala/com/paypal/gimel/sql/SQLParser.scala
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import scala.collection.mutable.ListBuffer
-import scala.util._
-
-import org.apache.hadoop.hive.ql.parse.{ASTNode, HiveParser, ParseDriver}
-
-object SQLParser {
-
- /**
- * getSourceTables - Helper function to call a function which is recursive to get the source table names from the AST
- *
- * @param sql to be parsed
- * @return - List of source table names
- */
-
- @deprecated
- def getSourceTables(sql: String): ListBuffer[String] = {
- val parsDri = new ParseDriver()
- val ast_tree: ASTNode = parsDri.parse(sql)
- getSourceTables(ast_tree)
- }
-
-
- /**
- * getTargetTables1 - Helper function to call a function which is recursive to get the Target table names from the AST
- *
- * @param sql to be parsed
- * @return - List of target tables if any. If it is select only table, it returns a None.
- */
-
- def getTargetTables(sql: String): Option[String] = {
- Try {
- GimelQueryUtils.isHavingInsert(sql) match {
- case false => None
- case true =>
- val lSql = sql.toLowerCase()
- val tokens = GimelQueryUtils.tokenizeSql(lSql)
- val tableIndex = tokens.contains("table") match {
- case true => tokens.indexOf("table")
- case false => tokens.indexOf("into")
- }
- Some(tokens(tableIndex + 1))
- }
- } match {
- case Success(x) => x
- case Failure(f) =>
- throw new Exception(
- s"""
- |ERROR PARSING SQL IN Gimel --> ${sql}
- |Exception --> ${f}
- |PLEASE VALIDATE IF SQL IS FORMED CORRECTLY.
- """.stripMargin)
- }
- }
-
- // TODO - Following two functions can be combined later.
-
- /**
- * getSourceTables - Recursive function to get the source table names
- *
- * @param from - AST tree
- * @param myList - list of source table names
- */
- private def getSourceTables(from: ASTNode,
- myList: ListBuffer[String] = new ListBuffer[String]()): ListBuffer[String] = {
- var table: String = ""
-
- if (from != null) {
-
- if (HiveParser.TOK_TABREF == from.getType) {
- val tabName = from.getChild(0)
-
- if (HiveParser.TOK_TABNAME == tabName.getType) {
- if (tabName.getChildCount == 2) {
- table = tabName.getChild(0).getText + "." + tabName.getChild(1).getText
- } else {
- table = tabName.getChild(0).getText
- }
- myList += table
- }
- }
-
- for (i <- 0 to from.getChildCount) {
- val child = from.getChild(i)
- if (child != null) {
- getSourceTables(child.asInstanceOf[ASTNode], myList)
- }
- }
- }
- myList
- }
-
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryProcessorTest.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryProcessorTest.scala
deleted file mode 100644
index cffd38d2..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryProcessorTest.scala
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import org.apache.spark.sql.{DataFrame, SparkSession}
-import org.scalatest._
-
-class GimelQueryProcessorTest extends FunSpec with Matchers with BeforeAndAfter {
- var sparkSession : SparkSession = _
-
- before {
- sparkSession = SparkSession.builder().appName("GimelQueryProcessor Test")
- .master("local")
- .getOrCreate()
-
- // HBaseLocalClient.startHbaseCluster(sparkSession)
- }
-
- after {
- sparkSession.close()
- // HBaseLocalClient.stopHbaseCluster()
- }
-
- /*
- * The test cases are in the ignored scope due to https://github.com/elastic/elasticsearch-hadoop/issues/1097
- * To test the following: Change "ignore" to "it"
- * Exclude either elasticsearch-hadoop pr elasticsearch-spark from the dependencies by changing their scope to provided in gimel-elasticsearch
- * OR Change the elasticsearch-hadoop version to 6.6.0 or 7.0.0
- */
-
- ignore("should test json in Hdfs dataset read via sql") {
- val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession)
- val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass;
- val resourcesPath = "file://" + (className.getResource("/hdfs_test.json")).getPath
- val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath,
- "gimel.hdfs.data.format" -> "json",
- "gimel.hive.db.name" -> "db",
- "gimel.hive.table.name" -> "table",
- "gimel.hdfs.nn" -> "file:/")
- gsql("set gimel.catalog.provider=USER")
- val dataSetProperties = s"""
- {
- "datasetType" : "HDFS",
- "fields" : [],
- "partitionFields" : [],
- "props": {
- "gimel.hdfs.data.location" : "$resourcesPath",
- "gimel.hdfs.data.format" : "json",
- "gimel.hive.db.name" : "db",
- "gimel.hive.table.name" : "table",
- "gimel.hdfs.nn" : "file:/",
- "datasetName" : "MyDataset"
- }
- }"""
- gsql("set gimel.catalog.provider=USER")
- gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""")
- val res: DataFrame = gsql(s"""select * from udc.hdfs.json""")
- assert(res.count() == 1)
- }
-
- ignore("should test csv in Hdfs dataset read via sql") {
- val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession)
- val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass;
- val resourcesPath = "file://" + (className.getResource("/hdfs_test.csv")).getPath
- val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath,
- "gimel.hdfs.data.format" -> "json",
- "gimel.hive.db.name" -> "db",
- "gimel.hive.table.name" -> "table",
- "gimel.hdfs.nn" -> "file:/")
- gsql("set gimel.catalog.provider=USER")
- val dataSetProperties = s"""
- {
- "datasetType" : "HDFS",
- "fields" : [],
- "partitionFields" : [],
- "props": {
- "gimel.hdfs.data.location" : "$resourcesPath",
- "gimel.hdfs.data.format" : "json",
- "gimel.hive.db.name" : "db",
- "gimel.hive.table.name" : "table",
- "gimel.hdfs.nn" : "file:/",
- "datasetName" : "MyDataset"
- }
- }"""
- gsql("set gimel.catalog.provider=USER")
- gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""")
- val res: DataFrame = gsql(s"""select * from udc.hdfs.json""")
- assert(res.count() == 1)
- }
-
- ignore("should test text in Hdfs dataset read via sql") {
- val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession)
- val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass;
- val resourcesPath = "file://" + (className.getResource("/hdfs_test.txt")).getPath
- val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath,
- "gimel.hdfs.data.format" -> "json",
- "gimel.hive.db.name" -> "db",
- "gimel.hive.table.name" -> "table",
- "gimel.hdfs.nn" -> "file:/")
- gsql("set gimel.catalog.provider=USER")
- val dataSetProperties = s"""
- {
- "datasetType" : "HDFS",
- "fields" : [],
- "partitionFields" : [],
- "props": {
- "gimel.hdfs.data.location" : "$resourcesPath",
- "gimel.hdfs.data.format" : "json",
- "gimel.hive.db.name" : "db",
- "gimel.hive.table.name" : "table",
- "gimel.hdfs.nn" : "file:/",
- "datasetName" : "MyDataset"
- }
- }"""
- gsql("set gimel.catalog.provider=USER")
- gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""")
- val res: DataFrame = gsql(s"""select * from udc.hdfs.json""")
- assert(res.count()==1)
- }
-
- ignore("should test avro in Hdfs dataset read via sql") {
- val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession)
- val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass;
- val resourcesPath = "file://" + (className.getResource("/hdfs_test.avro")).getPath
- val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath,
- "gimel.hdfs.data.format" -> "json",
- "gimel.hive.db.name" -> "db",
- "gimel.hive.table.name" -> "table",
- "gimel.hdfs.nn" -> "file:/")
- gsql("set gimel.catalog.provider=USER")
- val dataSetProperties = s"""
- {
- "datasetType" : "HDFS",
- "fields" : [],
- "partitionFields" : [],
- "props": {
- "gimel.hdfs.data.location" : "$resourcesPath",
- "gimel.hdfs.data.format" : "json",
- "gimel.hive.db.name" : "db",
- "gimel.hive.table.name" : "table",
- "gimel.hdfs.nn" : "file:/",
- "datasetName" : "MyDataset"
- }
- }"""
- gsql("set gimel.catalog.provider=USER")
- gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""")
- val res: DataFrame = gsql(s"""select * from udc.hdfs.json""")
- assert(res.count() == 2)
- }
-
- ignore("should test gz in Hdfs dataset read via sql") {
- val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession)
- val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass;
- val resourcesPath = "file://" + (className.getResource("/hdfs_test.txt.gz")).getPath
- val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath,
- "gimel.hdfs.data.format" -> "json",
- "gimel.hive.db.name" -> "db",
- "gimel.hive.table.name" -> "table",
- "gimel.hdfs.nn" -> "file:/")
- gsql("set gimel.catalog.provider=USER")
- val dataSetProperties = s"""
- {
- "datasetType" : "HDFS",
- "fields" : [],
- "partitionFields" : [],
- "props": {
- "gimel.hdfs.data.location" : "$resourcesPath",
- "gimel.hdfs.data.format" : "json",
- "gimel.hive.db.name" : "db",
- "gimel.hive.table.name" : "table",
- "gimel.hdfs.nn" : "file:/",
- "datasetName" : "MyDataset"
- }
- }"""
- gsql("set gimel.catalog.provider=USER")
- gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""")
- val res: DataFrame = gsql(s"""select * from udc.hdfs.json""")
- assert(res.count() == 1)
- }
-
- ignore("should test sequence in Hdfs dataset read via sql") {
- val gsql: String => DataFrame = com.paypal.gimel.sql.GimelQueryProcessor.executeBatch(_: String, sparkSession)
- val className = new com.paypal.gimel.hdfs.DataSet(sparkSession).getClass;
- val resourcesPath = "file://" + (className.getResource("/hdfs_test.seq")).getPath
- val props : Map[String, String] = Map("gimel.hdfs.data.location" -> resourcesPath,
- "gimel.hdfs.data.format" -> "json",
- "gimel.hive.db.name" -> "db",
- "gimel.hive.table.name" -> "table",
- "gimel.hdfs.nn" -> "file:/")
- gsql("set gimel.catalog.provider=USER")
- val dataSetProperties = s"""
- {
- "datasetType" : "HDFS",
- "fields" : [],
- "partitionFields" : [],
- "props": {
- "gimel.hdfs.data.location" : "$resourcesPath",
- "gimel.hdfs.data.format" : "json",
- "gimel.hive.db.name" : "db",
- "gimel.hive.table.name" : "table",
- "gimel.hdfs.nn" : "file:/",
- "datasetName" : "MyDataset"
- }
- }"""
- gsql("set gimel.catalog.provider=USER")
- gsql(s"""set udc.hdfs.json.dataSetProperties=$dataSetProperties""")
- val res: DataFrame = gsql(s"""select * from udc.hdfs.json""")
- assert(res.count() == 100)
- }
-
-// ignore("should test hbase write") {
-// val tableName = "test_table"
-// val gsql: String => DataFrame = com.paypal.gimel.scaas.GimelQueryProcessor.executeBatch(_: String, sparkSession)
-// gsql("set " + HbaseConfigs.hbaseRowKey + "=id")
-// gsql("set " + HbaseConfigs.hbaseColumnMappingKey + "=personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
-// val dataFrame = HBaseLocalClient.mockDataInDataFrame(sparkSession, 1000)
-// dataFrame.registerTempTable("input_table")
-// val sql = "insert into HBase.Local.default." + tableName + " select * from input_table"
-// val df = gsql(sql)
-// df.show
-// }
-//
-// ignore("should test hbase read with limit") {
-// val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]
-// val listener = new QueryExecutionListener {
-// // Only test successful case here, so no need to implement `onFailure`
-// override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
-// override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
-// metrics += ((funcName, qe, duration))
-// }
-// }
-// sparkSession.listenerManager.register(listener)
-// val tableName = "test_table"
-// val gsql: String => DataFrame = com.paypal.gimel.scaas.GimelQueryProcessor.executeBatch(_: String, sparkSession)
-// gsql("set " + HbaseConfigs.hbaseRowKey + "=id")
-// gsql("set " + HbaseConfigs.hbaseColumnMappingKey + "=personal:name,personal:address,personal:age,professional:company,professional:designation,professional:salary")
-// sparkSession.conf.set(GimelConstants.HBASE_PAGE_SIZE, 20)
-// val sql = "select * from HBase.Local.default." + tableName + " limit 20"
-// val df = gsql(sql)
-// df.show(20)
-// val metricInsertQuery = metrics(metrics.length - 1)
-// val qe = metricInsertQuery._2
-// println(qe.executedPlan.children(0).children(0).children(0).metrics)
-// val kafkaReadOutputRows = qe.executedPlan.children(0).children(0).children(0).metrics("numOutputRows").value
-// assert(kafkaReadOutputRows == 20)
-// sparkSession.conf.unset(GimelConstants.HBASE_PAGE_SIZE)
-//
-// }
-
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsSpec.scala
deleted file mode 100644
index 812dfd43..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsSpec.scala
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package com.paypal.gimel.sql
-
-import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers}
-
-class GimelQueryUtilsSpec
- extends FunSpec
- with SharedSparkSession
- with Matchers
- with BeforeAndAfterEach {
-
- // add things to do before each test for this specific files
- protected override def beforeEach(): Unit = {
- GimelQueryUtils.setCatalogProvider("UDC")
- }
-
- // add things to do after each test for this specific files
- protected override def afterEach(): Unit = {
- GimelQueryUtils.setCatalogProvider("UDC")
- }
-
- describe("setCatalogProvider") {
- it("should set user specified CatalogProvider") {
-
- // UDC Catalog provider
- GimelQueryUtils.setCatalogProvider("UDC")
- GimelQueryUtils.getCatalogProvider() should be("UDC")
-
- GimelQueryUtils.setCatalogProvider("HIVE")
- GimelQueryUtils.getCatalogProvider() should be("HIVE")
-
- GimelQueryUtils.setCatalogProvider("PCATALOG")
- GimelQueryUtils.getCatalogProvider() should be("PCATALOG")
-
- }
-
- it("should throw warning if the catalog provider is not UDC/HIVE/PCATALOG") {
- GimelQueryUtils.setCatalogProvider("TEST")
- GimelQueryUtils.getCatalogProvider() should be("UDC")
-
- }
- }
-
- describe("tokenizeSql") {
- it("should tokenize the string passed to it") {
- GimelQueryUtils.tokenizeSql(SQLMasterList.simpleInsertSelect2) should be(
- Array(
- "INSERT",
- "INTO",
- "UDC.Mysql.datalake.test.YELP_REVIEW_WRITE",
- "SELECT",
- "*",
- "FROM",
- "udc.kafka.tau.yelp.review"
- )
- )
- }
- }
-
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsTest.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsTest.scala
deleted file mode 100644
index 7066c9ae..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/GimelQueryUtilsTest.scala
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import scala.util.{Failure, Success, Try}
-
-import org.apache.commons.lang3.StringUtils
-import org.mockito.Mockito._
-import org.scalamock.scalatest.MockFactory
-import org.scalatest._
-
-import com.paypal.gimel.common.catalog.CatalogProvider
-import com.paypal.gimel.common.gimelservices.{GimelServicesProperties, GimelServiceUtilities}
-import com.paypal.gimel.parser.utilities.{QueryParserUtils, SearchSchemaUtils}
-
-class GimelQueryUtilsTest extends FunSuite with Matchers with MockFactory {
-
- import com.paypal.gimel.parser.utilities.QueryParserUtils._
-
- test("Test Extraction of tablename ") {
- assert(
- extractTableName(
- "udc.teradata.test_cluster.yelp.review"
- ) == "yelp.review"
- )
- assert(
- extractTableName(
- "udc.Teradata.Test_cluster.yelp.business_details"
- ) == "yelp.business_details"
- )
- val tableName = "udc.teradata.test_cluster.yelp.review"
- assert(extractTableName(tableName, 1) === "review")
- assert(extractTableName(tableName, 2) === "yelp.review")
- testErrorCase(extractTableName("", 2))
- testErrorCase(
- extractTableName("yelp.review", 2)
- )
- testErrorCase(extractTableName(null, 2))
- testErrorCase(extractTableName("tablename") === "tablename")
- }
-
- test("Execute query ") {
- println(
- QueryParserUtils.isQueryOfGivenSeqType(
- "sel * from udc.Teradata.Test_cluster.yelp.business_details sample 10;"
- )
- )
- println(
- QueryParserUtils.isQueryOfGivenSeqType(
- "sel * fromudc.Teradata.Test_cluster.yelp.reviews sample 10;"
- )
- )
- println(
- QueryParserUtils.isQueryOfGivenSeqType(
- "DELETE udc.Teradata.Test_cluster.yelp.business_details ALL"
- )
- )
- }
-
- test("Transform SQL name") {
- println(
- QueryParserUtils.isQueryOfGivenSeqType(
- "show select * from udc.Teradata.Test_cluster.yelp.business_details"
- )
- )
- validateTransformSQL(
- "DELETE udc.Teradata.Test_cluster.yelp.business_details ALL",
- "DELETE yelp.business_details ALL"
- )
- validateTransformSQL(
- "show select * from udc.Teradata.Test_cluster.yelp.business_details",
- "show select * from yelp.business_details"
- )
- validateTransformSQL(
- """ INSERT INTO udc.Teradata.Test_cluster.yelp.business_details (
- | id,
- | created_date )
- | VALUES ('123fvf', '2019-08-09')""".stripMargin,
- """ INSERT INTO yelp.business_details (
- | id,
- | created_date )
- | VALUES ('123fvf', '2019-08-09')""".stripMargin
- )
- }
-
- def testErrorCase[R](block: => R): Option[R] = {
- Try(block) match {
- case Success(value) => Option(value)
- case Failure(exception) =>
- exception.printStackTrace()
- None
- }
- }
-
- test("Replace SQL") {
- val sql =
- """INSERT INTO udc.teradata.test_cluster.yelp.business_details VALUES ('123fvfv',
- |'2019-08-05')""".stripMargin
- var transformedSQL = sql
- val tables = GimelQueryUtils.getAllTableSources(
- sql,
- searchList = SearchSchemaUtils.ALL_TABLES_SEARCH_CRITERIA
- )
- println("Tables -> " + tables)
- tables.foreach(
- tableName =>
- transformedSQL =
- transformedSQL.replaceAll(tableName, extractTableName(tableName))
- )
- println("transformedSQL -> " + transformedSQL)
- assert(
- transformedSQL ===
- """INSERT INTO yelp.business_details VALUES ('123fvfv',
- |'2019-08-05')""".stripMargin
- )
- }
-
- private def validateTransformSQL(sql: String, assertString: String) = {
- assert(
- transformSQL(sql, QueryParserUtils.getDatasets(sql)) === assertString
- )
- }
-
- def transformSQL(sql: String, datasets: Seq[String]): String = {
- var transformedSQL = sql
- datasets.foreach(
- datasetName =>
- transformedSQL = StringUtils.replaceIgnoreCase(
- transformedSQL,
- datasetName,
- extractTableName(datasetName)
- )
- )
-
- transformedSQL
- }
-
- ignore("No connection to UDC service: TC 2") {
- test("validateAllDatasetsAreFromSameJdbcSystem") {
-
- val gimelServiceProps = spy(new GimelServicesProperties())
- val serviceUtilities = mock[GimelServiceUtilities]
- when(
- serviceUtilities
- .getSystemAttributesMapByName("udc.teradata.test_cluster.yelp.review")
- ).thenReturn(
- Map(
- "gimel.storage.type" -> "JDBC",
- "gimel.jdbc.url" -> "jdbc:teradata://teradata-host",
- "gimel.jdbc.driver.class" -> "com.teradata.jdbc.TeraDriver",
- "storageSystemID" -> "11"
- )
- )
- println(
- CatalogProvider
- .getStorageSystemProperties("udc.teradata.test_cluster.yelp.review")
- )
- println("Hello")
- }
- }
-
- test("extractSystemFromDatasetName") {
- assert(
- extractSystemFromDatasetName("udc.teradata.test_cluster.yelp.review") === "teradata.test_cluster"
- )
- try {
- extractSystemFromDatasetName("yelp.review")
- } catch {
- case e: IllegalStateException => e.printStackTrace()
- }
- try {
- extractSystemFromDatasetName(null)
- } catch {
- case e: IllegalArgumentException => e.printStackTrace()
- }
- assert(
- extractSystemFromDatasetName("udc. kafka.test_cluster.yelp.review ") === "kafka.test_cluster"
- )
- }
-
- test(" IS Select Quey") {
- assert(
- QueryParserUtils.isSelectQuery(
- "select * from udc.teradata.test_cluster.yelp.review sample 10;"
- )
- )
- }
-
- test(" getTablesFrom SQL ") {
- assert(
- GimelQueryUtils
- .getTablesFrom("help table udc.teradata.test_cluster.yelp.review;")
- .sameElements(Array("udc.teradata.test_cluster.yelp.review"))
- )
-
- assert(
- GimelQueryUtils
- .getAllTableSources(
- "help table udc.teradata.test_cluster.yelp.review;",
- searchList = SearchSchemaUtils.TARGET_TABLES_SEARCH_CRITERIA
- ) == List("udc.teradata.test_cluster.yelp.review")
- )
-
- assert(
- GimelQueryUtils
- .getAllTableSources(
- """
- |create multiset table ${targetDb}.enriched_data as
- |select
- | review.review_id,
- | review.review_text,
- | review.user_id,
- | review.review_date,
- | review.business_id,
- | business_details.name as business_name,
- | postal_geo_map.latitude as business_latitude,
- | postal_geo_map.longitude as business_longitude,
- | yelp_user.name as user_name,
- | yelp_user.review_count as user_review_count,
- | yelp_user.yelping_since as user_yelping_since
- |from
- | pcatalog.teradata.tau.yelp.review review
- |inner join
- | pcatalog.teradata.tau.yelp.business_details business_details
- |on
- | review.business_id = business_details.business_id
- |join
- | pcatalog.teradata.tau.yelp.business_address business_address
- |on
- | review.business_id = business_address.business_id
- |join
- | pcatalog.teradata.tau.yelp.user yelp_user
- |on
- | yelp_user.user_id = review.user_id
- |join
- | pcatalog.teradata.tau.yelp.postal_geo_map
- |on
- | business_address.postal_code = postal_geo_map.postal_code
- |where
- | review.review_date > current_date -150
- |and
- | review.business_id = 'ogpiys3gnfZNZBTEJw5-1Q'
- |""".stripMargin,
- searchList = SearchSchemaUtils.ALL_TABLES_SEARCH_CRITERIA
- ).sorted.sameElements(Array(
- "pcatalog.teradata.tau.yelp.review",
- "${targetdb}.enriched_data",
- "pcatalog.teradata.tau.yelp.business_details",
- "pcatalog.teradata.tau.yelp.business_address",
- "pcatalog.teradata.tau.yelp.user",
- "pcatalog.teradata.tau.yelp.postal_geo_map"
- ).sorted)
- )
- }
-
- // Substitutes dataset name with tmp table in sql using regex
- test ("getSQLWithTmpTable") {
- // Should match as "udc.hive.test.flights" is preceded by space and is at end of the line
- assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights",
- "udc.hive.test.flights",
- "tmp_flights")
- == "select * from tmp_flights")
-
- // Should not match as "udc.hive.test.flights" is not preceded by any white space
- assert(GimelQueryUtils.getSQLWithTmpTable("select * fromudc.hive.test.flights",
- "udc.hive.test.flights",
- "tmp_flights")
- == "select * fromudc.hive.test.flights")
-
- // Should not match as "udc.hive.test.flights" is not followed by any white space, ; or ,
- assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights_schedule",
- "udc.hive.test.flights",
- "tmp_flights")
- == "select * from udc.hive.test.flights_schedule")
-
- // Should match as "udc.hive.test.flights" is preceded by space and followed by new line
- assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights\n",
- "udc.hive.test.flights",
- "tmp_flights")
- == "select * from tmp_flights\n")
-
- // Should match as "udc.hive.test.flights" is preceded by space and followed by ,
- assert(GimelQueryUtils.getSQLWithTmpTable("select * from udc.hive.test.flights, udc.hive.test.flights_schedule",
- "udc.hive.test.flights",
- "tmp_flights")
- == "select * from tmp_flights, udc.hive.test.flights_schedule")
-
- // Should match as "udc.hive.test.flights" is preceded and followed by space
- assert(GimelQueryUtils.getSQLWithTmpTable(
- "select * from udc.hive.test.flights where flights_id = 123",
- "udc.hive.test.flights",
- "tmp_flights")
- == "select * from tmp_flights where flights_id = 123")
- }
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLMasterList.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLMasterList.scala
deleted file mode 100644
index a784c424..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLMasterList.scala
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-object SQLMasterList {
-
- val insertSQL1 =
- """
- |INSERT INTO data_source_tab2 PARTITION (p1 = 'part1', p2)
- | SELECT id, 'part' || id FROM RANGE(1, 3)
- |
- """.stripMargin
-
- val insertSQL2 =
- """
- |INSERT INTO table data_source_tab2 PARTITION (p1 = 'part1', p2)
- | SELECT id, 'part' || id FROM RANGE(1, 3)
- |
- """.stripMargin
-
- val insertSQL3 =
- """
- |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'partNew1', p2)
- | VALUES (3, 'partNew2')
- """.stripMargin
-
- val insertSQL4 =
- """
- |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'part1', p2)
- | VALUES (5, 'part1')
- """.stripMargin
-
- val insertSQL5 =
- """insert into pcatalog.Elastic_Test_Cluster_yelp_review_data
- |select * from pcatalog.kafka_test_cluster_yelp_review
- |
- """.stripMargin
-
- val insertSQL6 =
- """
- |insert into tgt select * from src
- """.stripMargin
-
- val insertPartitionedTable1 =
- """
- |INSERT INTO TABLE temp_table2 PARTITION(col1) SELECT col1, col2, col3, col4 FROM temp_table1;
- """.stripMargin
-
- val baSQL4 =
- """
- |select yelp_user.*, last_user_review.last_review_date
- |from pcatalog.teradata.tau.yelp.user yelp_user
- |join (
- |select user_id,
- |max(review_date) as last_review_date
- |from pcatalog.teradata.tau.yelp.review review
- |group by user_id
- |) last_user_review
- |on yelp_user.user_id = last_user_review.user_id
- |where yelp_user.review_count > 100
- |and yelp_user.useful> 100
- |and fans > 100
- |and last_user_review.last_review_date < current_date - 180
- |sample 10
- """.stripMargin
-
- val sqlANSIanNonANSI =
- """select t1.* from
- |(select
- |b.*
- |from
- |a k,b l, c l, d k ) join t2
- |on t1.id = t2.id where
- |1=2 and
- |(select * from k , l where k.id = l.id)
- """.stripMargin
-
- val sqlANSISimple =
- """
- |select
- |b.*
- |from
- |a k,b l, c l, d k
- """.stripMargin
-
- val sqlANSISimpleSubQuery =
- """
- |select t.* from
- |(select
- |b.*
- |from
- |a k,b l, c l, d k ) t
- """.stripMargin
-
- val sqlANSIOnly =
- """
- |select * from
- |testdb.emp d
- |left join emp_loc f
- |on d.id = f.id
- """.stripMargin
- val plainSQL =
- """
- |select
- | * from abc;
- """.stripMargin
-
- val innerSQL =
- """
- |select * from (select * from a) tbl
- """.stripMargin
-
- val joinANSISQLViaUDC =
- """
- |select t2.c1, t2.c2
- |, t1.*
- |from
- |testdb.emp t1
- |join (
- |select f1.c11, f2.c11
- |from udc.kafka.test.emp.address f1
- |join udc.kafka.test.emp.loc f2
- |on f1.id = f2.id
- |) t2
- |on t1.id = t2.id
- """.stripMargin
-
-
- val mixANSINonANSISQL =
- """
- |select * from
- |testdb.emp s join
- |(
- |select
- |* from
- |udc.kafka.test.test.emp a, udc.hive.test_cluster.test.calendar b, c
- |where a.id = b.id
- |and c.id1 = c.id1
- |) t
- |on s.key = b.key
- |where 1= 2
- |and exists (select 1 from udc.teradata.test_cluster.testdb.lkp where lkp.id3 = s.id3);
- |
- """.stripMargin
-
- val commaTypeSQL =
- """
- |select f.*, d.*
- |from f , d
- |where f.id = d.id
- """.stripMargin
-
- val mixCaseSQL = "select * FRom tmp"
-
- val simpleSelect1 = "SELECT * FROM UDC.Mysql.datalake.test.YELP_REVIEW_READ"
-
- val simpleInsertSelect1 =
- """
- | INSERT INTO UDC.Mysql.datalake.test.YELP_REVIEW_WRITE
- | SELECT * FROM udc.kafka.tau.yelp.review
- """.stripMargin
-
- val simpleInsertSelect2 =
- "INSERT INTO UDC.Mysql.datalake.test.YELP_REVIEW_WRITE \n " +
- "SELECT * FROM udc.kafka.tau.yelp.review" + "\t" + " "
-
- // All DDLs are addressed here
-
- val simpleCreateDDL =
- "CREATE table tempTable (ageField int)"
-
- val complexCreateDDL =
- """CREATE EXTERNAL TABLE pcatalog.elastic_smoke_test(data string)
- |STORED AS TEXTFILE\nLOCATION 'hdfs:///tmp/pcatalog/elastic_smoke_test'
- |TBLPROPERTIES (
- |'gimel.storage.type' = 'ELASTIC_SEARCH',
- |'es.mapping.date.rich' = 'true',
- |'es.nodes' = 'http://es-host',
- |'es.port' = '8080',
- |'es.resource' = 'flights/data')
- """.stripMargin
-
- val dropIfExistsDDL =
- """DROP TABLE IF EXISTS pcatalog.elastic_smoke_test"""
-
- val dropPlainDDL =
- """DROP TABLE pcatalog.elastic_smoke_test"""
-
- val dropIfExistsViewDDL =
- """DROP TABLE IF EXISTS pcatalog.elastic_smoke_test"""
-
- val dropPlainViewDDL =
- """DROP TABLE pcatalog.elastic_smoke_test"""
-
- val truncateTableDDL =
- """TRUNCATE TABLE pcatalog.elastic_smoke_test"""
-
- val createTablePattern =
- """CREATE TABLE udc.mive.test_cluster.default.temp age (int)"""
-
- val createExternalTablePattern =
- """CREATE EXTERNAL TABLE udc.mive.test_cluster.default.temp age (int)"""
-
- val multisetPattern =
- """CREATE MULTISET TABLE udc.mive.test_cluster.default.temp age (int)"""
-
- val setPattern =
- """CREATE SET TABLE udc.mive.test_cluster.default.temp age (int)"""
-
- val dropTablePattern =
- """DROP TABLE udc.mive.test_cluster.default.temp"""
-
- val truncateTablePattern =
- """TRUNCATE TABLE udc.mive.test_cluster.default.temp"""
-
- val deleteFromPattern =
- """DELETE FROM udc.mive.test_cluster.default.temp"""
-
- val deletePattern =
- """DELETE udc.mive.test_cluster.default.temp"""
-
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParseDDLSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParseDDLSpec.scala
deleted file mode 100644
index c5d39dab..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParseDDLSpec.scala
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import org.scalatest.{BeforeAndAfterEach, FunSpec, Matchers}
-
-class SQLParseDDLSpec
- extends FunSpec
- with SharedSparkSession
- with Matchers
- with BeforeAndAfterEach {
-
- // add things to do before each test for this specific files
- protected override def beforeEach(): Unit = {
- GimelQueryUtils.setCatalogProvider("UDC")
- }
-
- // add things to do after each test for this specific files
- protected override def afterEach(): Unit = {
- GimelQueryUtils.setCatalogProvider("UDC")
- }
-
- describe("DROP TABLE TEMP TABLE") {
- it("It should return true") {
-
- GimelQueryUtils.isDropTableATempTable("DROP TABLE basu", spark) should be(
- false
- )
- }
- }
-
- describe("DROP TABLE with IF exists") {
- it("It should return true") {
-
- GimelQueryUtils.isDDL(SQLMasterList.dropIfExistsDDL, spark) should be(
- true
- )
- }
- }
-
- describe("DROP TABLE without IF exists") {
- it("It should return true") {
-
- GimelQueryUtils.isDDL(SQLMasterList.dropPlainDDL, spark) should be(true)
- }
- }
-
- describe("DROP view with IF exists") {
- it("It should return true") {
-
- GimelQueryUtils.isDDL(SQLMasterList.dropIfExistsViewDDL, spark) should be(
- true
- )
- }
- }
-
- describe("DROP view without IF exists") {
- it("It should return true") {
-
- GimelQueryUtils.isDDL(SQLMasterList.dropPlainViewDDL, spark) should be(
- true
- )
- }
- }
-
- describe("truncate table") {
- it("It should return true") {
- GimelQueryUtils.isDDL(SQLMasterList.truncateTableDDL, spark) should be(
- true
- )
- }
- }
-
- describe("Complex Create External table") {
- it("It should return true") {
-
- GimelQueryUtils.isDDL(SQLMasterList.complexCreateDDL, spark) should be(
- true
- )
- }
- }
-
- describe("createTablePattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(SQLMasterList.createTablePattern) should be(
- true
- )
- }
- }
-
- describe("createExternalTablePattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(
- SQLMasterList.createExternalTablePattern
- ) should be(true)
- }
- }
-
- describe("multisetPattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(SQLMasterList.multisetPattern) should be(
- true
- )
- }
- }
-
- describe("setPattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(SQLMasterList.setPattern) should be(
- true
- )
- }
- }
-
- describe("dropTablePattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(SQLMasterList.dropTablePattern) should be(
- true
- )
- }
- }
-
- describe("truncateTablePattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(SQLMasterList.truncateTablePattern) should be(
- true
- )
- }
- }
-
- describe("deleteFromPattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(SQLMasterList.deleteFromPattern) should be(
- true
- )
- }
- }
-
- describe("deletePattern") {
- it("It should return true") {
-
- GimelQueryUtils.isUDCDataDefinition(SQLMasterList.deletePattern) should be(
- true
- )
- }
- }
-
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSourceTableSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSourceTableSpec.scala
deleted file mode 100644
index 8d5d7d65..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSourceTableSpec.scala
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import org.scalatest.{FunSpec, Matchers}
-
-import com.paypal.gimel.logger.Logger
-import com.paypal.gimel.parser.utilities.SQLNonANSIJoinParser
-import com.paypal.gimel.sql.SQLMasterList._
-
-class SQLParserSourceTableSpec extends FunSpec with Matchers {
-
- private val logger = Logger(this.getClass.getName)
-
- describe("getSourceTablesFromNonAnsi") {
- it("should pick correct table names from the SELECT QUERY") {
-
- SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSIanNonANSI) should equal(
- List("a", "b", "c", "d", "k", "l")
- )
- SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSISimple) should equal(
- List("a", "b", "c", "d")
- )
- SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSISimpleSubQuery) should equal(
- List("a", "b", "c", "d")
- )
- SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(sqlANSIOnly) should equal(
- List()
- )
- SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(joinANSISQLViaUDC) should equal(
- List()
- )
- SQLNonANSIJoinParser.getSourceTablesFromNonAnsi(mixANSINonANSISQL) should equal(
- List(
- "udc.kafka.test.test.emp",
- "udc.hive.test_cluster.test.calendar",
- "c",
- "udc.teradata.test_cluster.testdb.lkp"
- )
- )
- }
- }
-
- describe("getAll UDC TableSources") {
- it("should pick correct table names from the SELECT QUERY") {
-
- GimelQueryUtils.getTablesFrom(sqlANSIanNonANSI) should equal(List())
- GimelQueryUtils.getTablesFrom(sqlANSISimple) should equal(List())
- GimelQueryUtils.getTablesFrom(sqlANSISimpleSubQuery) should equal(List())
- GimelQueryUtils.getTablesFrom(sqlANSIOnly) should equal(List())
- GimelQueryUtils.getTablesFrom(mixANSINonANSISQL) should equal(
- List(
- "udc.kafka.test.test.emp",
- "udc.hive.test_cluster.test.calendar",
- "udc.teradata.test_cluster.testdb.lkp"
- )
- )
- GimelQueryUtils.getTablesFrom(baSQL4) should equal(
- List(
- "pcatalog.teradata.tau.yelp.review",
- "pcatalog.teradata.tau.yelp.user"
- )
- )
- GimelQueryUtils.getTablesFrom(joinANSISQLViaUDC).sorted should equal(
- List(
- "udc.kafka.test.emp.address",
- "udc.kafka.test.emp.loc"
- ).sorted
- )
- GimelQueryUtils.getAllTableSources(joinANSISQLViaUDC).sorted should equal(
- List(
- "testdb.emp",
- "udc.kafka.test.emp.address",
- "udc.kafka.test.emp.loc"
- ).sorted
- )
- }
- }
-
- describe("get All Source Tables") {
- it("should pick correct table names from the SELECT QUERY") {
-
- GimelQueryUtils.getAllTableSources(sqlANSIanNonANSI) should equal(
- List("a", "b", "c", "d", "k", "l", "t2")
- )
- GimelQueryUtils.getAllTableSources(sqlANSISimple) should equal(
- List("a", "b", "c", "d")
- )
- GimelQueryUtils.getAllTableSources(sqlANSISimpleSubQuery) should equal(
- List("a", "b", "c", "d")
- )
- GimelQueryUtils.getAllTableSources(sqlANSIOnly) should equal(
- List("emp_loc", "testdb.emp")
- )
- GimelQueryUtils.getAllTableSources(joinANSISQLViaUDC).sorted should equal(
- List(
- "testdb.emp",
- "udc.kafka.test.emp.address",
- "udc.kafka.test.emp.loc"
- ).sorted
- )
- GimelQueryUtils.getAllTableSources(mixANSINonANSISQL).sorted should equal(
- List(
- "testdb.emp",
- "udc.kafka.test.test.emp",
- "udc.hive.test_cluster.test.calendar",
- "c",
- "udc.teradata.test_cluster.testdb.lkp"
- ).sorted
- )
- }
- }
-
- describe("isSQLNonANSIJoin") {
- it(
- "should pick tell correctly if a SQL is ANSI only or has NON-ANSI joins as well"
- ) {
-
- SQLNonANSIJoinParser.isSQLNonANSIJoin(sqlANSIanNonANSI) should equal(true)
- }
- }
-
- describe("All DDL DML type") {
- it("should pick correct table names ") {
- GimelQueryUtils.getAllTableSources(
- "collect statistics on yelp.tmp_table"
- ) should equal(List("yelp.tmp_table"))
- GimelQueryUtils.getAllTableSources("DELETE ALL yelp.tmp_table") should equal(
- List("yelp.tmp_table")
- )
- GimelQueryUtils.getAllTableSources("DELETE yelp.tmp_table ALL") should equal(
- List("yelp.tmp_table")
- )
- GimelQueryUtils.getAllTableSources("DESCRIBE yelp.tmp_table") should equal(
- List("yelp.tmp_table")
- )
- GimelQueryUtils.getAllTableSources("HELP table yelp.tmp_table") should equal(
- List("yelp.tmp_table")
- )
- GimelQueryUtils.getAllTableSources("show view yelp.tmp_table") should equal(
- List("yelp.tmp_table")
- )
- }
- it("should exclude join desc") {
- GimelQueryUtils.getAllTableSources("DESC yelp.tmp_table") should equal(
- List()
- )
- }
- it("should pick table names from CACHE table") {
- assert(
- GimelQueryUtils
- .getAllTableSources("""cache table work_day_employees as
- |select * from udc.SFTP.Test.default.Files;""".stripMargin) == List(
- "work_day_employees",
- "udc.sftp.test.default.files"
- )
- )
- assert(
- GimelQueryUtils
- .getAllTableSources(
- """cache table workday_dump1 as
- |select
- |lower(a.ntid) as username
- |,a.`Employee QID` as employee_qid
- |,a.`Emplyee Last Name` as last_name
- |,a.`Employee First Name` as first_name
- |,concat(a.`Emplyee Last Name`,',',a.`Employee First Name`) as full_name
- |,a.`Org Description` as org_desc
- |,a.`Org ID ` as org_id
- |,a.`Loaction` as location
- |,lower(a.`Manager ID`) as manager_qid
- |,lower(b.ntid) as manager_username
- |from work_day_employees a
- |left join work_day_employees_b b
- |on a.`Manager ID` = b.`Employee QID`;""".stripMargin
- ) == List(
- "workday_dump1",
- "work_day_employees",
- "work_day_employees_b"
- )
- )
-
- assert(
- GimelQueryUtils
- .getAllTableSources(
- """set gimel.jdbc.p.strategy=file;
- |set gimel.jdbc.p.file=/user/testuser/udc.prod.pass;
- |set gimel.jdbc.username=testadmin;
- |
- |insert into udc.MySql.UDC.pcatalog.workday_dump
- |select * from workday_dump1 """.stripMargin
- ) == List("workday_dump1", "udc.mysql.udc.pcatalog.workday_dump")
- )
- }
- }
-
- describe("Check multiple match criteria within same SQL ") {
- it("should extract valid table names ") {
- logger.info(
- GimelQueryUtils.getAllTableSources(
- "drop table if exists udc.hive.test.testdb.emp"
- )
- )
-
- logger.info(
- GimelQueryUtils.getAllTableSources(
- """cache table td_views_hive_all
- |select distinct * from
- |(
- |select *
- | from
- | udc.hive.test.default.teradata_db_views_test
- |union
- |select *
- | from
- | udc.hive.test.default.teradata_2_db_views_test
- |)""".stripMargin
- )
- )
- logger.info(
- GimelQueryUtils.getAllTableSources(
- """cache table td_views_hive_all
- |select distinct tlb.* from
- |(
- |select *
- |from
- |udc.hive.test.default.teradata_db_views_test
- |union
- |select *
- |from
- |udc.hive.test.default.teradata_2_db_views_test
- |) tlb""".stripMargin
- )
- )
- }
- }
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSpec.scala
deleted file mode 100644
index 0e1325fc..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserSpec.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import org.scalatest.{FunSpec, Matchers}
-
-class SQLParserSpec extends FunSpec with Matchers {
-
-
- it("should pick the Target Table Accurately from the SQL without 'table' keyword") {
-
- com.paypal.gimel.sql.SQLParser.getTargetTables(
- """
- |INSERT INTO data_source_tab2 PARTITION (p1 = 'part1', p2)
- | SELECT id, 'part' || id FROM RANGE(1, 3)
- |
- """.stripMargin) shouldBe Some("data_source_tab2")
-
- }
-
- it("should pick the Target Table Accurately from the SQL with 'table' keyword") {
- com.paypal.gimel.sql.SQLParser.getTargetTables(
- """
- |INSERT INTO table data_source_tab2 PARTITION (p1 = 'part1', p2)
- | SELECT id, 'part' || id FROM RANGE(1, 3)
- |
- """.stripMargin) shouldBe Some("data_source_tab2")
-
- }
-
- it("should pick the Target Table Accurately from the SQL when there is an 'override' keyword") {
- com.paypal.gimel.sql.SQLParser.getTargetTables(
- """
- |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'partNew1', p2)
- | VALUES (3, 'partNew2')
- """.stripMargin) shouldBe Some("data_source_tab2")
-
- }
-
- it("should pick the Target Table Accurately from the SQL when there is an 'override' keyword 1") {
- com.paypal.gimel.sql.SQLParser.getTargetTables(
- """
- |INSERT OVERWRITE TABLE data_source_tab2 PARTITION (p1 = 'part1', p2)
- | VALUES (5, 'part1')
- """.stripMargin) shouldBe Some("data_source_tab2")
- }
-
- it("should pick the Target Table Accurately from the SQL when the SQL has a DB.Table format") {
- com.paypal.gimel.sql.SQLParser.getTargetTables(
- """insert into pcatalog.elastic_cluster_flights_log_notebook_data
- |select * from pcatalog.kafka_flights_log
- |
- """.stripMargin) shouldBe Some("pcatalog.elastic_cluster_flights_log_notebook_data")
- }
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserTargetTableSpec.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserTargetTableSpec.scala
deleted file mode 100644
index d509095d..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SQLParserTargetTableSpec.scala
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import org.scalatest.{FunSpec, Matchers}
-
-import com.paypal.gimel.sql.SQLMasterList._
-
-class SQLParserTargetTableSpec extends FunSpec with Matchers {
-
-
- describe("getTargetTables") {
-
- it("should pick the TARGET Table Accurately from the SQL without 'table' keyword") {
- SQLParser.getTargetTables(insertSQL1) shouldBe Some("data_source_tab2")
- }
-
- it("should pick the TARGET Table Accurately from the SQL with 'table' keyword") {
- SQLParser.getTargetTables(insertSQL2) shouldBe Some("data_source_tab2")
- }
-
- it("should pick the TARGET Table Accurately from the SQL when there is an 'override' keyword") {
- SQLParser.getTargetTables(insertSQL3) shouldBe Some("data_source_tab2")
- }
-
- it("should pick the TARGET Table Accurately from the SQL when there is an 'override' keyword 1") {
- SQLParser.getTargetTables(insertSQL4) shouldBe Some("data_source_tab2")
- }
-
- it("should pick the TARGET Table Accurately from the SQL when the SQL has a DB.Table format") {
- SQLParser.getTargetTables(insertSQL5) shouldBe Some("pcatalog.elastic_test_cluster_yelp_review_data")
- }
-
- it("should pick correct table name from the SELECT QUERY") {
- GimelQueryUtils.getTablesFrom(simpleSelect1) should equal(Array("udc.mysql.datalake.test.yelp_review_read"))
- }
-
- it("should pick proper table name from the only the SELECT Query") {
- GimelQueryUtils.getTablesFrom(simpleInsertSelect1) should equal(Array("udc.kafka.tau.yelp.review",
- "udc.mysql.datalake.test.yelp_review_write"))
- }
- }
-
-
- describe("isQueryContainingPartitioning") {
-
- it("should return true if query contains ; insert into partitions of target table. ") {
- GimelQueryUtils.isQueryContainingPartitioning(insertPartitionedTable1) shouldBe (true)
- GimelQueryUtils.isQueryContainingPartitioning(insertSQL1) shouldBe (true)
- GimelQueryUtils.isQueryContainingPartitioning(insertSQL2) shouldBe (true)
- GimelQueryUtils.isQueryContainingPartitioning(insertSQL3) shouldBe (true)
- GimelQueryUtils.isQueryContainingPartitioning(insertSQL4) shouldBe (true)
- }
-
- it("should return false if query does not contain partition") {
- GimelQueryUtils.isQueryContainingPartitioning(insertSQL5) shouldBe (false)
- GimelQueryUtils.isQueryContainingPartitioning(insertSQL6) shouldBe (false)
- }
- }
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SharedSparkSession.scala b/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SharedSparkSession.scala
deleted file mode 100644
index 9c0d19bd..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/com/paypal/gimel/sql/SharedSparkSession.scala
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.sql
-
-import org.apache.spark.SparkConf
-import org.apache.spark.sql.{SparkSession, SQLContext}
-import org.apache.spark.sql.internal.SQLConf
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSpec, Suite}
-import org.scalatest.concurrent.Eventually
-
-trait SharedSparkSession
- extends FunSpec
- with BeforeAndAfterEach
- with BeforeAndAfterAll
- with Eventually { self: Suite =>
-
- /**
- * The [[SparkSession]] to use for all tests in this suite.
- *
- * By default, the underlying [[org.apache.spark.SparkContext]] will be run in local
- * mode with the default test configurations.
- */
- @transient private var _spark: SparkSession = null
-
- /**
- * Make sure the [[SparkSession]] is initialized before any tests are run.
- */
- protected override def beforeAll(): Unit = {
- initializeSession()
-
- // Ensure we have initialized the context before calling parent code
- super.beforeAll()
- }
-
- /**
- * This is the SparkSession tio be accessed everywhere within the module for tests
- */
- protected implicit def spark: SparkSession = _spark
-
- /**
- * This is the SqlContext tio be accessed everywhere within the module for tests
- */
- protected implicit def sqlContext: SQLContext = _spark.sqlContext
-
- /**
- * Generally, this is just called from
- * beforeAll; however, in test using styles other than FunSuite, there is
- * often code that relies on the session between test group constructs and
- * the actual tests, which may need this session. It is purely a semantic
- * difference, but semantically, it makes more sense to call
- * 'initializeSession' between a 'describe' and an 'it' call than it does to
- * call 'beforeAll'.
- */
- protected def initializeSession(): Unit = {
- if (_spark == null) {
- _spark = createSparkSession
- }
- }
-
- /**
- *
- * @return sparkSession
- */
- protected def createSparkSession: SparkSession = {
- SparkSession
- .builder()
- .master("local")
- .appName("Spark Unit Tests")
- .config(sparkConf)
- .getOrCreate()
- }
-
- // Here add all the spark confs to be initialized in order to start the sparksession with.
- protected def sparkConf = {
- new SparkConf()
- .set("spark.unsafe.exceptionOnMemoryLeak", "true")
- .set(SQLConf.CODEGEN_FALLBACK.key, "false")
- }
-
- /**
- * Stop the underlying [[org.apache.spark.SparkContext]], if any.
- */
- protected override def afterAll(): Unit = {
- try {
- super.afterAll()
- } finally {
- try {
- if (_spark != null) {
- try {
- _spark.sessionState.catalog.reset()
- } finally {
- _spark.stop()
- _spark = null
- }
- }
- } finally {
- SparkSession.clearActiveSession()
- SparkSession.clearDefaultSession()
- }
- }
- }
-
- /**
- * Things to do before each test
- */
- protected override def beforeEach(): Unit = {
- super.beforeEach()
- }
-
- /**
- * Things to do after each test
- */
- protected override def afterEach(): Unit = {
- super.afterEach()
- // Clear all persistent datasets after each test
- spark.sharedState.cacheManager.clearCache()
- }
-}
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.avro b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.avro
deleted file mode 100644
index 8ffdc972..00000000
Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.avro and /dev/null differ
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.csv b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.csv
deleted file mode 100644
index 59f3f4dc..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.csv
+++ /dev/null
@@ -1 +0,0 @@
-a,b,c,d,e
\ No newline at end of file
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.json b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.json
deleted file mode 100644
index 715b02d4..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.json
+++ /dev/null
@@ -1 +0,0 @@
-{"key":"value"}
\ No newline at end of file
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.parquet b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.parquet
deleted file mode 100644
index 2ae23dac..00000000
Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.parquet and /dev/null differ
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.seq b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.seq
deleted file mode 100755
index 78822754..00000000
Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.seq and /dev/null differ
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt
deleted file mode 100644
index 808976a7..00000000
--- a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt
+++ /dev/null
@@ -1 +0,0 @@
-This is a test file for hdfs read api
\ No newline at end of file
diff --git a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt.gz b/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt.gz
deleted file mode 100644
index 3f41db74..00000000
Binary files a/gimel-dataapi/gimel-sql/src/test/scala/resources/hdfs_test.txt.gz and /dev/null differ
diff --git a/gimel-dataapi/gimel-tools/pom.xml b/gimel-dataapi/gimel-tools/pom.xml
deleted file mode 100644
index c2630dbd..00000000
--- a/gimel-dataapi/gimel-tools/pom.xml
+++ /dev/null
@@ -1,114 +0,0 @@
-
-
-
-
-
-
- gimel-dataapi
- com.paypal.gimel
- 2.4.7-SNAPSHOT
- ../pom.xml
-
- 4.0.0
-
- gimel-tools
- 2.4.7-SNAPSHOT
-
-
-
- com.paypal.gimel
- gimel-sql
- ${gimel.version}-SNAPSHOT
-
-
- org.scala-lang
- *
-
-
- org.apache.kafka
- kafka-clients
-
-
-
-
- org.apache.kafka
- kafka-clients
- ${kafka.version}
- provided
-
-
-
-
- src/main/scala
-
-
-
- org.apache.maven.plugins
- maven-shade-plugin
- 3.0.0
-
-
-
- com.google.common
- gimel-shaded.com.google.common
-
-
- com.sun.jersey
- gimel-shaded.com.sun.jersey
-
-
-
-
- *:*
-
- META-INF/*.SF
- META-INF/*.DSA
- META-INF/*.RSA
-
-
-
-
-
-
-
- META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
-
-
-
-
-
-
- gimel-shading
- package
-
- shade
-
-
-
-
-
-
-
-
diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/BenchMarkKafkaDataSetAPI.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/BenchMarkKafkaDataSetAPI.scala
deleted file mode 100644
index fca1432b..00000000
--- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/BenchMarkKafkaDataSetAPI.scala
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.tools
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.sql._
-
-import com.paypal.gimel.DataSet
-import com.paypal.gimel.common.conf.GimelConstants
-import com.paypal.gimel.common.storageadmin.HDFSAdminClient
-import com.paypal.gimel.common.utilities.Timer
-import com.paypal.gimel.kafka.conf.KafkaConfigs
-import com.paypal.gimel.logger.Logger
-import com.paypal.gimel.tools.conf.BenchmarkKafkaConstants
-
-@deprecated
-object BenchMarkKafkaDataSetAPI extends App {
-
- // Logger Initiation
- val logger = Logger(this.getClass.getName)
-
- val sparkSession = SparkSession
- .builder()
- .appName("SparkSessionZipsExample")
- .enableHiveSupport()
- .getOrCreate()
- val sc = sparkSession.sparkContext
- val sqlContext = sparkSession.sqlContext
-
- import BenchMarkHelperUtils._
-
- val paramsMapBuilder = resolveRunTimeParameters(args)
- lazy val appName = sparkSession.conf.get(GimelConstants.SPARK_APP_NAME, "NA") + "_" + sc.getConf.getAppId
- lazy val path1 = "/tmp/" + sc.sparkUser + "_" + appName + "_" + ".Data_API.DataSet.benchmark.log"
- val path = paramsMapBuilder.getOrElse("targetFile", path1)
- val fetchRowsOnFirstRun = paramsMapBuilder(BenchmarkKafkaConstants.fetchRowsKey)
- val maxRecordsPerPartition = paramsMapBuilder(BenchmarkKafkaConstants.maxRecordsPerPartitionKey)
- val minRowsPerParallel = paramsMapBuilder(BenchmarkKafkaConstants.minRowsPerPartitionKey)
- val datasetName = paramsMapBuilder("dataset")
-
- /**
- * START DATASET API STATS CAPTURE
- */
- val dataset = DataSet(sparkSession)
- val props = s"""${KafkaConfigs.minRowsPerParallelKey}=$minRowsPerParallel:${KafkaConfigs.rowCountOnFirstRunKey}=$fetchRowsOnFirstRun:${KafkaConfigs.maxRecordsPerPartition}=$maxRecordsPerPartition"""
- val dataDF = dataset.read(datasetName, props)
-
- // val timer = Timer()
- // timer.start;
- val timer = Timer()
- timer.start
- val myCount = dataDF.count()
- val totalMS = timer.endWithMillSecRunTime
-
- val executorMemoryStatus = sc.getExecutorMemoryStatus.mkString("\n")
- val totalExecutors = sc.getExecutorMemoryStatus.size
- val executorStorageStatus = sc.getExecutorStorageStatus.map(x => "blockManagerId:" + x.blockManagerId + "|maxMem:" + x.maxMem + "|memUsed:" + x.memUsed + "|memRemaining:" + x.memRemaining).mkString("\n")
-
- val allConfs = sc.getConf.getAll.mkString("\n")
-
- /**
- * COMPOSE STATS
- */
-
- val toWrite =
- s"""
- |DataAPI:BenchMark Count:$myCount
- |DataAPI:totalExecutors:$totalExecutors
- |DataAPI:TotalMS:$totalMS
- """.stripMargin
-
- /**
- * Write Stats
- */
-
- logger.info(s"Writing to Path --> $path")
- HDFSAdminClient.writeHDFSFile(path, toWrite)
-
- sc.stop()
-
-}
-
-@deprecated
-class CDHTimer(funcName: String) {
- val logger = Logger()
-
- def timed[T](f: => T): T = {
- val startTime = System.currentTimeMillis()
- try f finally println(s"Function completed in: ${System.currentTimeMillis() - startTime} ms")
- }
-
- var startTime: Long = -1L
-
- def start(): Unit = {
- startTime = System.currentTimeMillis()
- }
-
- def end(): Long = {
- val endTime: Long = System.currentTimeMillis()
- val elapsedTime = endTime - startTime
- logger.info("TOTAL TIME " + funcName + " elapse time = " + elapsedTime)
- elapsedTime
- }
-}
-
-@deprecated
-object BenchMarkHelperUtils {
-
- val logger = Logger()
-
- /**
- * Resolves RunTime Params
- *
- * @param allParams args
- * @return Map[String, String]
- */
- def resolveRunTimeParameters(allParams: Array[String]): Map[String, String] = {
-
- var paramsMapBuilder: Map[String, String] = Map()
- logger.info(s"All Params From User --> ${allParams.mkString("\n")}")
- val usage =
- """
- |dataset=pcatalog.kafka_flights_log fetchRowsOnFirstRun=1000000 maxRecordsPerPartition=1000000 targetFile=/tmp/stats/log"
- """.stripMargin
- if (allParams.length == 0) {
- println(usage)
- throw new Exception("Args Cannot be Empty")
- }
- for (jobParams <- allParams) {
- for (eachParam <- jobParams.split(" ")) {
- paramsMapBuilder += (eachParam.split("=")(0) -> eachParam.split("=", 2)(1))
- }
- }
- if (!paramsMapBuilder.contains("dataset")) paramsMapBuilder += ("dataset" -> "pcatalog.kafka_flights_log")
- if (!paramsMapBuilder.contains(BenchmarkKafkaConstants.fetchRowsKey)) paramsMapBuilder += (BenchmarkKafkaConstants.fetchRowsKey -> "1000000")
- if (!paramsMapBuilder.contains(BenchmarkKafkaConstants.maxRecordsPerPartitionKey)) paramsMapBuilder += (BenchmarkKafkaConstants.maxRecordsPerPartitionKey -> "1000000")
- if (!paramsMapBuilder.contains(BenchmarkKafkaConstants.minRowsPerPartitionKey)) paramsMapBuilder += (BenchmarkKafkaConstants.minRowsPerPartitionKey -> "100000")
- logger.info(s"Resolved Params From Code --> $paramsMapBuilder")
- paramsMapBuilder
- }
-}
diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/ExecSQLWrapper.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/ExecSQLWrapper.scala
deleted file mode 100644
index 0ffb915d..00000000
--- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/ExecSQLWrapper.scala
+++ /dev/null
@@ -1,289 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.tools
-
-import java.util.Calendar
-
-import scala.collection.immutable.Map
-import scala.language.implicitConversions
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.sql.SparkSession
-
-import com.paypal.gimel.common.conf.GimelConstants
-import com.paypal.gimel.common.storageadmin.HDFSAdminClient
-import com.paypal.gimel.common.utilities.Timer
-import com.paypal.gimel.hbase.conf.HbaseConfigs
-import com.paypal.gimel.kafka.conf.{KafkaConfigs, KafkaConstants}
-import com.paypal.gimel.logger.Logger
-import com.paypal.gimel.sql.GimelQueryProcessor
-import com.paypal.gimel.tools.conf.CopyDatasetConstants
-
-object CopyDataSet extends App {
-
- import CopyHelperUtils._
- import com.paypal.gimel.kafka.utilities.KafkaUtilities._
-
- val logger = Logger(this.getClass.getName)
- val user = sys.env("USER")
- val sparkConf = new SparkConf()
- val sparkSession = SparkSession
- .builder()
- .enableHiveSupport()
- .getOrCreate()
- val props = resolveRunTimeParameters(args) ++ Map(GimelConstants.SPARK_APP_ID -> sparkSession.conf.get(GimelConstants.SPARK_APP_ID),
- GimelConstants.SPARK_APP_NAME -> sparkSession.conf.get(GimelConstants.SPARK_APP_NAME))
- props.foreach(prop => sparkSession.conf.set(prop._1, prop._2))
- logger.setSparkVersion(sparkSession.version)
- val resolvedProps = getOptions(sparkSession)
- val queryToExecute = getQuery(props)
- val sparkAppName = sparkSession.conf.get("spark.app.name")
- val yarnCluster = com.paypal.gimel.common.utilities.DataSetUtils.getYarnClusterName()
- val runMode = props("mode") match {
- case "stream" => KafkaConstants.gimelAuditRunTypeStream
- case "batch" => KafkaConstants.gimelAuditRunTypeBatch
- case "intelligent" => KafkaConstants.gimelAuditRunTypeIntelligent
- case _ => GimelConstants.UNKNOWN_STRING.toLowerCase
- }
-
- val hiveStagingDir = props.getOrElse("hiveStagingDir", "")
- try {
- props("mode").toLowerCase() match {
- case CopyDatasetConstants.COPY_DATASET_STREAM_MODE => GimelQueryProcessor.executeStream(queryToExecute, sparkSession)
- case CopyDatasetConstants.COPY_DATASET_BATCH_MODE => GimelQueryProcessor.executeBatch(queryToExecute, sparkSession)
- case CopyDatasetConstants.COPY_DATASET_BATCH_INTERACTIVE_MODE =>
- val isBatchInfinite = props.getOrElse("isBatchRecursionInfinite", "false").toBoolean
- val batchRecursionRequested = props.getOrElse("batchRecursionRequested", "100").toInt
- val batchRecursinMins = props.getOrElse("batchRecursionMinutes", 30).toString.toInt
- logger.info(
- s"""
- |--------------------------------------------------------------------
- || isBatchRecursionInfinite | ${isBatchInfinite}
- || batchRecursionRequested | ${batchRecursionRequested}
- || batchRecursinMins | ${batchRecursinMins}
- |--------------------------------------------------------------------
- """.stripMargin)
- val batchRecursionMilliSec: Double = batchRecursinMins * 60 * 1000D
- var currentIteration = 1
- while (isBatchInfinite || (currentIteration <= batchRecursionRequested)) {
- val startTime = Calendar.getInstance().getTime
- logger.info(
- s"""
- |--------------------------------------------------------------------
- || Mode | ${props("mode")}
- || Iteration | ${currentIteration}
- || Start Time | ${Calendar.getInstance().getTime}
- |--------------------------------------------------------------------
- """.stripMargin)
- val timer = Timer()
- timer.start
- GimelQueryProcessor.executeBatch(queryToExecute, sparkSession)
- val totalTimeMilliSec: Double = timer.endWithMillSecRunTime
- val endTime = Calendar.getInstance().getTime
- val sleepMilliSec = scala.math.max(0, batchRecursionMilliSec - totalTimeMilliSec)
- logger.info(
- s"""
- |--------------------------------------------------------------------
- || (*) | Iteration | ${currentIteration}
- || (*) | Start Time Execution | ${startTime}
- || (*) | Start End Execution | ${endTime}
- || (Y) | Time Taken for Execution (ms) | ${totalTimeMilliSec}
- || (X) | Batch Iteration Request (ms) | ${batchRecursionMilliSec}
- || (X-Y) | Time Remaining for Sleep (ms) | ${sleepMilliSec}
- |--------------------------------------------------------------------
- """.stripMargin)
- if (currentIteration == batchRecursionRequested) logger.info("All Iterations Completed !")
- if (sleepMilliSec > 0 && currentIteration < batchRecursionRequested) {
- logger.info(s"Going to Sleep at --> ${Calendar.getInstance().getTime}")
- Thread.sleep(sleepMilliSec.toLong)
- logger.info(s"Woke Up at --> ${Calendar.getInstance().getTime}")
- }
- currentIteration += 1
- }
- case CopyDatasetConstants.COPY_DATASET_INTELLIGENT_MODE =>
- logger.info(s"Mode --> auto")
- var batchRunCount = 0
- while (!isStreamable(sparkSession, props)) {
- logger.info(s"====== BATCH Mode < Iteration --> ${batchRunCount} > ======")
- val timer = Timer()
- timer.start
- GimelQueryProcessor.executeBatch(queryToExecute, sparkSession)
- if (hiveStagingDir != "") sparkSession.sql(s"dfs -rm -r -f ${hiveStagingDir}")
- timer.endWithMillSecRunTime
- logger.info(s"====== BATCH Mode < Iteration --> ${batchRunCount} > Total Time Seconds --> ${timer.endWithMillSecRunTime / 1000} ====== ")
- batchRunCount = batchRunCount + 1
- }
- logger.info("====== STREAM Mode ======")
- GimelQueryProcessor.executeStream(queryToExecute, sparkSession)
- case _ => throw new Exception("Invalid Mode of Execution Must be one of these ")
- }
-
- // push logs to KAFKA
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , runMode
- , yarnCluster
- , user
- , s"${yarnCluster}/${user}/${sparkAppName}".replaceAllLiterally("/", "_").replaceAllLiterally(" ", "-")
- , "copyDataSet"
- , s"${queryToExecute}"
- , scala.collection.mutable.Map("sql" -> queryToExecute)
- , GimelConstants.SUCCESS
- , GimelConstants.EMPTY_STRING
- , GimelConstants.EMPTY_STRING
- )
- }
- catch {
- case e: Throwable => {
- e.printStackTrace()
-
- // push logs to KAFKA
- logger.logMethodAccess(sparkSession.sparkContext.getConf.getAppId
- , sparkSession.conf.get("spark.app.name")
- , this.getClass.getName
- , runMode
- , yarnCluster
- , user
- , s"${yarnCluster}/${user}/${sparkAppName}".replaceAllLiterally("/", "_").replaceAllLiterally(" ", "-")
- , "copyDataSet"
- , s"${queryToExecute}"
- , scala.collection.mutable.Map("sql" -> queryToExecute)
- , GimelConstants.FAILURE
- , e.toString + "\n" + e.getStackTraceString
- , GimelConstants.UNKNOWN_STRING
- )
-
- // throw error to console
- logger.throwError(e.toString)
-
- throw e
- }
- }
-
-}
-
-object CopyHelperUtils {
-
- val logger = Logger(this.getClass.getName)
-
- /**
- * Resolves RunTime Params
- *
- * @param allParams args
- * @return Map[String, String]
- */
- def resolveRunTimeParameters(allParams: Array[String]): Map[String, String] = {
- def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName()
-
- logger.info(" @Begin --> " + MethodName)
-
- var paramsMapBuilder: Map[String, String] = Map()
- for (jobParams <- allParams) {
- for (eachParam <- jobParams.split(" ")) {
- paramsMapBuilder += (eachParam.split("=")(0) -> eachParam.split("=", 2)(1))
- }
- }
- logger.info(s"All Params From User --> ${paramsMapBuilder.mkString("\n")}")
-
- val usage =
- """
- |For Details : https://github.com/Paypal/gimel/blob/oss/docs/gimel-tools/ExecSQLWrapper.md
- """.stripMargin
-
- if (allParams.length == 0) {
- logger.error(usage)
- throw new Exception(s"Args Cannot be Empty. Usage --> \n${usage}")
- }
-
- if (!paramsMapBuilder.contains("mode")) throw new Exception(s"mode must be supplied as either < batch|stream > Usage --> \n${usage}")
- if (!paramsMapBuilder.contains("querySourceFile")) throw new Exception(s"querySourceFile must be supplied ! Usage --> \n${usage}")
-
- logger.info(s"Resolved Params From Code --> ${paramsMapBuilder}")
- paramsMapBuilder
- }
-
- /**
- * getOptions - read the hive context options that was set by the user else add the default values
- *
- * @param sparkSession SparkSession
- * @return - Tuple ( String with concatenated options read from the hivecontext , Same Props as a Map[String,String] )
- */
-
- def getOptions(sparkSession: SparkSession): (String, Map[String, String]) = {
- def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName()
-
- logger.info(" @Begin --> " + MethodName)
-
- val hiveConf: Map[String, String] = sparkSession.conf.getAll
- val optionsToCheck: Map[String, String] = Map(
- KafkaConfigs.rowCountOnFirstRunKey -> "250"
- , KafkaConfigs.batchFetchSize -> "250"
- , KafkaConfigs.maxRecordsPerPartition -> "25000000"
- , GimelConstants.LOG_LEVEL -> "ERROR"
- , KafkaConfigs.kafkaConsumerReadCheckpointKey -> "true"
- , KafkaConfigs.kafkaConsumerClearCheckpointKey -> "false"
- , KafkaConfigs.maxRatePerPartitionKey -> "3600"
- , KafkaConfigs.streamParallelKey -> "10"
- , KafkaConfigs.defaultBatchInterval -> "30"
- , KafkaConfigs.isStreamParallelKey -> "true"
- , KafkaConfigs.isBackPressureEnabledKey -> "true"
- , HbaseConfigs.hbaseOperation -> "scan"
- , HbaseConfigs.hbaseFilter -> ""
- , GimelConstants.DATA_CACHE_IS_ENABLED -> "false"
- , GimelConstants.DATA_CACHE_IS_ENABLED_FOR_ALL -> "true"
- )
- val resolvedOptions: Map[String, String] = optionsToCheck.map { kvPair =>
- (kvPair._1, hiveConf.getOrElse(kvPair._1, kvPair._2))
- }
- resolvedOptions.foreach(conf => sparkSession.conf.set(conf._1, conf._2))
- (resolvedOptions.map(x => x._1 + "=" + x._2).mkString(":"), hiveConf ++ resolvedOptions)
- }
-
- def getQuery(props: Map[String, String]): String = {
-
- val sql: String = {
- logger.info(s"User Requested Execution of SQL from External File.")
- val querySourceFile = props("querySourceFile")
- val unresolvedQuery = HDFSAdminClient.readHDFSFile(querySourceFile)
- logger.info(s"SQL From External File --> \n${unresolvedQuery}")
- val replacementProps = props.filter(x => x._1.toUpperCase.startsWith("GIMEL.SQL.PARAM"))
- logger.info(
- s"""
- |Following Props will be resolved in External File's SQL String -->
- |${replacementProps.mkString("\n", "\n", "")}
- """.stripMargin)
- replacementProps.foldLeft(unresolvedQuery)((s, prop) => s.replaceAll(prop._1.toUpperCase, prop._2))
- }
- logger.info(s"Resolved Query to Execute --> ${sql}")
- sql
- }
-
- /**
- * getYarnClusterName - gets the yarn cluster from the hadoop config file
- *
- * @return
- */
- def getYarnClusterName(): String = {
- val hadoopConfiguration = new org.apache.hadoop.conf.Configuration()
- val cluster = hadoopConfiguration.get(GimelConstants.FS_DEFAULT_NAME)
- cluster.split("/").last
- }
-}
diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/BenchmarkKafkaConstants.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/BenchmarkKafkaConstants.scala
deleted file mode 100644
index 91b4e1f3..00000000
--- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/BenchmarkKafkaConstants.scala
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.tools.conf
-
-object BenchmarkKafkaConstants {
- val minRowsPerPartitionKey: String = "minRowsPerParallel"
- val maxRecordsPerPartitionKey: String = "maxRecordsPerPartition"
- val fetchRowsKey: String = "fetchRowsOnFirstRun"
-}
diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/CopyDatasetConstants.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/CopyDatasetConstants.scala
deleted file mode 100644
index ed767d8a..00000000
--- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/CopyDatasetConstants.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.tools.conf
-
-// Copy Dataset Constants
-object CopyDatasetConstants {
- val COPY_DATASET_STREAM_MODE = "stream"
- val COPY_DATASET_BATCH_MODE = "batch"
- val COPY_DATASET_BATCH_INTERACTIVE_MODE = "batch_iterative"
- val COPY_DATASET_INTELLIGENT_MODE = "intelligent"
-}
diff --git a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/SinkMetricsReconcilerConstants.scala b/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/SinkMetricsReconcilerConstants.scala
deleted file mode 100644
index ae64a281..00000000
--- a/gimel-dataapi/gimel-tools/src/main/scala/com/paypal/gimel/tools/conf/SinkMetricsReconcilerConstants.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright 2018 PayPal Inc.
- *
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.paypal.gimel.tools.conf
-
-
-object SinkMetricsReconcilerConstants {
-
- val statsTargetRecordsUpdatedFlag: String = "target_records_updated_flag"
- val statsMetadataID: String = "metadata._id"
- val statsTargetDataSetName: String = "target_data_set_name"
- val statsGimelloadID: String = "gimel_load_id"
- val statsTargetRecordsCount: String = "target_records_count"
- val statsGimelMetricsLaggingBatchValidationLogType: String = "GimelMetricsLaggingRecords9"
- val statsGimelMetricsBatchValidationLogType: String = "GimelMetricsBatchValidation9"
-
-}
diff --git a/gimel-dataapi/pom.xml b/gimel-dataapi/pom.xml
index e7ff3bce..0205e8c1 100644
--- a/gimel-dataapi/pom.xml
+++ b/gimel-dataapi/pom.xml
@@ -47,12 +47,7 @@ under the License.
gimel-connectors/gimel-jdbc
gimel-connectors/gimel-cassandra
gimel-connectors/gimel-aerospike
-
-
gimel-core
- gimel-sql
- gimel-tools
- gimel-examples