Merge pull request #9 from data-intuitive/develop

Merge recent developments
data-intuitive · Feb 28, 2022 · b18c632 · b18c632
2 parents 6778eab + f640465
commit b18c632
Show file tree

Hide file tree

Showing 51 changed files with 785 additions and 2,171 deletions.
diff --git a/.github/workflows/sbt.yaml b/.github/workflows/sbt.yaml
@@ -0,0 +1,35 @@
+name: Scala CI and Publish
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    if: "!contains(github.event.head_commit.message, 'ci skip')"
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Scala
+      uses: olafurpg/setup-scala@v10
+    - name: Run tests on Ubuntu
+      if: startsWith(matrix.os, 'ubuntu')
+      run: sbt 'testOnly -- -l com.dataintuitive.tags.IOtag'
+  publish:
+    name: Publish package
+    needs: [build]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Scala
+      uses: olafurpg/setup-scala@v10
+    - name: Upload
+      run: sbt aetherDeploy
+      env:
+        GITHUB_TOKEN: ${{ secrets.PACKAGE_SECRET }}
diff --git a/README.md b/README.md
@@ -1,19 +1,23 @@
 # Introduction
 
-This project provides the API for [LuciusWeb](https://github.com/data-intuitive/LuciusWeb) to talk to. The API is a [Spark Jobserver](https://github.com/spark-jobserver/spark-jobserver) project. It needs to be compiled and the resulting `jar` has to be uploaded to the Spark-Jobserver.
+This project provides the API for [LuciusWeb](https://github.com/data-intuitive/LuciusWeb). The API is a [Spark Jobserver](https://github.com/spark-jobserver/spark-jobserver) project. It needs to be compiled and the resulting `jar` has to be uploaded to the Spark-Jobserver.
 
 There's still a lot of work to be done on this (version numbers don't reflect everything).
 
-__Please note that version 2.1.0 and onwards is for Spark 2.2.1.__
+# Dependencies
 
+| LuciusAPI | LuciusCore | Spark Jobserver | Spark |
+|-----------|------------|-----------------|-------|
+| 5.0.0     | 4.0.10     | 0.11.1          | 2.4.7 |
+| 5.0.1     | 4.0.11     | 0.11.1          | 2.4.7 |
 
-# API
+# API Documentation
 
 The documentation of the (__old version__) of the API is available in [postman](https://www.getpostman.com/) and [can be found here](https://www.getpostman.com/collections/cf537f6cae9b82c35034).
 
 # Data
 
-Public data is not available yet.
+Public data is not available (yet).
 
 # Local Deployment
 

diff --git a/build.sbt b/build.sbt
@@ -1,31 +1,39 @@
 name := "LuciusAPI"
 
-version := "3.3.6"
+import aether.AetherKeys._
 
-scalaVersion := "2.11.12"
-
-resolvers += "Job Server Bintray" at "https://dl.bintray.com/spark-jobserver/maven"
+version in ThisBuild := "5.0.1"
 
-resolvers += "Local Ivy" at "file://Users/toni/.ivy2/local"
+scalaVersion := "2.11.12"
 
 resolvers += Resolver.githubPackages("data-intuitive")
+resolvers += "Artifactory" at "https://sparkjobserver.jfrog.io/artifactory/jobserver/"
 
 libraryDependencies ++= Seq(
-  "com.data-intuitive" %% "luciuscore"        % "3.3.4",
-  "spark.jobserver"    %% "job-server-api"    % "0.8.0"      % "provided",
-  "spark.jobserver"    %% "job-server-extras" % "0.8.0"      % "provided",
+  "com.data-intuitive" %% "luciuscore"        % "4.0.11",
+  "spark.jobserver"    %% "job-server-api"    % "0.11.1"     % "provided",
+  "spark.jobserver"    %% "job-server-extras" % "0.11.1"     % "provided",
   "org.scalactic"      %% "scalactic"         % "3.0.7"      % "test"    ,
   "org.scalatest"      %% "scalatest"         % "3.0.7"      % "test"    ,
-  "org.apache.spark"   %% "spark-core"        % "2.3.1"      % "provided",
-  "org.apache.spark"   %% "spark-sql"         % "2.3.1"      % "provided"
-  /* "org.scalaz"         %% "scalaz-core"       % "7.2.0" */
+  "org.apache.spark"   %% "spark-core"        % "2.4.7"      % "provided",
+  "org.apache.spark"   %% "spark-sql"         % "2.4.7"      % "provided"
 )
 
 test in assembly := {}
 
 organization := "com.data-intuitive"
 licenses += ("Apache-2.0", url("https://www.apache.org/licenses/LICENSE-2.0.html"))
 
+// publish to github packages
+publishTo := Some("GitHub data-intuitive Apache Maven Packages" at "https://maven.pkg.github.com/data-intuitive/luciusapi")
+publishMavenStyle := true
+credentials += Credentials(
+  "GitHub Package Registry",
+  "maven.pkg.github.com",
+  "tverbeiren",
+  System.getenv("GITHUB_TOKEN")
+)
+
 // Publish assembly jar as well
 artifact in (Compile, assembly) := {
   val art = (artifact in (Compile, assembly)).value
@@ -34,7 +42,4 @@ artifact in (Compile, assembly) := {
 
 addArtifact(artifact in (Compile, assembly), assembly)
 
-githubOwner := "data-intuitive"
-githubRepository := "luciusapi"
-githubTokenSource := TokenSource.GitConfig("github.token")
-publishMavenStyle := true
+aetherPackageMain := assembly.value
diff --git a/config/example.conf b/config/example.conf
@@ -9,10 +9,17 @@
   geneFeatures {
     probesetID = probesetid,
     dataType = dataType,
+    dataType2 = dataType2,
     ENTREZID = entrezid,
     ENSEMBL = ensemblid,
     SYMBOL = symbol,
     GENENAME = name,
     GENEFAMILY = geneFamily
   }
+    geneDataType {
+  	"1-1" = "L1000"
+  	"0-1" = "BING"
+  	"0-0" = "AIG"
+  	"1-0" = "INVALID"
+  }
 }
diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version=1.2.8
+sbt.version=1.3.10
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1,7 +1,7 @@
 logLevel := Level.Warn
 
-addSbtPlugin("com.codecommit" % "sbt-github-packages" % "0.4.2")
-/* addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4") */
 addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
 addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.4.0")
 addSbtPlugin("org.ensime" % "sbt-ensime" % "2.5.1")
+addSbtPlugin("no.arktekk.sbt" % "aether-deploy" % "0.26.0")
+addSbtPlugin("com.codecommit" % "sbt-github-packages" % "0.5.2")
diff --git a/project/project/metals.sbt b/project/project/metals.sbt
@@ -0,0 +1,4 @@
+// DO NOT EDIT! This file is auto-generated.
+// This file enables sbt-bloop to create bloop config files.
+
+addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.8")
diff --git a/project/project/project/metals.sbt b/project/project/project/metals.sbt
@@ -0,0 +1,4 @@
+// DO NOT EDIT! This file is auto-generated.
+// This file enables sbt-bloop to create bloop config files.
+
+addSbtPlugin("ch.epfl.scala" % "sbt-bloop" % "1.4.8")
diff --git a/src/main/scala/com/dataintuitive/luciusapi/Common.scala b/src/main/scala/com/dataintuitive/luciusapi/Common.scala
@@ -1,8 +1,10 @@
 package com.dataintuitive.luciusapi
 
 // LuciusCore
-import com.dataintuitive.luciuscore.Model.DbRow
-import com.dataintuitive.luciuscore.genes._
+import com.dataintuitive.luciuscore._
+import model.v4._
+import genes._
+import api._
 
 // Jobserver
 import spark.jobserver.api.{JobEnvironment, SingleProblem, ValidationProblem}
@@ -23,8 +25,6 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 
-import com.dataintuitive.luciusapi.Model.FlatDbRow
-
 import com.dataintuitive.jobserver.NamedDataSet
 import com.dataintuitive.jobserver.DataSetPersister
 
@@ -40,8 +40,6 @@ object Common extends Serializable {
     new BroadcastPersister[U]
   implicit def DataSetPersister[T]: NamedObjectPersister[NamedDataSet[T]] = new DataSetPersister[T]
 
-  case class CachedData(db: Dataset[DbRow], flatDb: Dataset[FlatDbRow], genesDB: GenesDB)
-
   object ParamHandlers {
 
     def paramSignature(config: Config): List[String] Or One[ValidationProblem] = {
@@ -115,7 +113,12 @@ object Common extends Serializable {
     }
 
     def validHeadTail(config: Config): Boolean Or One[ValidationProblem] = {
-      if (optParamHead(config) > 0 || optParamTail(config) > 0) Good(true)
+      // we only want either head or tail but not both, so 'exclusive or' needed instead of 'or', so use '!=" instead of '||'
+      // (false, false) => false
+      // (true, false)  => true
+      // (false, true)  => true
+      // (true, true)   => false
+      if (optParamHead(config) > 0 != optParamTail(config) > 0) Good(true)
       else Bad(One(SingleProblem("Either head or tail count needs to be provided")))
     }
 
@@ -156,8 +159,8 @@ object Common extends Serializable {
         }.toOption
         .getOrElse(Seq())
 
-    def validVersion(config: Config): Boolean Or One[ValidationProblem] = {
-      if (VERSIONS contains optParamVersion(config)) Good(true)
+    def validVersion(config: Config): String Or One[ValidationProblem] = {
+      if (VERSIONS contains optParamVersion(config)) Good(optParamVersion(config))
       else Bad(One(SingleProblem("Not a valid version identifier")))
     }
 
@@ -169,13 +172,21 @@ object Common extends Serializable {
       Try(config.getString("limit").toInt).getOrElse(default)
     }
 
+    def optParamLike(config: Config, default: List[String] = Nil): List[String] = {
+      Try(config.getString("like").split(" ").toList).getOrElse(default)
+    }
+
+    def optParamTrtType(config: Config, default: List[String] = Nil): List[String] = {
+      Try(config.getString("trtType").split(" ").toList).getOrElse(default)
+    }
+
     def optParamFeatures(config: Config, default: List[String] = List(".*")): List[String] = {
       Try(config.getString("features").toString.split(" ").toList).getOrElse(default)
     }
 
-    def getDB(runtime: JobEnvironment): Dataset[DbRow] Or One[ValidationProblem] = {
+    def getDB(runtime: JobEnvironment): Dataset[Perturbation] Or One[ValidationProblem] = {
       Try {
-        val NamedDataSet(db, _, _) = runtime.namedObjects.get[NamedDataSet[DbRow]]("db").get
+        val NamedDataSet(db, _, _) = runtime.namedObjects.get[NamedDataSet[Perturbation]]("db").get
         db
       }.map(db => Good(db))
         .getOrElse(Bad(One(SingleProblem("Cached DB not available"))))
@@ -197,12 +208,44 @@ object Common extends Serializable {
         .getOrElse(Bad(One(SingleProblem("Broadcast genes not available"))))
     }
 
+    def getFilters(runtime: JobEnvironment): Filters.FiltersDB Or One[ValidationProblem] = {
+      Try {
+        val NamedBroadcast(filters) = runtime.namedObjects.get[NamedBroadcast[Filters.FiltersDB]]("filters").get
+        filters.value
+      }.map(filters => Good(filters))
+        .getOrElse(Bad(One(SingleProblem("Broadcast filters not available"))))
+    }
+
     def paramDb(config: Config): String Or One[ValidationProblem] = {
       Try(config.getString("db.uri"))
         .map(db => Good(db))
         .getOrElse(Bad(One(SingleProblem("DB config parameter not provided"))))
     }
 
+    def paramDbs(config: Config): List[String] Or One[ValidationProblem] = {
+      Try(config.getStringList("db.uris").asScala.toList)
+        .map(dbs => Good(dbs))
+        .getOrElse(Bad(One(SingleProblem("DB config parameter not provided"))))
+    }
+
+    /**
+     * Checks config for either db.uri or db.uris.
+     * This allows for both using the older format db.uri as single string
+     * or the newer format db.uris which allows a list of strings.
+     * By supporting the old format we prevent old config files from breaking.
+     */
+    def paramDbOrDbs(config: Config): List[String] Or One[ValidationProblem] = {
+      val singleDb = paramDb(config)
+      val multipleDbs = paramDbs(config)
+
+      (singleDb.isGood, multipleDbs.isGood) match {
+        case (false, false) => Bad(One(SingleProblem("DB config parameter not provided")))
+        case (true, true) => Bad(One(SingleProblem("Only one declaration of db.uri or db.uris is allowed")))
+        case (true, false) => singleDb.map(List(_))
+        case (false, true) => multipleDbs
+      }
+    }
+
     def paramGenes(config: Config): String Or One[ValidationProblem] = {
       Try(config.getString("geneAnnotations"))
         .map(ga => Good(ga))
@@ -252,37 +295,16 @@ object Common extends Serializable {
           .getOrElse(defaultDict)
     }
 
-  }
-
-  object Variables {
-
-    // Calculated
-    val ZHANG = Set("zhang", "similarity", "Zhang", "Similarity")
-
-    // Sample
-    val ID = Set("id", "pwid")
-    val BATCH = Set("batch", "Batch")
-    val PLATEID = Set("plateid", "PlateId")
-    val WELL = Set("well", "Well")
-    val PROTOCOLNAME = Set("protocolname", "cellline", "CellLine", "ProtocolName", "protocol", "Protocol")
-    val CONCENTRATION = Set("concentration", "Concentration")
-    val YEAR = Set("year", "Year")
-    val TIME = Set("time", "Time")
-
-    // Compound
-    val COMPOUND_ID = Set("jnjs", "Jnjs", "cid", "pid", "compound_id")
-    val JNJB = Set("jnjb", "Jnjb")
-    val COMPOUND_SMILES = Set("Smiles", "smiles", "SMILES", "compound_smiles")
-    val COMPOUND_INCHIKEY = Set("inchikey", "Inchikey", "compound_inchikey")
-    val COMPOUND_NAME = Set("compoundname", "CompoundName", "Compoundname", "name", "Name", "compound_name")
-    val COMPOUND_TYPE = Set("Type", "type", "compound_type")
-    val COMPOUND_TARGETS = Set("targets", "knownTargets", "Targets", "compound_targets")
-
-    // Filters
-    val FILTERS = Set("filters", "Filters", "filter", "filters")
-
-    // Derived
-    val SIGNIFICANTGENES = Set("significantGenes")
+    /**
+     * geneDataType contains a mapping between the read dataType and how it should be
+     * returned from the code.
+     * This is especially useful when two dataTypes are read in together to be concatenated,
+     */
+    def paramGeneDataTypes(config: Config):Map[String, String] = {
+      Try(config.getObject("geneDataType")).toOption
+        .map(_.unwrapped.asScala.toMap.map{case (k,v) => (k.toString, v.toString)})
+        .getOrElse(Map.empty)
+    }
 
   }
 

diff --git a/src/main/scala/com/dataintuitive/luciusapi/Model.scala b/src/main/scala/com/dataintuitive/luciusapi/Model.scala