From 043892e4381c2ead0e12274fe8a9d5291647bb80 Mon Sep 17 00:00:00 2001 From: grapigeau Date: Thu, 19 Dec 2024 10:45:26 -0500 Subject: [PATCH] add an ignore section to configuration which will exclude analysis from those providers --- README.md | 22 +++++++++++-- .../oicr/gsi/cerberus/cli/Configuration.java | 11 +++++++ .../on/oicr/gsi/cerberus/cli/RunOnline.java | 11 ++++++- .../ca/on/oicr/gsi/cerberus/JoinSource.java | 2 +- .../vidarr/VidarrWorkflowRunSource.java | 32 ++++++++++++++----- 5 files changed, 65 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index a732349..f075da5 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,10 @@ create a configuration file ending in `.cerberus` as follows: }, "vidarr": { "prod": "http://vidarr-prod.example.com:8000" - } + }, + "ignore": [ + "example-bad-provider" + ] } The `"pinery"` section describes all Pinery instances that can be used LIMS @@ -32,8 +35,21 @@ The `"vidarr"` section describes all the Vidarr instances that should be used as file sources. The keys are the _internal name_ of that Vidarr instance and the value is the URL of that instance. +The `"ignore"` section contains all the LIMS provider names which are present +in the Vidarr instances' external keys but should NOT be merged when building +file provenance. If a Vidarr workflow run contains a single external key with data +from one of these ignore providers, the entire workflow run will be excluded. + +To build Cerberus locally: + + mvn clean install dependency:copy-dependencies + The Cerberus file provenance client can be used to produce a joined file provenance TSV in the traditional format using: - java -m ca.on.oicr.gsi.cerberus.cli/ca.on.oicr.gsi.cerberus.Main online \ - -c config.json -o output.json + java --module-path "$(find ./*/target/ ./*/target/dependency/ \ + -maxdepth 1 -mindepth 1 -iname "*.jar" | tr '\n' :)" \ + -m ca.on.oicr.gsi.cerberus.cli/ca.on.oicr.gsi.cerberus.cli.Main online \ + -c config.json -o output.tsv.gz + + diff --git a/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/Configuration.java b/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/Configuration.java index cd32da5..d29e5fa 100644 --- a/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/Configuration.java +++ b/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/Configuration.java @@ -1,11 +1,14 @@ package ca.on.oicr.gsi.cerberus.cli; +import java.util.ArrayList; +import java.util.List; import java.util.Map; public final class Configuration { private Map pinery; private Map vidarr; + private List ignore; public Map getPinery() { return pinery; @@ -15,6 +18,10 @@ public Map getVidarr() { return vidarr; } + public List getIgnore() { + return ignore; + } + public void setPinery(Map pinery) { this.pinery = pinery; } @@ -22,4 +29,8 @@ public void setPinery(Map pinery) { public void setVidarr(Map vidarr) { this.vidarr = vidarr; } + + public void setIgnore(ArrayList ignore) { + this.ignore = ignore; + } } diff --git a/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/RunOnline.java b/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/RunOnline.java index 22afb04..3b565e6 100644 --- a/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/RunOnline.java +++ b/cerberus-cli/src/main/java/ca/on/oicr/gsi/cerberus/cli/RunOnline.java @@ -11,6 +11,9 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; import java.util.concurrent.Callable; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -45,6 +48,9 @@ public Integer call() throws Exception { } try (final var output = new TabReportGenerator(tempOutputFileName)) { + + List ignoreProviders = + Objects.requireNonNullElse(configuration.getIgnore(), new ArrayList<>()); final var versions = configuration.getPinery().values().stream() .flatMap(pinery -> pinery.getVersions().stream()) @@ -54,7 +60,10 @@ public Integer call() throws Exception { JoinSource.join( JoinSource.all( configuration.getVidarr().entrySet().stream() - .map(e -> VidarrWorkflowRunSource.of(e.getKey(), e.getValue(), versions))), + .map( + e -> + VidarrWorkflowRunSource.of( + e.getKey(), e.getValue(), versions, ignoreProviders))), JoinSource.all( configuration.getPinery().entrySet().stream() .flatMap( diff --git a/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/JoinSource.java b/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/JoinSource.java index a45fe79..9a7622d 100644 --- a/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/JoinSource.java +++ b/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/JoinSource.java @@ -195,6 +195,6 @@ static JoinSource map(JoinSource source, Function source.fetch().map(mapper); } - /** Provided the stored dataa */ + /** Provided the stored data */ Stream fetch() throws Exception; } diff --git a/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/vidarr/VidarrWorkflowRunSource.java b/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/vidarr/VidarrWorkflowRunSource.java index 7464804..f4236fa 100644 --- a/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/vidarr/VidarrWorkflowRunSource.java +++ b/cerberus-core/src/main/java/ca/on/oicr/gsi/cerberus/vidarr/VidarrWorkflowRunSource.java @@ -23,7 +23,9 @@ import java.net.http.HttpRequest.BodyPublishers; import java.time.Duration; import java.util.EnumSet; +import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import java.util.stream.Stream; /** Incrementally fetch Vidarr workflow run data */ @@ -63,9 +65,9 @@ public static Stream key(ProvenanceWorkflowRun } public static JoinSource> of( - String instanceName, String baseUrl, Set versionTypes) { + String instanceName, String baseUrl, Set versionTypes, List ignoreProviders) { return IncrementalJoinSource.accumulating( - new VidarrWorkflowRunSource(instanceName, baseUrl, versionTypes)); + new VidarrWorkflowRunSource(instanceName, baseUrl, versionTypes, ignoreProviders)); } private final String baseUrl; @@ -73,11 +75,14 @@ public static JoinSource> of( private final String instanceName; private long lastTime; private final Set versionTypes; + private final List ignoreProviders; - public VidarrWorkflowRunSource(String instanceName, String baseUrl, Set versionTypes) { + public VidarrWorkflowRunSource( + String instanceName, String baseUrl, Set versionTypes, List ignoreProviders) { this.instanceName = instanceName; this.baseUrl = baseUrl; this.versionTypes = versionTypes; + this.ignoreProviders = ignoreProviders; } @Override @@ -107,16 +112,27 @@ public UpdateResult> update() throws Exceptio EPOCH.labels(baseUrl).set(body.getEpoch()); TIMESTAMP.labels(baseUrl).set(body.getTimestamp()); ERROR.labels(baseUrl).set(0); - for (final var workflowRun : body.getResults()) { - workflowRun.setInstanceName(instanceName); - } + List> validBodyResults; + validBodyResults = + body.getResults().stream() + .filter( + workflowRun -> + workflowRun.getExternalKeys().stream() + .noneMatch( + externalKey -> ignoreProviders.contains(externalKey.getProvider()))) + .map( + workflowRun -> { + workflowRun.setInstanceName(instanceName); + return workflowRun; + }) + .collect(Collectors.toList()); if (body.getEpoch() == epoch) { lastTime = body.getTimestamp(); - return UpdateResult.incremental(body.getResults()); + return UpdateResult.incremental(validBodyResults); } else { epoch = body.getEpoch(); lastTime = body.getTimestamp(); - return UpdateResult.restart(body.getResults()); + return UpdateResult.restart(validBodyResults); } } catch (Exception e) { ERROR.labels(baseUrl).set(1);