Skip to content

Commit

Permalink
add an ignore section to configuration which will exclude analysis fr…
Browse files Browse the repository at this point in the history
…om those providers
  • Loading branch information
grapigeau committed Dec 20, 2024
1 parent 5989b7a commit 043892e
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 13 deletions.
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@ create a configuration file ending in `.cerberus` as follows:
},
"vidarr": {
"prod": "http://vidarr-prod.example.com:8000"
}
},
"ignore": [
"example-bad-provider"
]
}

The `"pinery"` section describes all Pinery instances that can be used LIMS
Expand All @@ -32,8 +35,21 @@ The `"vidarr"` section describes all the Vidarr instances that should be used
as file sources. The keys are the _internal name_ of that Vidarr instance and
the value is the URL of that instance.

The `"ignore"` section contains all the LIMS provider names which are present
in the Vidarr instances' external keys but should NOT be merged when building
file provenance. If a Vidarr workflow run contains a single external key with data
from one of these ignore providers, the entire workflow run will be excluded.

To build Cerberus locally:

mvn clean install dependency:copy-dependencies

The Cerberus file provenance client can be used to produce a joined file
provenance TSV in the traditional format using:

java -m ca.on.oicr.gsi.cerberus.cli/ca.on.oicr.gsi.cerberus.Main online \
-c config.json -o output.json
java --module-path "$(find ./*/target/ ./*/target/dependency/ \
-maxdepth 1 -mindepth 1 -iname "*.jar" | tr '\n' :)" \
-m ca.on.oicr.gsi.cerberus.cli/ca.on.oicr.gsi.cerberus.cli.Main online \
-c config.json -o output.tsv.gz


Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package ca.on.oicr.gsi.cerberus.cli;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public final class Configuration {

private Map<String, PineryConfiguration> pinery;
private Map<String, String> vidarr;
private List<String> ignore;

public Map<String, PineryConfiguration> getPinery() {
return pinery;
Expand All @@ -15,11 +18,19 @@ public Map<String, String> getVidarr() {
return vidarr;
}

public List<String> getIgnore() {
return ignore;
}

public void setPinery(Map<String, PineryConfiguration> pinery) {
this.pinery = pinery;
}

public void setVidarr(Map<String, String> vidarr) {
this.vidarr = vidarr;
}

public void setIgnore(ArrayList<String> ignore) {
this.ignore = ignore;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.Callable;
import java.util.stream.Collectors;
import java.util.stream.Stream;
Expand Down Expand Up @@ -45,6 +48,9 @@ public Integer call() throws Exception {
}

try (final var output = new TabReportGenerator(tempOutputFileName)) {

List<String> ignoreProviders =
Objects.requireNonNullElse(configuration.getIgnore(), new ArrayList<>());
final var versions =
configuration.getPinery().values().stream()
.flatMap(pinery -> pinery.getVersions().stream())
Expand All @@ -54,7 +60,10 @@ public Integer call() throws Exception {
JoinSource.join(
JoinSource.all(
configuration.getVidarr().entrySet().stream()
.map(e -> VidarrWorkflowRunSource.of(e.getKey(), e.getValue(), versions))),
.map(
e ->
VidarrWorkflowRunSource.of(
e.getKey(), e.getValue(), versions, ignoreProviders))),
JoinSource.all(
configuration.getPinery().entrySet().stream()
.flatMap(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,6 @@ static <T, R> JoinSource<R> map(JoinSource<T> source, Function<? super T, ? exte
return () -> source.fetch().map(mapper);
}

/** Provided the stored dataa */
/** Provided the stored data */
Stream<T> fetch() throws Exception;
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
import java.net.http.HttpRequest.BodyPublishers;
import java.time.Duration;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/** Incrementally fetch Vidarr workflow run data */
Expand Down Expand Up @@ -63,21 +65,24 @@ public static Stream<ExternalId> key(ProvenanceWorkflowRun<? extends ExternalId>
}

public static JoinSource<ProvenanceWorkflowRun<ExternalKey>> of(
String instanceName, String baseUrl, Set<String> versionTypes) {
String instanceName, String baseUrl, Set<String> versionTypes, List<String> ignoreProviders) {
return IncrementalJoinSource.accumulating(
new VidarrWorkflowRunSource(instanceName, baseUrl, versionTypes));
new VidarrWorkflowRunSource(instanceName, baseUrl, versionTypes, ignoreProviders));
}

private final String baseUrl;
private long epoch;
private final String instanceName;
private long lastTime;
private final Set<String> versionTypes;
private final List<String> ignoreProviders;

public VidarrWorkflowRunSource(String instanceName, String baseUrl, Set<String> versionTypes) {
public VidarrWorkflowRunSource(
String instanceName, String baseUrl, Set<String> versionTypes, List<String> ignoreProviders) {
this.instanceName = instanceName;
this.baseUrl = baseUrl;
this.versionTypes = versionTypes;
this.ignoreProviders = ignoreProviders;
}

@Override
Expand Down Expand Up @@ -107,16 +112,27 @@ public UpdateResult<ProvenanceWorkflowRun<ExternalKey>> update() throws Exceptio
EPOCH.labels(baseUrl).set(body.getEpoch());
TIMESTAMP.labels(baseUrl).set(body.getTimestamp());
ERROR.labels(baseUrl).set(0);
for (final var workflowRun : body.getResults()) {
workflowRun.setInstanceName(instanceName);
}
List<ProvenanceWorkflowRun<ExternalKey>> validBodyResults;
validBodyResults =
body.getResults().stream()
.filter(
workflowRun ->
workflowRun.getExternalKeys().stream()
.noneMatch(
externalKey -> ignoreProviders.contains(externalKey.getProvider())))
.map(
workflowRun -> {
workflowRun.setInstanceName(instanceName);
return workflowRun;
})
.collect(Collectors.toList());
if (body.getEpoch() == epoch) {
lastTime = body.getTimestamp();
return UpdateResult.incremental(body.getResults());
return UpdateResult.incremental(validBodyResults);
} else {
epoch = body.getEpoch();
lastTime = body.getTimestamp();
return UpdateResult.restart(body.getResults());
return UpdateResult.restart(validBodyResults);
}
} catch (Exception e) {
ERROR.labels(baseUrl).set(1);
Expand Down

0 comments on commit 043892e

Please sign in to comment.