-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement an exception handler for api client #72
Merged
Merged
Changes from 1 commit
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
29 changes: 29 additions & 0 deletions
29
plugins/nf-nomad/src/main/nextflow/nomad/config/RetryConfig.groovy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package nextflow.nomad.config | ||
|
||
import groovy.transform.CompileStatic | ||
import nextflow.util.Duration | ||
|
||
|
||
@CompileStatic | ||
class RetryConfig { | ||
|
||
Duration delay = Duration.of('250ms') | ||
Duration maxDelay = Duration.of('90s') | ||
int maxAttempts = 10 | ||
double jitter = 0.25 | ||
|
||
RetryConfig(){ | ||
this(Collections.emptyMap()) | ||
} | ||
|
||
RetryConfig(Map config){ | ||
if( config.delay ) | ||
delay = config.delay as Duration | ||
if( config.maxDelay ) | ||
maxDelay = config.maxDelay as Duration | ||
if( config.maxAttempts ) | ||
maxAttempts = config.maxAttempts as int | ||
if( config.jitter ) | ||
jitter = config.jitter as double | ||
} | ||
} |
66 changes: 66 additions & 0 deletions
66
plugins/nf-nomad/src/main/nextflow/nomad/executor/FailsafeExecutor.groovy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package nextflow.nomad.executor | ||
|
||
import dev.failsafe.Failsafe | ||
import dev.failsafe.RetryPolicy | ||
import dev.failsafe.event.EventListener | ||
import dev.failsafe.event.ExecutionAttemptedEvent | ||
import dev.failsafe.function.CheckedSupplier | ||
import groovy.transform.CompileStatic | ||
import groovy.util.logging.Slf4j | ||
import io.nomadproject.client.ApiException | ||
import nextflow.nomad.config.RetryConfig | ||
|
||
import java.time.temporal.ChronoUnit | ||
import java.util.concurrent.TimeoutException | ||
import java.util.function.Predicate | ||
|
||
@Slf4j | ||
@CompileStatic | ||
class FailsafeExecutor { | ||
|
||
private RetryConfig config | ||
|
||
FailsafeExecutor(RetryConfig config){ | ||
this.config = config | ||
} | ||
|
||
protected <T> RetryPolicy<T> retryPolicy(Predicate<? extends Throwable> cond) { | ||
|
||
final listener = new EventListener<ExecutionAttemptedEvent<T>>() { | ||
@Override | ||
void accept(ExecutionAttemptedEvent<T> event) throws Throwable { | ||
log.debug("Nomad TooManyRequests response error - attempt: ${event.attemptCount}; reason: ${event.lastFailure.message}") | ||
} | ||
} | ||
return RetryPolicy.<T>builder() | ||
.handleIf(cond) | ||
.withBackoff(config.delay.toMillis(), config.maxDelay.toMillis(), ChronoUnit.MILLIS) | ||
.withMaxAttempts(config.maxAttempts) | ||
.withJitter(config.jitter) | ||
.onRetry(listener) | ||
.build() | ||
} | ||
|
||
final private static List<Integer> RETRY_CODES = List.of(408, 429, 500, 502, 503, 504) | ||
|
||
protected <T> T apply(CheckedSupplier<T> action) { | ||
// define the retry condition | ||
final cond = new Predicate<? extends Throwable>() { | ||
@Override | ||
boolean test(Throwable t) { | ||
if( t instanceof ApiException && t.code in RETRY_CODES ) | ||
return true | ||
if( t instanceof IOException || t.cause instanceof IOException ) | ||
return true | ||
if( t instanceof TimeoutException || t.cause instanceof TimeoutException ) | ||
return true | ||
return false | ||
} | ||
} | ||
// create the retry policy object | ||
final policy = retryPolicy(cond) | ||
// apply the action with | ||
return Failsafe.with(policy).get(action) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,14 +45,14 @@ class NomadService implements Closeable{ | |
ApiClient apiClient | ||
JobsApi jobsApi | ||
VariablesApi variablesApi | ||
FailsafeExecutor safeExecutor | ||
|
||
NomadService(NomadConfig config) { | ||
this.config = config | ||
|
||
//TODO: Accommodate these connection level options in clientOpts() | ||
final CONNECTION_TIMEOUT_MILLISECONDS = 60000 | ||
final READ_TIMEOUT_MILLISECONDS = 60000 | ||
final WRITE_TIMEOUT_MILLISECONDS = 60000 | ||
final CONNECTION_TIMEOUT_MILLISECONDS = config.clientOpts().connectionTimeout | ||
final READ_TIMEOUT_MILLISECONDS = config.clientOpts().readTimeout | ||
final WRITE_TIMEOUT_MILLISECONDS = config.clientOpts().writeTimeout | ||
|
||
apiClient = new ApiClient( connectTimeout: CONNECTION_TIMEOUT_MILLISECONDS, readTimeout: READ_TIMEOUT_MILLISECONDS, writeTimeout: WRITE_TIMEOUT_MILLISECONDS) | ||
apiClient.basePath = config.clientOpts().address | ||
|
@@ -64,6 +64,8 @@ class NomadService implements Closeable{ | |
} | ||
this.jobsApi = new JobsApi(apiClient) | ||
this.variablesApi = new VariablesApi(apiClient) | ||
|
||
this.safeExecutor = new FailsafeExecutor(config.clientOpts().retryConfig) | ||
} | ||
|
||
|
||
|
@@ -96,8 +98,12 @@ class NomadService implements Closeable{ | |
} | ||
|
||
try { | ||
JobRegisterResponse jobRegisterResponse = jobsApi.registerJob(jobRegisterRequest, config.jobOpts().region, config.jobOpts().namespace, null, null) | ||
return jobRegisterResponse.evalID | ||
safeExecutor.apply { | ||
JobRegisterResponse jobRegisterResponse = jobsApi.registerJob(jobRegisterRequest, | ||
config.jobOpts().region, config.jobOpts().namespace, | ||
null, null) | ||
jobRegisterResponse.evalID | ||
} | ||
} catch (ApiException apiException) { | ||
log.debug("[NOMAD] Failed to submit ${job.name} -- Cause: ${apiException.responseBody ?: apiException}", apiException) | ||
throw new ProcessSubmitException("[NOMAD] Failed to submit ${job.name} -- Cause: ${apiException.responseBody ?: apiException}", apiException) | ||
|
@@ -110,7 +116,11 @@ class NomadService implements Closeable{ | |
|
||
String getJobState(String jobId){ | ||
try { | ||
List<AllocationListStub> allocations = jobsApi.getJobAllocations(jobId, config.jobOpts().region, config.jobOpts().namespace, null, null, null, null, null, null, null, null) | ||
List<AllocationListStub> allocations = safeExecutor.apply { | ||
jobsApi.getJobAllocations(jobId, config.jobOpts().region, config.jobOpts().namespace, | ||
null, null, null, null, null, null, | ||
null, null) | ||
} | ||
AllocationListStub last = allocations?.sort { | ||
it.modifyIndex | ||
}?.last() | ||
|
@@ -127,7 +137,10 @@ class NomadService implements Closeable{ | |
|
||
boolean checkIfRunning(String jobId){ | ||
try { | ||
Job job = jobsApi.getJob(jobId, config.jobOpts().region, config.jobOpts().namespace, null, null, null, null, null, null, null) | ||
Job job = safeExecutor.apply { | ||
jobsApi.getJob(jobId, config.jobOpts().region, config.jobOpts().namespace, | ||
null, null, null, null, null, null, null) | ||
} | ||
log.debug "[NOMAD] checkIfRunning jobID=$job.ID; status=$job.status" | ||
job.status == "running" | ||
}catch (Exception e){ | ||
|
@@ -138,7 +151,10 @@ class NomadService implements Closeable{ | |
|
||
boolean checkIfDead(String jobId){ | ||
try{ | ||
Job job = jobsApi.getJob(jobId, config.jobOpts().region, config.jobOpts().namespace, null, null, null, null, null, null, null) | ||
Job job = safeExecutor.apply { | ||
jobsApi.getJob(jobId, config.jobOpts().region, config.jobOpts().namespace, | ||
null, null, null, null, null, null, null) | ||
} | ||
log.debug "[NOMAD] checkIfDead jobID=$job.ID; status=$job.status" | ||
job.status == "dead" | ||
}catch (Exception e){ | ||
|
@@ -158,15 +174,22 @@ class NomadService implements Closeable{ | |
protected void purgeJob(String jobId, boolean purge){ | ||
log.debug "[NOMAD] purgeJob with jobId=${jobId}" | ||
try { | ||
jobsApi.deleteJob(jobId, config.jobOpts().region, config.jobOpts().namespace, null, null, purge, true) | ||
safeExecutor.apply { | ||
jobsApi.deleteJob(jobId, config.jobOpts().region, config.jobOpts().namespace, | ||
null, null, purge, true) | ||
} | ||
}catch(Exception e){ | ||
log.debug("[NOMAD] Failed to delete job ${jobId} -- Cause: ${e.message ?: e}", e) | ||
} | ||
} | ||
|
||
String getClientOfJob(String jobId) { | ||
try{ | ||
List<AllocationListStub> allocations = jobsApi.getJobAllocations(jobId, config.jobOpts().region, config.jobOpts().namespace, null, null, null, null, null, null, null, null) | ||
List<AllocationListStub> allocations = safeExecutor.apply { | ||
jobsApi.getJobAllocations(jobId, config.jobOpts().region, config.jobOpts().namespace, | ||
null, null, null, null, null, null, | ||
null, null) | ||
} | ||
if( !allocations ){ | ||
return null | ||
} | ||
|
@@ -183,10 +206,12 @@ class NomadService implements Closeable{ | |
} | ||
|
||
String getVariableValue(String path, String key){ | ||
var variable = variablesApi.getVariableQuery("$path/$key", | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null, null, null, null, null) | ||
var variable = safeExecutor.apply { | ||
variablesApi.getVariableQuery("$path/$key", | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null, null, null, null, null) | ||
} | ||
variable?.items?.find{ it.key == key }?.value | ||
} | ||
|
||
|
@@ -197,17 +222,22 @@ class NomadService implements Closeable{ | |
void setVariableValue(String path, String key, String value){ | ||
var content = Map.of(key,value) | ||
var variable = new Variable(path: path, items: content) | ||
variablesApi.postVariable("$path/$key", variable, | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null) | ||
safeExecutor.apply { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really like this clean design @jagedn 🤩 |
||
variablesApi.postVariable("$path/$key", variable, | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null) | ||
} | ||
} | ||
|
||
List<String> getVariablesList(){ | ||
var listRequest = variablesApi.getVariablesListRequest( | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null, null, null, null, null) | ||
var listRequest = safeExecutor.apply { | ||
variablesApi.getVariablesListRequest( | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null, null, | ||
null, null, null) | ||
} | ||
String path = (config.jobOpts().secretOpts?.path ?: '')+"/" | ||
listRequest.collect{ it.path - path} | ||
} | ||
|
@@ -218,9 +248,11 @@ class NomadService implements Closeable{ | |
|
||
void deleteVariable(String path, String key){ | ||
var variable = new Variable( items: Map.of(key, "")) | ||
variablesApi.deleteVariable("$path/$key", variable, | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null) | ||
safeExecutor.apply { | ||
variablesApi.deleteVariable("$path/$key", variable, | ||
config.jobOpts().region, | ||
config.jobOpts().namespace, | ||
null, null, null) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jagedn, is there a specific dictionary/reference you used to focus only on these exit codes?
Maybe a good idea to add that in the comments or just explain the through process behind these exit codes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hahaha c&p from the azure plugin
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, in that case these must be specific to the Azure Batch API 🤔
@tomiles @matthdsm @jhaezebr , are you aware of any Nomad specific error codes or does a Nomad client/server just propogate the task-and-OS level error code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't think so, they are common http error codes
I've added descriptions for them
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mmm, I was under the impression that nomad would have it's own set of error codes as well but judging from this issue hashicorp/nomad#17782 I think that for client/task errors it stores them in
exit code
Could be related to #77 🤔
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, I agree but maybe better handle the ext code in the #77 and let this PR handle the infra/http errors
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good - then let's merge and make an edge release :)