From de020316036e6e339f1c3577ee25f9ebe61293c3 Mon Sep 17 00:00:00 2001 From: Sebastian Echegaray Date: Mon, 24 Oct 2022 16:43:43 -0700 Subject: [PATCH] Added custom retry with backoff for error not covered in BQ client (400, please retry with backoff) --- .../gcp/bigquery/action/BigQueryExecute.java | 60 ++++++++++++++++--- 1 file changed, 52 insertions(+), 8 deletions(-) diff --git a/src/main/java/io/cdap/plugin/gcp/bigquery/action/BigQueryExecute.java b/src/main/java/io/cdap/plugin/gcp/bigquery/action/BigQueryExecute.java index aa23fd5f66..ed6c343261 100644 --- a/src/main/java/io/cdap/plugin/gcp/bigquery/action/BigQueryExecute.java +++ b/src/main/java/io/cdap/plugin/gcp/bigquery/action/BigQueryExecute.java @@ -48,8 +48,11 @@ import org.slf4j.LoggerFactory; import java.io.IOException; +import java.util.ArrayList; import java.util.Collections; +import java.util.List; import java.util.Map; +import java.util.function.Function; import javax.annotation.Nullable; /** @@ -100,9 +103,6 @@ public void run(ActionContext context) throws Exception { // Enable legacy SQL builder.setUseLegacySql(config.isLegacySQL()); - // Location must match that of the dataset(s) referenced in the query. - JobId jobId = JobId.newBuilder().setRandomJob().setLocation(config.getLocation()).build(); - // API request - starts the query. Credentials credentials = config.getServiceAccount() == null ? null : GCPUtils.loadServiceAccountCredentials(config.getServiceAccount(), @@ -126,13 +126,17 @@ public void run(ActionContext context) throws Exception { QueryJobConfiguration queryConfig = builder.build(); - Job queryJob = bigQuery.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build()); - LOG.info("Executing SQL as job {}.", jobId.getJob()); - LOG.debug("The BigQuery SQL is {}", config.getSql()); + // Setting external retry strategy for BigQuery client due to BigQuery Client not retrying when a job clashes + // with another job, due to error being 400. - // Wait for the query to complete - queryJob.waitFor(); + final String retryableStringPattern = "Retrying the job with back-off"; + List> retryRules = new ArrayList<>(); + retryRules.add( + (BigQueryException e) -> e.getCode() == 400 + && (e.getMessage().contains(retryableStringPattern) || e.getReason().contains(retryableStringPattern)) + ); + Job queryJob = executeQueryJobWithCustomRetry(bigQuery, queryConfig, retryRules); // Check for errors if (queryJob.getStatus().getError() != null) { @@ -169,6 +173,46 @@ public void run(ActionContext context) throws Exception { context.getMetrics().gauge(RECORDS_PROCESSED, rows); } + /** + * Executes Query with added retry rules following: + * https://cloud.google.com/bigquery/sla + */ + private Job executeQueryJobWithCustomRetry(BigQuery bigQuery, QueryJobConfiguration queryConfig, + List> retryRules) throws Exception { + // The longest amount of time to wait in-between retries. + final int maximumBackoff = 32; + + // The maximum number of retries. + final int maxRetries = 20; + + int retries = 0; + + while (true) { + try { + // Location must match that of the dataset(s) referenced in the query. + JobId jobId = JobId.newBuilder().setRandomJob().setLocation(config.getLocation()).build(); + Job queryJob = bigQuery.create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build()); + LOG.info("Executing SQL as job {}.", jobId.getJob()); + LOG.debug("The BigQuery SQL is {}", config.getSql()); + + // Wait for the query to complete + queryJob.waitFor(); + return queryJob; + } catch (BigQueryException bigQueryException) { + if (retries >= maxRetries) { + LOG.error("Run out of retries while executing query with backoff."); + throw bigQueryException; + } + if (retryRules.stream().noneMatch((f -> f.apply(bigQueryException)))) { + throw bigQueryException; + } + LOG.warn("Received {} error from BigQuery, retrying...", bigQueryException.getMessage()); + Thread.sleep(Math.round((Math.min(Math.pow(2, retries), maximumBackoff) + Math.random()) * 1000)); + retries += 1; + } + } + } + @Override public AbstractBigQueryActionConfig getConfig() { return config;