Create concept index tables clustered by primary entity id (#1135)

DataBiosphere · Jan 23, 2025 · dc5e139 · dc5e139
1 parent 7ce376a
commit dc5e139
Show file tree

Hide file tree

Showing 24 changed files with 247 additions and 116 deletions.
diff --git a/docs/generated/UNDERLAY_CONFIG.md b/docs/generated/UNDERLAY_CONFIG.md
@@ -140,6 +140,13 @@ List of attributes grouped together for search optimization.
 
  Order matter. Each entry is a list of attributes that are search for together. For example search is typically performed for contig and position together. 
 
+### SZAttributeSearch.includeEntityMainColumns
+**optional** boolean
+
+Whether all columns in the entity main table should also be included in this search table. Improves performance if other attributes are also fetched when performing this search by attributes.
+
+*Default value:* `false`
+
 ### SZAttributeSearch.includeNullValues
 **optional** boolean
 

diff --git a/.../src/main/java/bio/terra/tanagra/indexing/job/bigquery/WriteEntitySearchByAttributes.java b/.../src/main/java/bio/terra/tanagra/indexing/job/bigquery/WriteEntitySearchByAttributes.java
@@ -2,6 +2,7 @@
 
 import bio.terra.tanagra.indexing.job.BigQueryJob;
 import bio.terra.tanagra.indexing.job.dataflow.beam.BigQueryBeamUtils;
+import bio.terra.tanagra.underlay.ColumnSchema;
 import bio.terra.tanagra.underlay.entitymodel.Entity;
 import bio.terra.tanagra.underlay.indextable.ITEntityMain;
 import bio.terra.tanagra.underlay.indextable.ITEntitySearchByAttributes;
@@ -52,57 +53,69 @@ public JobStatus checkStatus() {
 
   @Override
   public void run(boolean isDryRun) {
-    List<Field> fields =
-        searchTable.getColumnSchemas().stream()
-            .map(
-                columnSchema ->
-                    Field.newBuilder(
-                            columnSchema.getColumnName(),
-                            BigQueryBeamUtils.fromDataType(columnSchema.getDataType()))
-                        .setMode(searchTable.includeNullValues() ? Mode.NULLABLE : Mode.REQUIRED)
-                        .build())
-            .toList();
-
-    // Build a clustering specification.
-    Clustering clustering =
-        Clustering.newBuilder().setFields(searchTable.getAttributeNames()).build();
-
-    // Create an empty table with this schema.
-    TableId destinationTable =
-        TableId.of(
-            indexerConfig.bigQuery.indexData.projectId,
-            indexerConfig.bigQuery.indexData.datasetId,
-            getOutputTableName());
-    googleBigQuery.createTableFromSchema(destinationTable, Schema.of(fields), clustering, isDryRun);
-
-    // Build the query to insert to the search table using a select from the main entity table.
+    // Create table definition & build the query to insert into the search table
+    // using a select from the main entity table.
+    List<Field> fields = new ArrayList<>();
     List<String> insertColumns = new ArrayList<>();
-    insertColumns.add(entity.getIdAttribute().getName());
-
     List<String> selectColumns = new ArrayList<>();
-    selectColumns.add(entity.getIdAttribute().getName());
-
     List<String> crossJoins = new ArrayList<>();
     List<String> whereClauses = new ArrayList<>();
+
     searchTable
-        .getAttributeNames()
+        .getColumnSchemas()
         .forEach(
-            attribute -> {
+            colSchema -> {
+              String attribute = colSchema.getColumnName();
               insertColumns.add(attribute);
 
-              if (entity.getAttribute(attribute).isDataTypeRepeated()) {
+              if (entityTable.getColumnSchemas().stream()
+                      .filter(col -> col.getColumnName().equals(attribute))
+                      .anyMatch(ColumnSchema::isDataTypeRepeated)
+                  != colSchema.isDataTypeRepeated()) {
+                // entityTable.repeated != searchTable.notRepeated
                 String alias = "flattened_" + attribute;
                 selectColumns.add(alias);
                 crossJoins.add(" CROSS JOIN UNNEST(" + attribute + ") AS " + alias);
               } else {
                 selectColumns.add(attribute);
               }
 
-              if (!searchTable.includeNullValues()) {
-                whereClauses.add(attribute + " IS NOT NULL");
+              Mode mode;
+              if (searchTable.getAttributeNames().contains(attribute)) {
+                if (searchTable.includeNullValues()) {
+                  mode = Mode.NULLABLE;
+                } else {
+                  mode = Mode.REQUIRED;
+                  whereClauses.add(attribute + " IS NOT NULL");
+                }
+              } else {
+                // all other attributes
+                mode =
+                    colSchema.isRequired()
+                        ? Mode.REQUIRED
+                        : (colSchema.isDataTypeRepeated() ? Mode.REPEATED : Mode.NULLABLE);
               }
+
+              fields.add(
+                  Field.newBuilder(
+                          colSchema.getColumnName(),
+                          BigQueryBeamUtils.fromDataType(colSchema.getDataType()))
+                      .setMode(mode)
+                      .build());
             });
 
+    // Build a clustering specification.
+    Clustering clustering =
+        Clustering.newBuilder().setFields(searchTable.getAttributeNames()).build();
+
+    // Create an empty table with this schema.
+    TableId destinationTable =
+        TableId.of(
+            indexerConfig.bigQuery.indexData.projectId,
+            indexerConfig.bigQuery.indexData.datasetId,
+            getOutputTableName());
+    googleBigQuery.createTableFromSchema(destinationTable, Schema.of(fields), clustering, isDryRun);
+
     String whereSql =
         whereClauses.isEmpty() ? StringUtils.EMPTY : " WHERE " + String.join(" AND ", whereClauses);
 

diff --git a/ui/src/tanagra-underlay/underlayConfig.ts b/ui/src/tanagra-underlay/underlayConfig.ts
@@ -15,6 +15,7 @@ export type SZAttribute = {
 
 export type SZAttributeSearch = {
   attributes: string[];
+  includeEntityMainColumns?: boolean;
   includeNullValues?: boolean;
 };
 

diff --git a/underlay/src/main/java/bio/terra/tanagra/api/filter/AttributeFilter.java b/underlay/src/main/java/bio/terra/tanagra/api/filter/AttributeFilter.java
@@ -57,8 +57,9 @@ public AttributeFilter(
     this.values = ImmutableList.copyOf(values);
   }
 
-  public Attribute getAttribute() {
-    return attribute;
+  @Override
+  public List<Attribute> getFilterAttributes() {
+    return List.of(attribute);
   }
 
   public UnaryOperator getUnaryOperator() {

diff --git a/underlay/src/main/java/bio/terra/tanagra/api/filter/BooleanAndOrFilter.java b/underlay/src/main/java/bio/terra/tanagra/api/filter/BooleanAndOrFilter.java
@@ -1,6 +1,7 @@
 package bio.terra.tanagra.api.filter;
 
 import bio.terra.tanagra.exception.InvalidQueryException;
+import bio.terra.tanagra.underlay.entitymodel.Attribute;
 import bio.terra.tanagra.underlay.entitymodel.Entity;
 import com.google.common.collect.ImmutableList;
 import java.util.List;
@@ -43,6 +44,15 @@ private static Entity getSubFiltersEntity(List<EntityFilter> filters) {
     return entity;
   }
 
+  @Override
+  public List<Attribute> getFilterAttributes() {
+    return subFilters.stream()
+        .map(EntityFilter::getFilterAttributes)
+        .flatMap(List::stream)
+        .distinct()
+        .toList();
+  }
+
   @Override
   public boolean equals(Object o) {
     if (!super.equals(o)) {

diff --git a/underlay/src/main/java/bio/terra/tanagra/api/filter/BooleanNotFilter.java b/underlay/src/main/java/bio/terra/tanagra/api/filter/BooleanNotFilter.java
@@ -1,5 +1,7 @@
 package bio.terra.tanagra.api.filter;
 
+import bio.terra.tanagra.underlay.entitymodel.Attribute;
+import java.util.List;
 import java.util.Objects;
 import org.slf4j.LoggerFactory;
 
@@ -18,6 +20,11 @@ public EntityFilter getSubFilter() {
     return subFilter;
   }
 
+  @Override
+  public List<Attribute> getFilterAttributes() {
+    return subFilter.getFilterAttributes();
+  }
+
   @Override
   public boolean equals(Object o) {
     if (!super.equals(o)) {

diff --git a/underlay/src/main/java/bio/terra/tanagra/api/filter/EntityFilter.java b/underlay/src/main/java/bio/terra/tanagra/api/filter/EntityFilter.java
@@ -1,6 +1,7 @@
 package bio.terra.tanagra.api.filter;
 
 import bio.terra.tanagra.underlay.Underlay;
+import bio.terra.tanagra.underlay.entitymodel.Attribute;
 import bio.terra.tanagra.underlay.entitymodel.Entity;
 import java.util.List;
 import java.util.Objects;
@@ -30,6 +31,11 @@ public Entity getEntity() {
     return entity;
   }
 
+  public List<Attribute> getFilterAttributes() {
+    // not supported or not implemented
+    return List.of();
+  }
+
   // TODO: Add logic here to merge filters automatically to get a simpler filter overall.
   public boolean isMergeable(EntityFilter entityFilter) {
     return false;

diff --git a/underlay/src/main/java/bio/terra/tanagra/api/filter/OccurrenceForPrimaryFilter.java b/underlay/src/main/java/bio/terra/tanagra/api/filter/OccurrenceForPrimaryFilter.java
@@ -1,9 +1,11 @@
 package bio.terra.tanagra.api.filter;
 
 import bio.terra.tanagra.underlay.Underlay;
+import bio.terra.tanagra.underlay.entitymodel.Attribute;
 import bio.terra.tanagra.underlay.entitymodel.Entity;
 import bio.terra.tanagra.underlay.entitymodel.entitygroup.CriteriaOccurrence;
 import jakarta.annotation.Nullable;
+import java.util.List;
 import java.util.Objects;
 import org.slf4j.LoggerFactory;
 
@@ -25,6 +27,15 @@ public OccurrenceForPrimaryFilter(
     this.criteriaSubFilter = criteriaSubFilter;
   }
 
+  @Override
+  public List<Attribute> getFilterAttributes() {
+    Attribute attribute =
+        criteriaOccurrence
+            .getOccurrencePrimaryRelationship(getOccurrenceEntity().getName())
+            .getForeignKeyAttribute(getOccurrenceEntity());
+    return attribute != null ? List.of(attribute) : List.of();
+  }
+
   public CriteriaOccurrence getCriteriaOccurrence() {
     return criteriaOccurrence;
   }

diff --git a/underlay/src/main/java/bio/terra/tanagra/api/filter/RelationshipFilter.java b/underlay/src/main/java/bio/terra/tanagra/api/filter/RelationshipFilter.java
@@ -47,6 +47,12 @@ public RelationshipFilter(
     this.groupByCountValue = groupByCountValue;
   }
 
+  @Override
+  public List<Attribute> getFilterAttributes() {
+    Attribute attribute = relationship.getForeignKeyAttribute(getSelectEntity());
+    return attribute != null ? List.of(attribute) : List.of();
+  }
+
   public EntityGroup getEntityGroup() {
     return entityGroup;
   }

diff --git a/underlay/src/main/java/bio/terra/tanagra/api/filter/TextSearchFilter.java b/underlay/src/main/java/bio/terra/tanagra/api/filter/TextSearchFilter.java
@@ -4,6 +4,7 @@
 import bio.terra.tanagra.underlay.entitymodel.Attribute;
 import bio.terra.tanagra.underlay.entitymodel.Entity;
 import jakarta.annotation.Nullable;
+import java.util.List;
 import java.util.Objects;
 import org.slf4j.LoggerFactory;
 
@@ -30,13 +31,9 @@ public TextSearchFilter(
     this.attribute = attribute;
   }
 
-  public boolean isForSpecificAttribute() {
-    return attribute != null;
-  }
-
-  @Nullable
-  public Attribute getAttribute() {
-    return attribute;
+  @Override
+  public List<Attribute> getFilterAttributes() {
+    return (attribute != null) ? List.of(attribute) : List.of();
   }
 
   public TextSearchOperator getOperator() {

diff --git a/underlay/src/main/java/bio/terra/tanagra/query/bigquery/BQExecutor.java b/underlay/src/main/java/bio/terra/tanagra/query/bigquery/BQExecutor.java
@@ -43,8 +43,7 @@ public BQExecutor(BQExecutorInfrastructure queryInfrastructure) {
   }
 
   public SqlQueryResult run(SqlQueryRequest queryRequest) {
-    // Log the SQL statement with parameters substituted locally (i.e. not by BQ) to help with
-    // debugging.
+    // Log the SQL statement with parameters substituted locally (i.e. not by BQ) for debugging.
     String sqlNoParams = queryRequest.getSql();
     for (String paramName : queryRequest.getSqlParams().getParamNamesLongestFirst()) {
       sqlNoParams =

diff --git a/underlay/src/main/java/bio/terra/tanagra/query/bigquery/BQQueryRunner.java b/underlay/src/main/java/bio/terra/tanagra/query/bigquery/BQQueryRunner.java
@@ -34,8 +34,9 @@
 import bio.terra.tanagra.underlay.entitymodel.Attribute;
 import bio.terra.tanagra.underlay.entitymodel.Entity;
 import bio.terra.tanagra.underlay.indextable.ITEntityLevelDisplayHints;
-import bio.terra.tanagra.underlay.indextable.ITEntityMain;
+import bio.terra.tanagra.underlay.indextable.ITEntitySearchByAttributes;
 import bio.terra.tanagra.underlay.indextable.ITInstanceLevelDisplayHints;
+import bio.terra.tanagra.underlay.indextable.IndexTable;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.HashBasedTable;
 import com.google.common.collect.Table;
@@ -48,6 +49,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.stream.Collectors;
 import org.apache.commons.lang3.NotImplementedException;
 import org.apache.commons.lang3.tuple.Pair;
 
@@ -395,17 +397,13 @@ private SqlQueryRequest buildQuerySqlAgainstIndexData(
     BQApiTranslator bqTranslator = new BQApiTranslator();
 
     // Build the list of entities we need for the select fields and filters.
-    Set<Entity> entities = new HashSet<>();
-    selectFields.forEach(selectField -> entities.add(selectField.getEntity()));
-    entities.addAll(filters.keySet());
+    Set<Entity> entities =
+        selectFields.stream().map(ValueDisplayField::getEntity).collect(Collectors.toSet());
     if (entities.size() > 1) {
       throw new NotImplementedException("Queries with more than one entity are not yet supported");
     }
 
-    // Build the list of entity main tables we need to query or join.
-    Entity singleEntity = entities.iterator().next();
-    ITEntityMain entityMain = underlay.getIndexSchema().getEntityMain(singleEntity.getName());
-
+    List<String> selectFieldNames = new ArrayList<>();
     List<String> selectFieldSqls = new ArrayList<>();
     List<String> joinTableSqls = new ArrayList<>();
     selectFields.forEach(
@@ -428,22 +426,47 @@ private SqlQueryRequest buildQuerySqlAgainstIndexData(
           }
 
           sqlQueryFields.forEach(
-              sqlQueryField -> selectFieldSqls.add(sqlQueryField.renderForSelect()));
+              sqlQueryField -> {
+                selectFieldNames.add(sqlQueryField.getField().getColumnName());
+                selectFieldSqls.add(sqlQueryField.renderForSelect());
+              });
         });
 
+    // Build the list of entity tables we need to query or join.
+    Entity singleEntity = entities.iterator().next();
+    EntityFilter singleEntityFilter = filters.get(singleEntity);
+
+    // get a list of attributes filtered on, if implemented for the filter
+    List<String> filterAttributeNames =
+        singleEntityFilter != null
+            ? singleEntityFilter.getFilterAttributes().stream().map(Attribute::getName).toList()
+            : List.of();
+
+    // default: use entityMain table
+    // check if: entity is optimized for search on filterAttributes &
+    // (either search table contains entityMain fields OR all selectFields)
+    IndexTable entityTable = underlay.getIndexSchema().getEntityMain(singleEntity.getName());
+    if (singleEntity.containsOptimizeSearchByAttributes(filterAttributeNames)) {
+      ITEntitySearchByAttributes searchTable =
+          underlay.getIndexSchema().getEntitySearchByAttributes(singleEntity, filterAttributeNames);
+      if (searchTable.includeEntityMainColumns()
+          || new HashSet<>(searchTable.getColumnNames()).containsAll(selectFieldNames)) {
+        entityTable = searchTable;
+      }
+    }
+
     // SELECT [select fields] FROM [entity main] JOIN [join tables]
     sql.append("SELECT ")
         .append(String.join(", ", selectFieldSqls))
         .append(" FROM ")
-        .append(entityMain.getTablePointer().render());
+        .append(entityTable.getTablePointer().render());
 
     // JOIN [join tables]
     if (!joinTableSqls.isEmpty()) {
       sql.append(" ").append(String.join(" ", joinTableSqls));
     }
 
     // WHERE [filter]
-    EntityFilter singleEntityFilter = filters.get(singleEntity);
     if (singleEntityFilter != null) {
       sql.append(" WHERE ")
           .append(bqTranslator.translator(singleEntityFilter).buildSql(sqlParams, null));