Skip to content

Commit

Permalink
Create concept index tables clustered by primary entity id (#1135)
Browse files Browse the repository at this point in the history
  • Loading branch information
dexamundsen authored Jan 23, 2025
1 parent 7ce376a commit dc5e139
Show file tree
Hide file tree
Showing 24 changed files with 247 additions and 116 deletions.
7 changes: 7 additions & 0 deletions docs/generated/UNDERLAY_CONFIG.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ List of attributes grouped together for search optimization.

Order matter. Each entry is a list of attributes that are search for together. For example search is typically performed for contig and position together.

### SZAttributeSearch.includeEntityMainColumns
**optional** boolean

Whether all columns in the entity main table should also be included in this search table. Improves performance if other attributes are also fetched when performing this search by attributes.

*Default value:* `false`

### SZAttributeSearch.includeNullValues
**optional** boolean

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import bio.terra.tanagra.indexing.job.BigQueryJob;
import bio.terra.tanagra.indexing.job.dataflow.beam.BigQueryBeamUtils;
import bio.terra.tanagra.underlay.ColumnSchema;
import bio.terra.tanagra.underlay.entitymodel.Entity;
import bio.terra.tanagra.underlay.indextable.ITEntityMain;
import bio.terra.tanagra.underlay.indextable.ITEntitySearchByAttributes;
Expand Down Expand Up @@ -52,57 +53,69 @@ public JobStatus checkStatus() {

@Override
public void run(boolean isDryRun) {
List<Field> fields =
searchTable.getColumnSchemas().stream()
.map(
columnSchema ->
Field.newBuilder(
columnSchema.getColumnName(),
BigQueryBeamUtils.fromDataType(columnSchema.getDataType()))
.setMode(searchTable.includeNullValues() ? Mode.NULLABLE : Mode.REQUIRED)
.build())
.toList();

// Build a clustering specification.
Clustering clustering =
Clustering.newBuilder().setFields(searchTable.getAttributeNames()).build();

// Create an empty table with this schema.
TableId destinationTable =
TableId.of(
indexerConfig.bigQuery.indexData.projectId,
indexerConfig.bigQuery.indexData.datasetId,
getOutputTableName());
googleBigQuery.createTableFromSchema(destinationTable, Schema.of(fields), clustering, isDryRun);

// Build the query to insert to the search table using a select from the main entity table.
// Create table definition & build the query to insert into the search table
// using a select from the main entity table.
List<Field> fields = new ArrayList<>();
List<String> insertColumns = new ArrayList<>();
insertColumns.add(entity.getIdAttribute().getName());

List<String> selectColumns = new ArrayList<>();
selectColumns.add(entity.getIdAttribute().getName());

List<String> crossJoins = new ArrayList<>();
List<String> whereClauses = new ArrayList<>();

searchTable
.getAttributeNames()
.getColumnSchemas()
.forEach(
attribute -> {
colSchema -> {
String attribute = colSchema.getColumnName();
insertColumns.add(attribute);

if (entity.getAttribute(attribute).isDataTypeRepeated()) {
if (entityTable.getColumnSchemas().stream()
.filter(col -> col.getColumnName().equals(attribute))
.anyMatch(ColumnSchema::isDataTypeRepeated)
!= colSchema.isDataTypeRepeated()) {
// entityTable.repeated != searchTable.notRepeated
String alias = "flattened_" + attribute;
selectColumns.add(alias);
crossJoins.add(" CROSS JOIN UNNEST(" + attribute + ") AS " + alias);
} else {
selectColumns.add(attribute);
}

if (!searchTable.includeNullValues()) {
whereClauses.add(attribute + " IS NOT NULL");
Mode mode;
if (searchTable.getAttributeNames().contains(attribute)) {
if (searchTable.includeNullValues()) {
mode = Mode.NULLABLE;
} else {
mode = Mode.REQUIRED;
whereClauses.add(attribute + " IS NOT NULL");
}
} else {
// all other attributes
mode =
colSchema.isRequired()
? Mode.REQUIRED
: (colSchema.isDataTypeRepeated() ? Mode.REPEATED : Mode.NULLABLE);
}

fields.add(
Field.newBuilder(
colSchema.getColumnName(),
BigQueryBeamUtils.fromDataType(colSchema.getDataType()))
.setMode(mode)
.build());
});

// Build a clustering specification.
Clustering clustering =
Clustering.newBuilder().setFields(searchTable.getAttributeNames()).build();

// Create an empty table with this schema.
TableId destinationTable =
TableId.of(
indexerConfig.bigQuery.indexData.projectId,
indexerConfig.bigQuery.indexData.datasetId,
getOutputTableName());
googleBigQuery.createTableFromSchema(destinationTable, Schema.of(fields), clustering, isDryRun);

String whereSql =
whereClauses.isEmpty() ? StringUtils.EMPTY : " WHERE " + String.join(" AND ", whereClauses);

Expand Down
1 change: 1 addition & 0 deletions ui/src/tanagra-underlay/underlayConfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ export type SZAttribute = {

export type SZAttributeSearch = {
attributes: string[];
includeEntityMainColumns?: boolean;
includeNullValues?: boolean;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ public AttributeFilter(
this.values = ImmutableList.copyOf(values);
}

public Attribute getAttribute() {
return attribute;
@Override
public List<Attribute> getFilterAttributes() {
return List.of(attribute);
}

public UnaryOperator getUnaryOperator() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package bio.terra.tanagra.api.filter;

import bio.terra.tanagra.exception.InvalidQueryException;
import bio.terra.tanagra.underlay.entitymodel.Attribute;
import bio.terra.tanagra.underlay.entitymodel.Entity;
import com.google.common.collect.ImmutableList;
import java.util.List;
Expand Down Expand Up @@ -43,6 +44,15 @@ private static Entity getSubFiltersEntity(List<EntityFilter> filters) {
return entity;
}

@Override
public List<Attribute> getFilterAttributes() {
return subFilters.stream()
.map(EntityFilter::getFilterAttributes)
.flatMap(List::stream)
.distinct()
.toList();
}

@Override
public boolean equals(Object o) {
if (!super.equals(o)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package bio.terra.tanagra.api.filter;

import bio.terra.tanagra.underlay.entitymodel.Attribute;
import java.util.List;
import java.util.Objects;
import org.slf4j.LoggerFactory;

Expand All @@ -18,6 +20,11 @@ public EntityFilter getSubFilter() {
return subFilter;
}

@Override
public List<Attribute> getFilterAttributes() {
return subFilter.getFilterAttributes();
}

@Override
public boolean equals(Object o) {
if (!super.equals(o)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package bio.terra.tanagra.api.filter;

import bio.terra.tanagra.underlay.Underlay;
import bio.terra.tanagra.underlay.entitymodel.Attribute;
import bio.terra.tanagra.underlay.entitymodel.Entity;
import java.util.List;
import java.util.Objects;
Expand Down Expand Up @@ -30,6 +31,11 @@ public Entity getEntity() {
return entity;
}

public List<Attribute> getFilterAttributes() {
// not supported or not implemented
return List.of();
}

// TODO: Add logic here to merge filters automatically to get a simpler filter overall.
public boolean isMergeable(EntityFilter entityFilter) {
return false;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package bio.terra.tanagra.api.filter;

import bio.terra.tanagra.underlay.Underlay;
import bio.terra.tanagra.underlay.entitymodel.Attribute;
import bio.terra.tanagra.underlay.entitymodel.Entity;
import bio.terra.tanagra.underlay.entitymodel.entitygroup.CriteriaOccurrence;
import jakarta.annotation.Nullable;
import java.util.List;
import java.util.Objects;
import org.slf4j.LoggerFactory;

Expand All @@ -25,6 +27,15 @@ public OccurrenceForPrimaryFilter(
this.criteriaSubFilter = criteriaSubFilter;
}

@Override
public List<Attribute> getFilterAttributes() {
Attribute attribute =
criteriaOccurrence
.getOccurrencePrimaryRelationship(getOccurrenceEntity().getName())
.getForeignKeyAttribute(getOccurrenceEntity());
return attribute != null ? List.of(attribute) : List.of();
}

public CriteriaOccurrence getCriteriaOccurrence() {
return criteriaOccurrence;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ public RelationshipFilter(
this.groupByCountValue = groupByCountValue;
}

@Override
public List<Attribute> getFilterAttributes() {
Attribute attribute = relationship.getForeignKeyAttribute(getSelectEntity());
return attribute != null ? List.of(attribute) : List.of();
}

public EntityGroup getEntityGroup() {
return entityGroup;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import bio.terra.tanagra.underlay.entitymodel.Attribute;
import bio.terra.tanagra.underlay.entitymodel.Entity;
import jakarta.annotation.Nullable;
import java.util.List;
import java.util.Objects;
import org.slf4j.LoggerFactory;

Expand All @@ -30,13 +31,9 @@ public TextSearchFilter(
this.attribute = attribute;
}

public boolean isForSpecificAttribute() {
return attribute != null;
}

@Nullable
public Attribute getAttribute() {
return attribute;
@Override
public List<Attribute> getFilterAttributes() {
return (attribute != null) ? List.of(attribute) : List.of();
}

public TextSearchOperator getOperator() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ public BQExecutor(BQExecutorInfrastructure queryInfrastructure) {
}

public SqlQueryResult run(SqlQueryRequest queryRequest) {
// Log the SQL statement with parameters substituted locally (i.e. not by BQ) to help with
// debugging.
// Log the SQL statement with parameters substituted locally (i.e. not by BQ) for debugging.
String sqlNoParams = queryRequest.getSql();
for (String paramName : queryRequest.getSqlParams().getParamNamesLongestFirst()) {
sqlNoParams =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@
import bio.terra.tanagra.underlay.entitymodel.Attribute;
import bio.terra.tanagra.underlay.entitymodel.Entity;
import bio.terra.tanagra.underlay.indextable.ITEntityLevelDisplayHints;
import bio.terra.tanagra.underlay.indextable.ITEntityMain;
import bio.terra.tanagra.underlay.indextable.ITEntitySearchByAttributes;
import bio.terra.tanagra.underlay.indextable.ITInstanceLevelDisplayHints;
import bio.terra.tanagra.underlay.indextable.IndexTable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
Expand All @@ -48,6 +49,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.lang3.tuple.Pair;

Expand Down Expand Up @@ -395,17 +397,13 @@ private SqlQueryRequest buildQuerySqlAgainstIndexData(
BQApiTranslator bqTranslator = new BQApiTranslator();

// Build the list of entities we need for the select fields and filters.
Set<Entity> entities = new HashSet<>();
selectFields.forEach(selectField -> entities.add(selectField.getEntity()));
entities.addAll(filters.keySet());
Set<Entity> entities =
selectFields.stream().map(ValueDisplayField::getEntity).collect(Collectors.toSet());
if (entities.size() > 1) {
throw new NotImplementedException("Queries with more than one entity are not yet supported");
}

// Build the list of entity main tables we need to query or join.
Entity singleEntity = entities.iterator().next();
ITEntityMain entityMain = underlay.getIndexSchema().getEntityMain(singleEntity.getName());

List<String> selectFieldNames = new ArrayList<>();
List<String> selectFieldSqls = new ArrayList<>();
List<String> joinTableSqls = new ArrayList<>();
selectFields.forEach(
Expand All @@ -428,22 +426,47 @@ private SqlQueryRequest buildQuerySqlAgainstIndexData(
}

sqlQueryFields.forEach(
sqlQueryField -> selectFieldSqls.add(sqlQueryField.renderForSelect()));
sqlQueryField -> {
selectFieldNames.add(sqlQueryField.getField().getColumnName());
selectFieldSqls.add(sqlQueryField.renderForSelect());
});
});

// Build the list of entity tables we need to query or join.
Entity singleEntity = entities.iterator().next();
EntityFilter singleEntityFilter = filters.get(singleEntity);

// get a list of attributes filtered on, if implemented for the filter
List<String> filterAttributeNames =
singleEntityFilter != null
? singleEntityFilter.getFilterAttributes().stream().map(Attribute::getName).toList()
: List.of();

// default: use entityMain table
// check if: entity is optimized for search on filterAttributes &
// (either search table contains entityMain fields OR all selectFields)
IndexTable entityTable = underlay.getIndexSchema().getEntityMain(singleEntity.getName());
if (singleEntity.containsOptimizeSearchByAttributes(filterAttributeNames)) {
ITEntitySearchByAttributes searchTable =
underlay.getIndexSchema().getEntitySearchByAttributes(singleEntity, filterAttributeNames);
if (searchTable.includeEntityMainColumns()
|| new HashSet<>(searchTable.getColumnNames()).containsAll(selectFieldNames)) {
entityTable = searchTable;
}
}

// SELECT [select fields] FROM [entity main] JOIN [join tables]
sql.append("SELECT ")
.append(String.join(", ", selectFieldSqls))
.append(" FROM ")
.append(entityMain.getTablePointer().render());
.append(entityTable.getTablePointer().render());

// JOIN [join tables]
if (!joinTableSqls.isEmpty()) {
sql.append(" ").append(String.join(" ", joinTableSqls));
}

// WHERE [filter]
EntityFilter singleEntityFilter = filters.get(singleEntity);
if (singleEntityFilter != null) {
sql.append(" WHERE ")
.append(bqTranslator.translator(singleEntityFilter).buildSql(sqlParams, null));
Expand Down
Loading

0 comments on commit dc5e139

Please sign in to comment.