-
Notifications
You must be signed in to change notification settings - Fork 240
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[DH-4306] Split db methods for cardinality and logs (#269)
* [DH-4306] Split db methods for cardinality and logs * Improve query history method for Snowflake and BigQuery * Document query_history endpoint
- Loading branch information
Showing
17 changed files
with
386 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from bson.objectid import ObjectId | ||
|
||
from dataherald.db_scanner.models.types import QueryHistory | ||
|
||
DB_COLLECTION = "query_history" | ||
|
||
|
||
class QueryHistoryRepository: | ||
def __init__(self, storage): | ||
self.storage = storage | ||
|
||
def insert(self, query_history: QueryHistory) -> QueryHistory: | ||
query_history_dict = query_history.dict(exclude={"id"}) | ||
query_history_dict["db_connection_id"] = ObjectId( | ||
query_history.db_connection_id | ||
) | ||
query_history.id = str( | ||
self.storage.insert_one(DB_COLLECTION, query_history_dict) | ||
) | ||
return query_history | ||
|
||
def find_by( | ||
self, query: dict, page: int = 1, limit: int = 10 | ||
) -> list[QueryHistory]: | ||
rows = self.storage.find(DB_COLLECTION, query, page=page, limit=limit) | ||
result = [] | ||
for row in rows: | ||
row["id"] = str(row["_id"]) | ||
row["db_connection_id"] = str(row["db_connection_id"]) | ||
result.append(QueryHistory(**row)) | ||
return result |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
from sqlalchemy.sql.schema import Column | ||
|
||
from dataherald.db_scanner.models.types import QueryHistory | ||
from dataherald.sql_database.base import SQLDatabase | ||
|
||
|
||
class AbstractScanner(ABC): | ||
@abstractmethod | ||
def cardinality_values(self, column: Column, db_engine: SQLDatabase) -> list | None: | ||
"""Returns a list if it is a catalog otherwise return None""" | ||
pass | ||
|
||
@abstractmethod | ||
def get_logs( | ||
self, table: str, db_engine: SQLDatabase, db_connection_id: str | ||
) -> list[QueryHistory]: | ||
"""Returns a list of logs""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import sqlalchemy | ||
from overrides import override | ||
from sqlalchemy.sql import func | ||
from sqlalchemy.sql.schema import Column | ||
|
||
from dataherald.db_scanner.models.types import QueryHistory | ||
from dataherald.db_scanner.services.abstract_scanner import AbstractScanner | ||
from dataherald.sql_database.base import SQLDatabase | ||
|
||
MIN_CATEGORY_VALUE = 1 | ||
MAX_CATEGORY_VALUE = 100 | ||
|
||
|
||
class BaseScanner(AbstractScanner): | ||
@override | ||
def cardinality_values(self, column: Column, db_engine: SQLDatabase) -> list | None: | ||
cardinality_query = sqlalchemy.select([func.distinct(column)]).limit(101) | ||
cardinality = db_engine.engine.execute(cardinality_query).fetchall() | ||
|
||
if MAX_CATEGORY_VALUE > len(cardinality) > MIN_CATEGORY_VALUE: | ||
return [str(category[0]) for category in cardinality] | ||
return None | ||
|
||
@override | ||
def get_logs( | ||
self, table: str, db_engine: SQLDatabase, db_connection_id: str # noqa: ARG002 | ||
) -> list[QueryHistory]: | ||
return [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from datetime import datetime, timedelta | ||
|
||
import sqlalchemy | ||
from overrides import override | ||
from sqlalchemy.sql import func | ||
from sqlalchemy.sql.schema import Column | ||
|
||
from dataherald.db_scanner.models.types import QueryHistory | ||
from dataherald.db_scanner.services.abstract_scanner import AbstractScanner | ||
from dataherald.sql_database.base import SQLDatabase | ||
|
||
MIN_CATEGORY_VALUE = 1 | ||
MAX_CATEGORY_VALUE = 100 | ||
MAX_LOGS = 5_000 | ||
|
||
|
||
class BigQueryScanner(AbstractScanner): | ||
@override | ||
def cardinality_values(self, column: Column, db_engine: SQLDatabase) -> list | None: | ||
rs = db_engine.engine.execute( | ||
f"SELECT APPROX_COUNT_DISTINCT({column.name}) FROM {column.table.name}" # noqa: S608 E501 | ||
).fetchall() | ||
|
||
if ( | ||
len(rs) > 0 | ||
and len(rs[0]) > 0 | ||
and MIN_CATEGORY_VALUE < rs[0][0] <= MAX_CATEGORY_VALUE | ||
): | ||
cardinality_query = sqlalchemy.select([func.distinct(column)]).limit(101) | ||
cardinality = db_engine.engine.execute(cardinality_query).fetchall() | ||
return [str(category[0]) for category in cardinality] | ||
|
||
return None | ||
|
||
@override | ||
def get_logs( | ||
self, table: str, db_engine: SQLDatabase, db_connection_id: str | ||
) -> list[QueryHistory]: | ||
filter_date = (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d") | ||
rows = db_engine.engine.execute( | ||
f"SELECT query, user_email, count(*) as occurrences FROM `region-us.INFORMATION_SCHEMA.JOBS`, UNNEST(referenced_tables) AS t where job_type = 'QUERY' and statement_type = 'SELECT' and t.table_id = '{table}' and state = 'DONE' and creation_time >='{filter_date}' group by query, user_email ORDER BY occurrences DESC limit {MAX_LOGS}" # noqa: S608 E501 | ||
).fetchall() | ||
return [ | ||
QueryHistory( | ||
db_connection_id=db_connection_id, | ||
table_name=table, | ||
query=row[0], | ||
user=row[1], | ||
occurrences=row[2], | ||
) | ||
for row in rows | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from overrides import override | ||
from sqlalchemy.sql.schema import Column | ||
|
||
from dataherald.db_scanner.models.types import QueryHistory | ||
from dataherald.db_scanner.services.abstract_scanner import AbstractScanner | ||
from dataherald.sql_database.base import SQLDatabase | ||
|
||
MIN_CATEGORY_VALUE = 1 | ||
MAX_CATEGORY_VALUE = 100 | ||
|
||
|
||
class PostgreSqlScanner(AbstractScanner): | ||
@override | ||
def cardinality_values(self, column: Column, db_engine: SQLDatabase) -> list | None: | ||
rs = db_engine.engine.execute( | ||
f"SELECT n_distinct, most_common_vals::TEXT::TEXT[] FROM pg_catalog.pg_stats WHERE tablename = '{column.table.name}' AND attname = '{column.name}'" # noqa: S608 E501 | ||
).fetchall() | ||
|
||
if ( | ||
len(rs) > 0 | ||
and MIN_CATEGORY_VALUE < rs[0]["n_distinct"] <= MAX_CATEGORY_VALUE | ||
): | ||
return rs[0]["most_common_vals"] | ||
return None | ||
|
||
@override | ||
def get_logs( | ||
self, table: str, db_engine: SQLDatabase, db_connection_id: str # noqa: ARG002 | ||
) -> list[QueryHistory]: | ||
return [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from datetime import datetime, timedelta | ||
|
||
import sqlalchemy | ||
from overrides import override | ||
from sqlalchemy.sql import func | ||
from sqlalchemy.sql.schema import Column | ||
|
||
from dataherald.db_scanner.models.types import QueryHistory | ||
from dataherald.db_scanner.services.abstract_scanner import AbstractScanner | ||
from dataherald.sql_database.base import SQLDatabase | ||
|
||
MIN_CATEGORY_VALUE = 1 | ||
MAX_CATEGORY_VALUE = 100 | ||
MAX_LOGS = 5_000 | ||
|
||
|
||
class SnowflakeScanner(AbstractScanner): | ||
@override | ||
def cardinality_values(self, column: Column, db_engine: SQLDatabase) -> list | None: | ||
rs = db_engine.engine.execute( | ||
f"select HLL({column.name}) from {column.table.name}" # noqa: S608 E501 | ||
).fetchall() | ||
|
||
if ( | ||
len(rs) > 0 | ||
and len(rs[0]) > 0 | ||
and MIN_CATEGORY_VALUE < rs[0][0] <= MAX_CATEGORY_VALUE | ||
): | ||
cardinality_query = sqlalchemy.select([func.distinct(column)]).limit(101) | ||
cardinality = db_engine.engine.execute(cardinality_query).fetchall() | ||
return [str(category[0]) for category in cardinality] | ||
|
||
return None | ||
|
||
@override | ||
def get_logs( | ||
self, table: str, db_engine: SQLDatabase, db_connection_id: str | ||
) -> list[QueryHistory]: | ||
database_name = db_engine.engine.url.database.split("/")[0] | ||
filter_date = (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d") | ||
rows = db_engine.engine.execute( | ||
f"select QUERY_TEXT, USER_NAME, count(*) as occurrences from TABLE(INFORMATION_SCHEMA.QUERY_HISTORY()) where DATABASE_NAME = '{database_name}' and QUERY_TYPE = 'SELECT' and EXECUTION_STATUS = 'SUCCESS' and START_TIME > '{filter_date}' and QUERY_TEXT like '%FROM {table}%' and QUERY_TEXT not like '%QUERY_HISTORY%' group by QUERY_TEXT, USER_NAME ORDER BY occurrences DESC limit {MAX_LOGS}" # noqa: S608 E501 | ||
).fetchall() | ||
return [ | ||
QueryHistory( | ||
db_connection_id=db_connection_id, | ||
table_name=table, | ||
query=row[0], | ||
user=row[1], | ||
occurrences=row[2], | ||
) | ||
for row in rows | ||
] |
Oops, something went wrong.