-
Notifications
You must be signed in to change notification settings - Fork 58
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added optimize delta tables to support all delta tables #386
base: main
Are you sure you want to change the base?
Changes from 4 commits
6ca13cc
286a5ff
5aff7de
87415d9
a3fc2ec
f10ccaf
fa5fa14
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import pandas as pd | ||
from typing import Optional, Union, List | ||
from uuid import UUID | ||
from sempy_labs._helper_functions import ( | ||
resolve_workspace_name_and_id, | ||
resolve_lakehouse_name_and_id, | ||
resolve_item_name_and_id, | ||
create_abfss_path, | ||
) | ||
from tqdm.auto import tqdm | ||
|
||
|
||
def optimize_delta_tables( | ||
tables: Optional[Union[str, List[str]]] = None, | ||
source: Optional[str | UUID] = None, | ||
source_type: str = "Lakehouse", | ||
workspace: Optional[str | UUID] = None, | ||
): | ||
""" | ||
Runs the `OPTIMIZE <https://docs.delta.io/latest/optimizations-oss.html>`_ function over the specified delta tables. | ||
|
||
Parameters | ||
---------- | ||
tables : str | List[str], default=None | ||
The table(s) to optimize. | ||
Defaults to None which resovles to optimizing all tables within the lakehouse. | ||
source : str | uuid.UUID, default=None | ||
The source location of the delta table (i.e. lakehouse). | ||
Defaults to None which resolves to the lakehouse attached to the notebook. | ||
source_type : str, default="Lakehouse" | ||
The source type (i.e. "Lakehouse", "SemanticModel") | ||
workspace : str | uuid.UUID, default=None | ||
The Fabric workspace name or ID used by the lakehouse. | ||
Defaults to None which resolves to the workspace of the attached lakehouse | ||
or if no lakehouse attached, resolves to the workspace of the notebook. | ||
""" | ||
|
||
from pyspark.sql import SparkSession | ||
from sempy_labs.lakehouse._get_lakehouse_tables import get_lakehouse_tables | ||
from delta import DeltaTable | ||
|
||
(workspace_name, workspace_id) = resolve_workspace_name_and_id(workspace) | ||
|
||
if source is None: | ||
(item_name, item_id) = resolve_lakehouse_name_and_id() | ||
else: | ||
(item_name, item_id) = resolve_item_name_and_id( | ||
item=source, type=source_type, workspace=workspace_id | ||
) | ||
|
||
if isinstance(tables, str): | ||
tables = [tables] | ||
|
||
if source_type == "Lakehouse": | ||
dfL = get_lakehouse_tables(lakehouse=item_name, workspace=workspace_id) | ||
dfL_delta = dfL[dfL["Format"] == "delta"] | ||
|
||
if tables is not None: | ||
delta_tables = dfL_delta[dfL_delta["Table Name"].isin(tables)] | ||
else: | ||
delta_tables = dfL_delta.copy() | ||
else: | ||
data = [] | ||
for t in tables: | ||
new_data = { | ||
"Table Name": t, | ||
"Location": create_abfss_path(workspace_id, item_id, t), | ||
} | ||
data.append(new_data) | ||
|
||
delta_tables = pd.DataFrame(data) | ||
|
||
spark = SparkSession.builder.getOrCreate() | ||
|
||
for _, r in (bar := tqdm(delta_tables.iterrows())): | ||
tableName = r["Table Name"] | ||
tablePath = r["Location"] | ||
bar.set_description(f"Optimizing the '{tableName}' table...") | ||
deltaTable = DeltaTable.forPath(spark, tablePath) | ||
deltaTable.optimize().executeCompaction() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why would you use the notebook instead of this? https://learn.microsoft.com/en-us/rest/api/fabric/lakehouse/background-jobs/run-on-demand-table-maintenance?tabs=HTTP There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed this function for now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. function is still there? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
in general I'd recommend to https://docs.python.org/3/library/urllib.parse.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed