Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

M kovalsky/linguisticschema #90

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/sempy_labs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
get_semantic_model_bim,
)
from sempy_labs._list_functions import (
list_synonyms,
list_reports_using_semantic_model,
delete_custom_pool,
list_semantic_model_objects,
Expand Down Expand Up @@ -57,6 +58,7 @@
)

from sempy_labs._helper_functions import (
generate_synonyms,
resolve_workspace_capacity,
create_abfss_path,
format_dax_object_name,
Expand Down Expand Up @@ -205,4 +207,6 @@
"resolve_capacity_name",
"run_model_bpa_bulk",
"create_model_bpa_semantic_model",
"list_synonyms",
"generate_synonyms",
]
30 changes: 30 additions & 0 deletions src/sempy_labs/_helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -850,3 +850,33 @@ def pagination(client, response):
continuation_uri = response_json.get("continuationUri")

return responses


def validate_weight(weight: float):

if weight is not None and (weight <= 0 or weight >= 1):
raise ValueError(
f"{icons.red_dot} Invalid weight parameter. Weight must be a value between 0 and 1."
)


def generate_synonyms(word: str) -> List[str]:

import nltk
from nltk.corpus import wordnet
from nltk.data import find

try:
find("corpora/wordnet.zip")
except Exception:
nltk.download("wordnet")

synonyms = wordnet.synsets(word)
synonym_list = []
for syn in synonyms:
for lemma in syn.lemmas():
syn_word = lemma.name()
if syn_word != word and "_" not in syn_word:
synonym_list.append(syn_word)

return synonym_list
85 changes: 84 additions & 1 deletion src/sempy_labs/_list_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
import pandas as pd
import base64
import requests
import time
import json
from pyspark.sql import SparkSession
from typing import Optional
from sempy._utils._log import log
import sempy_labs._icons as icons
from sempy.fabric.exceptions import FabricHTTPException
from collections import defaultdict


def get_object_level_security(
Expand Down Expand Up @@ -2578,3 +2579,85 @@ def list_reports_using_semantic_model(
df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True)

return df


@log
def list_synonyms(dataset: str, workspace: Optional[str] = None):

from sempy_labs.tom import connect_semantic_model

workspace = fabric.resolve_workspace_name(workspace)

df = pd.DataFrame(
columns=[
"Culture Name",
"Table Name",
"Object Name",
"Object Type",
"Synonym",
"Type",
"State",
"Weight",
"Last Modified",
]
)

with connect_semantic_model(
dataset=dataset, workspace=workspace, readonly=True
) as tom:
for c in tom.model.Cultures:
if c.LinguisticMetadata is not None:
lm = json.loads(c.LinguisticMetadata.Content)
if "Entities" in lm:
for k, v in lm.get("Entities", []).items():
binding = v.get("Definition", {}).get("Binding", {})

t_name = binding.get("ConceptualEntity")
object_name = binding.get("ConceptualProperty")

if object_name is None:
object_type = "Table"
object_name = t_name
elif any(
m.Name == object_name and m.Parent.Name == t_name
for m in tom.all_measures()
):
object_type = "Measure"
elif any(
m.Name == object_name and m.Parent.Name == t_name
for m in tom.all_columns()
):
object_type = "Column"
elif any(
m.Name == object_name and m.Parent.Name == t_name
for m in tom.all_hierarchies()
):
object_type = "Hierarchy"

merged_terms = defaultdict(dict)
for t in v.get("Terms", []):
for term, properties in t.items():
normalized_term = term.lower()
merged_terms[normalized_term].update(properties)

for term, props in merged_terms.items():
new_data = {
"Culture Name": lm.get("Language"),
"Table Name": t_name,
"Object Name": object_name,
"Object Type": object_type,
"Synonym": term,
"Type": props.get("Type"),
"State": props.get("State"),
"Weight": props.get("Weight"),
"Last Modified": props.get("LastModified"),
}
df = pd.concat(
[df, pd.DataFrame(new_data, index=[0])],
ignore_index=True,
)

df["Weight"] = df["Weight"].fillna(0).astype(float)
df["Last Modified"] = pd.to_datetime(df["Last Modified"])

return df
10 changes: 6 additions & 4 deletions src/sempy_labs/directlake/_guardrails.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,16 @@ def get_sku_size(workspace: Optional[str] = None) -> str:
if len(dfW) == 0:
raise ValueError(f"{icons.red_dot} The '{workspace}' is not a valid workspace.")

capacity_id = dfW['Capacity Id'].iloc[0]
capacity_id = dfW["Capacity Id"].iloc[0]
dfC = fabric.list_capacities()
dfC_filt = dfC[dfC['Id'] == capacity_id]
dfC_filt = dfC[dfC["Id"] == capacity_id]

if len(dfC_filt) == 0:
raise ValueError(f"{icons.red_dot} The '{capacity_id}' Id is not a valid capacity Id.")
raise ValueError(
f"{icons.red_dot} The '{capacity_id}' Id is not a valid capacity Id."
)

return dfC_filt['Sku'].iloc[0]
return dfC_filt["Sku"].iloc[0]


def get_directlake_guardrails_for_sku(sku_size: str) -> pd.DataFrame:
Expand Down
38 changes: 15 additions & 23 deletions src/sempy_labs/lakehouse/_get_lakehouse_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,14 +114,14 @@ def get_lakehouse_tables(
sku_value = get_sku_size(workspace)
guardrail = get_directlake_guardrails_for_sku(sku_value)
spark = SparkSession.builder.getOrCreate()
df['Files'] = None
df['Row Groups'] = None
df['Table Size'] = None
df["Files"] = None
df["Row Groups"] = None
df["Table Size"] = None
if count_rows:
df['Row Count'] = None
df["Row Count"] = None
for i, r in df.iterrows():
tName = r['Table Name']
if r['Type'] == 'Managed' and r['Format'] == 'delta':
tName = r["Table Name"]
if r["Type"] == "Managed" and r["Format"] == "delta":
detail_df = spark.sql(f"DESCRIBE DETAIL `{tName}`").collect()[0]
num_files = detail_df.numFiles
size_in_bytes = detail_df.sizeInBytes
Expand All @@ -141,36 +141,28 @@ def get_lakehouse_tables(
).num_row_groups
except FileNotFoundError:
continue
df.at[i, 'Files'] = num_files
df.at[i, 'Row Groups'] = num_rowgroups
df.at[i, 'Table Size'] = size_in_bytes
df.at[i, "Files"] = num_files
df.at[i, "Row Groups"] = num_rowgroups
df.at[i, "Table Size"] = size_in_bytes
if count_rows:
num_rows = spark.table(tName).count()
df.at[i, 'Row Count'] = num_rows
df.at[i, "Row Count"] = num_rows

if extended:
intColumns = ["Files", "Row Groups", "Table Size"]
df[intColumns] = df[intColumns].astype(int)
df["SKU"] = guardrail["Fabric SKUs"].iloc[0]
df["Parquet File Guardrail"] = guardrail[
"Parquet files per table"
].iloc[0]
df["Parquet File Guardrail"] = guardrail["Parquet files per table"].iloc[0]
df["Row Group Guardrail"] = guardrail["Row groups per table"].iloc[0]
df["Row Count Guardrail"] = (
guardrail["Rows per table (millions)"].iloc[0] * 1000000
)

df["Parquet File Guardrail Hit"] = (
df["Files"] > df["Parquet File Guardrail"]
)
df["Row Group Guardrail Hit"] = (
df["Row Groups"] > df["Row Group Guardrail"]
)
df["Parquet File Guardrail Hit"] = df["Files"] > df["Parquet File Guardrail"]
df["Row Group Guardrail Hit"] = df["Row Groups"] > df["Row Group Guardrail"]
if count_rows:
df['Row Count'] = df['Row Count'].astype(int)
df["Row Count Guardrail Hit"] = (
df["Row Count"] > df["Row Count Guardrail"]
)
df["Row Count"] = df["Row Count"].astype(int)
df["Row Count Guardrail Hit"] = df["Row Count"] > df["Row Count Guardrail"]

if export:
lakeAttach = lakehouse_attached()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import sempy.fabric as fabric
import pandas as pd
import datetime
import time
from sempy_labs._list_functions import list_tables
from sempy_labs.directlake._get_shared_expression import get_shared_expression
from sempy_labs._helper_functions import resolve_lakehouse_name, retry
Expand Down
Loading