microsoft · m-kovalsky · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/src/sempy_labs/__init__.py b/src/sempy_labs/__init__.py
@@ -13,6 +13,7 @@
     get_semantic_model_bim,
 )
 from sempy_labs._list_functions import (
+    list_synonyms,
     list_reports_using_semantic_model,
     delete_custom_pool,
     list_semantic_model_objects,
@@ -57,6 +58,7 @@
 )
 
 from sempy_labs._helper_functions import (
+    generate_synonyms,
     resolve_workspace_capacity,
     create_abfss_path,
     format_dax_object_name,
@@ -205,4 +207,6 @@
     "resolve_capacity_name",
     "run_model_bpa_bulk",
     "create_model_bpa_semantic_model",
+    "list_synonyms",
+    "generate_synonyms",
 ]
diff --git a/src/sempy_labs/_helper_functions.py b/src/sempy_labs/_helper_functions.py
@@ -850,3 +850,33 @@ def pagination(client, response):
         continuation_uri = response_json.get("continuationUri")
 
     return responses
+
+
+def validate_weight(weight: float):
+
+    if weight is not None and (weight <= 0 or weight >= 1):
+        raise ValueError(
+            f"{icons.red_dot} Invalid weight parameter. Weight must be a value between 0 and 1."
+        )
+
+
+def generate_synonyms(word: str) -> List[str]:
+
+    import nltk
+    from nltk.corpus import wordnet
+    from nltk.data import find
+
+    try:
+        find("corpora/wordnet.zip")
+    except Exception:
+        nltk.download("wordnet")
+
+    synonyms = wordnet.synsets(word)
+    synonym_list = []
+    for syn in synonyms:
+        for lemma in syn.lemmas():
+            syn_word = lemma.name()
+            if syn_word != word and "_" not in syn_word:
+                synonym_list.append(syn_word)
+
+    return synonym_list
diff --git a/src/sempy_labs/_list_functions.py b/src/sempy_labs/_list_functions.py
@@ -12,12 +12,13 @@
 import pandas as pd
 import base64
 import requests
-import time
 import json
 from pyspark.sql import SparkSession
 from typing import Optional
+from sempy._utils._log import log
 import sempy_labs._icons as icons
 from sempy.fabric.exceptions import FabricHTTPException
+from collections import defaultdict
 
 
 def get_object_level_security(
@@ -2578,3 +2579,85 @@ def list_reports_using_semantic_model(
             df = pd.concat([df, pd.DataFrame(new_data, index=[0])], ignore_index=True)
 
     return df
+
+
+@log
+def list_synonyms(dataset: str, workspace: Optional[str] = None):
+
+    from sempy_labs.tom import connect_semantic_model
+
+    workspace = fabric.resolve_workspace_name(workspace)
+
+    df = pd.DataFrame(
+        columns=[
+            "Culture Name",
+            "Table Name",
+            "Object Name",
+            "Object Type",
+            "Synonym",
+            "Type",
+            "State",
+            "Weight",
+            "Last Modified",
+        ]
+    )
+
+    with connect_semantic_model(
+        dataset=dataset, workspace=workspace, readonly=True
+    ) as tom:
+        for c in tom.model.Cultures:
+            if c.LinguisticMetadata is not None:
+                lm = json.loads(c.LinguisticMetadata.Content)
+                if "Entities" in lm:
+                    for k, v in lm.get("Entities", []).items():
+                        binding = v.get("Definition", {}).get("Binding", {})
+
+                        t_name = binding.get("ConceptualEntity")
+                        object_name = binding.get("ConceptualProperty")
+
+                        if object_name is None:
+                            object_type = "Table"
+                            object_name = t_name
+                        elif any(
+                            m.Name == object_name and m.Parent.Name == t_name
+                            for m in tom.all_measures()
+                        ):
+                            object_type = "Measure"
+                        elif any(
+                            m.Name == object_name and m.Parent.Name == t_name
+                            for m in tom.all_columns()
+                        ):
+                            object_type = "Column"
+                        elif any(
+                            m.Name == object_name and m.Parent.Name == t_name
+                            for m in tom.all_hierarchies()
+                        ):
+                            object_type = "Hierarchy"
+
+                        merged_terms = defaultdict(dict)
+                        for t in v.get("Terms", []):
+                            for term, properties in t.items():
+                                normalized_term = term.lower()
+                                merged_terms[normalized_term].update(properties)
+
+                        for term, props in merged_terms.items():
+                            new_data = {
+                                "Culture Name": lm.get("Language"),
+                                "Table Name": t_name,
+                                "Object Name": object_name,
+                                "Object Type": object_type,
+                                "Synonym": term,
+                                "Type": props.get("Type"),
+                                "State": props.get("State"),
+                                "Weight": props.get("Weight"),
+                                "Last Modified": props.get("LastModified"),
+                            }
+                            df = pd.concat(
+                                [df, pd.DataFrame(new_data, index=[0])],
+                                ignore_index=True,
+                            )
+
+    df["Weight"] = df["Weight"].fillna(0).astype(float)
+    df["Last Modified"] = pd.to_datetime(df["Last Modified"])
+
+    return df
diff --git a/src/sempy_labs/directlake/_guardrails.py b/src/sempy_labs/directlake/_guardrails.py
@@ -52,14 +52,16 @@ def get_sku_size(workspace: Optional[str] = None) -> str:
     if len(dfW) == 0:
         raise ValueError(f"{icons.red_dot} The '{workspace}' is not a valid workspace.")
 
-    capacity_id = dfW['Capacity Id'].iloc[0]
+    capacity_id = dfW["Capacity Id"].iloc[0]
     dfC = fabric.list_capacities()
-    dfC_filt = dfC[dfC['Id'] == capacity_id]
+    dfC_filt = dfC[dfC["Id"] == capacity_id]
 
     if len(dfC_filt) == 0:
-        raise ValueError(f"{icons.red_dot} The '{capacity_id}' Id is not a valid capacity Id.")
+        raise ValueError(
+            f"{icons.red_dot} The '{capacity_id}' Id is not a valid capacity Id."
+        )
 
-    return dfC_filt['Sku'].iloc[0]
+    return dfC_filt["Sku"].iloc[0]
 
 
 def get_directlake_guardrails_for_sku(sku_size: str) -> pd.DataFrame:

diff --git a/src/sempy_labs/lakehouse/_get_lakehouse_tables.py b/src/sempy_labs/lakehouse/_get_lakehouse_tables.py
@@ -114,14 +114,14 @@ def get_lakehouse_tables(
         sku_value = get_sku_size(workspace)
         guardrail = get_directlake_guardrails_for_sku(sku_value)
         spark = SparkSession.builder.getOrCreate()
-        df['Files'] = None
-        df['Row Groups'] = None
-        df['Table Size'] = None
+        df["Files"] = None
+        df["Row Groups"] = None
+        df["Table Size"] = None
         if count_rows:
-            df['Row Count'] = None
+            df["Row Count"] = None
         for i, r in df.iterrows():
-            tName = r['Table Name']
-            if r['Type'] == 'Managed' and r['Format'] == 'delta':
+            tName = r["Table Name"]
+            if r["Type"] == "Managed" and r["Format"] == "delta":
                 detail_df = spark.sql(f"DESCRIBE DETAIL `{tName}`").collect()[0]
                 num_files = detail_df.numFiles
                 size_in_bytes = detail_df.sizeInBytes
@@ -141,36 +141,28 @@ def get_lakehouse_tables(
                         ).num_row_groups
                     except FileNotFoundError:
                         continue
-                df.at[i, 'Files'] = num_files
-                df.at[i, 'Row Groups'] = num_rowgroups
-                df.at[i, 'Table Size'] = size_in_bytes
+                df.at[i, "Files"] = num_files
+                df.at[i, "Row Groups"] = num_rowgroups
+                df.at[i, "Table Size"] = size_in_bytes
             if count_rows:
                 num_rows = spark.table(tName).count()
-                df.at[i, 'Row Count'] = num_rows
+                df.at[i, "Row Count"] = num_rows
 
     if extended:
         intColumns = ["Files", "Row Groups", "Table Size"]
         df[intColumns] = df[intColumns].astype(int)
         df["SKU"] = guardrail["Fabric SKUs"].iloc[0]
-        df["Parquet File Guardrail"] = guardrail[
-            "Parquet files per table"
-        ].iloc[0]
+        df["Parquet File Guardrail"] = guardrail["Parquet files per table"].iloc[0]
         df["Row Group Guardrail"] = guardrail["Row groups per table"].iloc[0]
         df["Row Count Guardrail"] = (
             guardrail["Rows per table (millions)"].iloc[0] * 1000000
         )
 
-        df["Parquet File Guardrail Hit"] = (
-            df["Files"] > df["Parquet File Guardrail"]
-        )
-        df["Row Group Guardrail Hit"] = (
-            df["Row Groups"] > df["Row Group Guardrail"]
-        )
+        df["Parquet File Guardrail Hit"] = df["Files"] > df["Parquet File Guardrail"]
+        df["Row Group Guardrail Hit"] = df["Row Groups"] > df["Row Group Guardrail"]
     if count_rows:
-        df['Row Count'] = df['Row Count'].astype(int)
-        df["Row Count Guardrail Hit"] = (
-            df["Row Count"] > df["Row Count Guardrail"]
-        )
+        df["Row Count"] = df["Row Count"].astype(int)
+        df["Row Count Guardrail Hit"] = df["Row Count"] > df["Row Count Guardrail"]
 
     if export:
         lakeAttach = lakehouse_attached()

diff --git a/src/sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py b/src/sempy_labs/migration/_migrate_tables_columns_to_semantic_model.py
@@ -1,7 +1,5 @@
 import sempy.fabric as fabric
 import pandas as pd
-import datetime
-import time
 from sempy_labs._list_functions import list_tables
 from sempy_labs.directlake._get_shared_expression import get_shared_expression
 from sempy_labs._helper_functions import resolve_lakehouse_name, retry