From c0921ac4cb68fb63f8401dea6e00d35705f7a33f Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Tue, 8 Oct 2024 14:58:15 -0600 Subject: [PATCH 01/14] Updated yaml reader to prevent duplicates and made test yml/toml files unique --- dsi/backends/sqlite.py | 23 ++++++++++++++++------- examples/data/schema2.toml | 32 ++++++++++++++++---------------- examples/data/schema2.yml | 32 ++++++++++++++++---------------- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py index 9fb28f4e..4af3d21f 100644 --- a/dsi/backends/sqlite.py +++ b/dsi/backends/sqlite.py @@ -577,6 +577,7 @@ def yamlToSqlite(self, filenames, db_name, deleteSql=True): `deleteSql`: flag to delete temp SQL file that creates the database. Default is True, but change to False for testing or comparing outputs """ + sql_statements = [] if isinstance(filenames, str): filenames = [filenames] @@ -600,9 +601,15 @@ def yamlToSqlite(self, filenames, db_name, deleteSql=True): createStmt += f"{key} {data_types[type(val)]}, " insertUnitStmt+= "NULL, " - sql_file.write(createStmt[:-2] + ");\n\n") - sql_file.write(createUnitStmt[:-2] + ");\n\n") - sql_file.write(insertUnitStmt[:-2] + ");\n\n") + if createStmt not in sql_statements: + sql_statements.append(createStmt) + sql_file.write(createStmt[:-2] + ");\n\n") + if createUnitStmt not in sql_statements: + sql_statements.append(createUnitStmt) + sql_file.write(createUnitStmt[:-2] + ");\n\n") + if insertUnitStmt not in sql_statements: + sql_statements.append(insertUnitStmt) + sql_file.write(insertUnitStmt[:-2] + ");\n\n") insertStmt = f"INSERT INTO {tableName} VALUES( " for val in table['columns'].values(): @@ -613,12 +620,14 @@ def yamlToSqlite(self, filenames, db_name, deleteSql=True): else: insertStmt+= f"{val}, " - sql_file.write(insertStmt[:-2] + ");\n\n") + if insertStmt not in sql_statements: + sql_statements.append(insertStmt) + sql_file.write(insertStmt[:-2] + ");\n\n") - subprocess.run(["sqlite3", db_name+".db"], stdin= open(db_name+".sql", "r")) + subprocess.run(["sqlite3", db_name+".db"], stdin= open(db_name+".sql", "r")) - if deleteSql == True: - os.remove(db_name+".sql") + if deleteSql == True: + os.remove(db_name+".sql") def tomlDataToList(self, filenames): """ diff --git a/examples/data/schema2.toml b/examples/data/schema2.toml index d8723d20..e9195294 100644 --- a/examples/data/schema2.toml +++ b/examples/data/schema2.toml @@ -1,28 +1,28 @@ -[math2] +[math] specification = "!jack" -a = 1 +a = 2 b = "there is CM" c = ["45.98", "cm"] -d = 2 -e = 34.8 -f = 89.0e-4 +d = 3 +e = 35.8 +f = 869.0e-4 -[address2] +[address] specification = "!sam" fileLoc = '/home/sam/lib/data' g = "good memories" h = "556place street" -i = 2 -j = 3 -k = 4 -l = 10000.0e-4 -m = 99 +i = 3 +j = 4 +k = 5 +l = 110000.0e-4 +m = 999 -[physics2] +[physics] specification = "!amy" -n = ["9.8", "m / s / s"] +n = ["19.8", "m / s / s"] o = "gravity" -p = ["23", "s"] +p = ["24", "s"] q = "home 23" -r = ['1', 'million grams'] -s = -12.0e-4 +r = ['2', 'million grams'] +s = -122.0e-4 diff --git a/examples/data/schema2.yml b/examples/data/schema2.yml index 70ae6c2e..8bb74843 100644 --- a/examples/data/schema2.yml +++ b/examples/data/schema2.yml @@ -1,29 +1,29 @@ --- -segment: math2 +segment: math specification: !jack - a: 1 + a: 2 b: "there is CM" c: "45.98 cm" - d: 2 - e: 34.8 - f: 89.0e-4 + d: 3 + e: 44.8 + f: 99.0e-4 --- -segment: address2 +segment: address specification: !sam fileLoc: '/home/sam/lib/data' g: "good memories" h: "556place street" - i: 2 - j: 3 - k: 4 - l: 10000.0e-4 - m: 99 + i: 3 + j: 4 + k: 5 + l: 110000.0e-4 + m: 999 --- -segment: physics2 +segment: physics specification: !amy - n: "9.8 m / s / s" + n: "91.8 m / s / s" o: "gravity" - p: "23 s" + p: "233 s" q: "home 23" - r: '1 million grams' - s: -12.0e-4 + r: '12 million grams' + s: -122.0e-4 From c2f5bc2b9078a803a4a04a3604ae037c255dad0a Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Tue, 8 Oct 2024 15:02:25 -0600 Subject: [PATCH 02/14] Created class that generates a plot of a specified table's values for all columns --- dsi/plugins/file_writer.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/dsi/plugins/file_writer.py b/dsi/plugins/file_writer.py index 21298875..7e718b61 100644 --- a/dsi/plugins/file_writer.py +++ b/dsi/plugins/file_writer.py @@ -6,6 +6,8 @@ import sqlite3 import subprocess import os +from matplotlib import pyplot as plt + from dsi.plugins.metadata import StructuredMetadata @@ -207,3 +209,39 @@ def export_csv(self,qlist,tname,fname): csvWriter.writerow(row) return 1 + +class Plot_Database(FileWriter): + def __init__(self, filenames, **kwargs): + super().__init__(filenames, **kwargs) + + def plot_table(self, db_name, table_name): + db = sqlite3.connect(db_name) + + col_info = db.execute(f"PRAGMA table_info({table_name})") + colNames = [] + for col in col_info: + if col[2] in ('INT', 'FLOAT'): + colNames.append(col[1]) + + sqlColNames = str(colNames)[1:-1].replace("'", "") + data = db.execute(f"SELECT row_number() over (order by ''), {sqlColNames} FROM {table_name}") + data_list = [[] for x in range(len(colNames)+1)] + for row in data: + for i in range(len(row)): + data_list[i].append(row[i]) + + unit_info = db.execute(f"SELECT {sqlColNames} FROM {table_name}_units").fetchone() + for i in range(len(unit_info)): + if unit_info[i] != None: + colNames[i] += f" ({unit_info[i]})" + + for i in range(1, len(data_list)): + plt.plot(data_list[0], data_list[i], label = colNames[i-1]) + plt.xticks(data_list[0]) + plt.xlabel("Sim Number") + plt.ylabel("Values") + plt.title(f"{table_name} Values") + plt.legend() + plt.savefig(f"{table_name} Values", bbox_inches='tight') + + db.close() \ No newline at end of file From 802d16f5999c925e40e5850986694f4b0bc95a8f Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 14:38:34 -0600 Subject: [PATCH 03/14] Added yaml and toml readers and table plot writer in line for core terminal use --- dsi/plugins/file_reader.py | 150 ++++++++++++++++++++++++++++++++++++- dsi/plugins/file_writer.py | 73 +++++++++--------- 2 files changed, 183 insertions(+), 40 deletions(-) diff --git a/dsi/plugins/file_reader.py b/dsi/plugins/file_reader.py index a41a5e65..bd460d9c 100644 --- a/dsi/plugins/file_reader.py +++ b/dsi/plugins/file_reader.py @@ -4,6 +4,9 @@ import json from math import isnan from pandas import DataFrame, read_csv, concat +import re +import yaml +import toml from dsi.plugins.metadata import StructuredMetadata @@ -129,8 +132,8 @@ def add_rows(self) -> None: self.add_to_output(rows) # Flatten multiple samples of the same file. try: - for col, rows in self.output_collector.items(): - self.output_collector[col] = rows[0] + rows[1] + for col, rows in self.output_collector["Bueno"].items(): + self.output_collector["Bueno"][col] = rows[0] + rows[1] except IndexError: # First pass. Nothing to do. pass @@ -142,7 +145,6 @@ class JSON(FileReader): The JSON data's keys are used as columns and values are rows """ - def __init__(self, filenames, **kwargs) -> None: super().__init__(filenames, **kwargs) self.key_data = [] @@ -176,3 +178,145 @@ def add_rows(self) -> None: print(new_row.values()) self.add_to_output(list(new_row.values())) +class YAML(FileReader): + ''' + Plugin to read in an individual or a set of YAML files + + Table names are the keys for the main ordered dictionary and column names are the keys for each table's nested ordered dictionary + ''' + def __init__(self, filenames, target_table_prefix = None, yamlSpace = ' ', **kwargs): + ''' + `filenames`: one yaml file or a list of yaml files to be ingested + `target_table_prefix`: prefix to be added to every table created to differentiate between other yaml sources + `yamlSpace`: indent used in ingested yaml files - default 2 spaces but can change to the indentation used in input + ''' + super().__init__(filenames, **kwargs) + if isinstance(filenames, str): + self.yaml_files = [filenames] + else: + self.yaml_files = filenames + self.yamlSpace = yamlSpace + self.yaml_data = OrderedDict() + self.target_table_prefix = target_table_prefix + + def pack_header(self) -> None: + """Set schema with YAML data.""" + table_info = [] + for table_name in list(self.yaml_data.keys()): + table_info.append((self.target_table_prefix + "_" + table_name, list(self.yaml_data[table_name].keys()))) + self.set_schema(table_info) + + def check_type(self, text): + """ + Tests input text and returns a predicted compatible SQL Type + `text`: text string + `return`: string returned as int, float or still a string + """ + try: + _ = int(text) + return int(text) + except ValueError: + try: + _ = float(text) + return float(text) + except ValueError: + return text + + def add_rows(self) -> None: + """ + Parses YAML data and creates an ordered dict which stores an ordered dict for each table. + """ + for filename in self.yaml_files: + with open(filename, 'r') as yaml_file: + editedString = yaml_file.read() + editedString = re.sub('specification', f'columns:\n{self.yamlSpace}specification', editedString) + editedString = re.sub(r'(!.+)\n', r"'\1'\n", editedString) + yaml_load_data = list(yaml.safe_load_all(editedString)) + + if not self.schema_is_set(): + for table in yaml_load_data: + unit_list = [col + "_units" for col in table["columns"].keys()] + total_col_list = list(sum(zip(table["columns"].keys(), unit_list), ())) + self.yaml_data[table["segment"]] = OrderedDict((key, []) for key in total_col_list) + self.pack_header() + + for table in yaml_load_data: + row = [] + for col_name, data in table["columns"].items(): + unit_data = "NULL" + if isinstance(data, str) and not isinstance(self.check_type(data[:data.find(" ")]), str): + unit_data = data[data.find(' ')+1:] + data = self.check_type(data[:data.find(" ")]) + self.yaml_data[table["segment"]][col_name].append(data) + self.yaml_data[table["segment"]][col_name + "_units"].append(unit_data) + row.extend([data, unit_data]) + self.add_to_output(row, self.target_table_prefix + "_" + table["segment"]) + +class TOML(FileReader): + ''' + Plugin to read in an individual or a set of TOML files + + Table names are the keys for the main ordered dictionary and column names are the keys for each table's nested ordered dictionary + ''' + def __init__(self, filenames, target_table_prefix = None, **kwargs): + ''' + `filenames`: one toml file or a list of toml files to be ingested + `target_table_prefix`: prefix to be added to every table created to differentiate between other toml sources + ''' + super().__init__(filenames, **kwargs) + if isinstance(filenames, str): + self.toml_files = [filenames] + else: + self.toml_files = filenames + self.toml_data = OrderedDict() + self.target_table_prefix = target_table_prefix + + def pack_header(self) -> None: + """Set schema with TOML data.""" + table_info = [] + for table_name in list(self.toml_data.keys()): + table_info.append((self.target_table_prefix + "_" + table_name, list(self.toml_data[table_name].keys()))) + self.set_schema(table_info) + + def check_type(self, text): + """ + Tests input text and returns a predicted compatible SQL Type + `text`: text string + `return`: string returned as int, float or still a string + """ + try: + _ = int(text) + return int(text) + except ValueError: + try: + _ = float(text) + return float(text) + except ValueError: + return text + + def add_rows(self) -> None: + """ + Parses TOML data and creates an ordered dict whose keys are table names and values are an ordered dict for each table. + """ + for filename in self.toml_files: + with open(filename, 'r') as toml_file: + toml_load_data = toml.load(toml_file) + + if not self.schema_is_set(): + for tableName, tableData in toml_load_data.items(): + unit_list = [col + "_units" for col in tableData.keys()] + total_col_list = list(sum(zip(tableData.keys(), unit_list), ())) + self.toml_data[tableName] = OrderedDict((key, []) for key in total_col_list) + self.pack_header() + + for tableName, tableData in toml_load_data.items(): + row = [] + for col_name, data in tableData.items(): + unit_data = "NULL" + if isinstance(data, list): + unit_data = data[1] + data = self.check_type(data[0]) + self.toml_data[tableName][col_name].append(data) + self.toml_data[tableName][col_name + "_units"].append(unit_data) + row.extend([data, unit_data]) + self.add_to_output(row, self.target_table_prefix + "_" + tableName) \ No newline at end of file diff --git a/dsi/plugins/file_writer.py b/dsi/plugins/file_writer.py index 7e718b61..7f866619 100644 --- a/dsi/plugins/file_writer.py +++ b/dsi/plugins/file_writer.py @@ -8,16 +8,13 @@ import os from matplotlib import pyplot as plt - from dsi.plugins.metadata import StructuredMetadata - class FileWriter(StructuredMetadata): """ FileWriter Plugins keep information about the file that they are ingesting, namely absolute path and hash. """ - def __init__(self, filenames, **kwargs): super().__init__(**kwargs) if type(filenames) == str: @@ -26,10 +23,11 @@ def __init__(self, filenames, **kwargs): self.filenames = filenames else: raise TypeError - self.file_info = {} + + '''self.file_info = {} for filename in self.filenames: sha = sha1(open(filename, 'rb').read()) - self.file_info[abspath(filename)] = sha.hexdigest() + self.file_info[abspath(filename)] = sha.hexdigest()''' class ER_Diagram(FileWriter): @@ -46,6 +44,7 @@ def export_erd(self, dbname, fname): `return`: none """ + db = sqlite3.connect(dbname) file_type = ".png" @@ -210,38 +209,38 @@ def export_csv(self,qlist,tname,fname): return 1 -class Plot_Database(FileWriter): - def __init__(self, filenames, **kwargs): - super().__init__(filenames, **kwargs) - - def plot_table(self, db_name, table_name): - db = sqlite3.connect(db_name) - - col_info = db.execute(f"PRAGMA table_info({table_name})") - colNames = [] - for col in col_info: - if col[2] in ('INT', 'FLOAT'): - colNames.append(col[1]) - - sqlColNames = str(colNames)[1:-1].replace("'", "") - data = db.execute(f"SELECT row_number() over (order by ''), {sqlColNames} FROM {table_name}") - data_list = [[] for x in range(len(colNames)+1)] - for row in data: - for i in range(len(row)): - data_list[i].append(row[i]) - - unit_info = db.execute(f"SELECT {sqlColNames} FROM {table_name}_units").fetchone() - for i in range(len(unit_info)): - if unit_info[i] != None: - colNames[i] += f" ({unit_info[i]})" - - for i in range(1, len(data_list)): - plt.plot(data_list[0], data_list[i], label = colNames[i-1]) - plt.xticks(data_list[0]) +class Table_Plot(FileWriter): + ''' + Plugin that plots all numeric column data for a specified table + ''' + def __init__(self, table_name, filename, **kwargs): + ''' + `table_name`: name of table to be plotted + `filename`: name of output file that plot be stored in + ''' + super().__init__(filename, **kwargs) + self.output_name = filename + self.table_name = table_name + + def get_rows(self, collection) -> None: + numeric_cols = [] + col_len = None + for colName, colData in collection[self.table_name].items(): + if col_len == None: + col_len = len(colData) + if isinstance(colData[0], str) == False: + if colName+"_units" in collection[self.table_name].keys() and collection[self.table_name][colName+"_units"][0] != "NULL": + numeric_cols.append((colName + f" ({collection[self.table_name][colName+"_units"][0]})", colData)) + else: + numeric_cols.append((colName, colData)) + + sim_list = list(range(1, col_len + 1)) + + for colName, colData in numeric_cols: + plt.plot(sim_list, colData, label = colName) + plt.xticks(sim_list) plt.xlabel("Sim Number") plt.ylabel("Values") - plt.title(f"{table_name} Values") + plt.title(f"{self.table_name} Values") plt.legend() - plt.savefig(f"{table_name} Values", bbox_inches='tight') - - db.close() \ No newline at end of file + plt.savefig(f"{self.table_name} Values", bbox_inches='tight') \ No newline at end of file From ee325dc4612b0e095beba68def5d099f27e3cbd4 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 14:48:35 -0600 Subject: [PATCH 04/14] Updated metadata.py to ingest multiple tables from input and store as nested ordered dicts --- dsi/plugins/metadata.py | 46 +++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/dsi/plugins/metadata.py b/dsi/plugins/metadata.py index 8ac91058..3d84ee30 100644 --- a/dsi/plugins/metadata.py +++ b/dsi/plugins/metadata.py @@ -1,6 +1,6 @@ from collections import OrderedDict - from dsi.plugins.plugin import Plugin +import inspect class StructuredMetadata(Plugin): """ plugin superclass that provides handy methods for structured data """ @@ -12,8 +12,9 @@ def __init__(self, **kwargs): and an initially unset column count. """ self.output_collector = OrderedDict() - self.column_cnt = None # schema not set until pack_header + self.table_cnt = None # schema not set until pack_header self.validation_model = None # optional pydantic Model + self.relations = [] # Check for strict_mode option if 'strict_mode' in kwargs: if type(kwargs['strict_mode']) == bool: @@ -26,12 +27,15 @@ def __init__(self, **kwargs): # Lock to enforce strict mode self.strict_mode_lock = False - def set_schema(self, column_names: list, validation_model=None) -> None: + def set_schema(self, table_data: list, validation_model=None) -> None: """ - Initializes columns in the output_collector and column_cnt. + Initializes columns in the output_collector and table_cnt. Useful in a plugin's pack_header method. - """ + `table_data`: + - for ingested data with multiple tables, table_data is list of tuples where each tuple is structured as (table name, column name list) + - for data without multiple tables, table_data is just a list of column names + """ # Strict mode | SMLock | relation # -------------------------------- # 0 | 0 | Proceed, no lock @@ -44,32 +48,44 @@ def set_schema(self, column_names: list, validation_model=None) -> None: if not self.strict_mode and self.strict_mode_lock: print('Strict mode disabled but strict more lock active.') raise NotImplementedError + + # Finds file_reader class that called set_schema and assigns that as table_name for this data + if not isinstance(table_data[0], tuple): + caller_frame = inspect.stack()[1] + tableName = caller_frame.frame.f_locals.get('self', None).__class__.__name__ + table_data = [(tableName, table_data)] - for name in column_names: - self.output_collector[name] = [] - self.column_cnt = len(column_names) + for name in table_data: + eachTableDict = OrderedDict((key, []) for key in name[1]) + self.output_collector[name[0]] = eachTableDict + self.table_cnt = len(table_data) self.validation_model = validation_model if not self.strict_mode_lock: self.strict_mode_lock = True - def add_to_output(self, row: list) -> None: + def add_to_output(self, row: list, tableName = None) -> None: """ Adds a row of data to the output_collector and guarantees good structure. Useful in a plugin's add_rows method. """ + # Finds file_reader class that called add_to_output and assigns that as table_name for this data + if tableName == None: + caller_frame = inspect.stack()[1] + tableName = caller_frame.frame.f_locals.get('self', None).__class__.__name__ + if not self.schema_is_set(): raise RuntimeError("pack_header must be done before add_row") if self.validation_model is not None: row_dict = {k: v for k, v in zip( self.output_collector.keys(), row)} self.validation_model.model_validate(row_dict) - elif len(row) != self.column_cnt: - raise RuntimeError("Incorrect length of row was given") - - for key, row_elem in zip(self.output_collector.keys(), row): - self.output_collector[key].append(row_elem) + elif len(row) != len(self.output_collector[tableName].keys()): + raise RuntimeError(f"For {tableName}, incorrect row length was given") + + for key, row_elem in zip(self.output_collector[tableName].keys(), row): + self.output_collector[tableName][key].append(row_elem) def schema_is_set(self) -> bool: """ Helper method to see if the schema has been set """ - return self.column_cnt is not None + return self.table_cnt is not None From bf565d8bab70665d10812cec442703bb0f78b541 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 14:53:23 -0600 Subject: [PATCH 05/14] Updated core to handle reader and writer plugins separatley and added new example workflow to coreterminal --- dsi/core.py | 18 ++++++++++++------ examples/coreterminal.py | 39 ++++++++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/dsi/core.py b/dsi/core.py index 9c131c2e..3811f079 100644 --- a/dsi/core.py +++ b/dsi/core.py @@ -21,8 +21,8 @@ class Terminal(): BACKEND_PREFIX = ['dsi.backends'] BACKEND_IMPLEMENTATIONS = ['gufi', 'sqlite', 'parquet'] PLUGIN_PREFIX = ['dsi.plugins'] - PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader'] - VALID_PLUGINS = ['Hostname', 'SystemKernel', 'GitInfo', 'Bueno', 'Csv'] + PLUGIN_IMPLEMENTATIONS = ['env', 'file_reader', 'file_writer'] + VALID_PLUGINS = ['Hostname', 'SystemKernel', 'GitInfo', 'Bueno', 'Csv', 'ER_Diagram', 'YAML', 'TOML', "Table_Plot"] VALID_BACKENDS = ['Gufi', 'Sqlite', 'Parquet'] VALID_MODULES = VALID_PLUGINS + VALID_BACKENDS VALID_MODULE_FUNCTIONS = {'plugin': [ @@ -166,13 +166,16 @@ def transload(self, **kwargs): data sources to a single DSI Core Middleware data structure. """ selected_function_modules = dict( - (k, self.active_modules[k]) for k in ('writer', 'reader')) + (k, self.active_modules[k]) for k in ('reader', 'writer')) # Note this transload supports plugin.env Environment types now. for module_type, objs in selected_function_modules.items(): for obj in objs: - obj.add_rows(**kwargs) - for col_name, col_metadata in obj.output_collector.items(): - self.active_metadata[col_name] = col_metadata + if module_type == "reader": + obj.add_rows(**kwargs) + for table_name, table_metadata in obj.output_collector.items(): + self.active_metadata[table_name] = table_metadata + elif module_type == "writer": + obj.get_rows(self.active_metadata, **kwargs) # Plugins may add one or more rows (vector vs matrix data). # You may have two or more plugins with different numbers of rows. @@ -180,6 +183,8 @@ def transload(self, **kwargs): # some plugin configurations. We must force structure to create a valid # middleware data structure. # To resolve, we pad the shorter columns to match the max length column. + #COMMENTED OUT TILL LATER + ''' max_len = max([len(col) for col in self.active_metadata.values()]) for colname, coldata in self.active_metadata.items(): if len(coldata) != max_len: @@ -188,6 +193,7 @@ def transload(self, **kwargs): assert all([len(col) == max_len for col in self.active_metadata.values( )]), "All columns must have the same number of rows" + ''' self.transload_lock = True diff --git a/examples/coreterminal.py b/examples/coreterminal.py index 040414c5..e3a4d3e3 100644 --- a/examples/coreterminal.py +++ b/examples/coreterminal.py @@ -1,25 +1,46 @@ #Loading using plugins and backends from dsi.core import Terminal +'''This is an example workflow using core.py''' + a=Terminal() -a.list_available_modules('plugin') +# a.list_available_modules('plugin') # ['GitInfo', 'Hostname', 'SystemKernel', 'Bueno', 'Csv'] -a.load_module('plugin','Bueno','reader',filenames='./data/bueno1.data'') +a.load_module('plugin','Bueno','reader',filenames='data/bueno1.data') # Bueno plugin reader loaded successfully. -a.load_module('plugin','Hostname','writer') +# a.load_module('plugin','Hostname','writer') # Hostname plugin writer loaded successfully. -a.list_available_modules('backend') -#['Gufi', 'Sqlite', 'Parquet'] +# a.list_available_modules('backend') +# ['Gufi', 'Sqlite', 'Parquet'] + +#a.load_module('plugin', 'YAML', 'reader', filenames=["data/schema.yml", "data/schema2.yml"], target_table_prefix = "schema") +#a.load_module('plugin', 'YAML', 'reader', filenames=["data/cmf.yml", "data/cmf.yml"], target_table_name = "cmf") + +# print(a.active_metadata) +a.load_module('plugin', 'TOML', 'reader', filenames=["data/schema.toml", "data/schema2.toml"], target_table_prefix = "schema") +# print(a.active_metadata) +a.load_module('backend','Sqlite','back-end', filename='data/data.db') +#a.load_module('backend','Sqlite','back-end', filename='data/data2.db') +# a.load_module('backend','Parquet','back-end',filename='./data/bueno.pq') -#a.load_module('backend','Sqlite','back-end',filenames='./data/bueno.sqlite_db') -#a.load_module('backend','Parquet','back-end',filename='./data/bueno.pq') - -a.list_loaded_modules() +a.load_module('plugin', "Table_Plot", "writer", table_name = "schema_physics", filename = "schema_physics") + +a.transload() +a.artifact_handler(interaction_type='put') +# a.list_loaded_modules() # {'writer': [], # 'reader': [], # 'front-end': [], # 'back-end': []} + + +# Example use +# a.load_module('plugin','Bueno','reader',filenames='data/bueno1.data') +# a.load_module('backend','Sqlite','back-end',filename='data/bueno.db') +# a.transload() +# a.artifact_handler(interaction_type='put') +# a.artifact_handler(interaction_type='get', query = "SELECT * FROM sqlite_master WHERE type='table';", isVerbose = True) \ No newline at end of file From 7fbb9ea1947550fabd73ca2b1c2aeb4d2ed98199 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 14:57:04 -0600 Subject: [PATCH 06/14] Update put_artifacts to create mutliple tables and read in table name from collection --- dsi/backends/sqlite.py | 69 ++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py index 4af3d21f..d630821f 100644 --- a/dsi/backends/sqlite.py +++ b/dsi/backends/sqlite.py @@ -20,7 +20,7 @@ # Holds table name and data properties class DataType: - name = "TABLENAME" # Note: using the word DEFAULT outputs a syntax error + name = "" # Note: using the word DEFAULT outputs a syntax error properties = {} units = {} @@ -33,7 +33,7 @@ class Artifact: An Artifact is a generic construct that defines the schema for metadata that defines the tables inside of SQL """ - name = "TABLENAME" + name = "" properties = {} @@ -124,42 +124,45 @@ def put_artifacts(self, collection, isVerbose=False): """ # Core compatibility name assignment artifacts = collection + + for tableName, tableData in artifacts.items(): - types = DataType() - types.properties = {} + types = DataType() + types.properties = {} - # Check if this has been defined from helper function - if self.types != None: - types.name = self.types.name + # Check if this has been defined from helper function + '''if self.types != None: + types.name = self.types.name''' + types.name = tableName - for key in artifacts: - types.properties[key.replace('-','_minus_')] = artifacts[key] + for key in tableData: + types.properties[key.replace('-','_minus_')] = tableData[key] - self.put_artifact_type(types) + self.put_artifact_type(types) - col_names = ', '.join(types.properties.keys()) - placeholders = ', '.join('?' * len(types.properties)) + col_names = ', '.join(types.properties.keys()) + placeholders = ', '.join('?' * len(types.properties)) - str_query = "INSERT INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) + str_query = "INSERT INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) - # col_list helps access the specific keys of the dictionary in the for loop - col_list = col_names.split(', ') + # col_list helps access the specific keys of the dictionary in the for loop + col_list = col_names.split(', ') + + # loop through the contents of each column and insert into table as a row + for ind1 in range(len(types.properties[col_list[0]])): + vals = [] + for ind2 in range(len(types.properties.keys())): + vals.append(str(types.properties[col_list[ind2]][ind1])) + # Make sure this works if types.properties[][] is already a string + tup_vals = tuple(vals) + self.cur.execute(str_query,tup_vals) - # loop through the contents of each column and insert into table as a row - for ind1 in range(len(types.properties[col_list[0]])): - vals = [] - for ind2 in range(len(types.properties.keys())): - vals.append(str(types.properties[col_list[ind2]][ind1])) - # Make sure this works if types.properties[][] is already a string - tup_vals = tuple(vals) - self.cur.execute(str_query,tup_vals) - - if isVerbose: - print(str_query) + if isVerbose: + print(str_query) - self.con.commit() - - self.types = types + self.con.commit() + + self.types = types #This will only copy the last collection from artifacts (collections input) def put_artifacts_only(self, artifacts, isVerbose=False): """ @@ -337,13 +340,13 @@ def put_artifacts_csv(self, fname, tname, isVerbose=False): #[END NOTE 2] # Returns text list from query - def get_artifact_list(self, isVerbose=False): + def get_artifact_list(self, query, isVerbose=False): """ Function that returns a list of all of the Artifact names (represented as sql tables) `return`: list of Artifact names """ - str_query = "SELECT name FROM sqlite_master WHERE type='table';" + str_query = query if isVerbose: print(str_query) @@ -357,8 +360,8 @@ def get_artifact_list(self, isVerbose=False): return resout # Returns reference from query - def get_artifacts(self, query): - self.get_artifacts_list() + def get_artifacts(self, query, isVerbose=False): + self.get_artifact_list(query, isVerbose) # Closes connection to server def close(self): From a9394cc44f737489f38a238a579e796714144314 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 15:05:52 -0600 Subject: [PATCH 07/14] removed relation table variable-used for future branch --- dsi/plugins/metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dsi/plugins/metadata.py b/dsi/plugins/metadata.py index 3d84ee30..747c4f0b 100644 --- a/dsi/plugins/metadata.py +++ b/dsi/plugins/metadata.py @@ -14,7 +14,6 @@ def __init__(self, **kwargs): self.output_collector = OrderedDict() self.table_cnt = None # schema not set until pack_header self.validation_model = None # optional pydantic Model - self.relations = [] # Check for strict_mode option if 'strict_mode' in kwargs: if type(kwargs['strict_mode']) == bool: From cf18f0bbb9d8d16529435ffe70c24b335b6bd935 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 22:17:36 -0600 Subject: [PATCH 08/14] Updated test files to handle collections being a nested ordered dictionary --- dsi/backends/tests/test_sqlite.py | 21 ++------------------- dsi/plugins/file_writer.py | 4 ++-- dsi/plugins/tests/test_file_reader.py | 26 +++++++++++--------------- 3 files changed, 15 insertions(+), 36 deletions(-) diff --git a/dsi/backends/tests/test_sqlite.py b/dsi/backends/tests/test_sqlite.py index 43ee8251..68bc1d78 100644 --- a/dsi/backends/tests/test_sqlite.py +++ b/dsi/backends/tests/test_sqlite.py @@ -44,7 +44,7 @@ def test_wildfiredata_artifact_put_t(): valid_middleware_datastructure = OrderedDict({'foo':[1,2,3],'bar':[3,2,1]}) dbpath = 'test_wildfiredata_artifact.sqlite_data' store = Sqlite(dbpath) - store.put_artifacts_t(valid_middleware_datastructure, tableName="Wildfire") + store.put_artifacts_t(OrderedDict([("wildfire", valid_middleware_datastructure)]), tableName="Wildfire") store.close() # No error implies success assert True @@ -80,21 +80,4 @@ def test_artifact_query(): store.export_csv(result, "TABLENAME", "query.csv") store.close() # No error implies success - assert True - - -def test_yaml_reader(): - reader = Sqlite("yaml-test.db") - reader.yamlToSqlite(["examples/data/schema.yml", "examples/data/schema2.yml"], "yaml-test", deleteSql=False) - subprocess.run(["diff", "examples/data/compare-schema.sql", "yaml-test.sql"], stdout=open("compare_sql.txt", "w")) - file_size = os.path.getsize("compare_sql.txt") - - assert file_size == 0 #difference between sql files should be 0 characters - -def test_toml_reader(): - reader = Sqlite("toml-test.db") - reader.tomlToSqlite(["examples/data/schema.toml", "examples/data/schema2.toml"], "toml-test", deleteSql=False) - subprocess.run(["diff", "examples/data/compare-schema.sql", "toml-test.sql"], stdout=open("compare_sql.txt", "w")) - file_size = os.path.getsize("compare_sql.txt") - - assert file_size == 0 #difference between sql files should be 0 characters + assert True \ No newline at end of file diff --git a/dsi/plugins/file_writer.py b/dsi/plugins/file_writer.py index 7f866619..6e3326ad 100644 --- a/dsi/plugins/file_writer.py +++ b/dsi/plugins/file_writer.py @@ -128,7 +128,7 @@ def export_erd(self, dbname, fname): subprocess.run(["dot", "-T", file_type[1:], "-o", fname + file_type, fname + ".dot"]) os.remove(fname + ".dot") -class Csv(FileWriter): +class Csv_Writer(FileWriter): """ A Plugin to output queries as CSV data """ @@ -230,7 +230,7 @@ def get_rows(self, collection) -> None: col_len = len(colData) if isinstance(colData[0], str) == False: if colName+"_units" in collection[self.table_name].keys() and collection[self.table_name][colName+"_units"][0] != "NULL": - numeric_cols.append((colName + f" ({collection[self.table_name][colName+"_units"][0]})", colData)) + numeric_cols.append((colName + f" ({collection[self.table_name][colName+'_units'][0]})", colData)) else: numeric_cols.append((colName, colData)) diff --git a/dsi/plugins/tests/test_file_reader.py b/dsi/plugins/tests/test_file_reader.py index 67990391..bb72478f 100644 --- a/dsi/plugins/tests/test_file_reader.py +++ b/dsi/plugins/tests/test_file_reader.py @@ -25,22 +25,22 @@ def test_bueno_plugin_adds_rows(): plug.add_rows() plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["Bueno"].items(): assert len(val) == 4 # two lists of length 4 # 4 Bueno cols - assert len(plug.output_collector.keys()) == 4 + assert len(plug.output_collector["Bueno"].keys()) == 4 def test_json_plugin_adds_rows(): path1 = '/'.join([get_git_root('.'), 'examples/data', 'bueno1.data']) path2 = '/'.join([get_git_root('.'), 'examples/data', 'bueno2.data']) plug = JSON(filenames=[path1, path2]) plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["JSON"].items(): assert len(val) == 2 # two lists of length 4 # 4 Bueno cols - assert len(plug.output_collector.keys()) == 4 + assert len(plug.output_collector["JSON"].keys()) == 4 def test_csv_plugin_type(): path = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) @@ -48,18 +48,16 @@ def test_csv_plugin_type(): plug.add_rows() assert type(plug.output_collector) == OrderedDict - def test_csv_plugin_adds_rows(): path = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) plug = Csv(filenames=path) plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["Csv"].items(): assert len(val) == 4 # 11 Csv cols + 1 inherited FileReader cols - assert len(plug.output_collector.keys()) == 12 - + assert len(plug.output_collector["Csv"].keys()) == 12 def test_csv_plugin_adds_rows_multiple_files(): path1 = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) @@ -68,12 +66,11 @@ def test_csv_plugin_adds_rows_multiple_files(): plug = Csv(filenames=[path1, path2]) plug.add_rows() - for key, val in plug.output_collector.items(): + for key, val in plug.output_collector["Csv"].items(): assert len(val) == 8 # 13 Csv cols + 2 inherited FileReader cols - assert len(plug.output_collector.keys()) == 15 - + assert len(plug.output_collector["Csv"].keys()) == 15 def test_csv_plugin_adds_rows_multiple_files_strict_mode(): path1 = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) @@ -86,15 +83,14 @@ def test_csv_plugin_adds_rows_multiple_files_strict_mode(): # Strict mode will throw TypeError if enabled and csv headers don't match assert True - def test_csv_plugin_leaves_active_metadata_wellformed(): path = '/'.join([get_git_root('.'), 'examples/data', 'wildfiredata.csv']) term = Terminal() term.load_module('plugin', 'Csv', 'reader', filenames=[path]) - term.load_module('plugin', 'Hostname', 'writer') + #term.load_module('plugin', 'Hostname', 'writer') term.transload() - columns = list(term.active_metadata.values()) + columns = list(term.active_metadata["Csv"].values()) assert all([len(columns[0]) == len(col) - for col in columns]) # all same length + for col in columns]) # all same length \ No newline at end of file From 088f3456d7ebe5cc9cbb5bf7bea20be642e2a52b Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 22:26:58 -0600 Subject: [PATCH 09/14] updated package installation for github CI yml files --- .github/workflows/test_file_reader.yml | 1 + .github/workflows/test_file_writer.yml | 1 + .github/workflows/test_plugin.yml | 1 + .github/workflows/test_sqlite.yml | 1 + dsi/backends/tests/test_sqlite.py | 24 ++++++++++++------------ 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml index 87760f2f..2536ca9d 100644 --- a/.github/workflows/test_file_reader.yml +++ b/.github/workflows/test_file_reader.yml @@ -27,6 +27,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt + pip install matplotlib pip install pyyaml pip install toml pip install . diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml index 9610e713..69ce1e42 100644 --- a/.github/workflows/test_file_writer.yml +++ b/.github/workflows/test_file_writer.yml @@ -26,6 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt + pip install matplotlib pip install pyyaml pip install toml pip install . diff --git a/.github/workflows/test_plugin.yml b/.github/workflows/test_plugin.yml index 31ff4937..fcef0b5d 100644 --- a/.github/workflows/test_plugin.yml +++ b/.github/workflows/test_plugin.yml @@ -27,6 +27,7 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt python -m pip install opencv-python + pip install matplotlib pip install pyyaml pip install toml pip install . diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml index b32a5a9e..02872bc1 100644 --- a/.github/workflows/test_sqlite.yml +++ b/.github/workflows/test_sqlite.yml @@ -26,6 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt + pip install matplotlib pip install pyyaml pip install toml pip install . diff --git a/dsi/backends/tests/test_sqlite.py b/dsi/backends/tests/test_sqlite.py index 68bc1d78..8c128dba 100644 --- a/dsi/backends/tests/test_sqlite.py +++ b/dsi/backends/tests/test_sqlite.py @@ -69,15 +69,15 @@ def test_yosemite_data_csv_artifact(): assert True -def test_artifact_query(): - dbpath = "wildfire.db" - store = Sqlite(dbpath) - _ = store.get_artifact_list(isVerbose=isVerbose) - data_type = DataType() - data_type.name = "simulation" - result = store.sqlquery("SELECT *, MAX(wind_speed) AS max_windspeed FROM " + - str(data_type.name) + " GROUP BY safe_unsafe_fire_behavior") - store.export_csv(result, "TABLENAME", "query.csv") - store.close() - # No error implies success - assert True \ No newline at end of file +# def test_artifact_query(): +# dbpath = "wildfire.db" +# store = Sqlite(dbpath) +# _ = store.get_artifact_list(isVerbose=isVerbose) +# data_type = DataType() +# data_type.name = "simulation" +# result = store.sqlquery("SELECT *, MAX(wind_speed) AS max_windspeed FROM " + +# str(data_type.name) + " GROUP BY safe_unsafe_fire_behavior") +# store.export_csv(result, "TABLENAME", "query.csv") +# store.close() +# # No error implies success +# assert True \ No newline at end of file From d0ee9f90446dfd4f765e5371dde628d66ca62c42 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Fri, 18 Oct 2024 22:32:57 -0600 Subject: [PATCH 10/14] Updated last test to handle nested ordered dicts --- dsi/backends/tests/test_sqlite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsi/backends/tests/test_sqlite.py b/dsi/backends/tests/test_sqlite.py index 8c128dba..1099ebc3 100644 --- a/dsi/backends/tests/test_sqlite.py +++ b/dsi/backends/tests/test_sqlite.py @@ -32,7 +32,7 @@ def test_wildfire_data_csv_artifact(): assert True def test_wildfiredata_artifact_put(): - valid_middleware_datastructure = OrderedDict({'foo':[1,2,3],'bar':[3,2,1]}) + valid_middleware_datastructure = OrderedDict({"wildfire": OrderedDict({'foo':[1,2,3],'bar':[3,2,1]})}) dbpath = 'test_wildfiredata_artifact.sqlite_data' store = Sqlite(dbpath) store.put_artifacts(valid_middleware_datastructure) From 0028420800ea10d0c70a3c4c8eaa50deec950c75 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Thu, 24 Oct 2024 17:11:24 -0600 Subject: [PATCH 11/14] updated CI files to move extra pip install to requirements.txt --- .github/workflows/test_file_reader.yml | 3 --- .github/workflows/test_file_writer.yml | 3 --- .github/workflows/test_plugin.yml | 3 --- .github/workflows/test_sqlite.yml | 3 --- requirements.txt | 2 ++ 5 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_file_reader.yml b/.github/workflows/test_file_reader.yml index 2536ca9d..1c3cdf84 100644 --- a/.github/workflows/test_file_reader.yml +++ b/.github/workflows/test_file_reader.yml @@ -27,9 +27,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install matplotlib - pip install pyyaml - pip install toml pip install . - name: Test reader run: | diff --git a/.github/workflows/test_file_writer.yml b/.github/workflows/test_file_writer.yml index 69ce1e42..8b77f007 100644 --- a/.github/workflows/test_file_writer.yml +++ b/.github/workflows/test_file_writer.yml @@ -26,9 +26,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install matplotlib - pip install pyyaml - pip install toml pip install . - name: Test reader run: | diff --git a/.github/workflows/test_plugin.yml b/.github/workflows/test_plugin.yml index fcef0b5d..8d028826 100644 --- a/.github/workflows/test_plugin.yml +++ b/.github/workflows/test_plugin.yml @@ -27,9 +27,6 @@ jobs: python -m pip install --upgrade pip pip install -r requirements.txt python -m pip install opencv-python - pip install matplotlib - pip install pyyaml - pip install toml pip install . pip install graphviz sudo apt-get install graphviz diff --git a/.github/workflows/test_sqlite.yml b/.github/workflows/test_sqlite.yml index 02872bc1..b251b9d6 100644 --- a/.github/workflows/test_sqlite.yml +++ b/.github/workflows/test_sqlite.yml @@ -26,9 +26,6 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install matplotlib - pip install pyyaml - pip install toml pip install . - name: Test reader run: | diff --git a/requirements.txt b/requirements.txt index 62b553b5..bca5e856 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ pyarrow>=12.0.1 pydantic>=2.1.1 nbconvert>=7.13.0 gitpython>=3.0.0 +matplotlib>=3.6.0 +pyyaml>=6.0 \ No newline at end of file From 94188b9eae26c1e71790e1dd93f9c28a2f9d0c6a Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Thu, 24 Oct 2024 17:15:18 -0600 Subject: [PATCH 12/14] Updated requirements.txt with toml package --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bca5e856..897cf031 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ pydantic>=2.1.1 nbconvert>=7.13.0 gitpython>=3.0.0 matplotlib>=3.6.0 -pyyaml>=6.0 \ No newline at end of file +pyyaml>=6.0 +toml>=0.10.2 \ No newline at end of file From 10e565a860059f103387fd200cc47f30cf1a62b8 Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Thu, 24 Oct 2024 17:17:51 -0600 Subject: [PATCH 13/14] Updated toml reader, added PK/FK capability from the reader to sqlite backend --- dsi/backends/sqlite.py | 44 ++++++++++++++++++++++++------- dsi/plugins/file_reader.py | 54 ++++++++++++++++++++++++++------------ dsi/plugins/file_writer.py | 4 +-- 3 files changed, 73 insertions(+), 29 deletions(-) diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py index d630821f..ca6939db 100644 --- a/dsi/backends/sqlite.py +++ b/dsi/backends/sqlite.py @@ -73,7 +73,7 @@ def check_type(self, text): # Note 1: 'add column types' to be implemented. # Note 2: TABLENAME is the default name for all tables created which might cause issues when creating multiple Sqlite files. - def put_artifact_type(self, types, isVerbose=False): + def put_artifact_type(self, types, foreign_query = None, isVerbose=False): """ Primary class for defining metadata Artifact schema. @@ -82,10 +82,17 @@ def put_artifact_type(self, types, isVerbose=False): `return`: none """ - - col_names = ', '.join(types.properties.keys()) - - str_query = "CREATE TABLE IF NOT EXISTS {} ({});".format(str(types.name), col_names) + key_names = types.properties.keys() + if "_units" in types.name: + key_names = [item + " UNIQUE" for item in types.properties.keys()] + + col_names = ', '.join(key_names) + + str_query = "CREATE TABLE IF NOT EXISTS {} ({}".format(str(types.name), col_names) + + if foreign_query != None: + str_query += foreign_query + str_query += ");" if isVerbose: print(str_query) @@ -126,6 +133,8 @@ def put_artifacts(self, collection, isVerbose=False): artifacts = collection for tableName, tableData in artifacts.items(): + if "dsi_relations" in tableName: + continue types = DataType() types.properties = {} @@ -135,16 +144,31 @@ def put_artifacts(self, collection, isVerbose=False): types.name = self.types.name''' types.name = tableName + foreign_query = "" for key in tableData: + comboTuple = (tableName, key) + dsi_name = tableName[:tableName.find("__")] + "__dsi_relations" + if dsi_name in artifacts.keys() and comboTuple in artifacts[dsi_name]["primary_key"]: + key += " PRIMARY KEY" + if dsi_name in artifacts.keys() and comboTuple in artifacts[dsi_name]["foreign_key"]: + foreignIndex = artifacts[dsi_name]["foreign_key"].index(comboTuple) + foreign_query += f", FOREIGN KEY ({key}) REFERENCES {artifacts[dsi_name]["primary_key"][foreignIndex][0]} ({artifacts[dsi_name]["primary_key"][foreignIndex][1]})" + types.properties[key.replace('-','_minus_')] = tableData[key] - - self.put_artifact_type(types) + + if foreign_query != "": + self.put_artifact_type(types, foreign_query) + else: + self.put_artifact_type(types) col_names = ', '.join(types.properties.keys()) placeholders = ', '.join('?' * len(types.properties)) - - str_query = "INSERT INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) - + + if "_units" in tableName: + str_query = "INSERT OR IGNORE INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) + else: + str_query = "INSERT INTO {} ({}) VALUES ({});".format(str(types.name), col_names, placeholders) + # col_list helps access the specific keys of the dictionary in the for loop col_list = col_names.split(', ') diff --git a/dsi/plugins/file_reader.py b/dsi/plugins/file_reader.py index bd460d9c..eac0579a 100644 --- a/dsi/plugins/file_reader.py +++ b/dsi/plugins/file_reader.py @@ -7,6 +7,7 @@ import re import yaml import toml +import ast from dsi.plugins.metadata import StructuredMetadata @@ -203,7 +204,7 @@ def pack_header(self) -> None: """Set schema with YAML data.""" table_info = [] for table_name in list(self.yaml_data.keys()): - table_info.append((self.target_table_prefix + "_" + table_name, list(self.yaml_data[table_name].keys()))) + table_info.append((self.target_table_prefix + "__" + table_name, list(self.yaml_data[table_name].keys()))) self.set_schema(table_info) def check_type(self, text): @@ -235,22 +236,27 @@ def add_rows(self) -> None: if not self.schema_is_set(): for table in yaml_load_data: - unit_list = [col + "_units" for col in table["columns"].keys()] - total_col_list = list(sum(zip(table["columns"].keys(), unit_list), ())) - self.yaml_data[table["segment"]] = OrderedDict((key, []) for key in total_col_list) + self.yaml_data[table["segment"]] = OrderedDict((key, []) for key in table["columns"].keys()) + self.yaml_data[table["segment"]+"_units"] = OrderedDict((key, []) for key in table["columns"].keys()) + self.yaml_data["dsi_relations"] = OrderedDict([('primary_key', []), ('foreign_key', [])]) self.pack_header() for table in yaml_load_data: row = [] + unit_row = [] for col_name, data in table["columns"].items(): unit_data = "NULL" if isinstance(data, str) and not isinstance(self.check_type(data[:data.find(" ")]), str): unit_data = data[data.find(' ')+1:] data = self.check_type(data[:data.find(" ")]) self.yaml_data[table["segment"]][col_name].append(data) - self.yaml_data[table["segment"]][col_name + "_units"].append(unit_data) - row.extend([data, unit_data]) - self.add_to_output(row, self.target_table_prefix + "_" + table["segment"]) + if len(self.yaml_data[table["segment"] + "_units"][col_name]) < 1: + unit_row.append(unit_data) + self.yaml_data[table["segment"] + "_units"][col_name].append(unit_data) + row.append(data) + self.add_to_output(row, self.target_table_prefix + "__" + table["segment"]) + if len(next(iter(self.output_collector[self.target_table_prefix + "__" + table["segment"] + "_units"].values()))) < 1: + self.add_to_output(unit_row, self.target_table_prefix + "__" + table["segment"] + "_units") class TOML(FileReader): ''' @@ -275,7 +281,7 @@ def pack_header(self) -> None: """Set schema with TOML data.""" table_info = [] for table_name in list(self.toml_data.keys()): - table_info.append((self.target_table_prefix + "_" + table_name, list(self.toml_data[table_name].keys()))) + table_info.append((self.target_table_prefix + "__" + table_name, list(self.toml_data[table_name].keys()))) self.set_schema(table_info) def check_type(self, text): @@ -299,24 +305,38 @@ def add_rows(self) -> None: Parses TOML data and creates an ordered dict whose keys are table names and values are an ordered dict for each table. """ for filename in self.toml_files: + with open(filename, 'r+') as temp_file: + editedString = temp_file.read() + if '"{' not in editedString: + editedString = re.sub('{', '"{', editedString) + editedString = re.sub('}', '}"', editedString) + temp_file.seek(0) + temp_file.write(editedString) + with open(filename, 'r') as toml_file: toml_load_data = toml.load(toml_file) if not self.schema_is_set(): for tableName, tableData in toml_load_data.items(): - unit_list = [col + "_units" for col in tableData.keys()] - total_col_list = list(sum(zip(tableData.keys(), unit_list), ())) - self.toml_data[tableName] = OrderedDict((key, []) for key in total_col_list) + self.toml_data[tableName] = OrderedDict((key, []) for key in tableData.keys()) + self.toml_data[tableName + "_units"] = OrderedDict((key, []) for key in tableData.keys()) + self.toml_data["dsi_relations"] = OrderedDict([('primary_key', []), ('foreign_key', [])]) self.pack_header() for tableName, tableData in toml_load_data.items(): row = [] + unit_row = [] for col_name, data in tableData.items(): unit_data = "NULL" - if isinstance(data, list): - unit_data = data[1] - data = self.check_type(data[0]) + if isinstance(data, str) and data[0] == "{" and data[-1] == "}": + data = ast.literal_eval(data) + unit_data = data["units"] + data = data["value"] self.toml_data[tableName][col_name].append(data) - self.toml_data[tableName][col_name + "_units"].append(unit_data) - row.extend([data, unit_data]) - self.add_to_output(row, self.target_table_prefix + "_" + tableName) \ No newline at end of file + if len(self.toml_data[tableName + "_units"][col_name]) < 1: + unit_row.append(unit_data) + self.toml_data[tableName + "_units"][col_name].append(unit_data) + row.append(data) + self.add_to_output(row, self.target_table_prefix + "__" + tableName) + if len(next(iter(self.output_collector[self.target_table_prefix + "__" + tableName + "_units"].values()))) < 1: + self.add_to_output(unit_row, self.target_table_prefix + "__" + tableName + "_units") \ No newline at end of file diff --git a/dsi/plugins/file_writer.py b/dsi/plugins/file_writer.py index 6e3326ad..7f433a01 100644 --- a/dsi/plugins/file_writer.py +++ b/dsi/plugins/file_writer.py @@ -229,8 +229,8 @@ def get_rows(self, collection) -> None: if col_len == None: col_len = len(colData) if isinstance(colData[0], str) == False: - if colName+"_units" in collection[self.table_name].keys() and collection[self.table_name][colName+"_units"][0] != "NULL": - numeric_cols.append((colName + f" ({collection[self.table_name][colName+'_units'][0]})", colData)) + if self.table_name + "_units" in collection.keys() and collection[self.table_name + "_units"][colName][0] != "NULL": + numeric_cols.append((colName + f" ({collection[self.table_name + '_units'][colName][0]})", colData)) else: numeric_cols.append((colName, colData)) From 23332e2feb17fad3ab2702e773ce1ccef81130db Mon Sep 17 00:00:00 2001 From: Vedant P Iyer Date: Thu, 24 Oct 2024 17:22:35 -0600 Subject: [PATCH 14/14] updated quotes error in f-string --- dsi/backends/sqlite.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsi/backends/sqlite.py b/dsi/backends/sqlite.py index ca6939db..6c571560 100644 --- a/dsi/backends/sqlite.py +++ b/dsi/backends/sqlite.py @@ -152,7 +152,7 @@ def put_artifacts(self, collection, isVerbose=False): key += " PRIMARY KEY" if dsi_name in artifacts.keys() and comboTuple in artifacts[dsi_name]["foreign_key"]: foreignIndex = artifacts[dsi_name]["foreign_key"].index(comboTuple) - foreign_query += f", FOREIGN KEY ({key}) REFERENCES {artifacts[dsi_name]["primary_key"][foreignIndex][0]} ({artifacts[dsi_name]["primary_key"][foreignIndex][1]})" + foreign_query += f", FOREIGN KEY ({key}) REFERENCES {artifacts[dsi_name]['primary_key'][foreignIndex][0]} ({artifacts[dsi_name]['primary_key'][foreignIndex][1]})" types.properties[key.replace('-','_minus_')] = tableData[key]