Add examples for Fastod algorithm

chernishev · Mar 29, 2024 · 3d1cf76 · 3d1cf76
1 parent d3941bb
commit 3d1cf76
Show file tree

Hide file tree

Showing 4 changed files with 341 additions and 0 deletions.
diff --git a/examples/datasets/position_distribution.csv b/examples/datasets/position_distribution.csv
@@ -0,0 +1,7 @@
+year,position,percent
+2020,director,10%
+2020,other,50%
+2020,manager,40%
+2021,manager,35%
+2021,other,55%
+2021,director,10%
diff --git a/examples/datasets/salary.csv b/examples/datasets/salary.csv
@@ -0,0 +1,9 @@
+year,employee_grade,avg_salary
+2020,24,1000
+2020,40,7000
+2020,32,5000
+2020,29,3000
+2020,49,10000
+2021,50,15000
+2021,25,1500
+2021,30,6000
diff --git a/examples/mining_od_fastod_1.py b/examples/mining_od_fastod_1.py
@@ -0,0 +1,132 @@
+import desbordante
+import pandas
+from tabulate import tabulate
+
+TABLE = 'examples/datasets/salary.csv'
+TIME_LIMIT_SECONDS = 3
+
+def print_data_frame(data_frame, title = None):
+    print_table(data_frame, 'keys', title)
+
+def print_table(table, headers = None, title = None):
+    if title is not None:
+        print(title)
+
+    print(tabulate(table, headers=headers, tablefmt='psql'))
+
+def print_attribute_symbols(table):
+    print('Attribute symbols:')
+
+    counter = 1
+
+    for column in table:
+        print(f'{column} -- {counter}')
+        counter += 1
+
+def print_desc_ods_with_comments(desc_ods):
+    print('descending ods:', len(desc_ods))
+
+def print_asc_ods_with_comments(asc_ods, table):
+    print('ascending ods:', len(asc_ods))
+
+    for od in asc_ods:
+        print(od)
+
+    print()
+    print(f'Dependency "{asc_ods[0]}" means that ordering the table')
+    print('inside each equivalence class from "year" by attribute "avg_salary"')
+    print('automatically entails ordering by attribute "employee_grade".')
+
+    print()
+    print('We have 2 equivalence classes in "year": [2020] and [2021].')
+    print('Let\'s split the table into two tables based on these classes.')
+
+    table_part1 = table.iloc[:5,:]
+    table_part2 = table.iloc[5:,:]
+
+    print()
+    print_data_frame(table_part1, 'Part 1: this part of table corresponds to class [2020]')
+
+    print()
+    print('Let\'s sort it by attribute "avg_salary".')
+
+    table_part1_sorted = table_part1.sort_values('avg_salary')
+
+    print()
+    print_data_frame(table_part1_sorted, 'Sorted part 1:')
+
+    print()
+    print('We can see that this sort entails automatic ordering by')
+    print('attribute "employee_grade".')
+
+    print()
+    print_data_frame(table_part2, 'Part 2: this part of table corresponds to class [2021]')
+
+    print()
+    print('Let\'s sort it by attribute "avg_salary".')
+
+    table_part2_sorted = table_part2.sort_values('avg_salary')
+
+    print()
+    print_data_frame(table_part2_sorted, 'Sorted part 2:')
+
+    print()
+    print('We can see that this sort entails automatic ordering by')
+    print('attribute "employee_grade" too.')
+
+    print()
+    print(f'Dependency "{asc_ods[1]}" is similar to the first and means that')
+    print('ordering the table inside each equivalence class from "year" by')
+    print('attribute "employee_grade" automatically entails ordering by')
+    print('attribute "avg_salary". This can be seen in the tables above.')
+
+    print()
+    print('In other words, these dependencies indicate that the ordering of')
+    print('average salary entails an automatic ordering of the employee grade')
+    print('and vice versa.')
+
+def print_simple_ods_with_comments(simple_ods, table):
+    print('simple ods:', len(simple_ods))
+
+    for od in simple_ods:
+        print(od)
+
+    print()
+    print('These dependencies mean that inside each equivalence class from')
+    print('an attribute from their context the constancy of the attribute')
+    print('from the right side of the dependency can be traced.')
+
+    employee_grade_classes = [f'[{i}]' for i in table['employee_grade']]
+    employee_grade_classes_str = ', '.join(employee_grade_classes)
+
+    print()
+    print(f'For example, let\'s look at "{simple_ods[0]}". The context of this')
+    print('dependency is attribute "employee_grade". We have 8 equivalence classes')
+    print(f'in "employee_grade": {employee_grade_classes_str}.')
+    print('Since all the elements of attribute "employee_grade" are different,')
+    print('each of these classes contains only one element, so constancy within')
+    print('each class occurs automatically.')
+
+    print()
+    print('To better understand such dependencies, refer to the second example.')
+
+if __name__ == '__main__':
+    algo = desbordante.od.algorithms.Fastod()
+    algo.load_data(table=(TABLE, ',', True))
+    algo.execute(time_limit=TIME_LIMIT_SECONDS)
+
+    asc_ods = algo.get_asc_ods()
+    desc_ods = algo.get_desc_ods()
+    simple_ods = algo.get_simple_ods()
+
+    table = pandas.read_csv(TABLE)
+
+    print_data_frame(table)
+    print()
+    print_attribute_symbols(table)
+    print()
+    print_desc_ods_with_comments(desc_ods)
+    print()
+    print_asc_ods_with_comments(asc_ods, table)
+    print()
+    print_simple_ods_with_comments(simple_ods, table)
diff --git a/examples/mining_od_fastod_2.py b/examples/mining_od_fastod_2.py
@@ -0,0 +1,193 @@
+import desbordante
+import pandas
+from tabulate import tabulate
+
+TABLE = 'examples/datasets/position_distribution.csv'
+TIME_LIMIT_SECONDS = 3
+
+COLOR_CODES = {
+    'bold_underline_red': '\u001b[1;4;31m',
+    'bold_red': '\u001b[1;31m',
+    'bold_green': '\033[1;32m',
+    'bold_yellow': '\033[1;33m',
+    'bold_blue': '\033[1;34m',
+    'bold_magenta': '\033[1;35m',
+    'bold_cyan': '\033[1;36m',
+    'bold_white': '\033[1;37m',
+    'red': '\u001b[31m',
+    'green': '\033[32m',
+    'yellow': '\033[33m',
+    'blue': '\033[34m',
+    'magenta': '\033[35m',
+    'cyan': '\033[36m',
+    'white': '\033[37m',
+    'default': '\033[0m'
+}
+
+def make_text_colored(text, color):
+    return f'{COLOR_CODES[color]}{text}{COLOR_CODES["default"]}'
+
+def print_data_frame(data_frame, title = None):
+    print_table(data_frame, 'keys', title)
+
+def print_table(table, headers = None, title = None):
+    if title is not None:
+        print(title)
+
+    print(tabulate(table, headers=headers, tablefmt='psql'))
+
+def print_attribute_symbols(table):
+    print('Attribute symbols:')
+
+    counter = 1
+
+    for column in table:
+        print(f'{column} -- {counter}')
+        counter += 1
+
+def print_desc_ods_with_comments(desc_ods):
+    print('descending ods:', len(desc_ods))
+
+def print_asc_ods_with_comments(asc_ods, table):
+    print('ascending ods:', len(asc_ods))
+
+    for od in asc_ods:
+        print(od)
+
+    print()
+    print(f'Dependency "{asc_ods[0]}" means that ordering the table by attribute')
+    print('"percent" automatically entails ordering by attribute "position".')
+    print('Moreover, this is observed regardless of other attributes, since the')
+    print('dependency context is empty.')
+
+    print()
+    print('Let\'s sort it by attribute "percent".')
+
+    table_sorted = table.sort_values('percent')
+
+    print()
+    print_data_frame(table_sorted, 'Sorted table:')
+
+    print()
+    print('We can see that this sort entails automatic ordering by attribute')
+    print('"position".')
+
+    print()
+    print(f'Dependency "{asc_ods[1]}" is similar to the first and means that')
+    print('ordering the table by attribute "position" automatically entails')
+    print('ordering by attribute "percent". This can be seen in the table above.')
+
+    print()
+    print('In other words, these dependencies indicate that the ordering of')
+    print('percents entails an automatic ordering of the positions and vice')
+    print('versa.')
+
+def print_simple_ods_with_comments(simple_ods, table):
+    print('simple ods:', len(simple_ods))
+
+    for od in simple_ods:
+        print(od)
+
+    print()
+    print(f'Dependency "{simple_ods[0]}" means that inside each equivalence')
+    print('class from "percent" the constancy of the attribute "position" can')
+    print('be traced.')
+
+    percent_values = list(table['percent'])
+    percent_classes = set([f'class [{i}] with {percent_values.count(i)} element{"" if percent_values.count(i) == 1 else "s"}'
+                           for i in percent_values])
+
+    print()
+    print('We have 5 equivalence classes in "percent":')
+
+    for c in percent_classes:
+        print(c)
+
+    print()
+    print('This table shows the constancy of values from attribute "position"')
+    print('within each equivalence class from "percent". For clarity, lines')
+    print('containing different equivalence classes are colored differently.')
+
+    table1_headers = [i for i in table]
+    table1_rows = [list(str(i) for i in r) for r in table.values]
+
+    table1_rows[0] = [make_text_colored(i, 'bold_red') for i in table1_rows[0]]
+    table1_rows[1] = [make_text_colored(i, 'green') for i in table1_rows[1]]
+    table1_rows[2] = [make_text_colored(i, 'yellow') for i in table1_rows[2]]
+    table1_rows[3] = [make_text_colored(i, 'blue') for i in table1_rows[3]]
+    table1_rows[4] = [make_text_colored(i, 'magenta') for i in table1_rows[4]]
+    table1_rows[5] = [make_text_colored(i, 'bold_red') for i in table1_rows[5]]
+
+    print()
+    print_table(table1_rows, table1_headers)
+
+    print()
+    print(f'Dependency "{simple_ods[1]}" contains 2 attributes ("year" and')
+    print('"position") in its context and means the following: in the context')
+    print('of one year and one position the constancy of percents is observed.')
+    print('That is, in those tuples in which the year and position are the same,')
+    print('the same percent value is observed.')
+
+    print()
+    print('The following table shows these observations.')
+
+    table2_headers = list(table1_headers)
+    table2_rows = [list(str(i) for i in r) for r in table.values]
+
+    table2_rows[0] = [make_text_colored(i, 'red') for i in table2_rows[0]]
+    table2_rows[1] = [make_text_colored(i, 'green') for i in table2_rows[1]]
+    table2_rows[2] = [make_text_colored(i, 'yellow') for i in table2_rows[2]]
+    table2_rows[3] = [make_text_colored(i, 'blue') for i in table2_rows[3]]
+    table2_rows[4] = [make_text_colored(i, 'magenta') for i in table2_rows[4]]
+    table2_rows[5] = [make_text_colored(i, 'cyan') for i in table2_rows[5]]
+
+    print()
+    print_table(table2_rows, table2_headers)
+
+    print()
+    print('Consider the following two tables. In the first, dependency')
+    print(f'"{simple_ods[1]}" continues to exist. But in the second one no')
+    print('longer exists, since it is violated in third tuple, where the pair')
+    print('(2020, director) corresponds to 20%.')
+
+    table3_headers = list(table1_headers)
+    table3_rows = list(table2_rows)
+
+    table3_rows.insert(0, table3_rows[0])
+    table3_rows.insert(0, table3_rows[0])
+
+    print()
+    print_table(table3_rows, table3_headers, f'Dependency "{simple_ods[1]}" continues to exist:')
+
+    table4_headers = list(table1_headers)
+    table4_rows = list(table3_rows)
+
+    error_tuple = [str(i) for i in table.values[0]]
+    error_tuple[2] = '20%'
+    error_tuple = [make_text_colored(i, 'bold_underline_red') for i in error_tuple]
+
+    table4_rows[2] = error_tuple
+
+    print()
+    print_table(table4_rows, table4_headers, f'Dependency "{simple_ods[1]}" no longer exists:')
+
+if __name__ == '__main__':
+    algo = desbordante.od.algorithms.Fastod()
+    algo.load_data(table=(TABLE, ',', True))
+    algo.execute(time_limit=TIME_LIMIT_SECONDS)
+
+    asc_ods = algo.get_asc_ods()
+    desc_ods = algo.get_desc_ods()
+    simple_ods = algo.get_simple_ods()
+
+    table = pandas.read_csv(TABLE)
+
+    print_data_frame(table)
+    print()
+    print_attribute_symbols(table)
+    print()
+    print_desc_ods_with_comments(desc_ods)
+    print()
+    print_asc_ods_with_comments(asc_ods, table)
+    print()
+    print_simple_ods_with_comments(simple_ods, table)