Skip to content

Commit

Permalink
Add examples for Fastod algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
yakovypg authored and chernishev committed Mar 29, 2024
1 parent d3941bb commit 3d1cf76
Show file tree
Hide file tree
Showing 4 changed files with 341 additions and 0 deletions.
7 changes: 7 additions & 0 deletions examples/datasets/position_distribution.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
year,position,percent
2020,director,10%
2020,other,50%
2020,manager,40%
2021,manager,35%
2021,other,55%
2021,director,10%
9 changes: 9 additions & 0 deletions examples/datasets/salary.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
year,employee_grade,avg_salary
2020,24,1000
2020,40,7000
2020,32,5000
2020,29,3000
2020,49,10000
2021,50,15000
2021,25,1500
2021,30,6000
132 changes: 132 additions & 0 deletions examples/mining_od_fastod_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import desbordante
import pandas
from tabulate import tabulate

TABLE = 'examples/datasets/salary.csv'
TIME_LIMIT_SECONDS = 3

def print_data_frame(data_frame, title = None):
print_table(data_frame, 'keys', title)

def print_table(table, headers = None, title = None):
if title is not None:
print(title)

print(tabulate(table, headers=headers, tablefmt='psql'))

def print_attribute_symbols(table):
print('Attribute symbols:')

counter = 1

for column in table:
print(f'{column} -- {counter}')
counter += 1

def print_desc_ods_with_comments(desc_ods):
print('descending ods:', len(desc_ods))

def print_asc_ods_with_comments(asc_ods, table):
print('ascending ods:', len(asc_ods))

for od in asc_ods:
print(od)

print()
print(f'Dependency "{asc_ods[0]}" means that ordering the table')
print('inside each equivalence class from "year" by attribute "avg_salary"')
print('automatically entails ordering by attribute "employee_grade".')

print()
print('We have 2 equivalence classes in "year": [2020] and [2021].')
print('Let\'s split the table into two tables based on these classes.')

table_part1 = table.iloc[:5,:]
table_part2 = table.iloc[5:,:]

print()
print_data_frame(table_part1, 'Part 1: this part of table corresponds to class [2020]')

print()
print('Let\'s sort it by attribute "avg_salary".')

table_part1_sorted = table_part1.sort_values('avg_salary')

print()
print_data_frame(table_part1_sorted, 'Sorted part 1:')

print()
print('We can see that this sort entails automatic ordering by')
print('attribute "employee_grade".')

print()
print_data_frame(table_part2, 'Part 2: this part of table corresponds to class [2021]')

print()
print('Let\'s sort it by attribute "avg_salary".')

table_part2_sorted = table_part2.sort_values('avg_salary')

print()
print_data_frame(table_part2_sorted, 'Sorted part 2:')

print()
print('We can see that this sort entails automatic ordering by')
print('attribute "employee_grade" too.')

print()
print(f'Dependency "{asc_ods[1]}" is similar to the first and means that')
print('ordering the table inside each equivalence class from "year" by')
print('attribute "employee_grade" automatically entails ordering by')
print('attribute "avg_salary". This can be seen in the tables above.')

print()
print('In other words, these dependencies indicate that the ordering of')
print('average salary entails an automatic ordering of the employee grade')
print('and vice versa.')

def print_simple_ods_with_comments(simple_ods, table):
print('simple ods:', len(simple_ods))

for od in simple_ods:
print(od)

print()
print('These dependencies mean that inside each equivalence class from')
print('an attribute from their context the constancy of the attribute')
print('from the right side of the dependency can be traced.')

employee_grade_classes = [f'[{i}]' for i in table['employee_grade']]
employee_grade_classes_str = ', '.join(employee_grade_classes)

print()
print(f'For example, let\'s look at "{simple_ods[0]}". The context of this')
print('dependency is attribute "employee_grade". We have 8 equivalence classes')
print(f'in "employee_grade": {employee_grade_classes_str}.')
print('Since all the elements of attribute "employee_grade" are different,')
print('each of these classes contains only one element, so constancy within')
print('each class occurs automatically.')

print()
print('To better understand such dependencies, refer to the second example.')

if __name__ == '__main__':
algo = desbordante.od.algorithms.Fastod()
algo.load_data(table=(TABLE, ',', True))
algo.execute(time_limit=TIME_LIMIT_SECONDS)

asc_ods = algo.get_asc_ods()
desc_ods = algo.get_desc_ods()
simple_ods = algo.get_simple_ods()

table = pandas.read_csv(TABLE)

print_data_frame(table)
print()
print_attribute_symbols(table)
print()
print_desc_ods_with_comments(desc_ods)
print()
print_asc_ods_with_comments(asc_ods, table)
print()
print_simple_ods_with_comments(simple_ods, table)
193 changes: 193 additions & 0 deletions examples/mining_od_fastod_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import desbordante
import pandas
from tabulate import tabulate

TABLE = 'examples/datasets/position_distribution.csv'
TIME_LIMIT_SECONDS = 3

COLOR_CODES = {
'bold_underline_red': '\u001b[1;4;31m',
'bold_red': '\u001b[1;31m',
'bold_green': '\033[1;32m',
'bold_yellow': '\033[1;33m',
'bold_blue': '\033[1;34m',
'bold_magenta': '\033[1;35m',
'bold_cyan': '\033[1;36m',
'bold_white': '\033[1;37m',
'red': '\u001b[31m',
'green': '\033[32m',
'yellow': '\033[33m',
'blue': '\033[34m',
'magenta': '\033[35m',
'cyan': '\033[36m',
'white': '\033[37m',
'default': '\033[0m'
}

def make_text_colored(text, color):
return f'{COLOR_CODES[color]}{text}{COLOR_CODES["default"]}'

def print_data_frame(data_frame, title = None):
print_table(data_frame, 'keys', title)

def print_table(table, headers = None, title = None):
if title is not None:
print(title)

print(tabulate(table, headers=headers, tablefmt='psql'))

def print_attribute_symbols(table):
print('Attribute symbols:')

counter = 1

for column in table:
print(f'{column} -- {counter}')
counter += 1

def print_desc_ods_with_comments(desc_ods):
print('descending ods:', len(desc_ods))

def print_asc_ods_with_comments(asc_ods, table):
print('ascending ods:', len(asc_ods))

for od in asc_ods:
print(od)

print()
print(f'Dependency "{asc_ods[0]}" means that ordering the table by attribute')
print('"percent" automatically entails ordering by attribute "position".')
print('Moreover, this is observed regardless of other attributes, since the')
print('dependency context is empty.')

print()
print('Let\'s sort it by attribute "percent".')

table_sorted = table.sort_values('percent')

print()
print_data_frame(table_sorted, 'Sorted table:')

print()
print('We can see that this sort entails automatic ordering by attribute')
print('"position".')

print()
print(f'Dependency "{asc_ods[1]}" is similar to the first and means that')
print('ordering the table by attribute "position" automatically entails')
print('ordering by attribute "percent". This can be seen in the table above.')

print()
print('In other words, these dependencies indicate that the ordering of')
print('percents entails an automatic ordering of the positions and vice')
print('versa.')

def print_simple_ods_with_comments(simple_ods, table):
print('simple ods:', len(simple_ods))

for od in simple_ods:
print(od)

print()
print(f'Dependency "{simple_ods[0]}" means that inside each equivalence')
print('class from "percent" the constancy of the attribute "position" can')
print('be traced.')

percent_values = list(table['percent'])
percent_classes = set([f'class [{i}] with {percent_values.count(i)} element{"" if percent_values.count(i) == 1 else "s"}'
for i in percent_values])

print()
print('We have 5 equivalence classes in "percent":')

for c in percent_classes:
print(c)

print()
print('This table shows the constancy of values from attribute "position"')
print('within each equivalence class from "percent". For clarity, lines')
print('containing different equivalence classes are colored differently.')

table1_headers = [i for i in table]
table1_rows = [list(str(i) for i in r) for r in table.values]

table1_rows[0] = [make_text_colored(i, 'bold_red') for i in table1_rows[0]]
table1_rows[1] = [make_text_colored(i, 'green') for i in table1_rows[1]]
table1_rows[2] = [make_text_colored(i, 'yellow') for i in table1_rows[2]]
table1_rows[3] = [make_text_colored(i, 'blue') for i in table1_rows[3]]
table1_rows[4] = [make_text_colored(i, 'magenta') for i in table1_rows[4]]
table1_rows[5] = [make_text_colored(i, 'bold_red') for i in table1_rows[5]]

print()
print_table(table1_rows, table1_headers)

print()
print(f'Dependency "{simple_ods[1]}" contains 2 attributes ("year" and')
print('"position") in its context and means the following: in the context')
print('of one year and one position the constancy of percents is observed.')
print('That is, in those tuples in which the year and position are the same,')
print('the same percent value is observed.')

print()
print('The following table shows these observations.')

table2_headers = list(table1_headers)
table2_rows = [list(str(i) for i in r) for r in table.values]

table2_rows[0] = [make_text_colored(i, 'red') for i in table2_rows[0]]
table2_rows[1] = [make_text_colored(i, 'green') for i in table2_rows[1]]
table2_rows[2] = [make_text_colored(i, 'yellow') for i in table2_rows[2]]
table2_rows[3] = [make_text_colored(i, 'blue') for i in table2_rows[3]]
table2_rows[4] = [make_text_colored(i, 'magenta') for i in table2_rows[4]]
table2_rows[5] = [make_text_colored(i, 'cyan') for i in table2_rows[5]]

print()
print_table(table2_rows, table2_headers)

print()
print('Consider the following two tables. In the first, dependency')
print(f'"{simple_ods[1]}" continues to exist. But in the second one no')
print('longer exists, since it is violated in third tuple, where the pair')
print('(2020, director) corresponds to 20%.')

table3_headers = list(table1_headers)
table3_rows = list(table2_rows)

table3_rows.insert(0, table3_rows[0])
table3_rows.insert(0, table3_rows[0])

print()
print_table(table3_rows, table3_headers, f'Dependency "{simple_ods[1]}" continues to exist:')

table4_headers = list(table1_headers)
table4_rows = list(table3_rows)

error_tuple = [str(i) for i in table.values[0]]
error_tuple[2] = '20%'
error_tuple = [make_text_colored(i, 'bold_underline_red') for i in error_tuple]

table4_rows[2] = error_tuple

print()
print_table(table4_rows, table4_headers, f'Dependency "{simple_ods[1]}" no longer exists:')

if __name__ == '__main__':
algo = desbordante.od.algorithms.Fastod()
algo.load_data(table=(TABLE, ',', True))
algo.execute(time_limit=TIME_LIMIT_SECONDS)

asc_ods = algo.get_asc_ods()
desc_ods = algo.get_desc_ods()
simple_ods = algo.get_simple_ods()

table = pandas.read_csv(TABLE)

print_data_frame(table)
print()
print_attribute_symbols(table)
print()
print_desc_ods_with_comments(desc_ods)
print()
print_asc_ods_with_comments(asc_ods, table)
print()
print_simple_ods_with_comments(simple_ods, table)

0 comments on commit 3d1cf76

Please sign in to comment.