Skip to content

Commit

Permalink
Add CLI for INDs
Browse files Browse the repository at this point in the history
Add command-line interface for Spider and Faida
inclusion dependency detection algorithms
  • Loading branch information
p-senichenkov authored and chernishev committed Apr 2, 2024
1 parent 3de5147 commit d0a916c
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 13 deletions.
125 changes: 114 additions & 11 deletions cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from enum import StrEnum, auto
from time import process_time
from typing import Any, Callable
from os import scandir

import click
import desbordante
Expand All @@ -15,6 +16,7 @@ class Task(StrEnum):
afd = auto()
od = auto()
pfd = auto()
ind = auto()
fd_verification = auto()
afd_verification = auto()
mfd_verification = auto()
Expand All @@ -34,6 +36,8 @@ class Algorithm(StrEnum):
aid = auto()
fastod = auto()
order = auto()
spider = auto()
faida = auto()
naive_fd_verifier = auto()
naive_afd_verifier = auto()
icde09_mfd_verifier = auto()
Expand All @@ -47,6 +51,9 @@ class Algorithm(StrEnum):
VERBOSE = 'verbose'
ERROR = 'error'
ERROR_MEASURE = 'error_measure'
TABLES = 'tables'
TABLES_LIST = 'tables_list'
TABLES_DIRECTORY = 'tables_directory'

PRIMARY_HELP = '''The Desbordante data profiler is designed to help users
discover or verify various types of patterns in data. These patterns are
Expand Down Expand Up @@ -102,9 +109,10 @@ class Algorithm(StrEnum):
2) Discovery of approximate functional dependencies
3) Discovery of probabilistic functional dependencies
4) Discovery of exact order dependencies (set-based and list-based axiomatization)
5) Verification of exact functional dependencies
6) Verification of approximate functional dependencies
7) Verification of metric dependencies
5) Discovery of inclusion dependencies
6) Verification of exact functional dependencies
7) Verification of approximate functional dependencies
8) Verification of metric dependencies
If you need other types, you should look into the C++ code, the Python
bindings or the Web version.
Expand All @@ -117,7 +125,9 @@ class Algorithm(StrEnum):
specify the algorithm to run, e.g., PYRO
--table=TABLE
specify the input file to be processed by the algorithm
specify the input file to be processed by the algorithm.
Algorithms for some tasks (currently, only IND) accept multiple
input files; see --task=TASK for more information
--is_null_equal_null=BOOLEAN
specify whether two NULLs should be considered equal
Expand Down Expand Up @@ -165,6 +175,28 @@ class Algorithm(StrEnum):
Algorithms: PFDTANE
Default: PFDTANE
'''
IND_HELP = '''Discover inclusion dependecies. For more information about
inclusion dependecies, refer to the "Inclusion Dependency Discovery: An
Experimental Evaluation of Thirteen Algorithms" by Falco Dürsch et al.
Algorithms for this task accept multiple input files. You can use one of the
following options:
--tables=TABLE
specify input files to be processed by the algorithm.
For multiple values, specify multiple times
(e.g., --tables=TABLE_1 --tables=TABLE_2)
--tables_list=FILENAME
specify file with list of input files (one on a line).
You can type --tables_list=- to use stdin
--tables_directory=FILENAME, STRING, BOOLEAN
specify directory with input files.
separator and has_header are applied to all tables
Algorithms: SPIDER, FAIDA
Default: SPIDER
'''
FD_VERIFICATION_HELP = '''Verify whether a given exact functional dependency
holds on the specified dataset. For more information about the primitive and
algorithms, refer to the “Functional dependency discovery: an experimental
Expand Down Expand Up @@ -244,6 +276,16 @@ class Algorithm(StrEnum):
“Approximate Discovery of Functional Dependencies for Large Datasets” paper
by T.Bleifus et al.
'''
SPIDER_HELP = '''A disk-backed unary inclusion dependency mining algorithm.
For more information, refer to "Efficiently detecting inclusion dependencies"
by J. Bauckmann et al.
'''
FAIDA_HELP = '''Both unary and n-ary inclusion dependency mining algorithm.
Unlike all other algorithms, it is approximate, i.e. it can
miss some dependencies or produce non-valid ones. In exchange,
it is significantly faster. For more information, refer to "Fast approximate
discovery of inclusion dependencies" by S. Kruse et al.
'''
FASTOD_HELP = '''A modern algorithm for discovery of canonical order
dependencies. For more information, refer to the “Effective and complete
discovery of order dependencies via set-based axiomatization” paper by
Expand Down Expand Up @@ -285,6 +327,7 @@ class Algorithm(StrEnum):
Task.afd: AFD_HELP,
Task.od: OD_HELP,
Task.pfd: PFD_HELP,
Task.ind: IND_HELP,
Task.fd_verification: FD_VERIFICATION_HELP,
Task.afd_verification: AFD_VERIFICATION_HELP,
Task.mfd_verification: MFD_VERIFICATION_HELP
Expand All @@ -304,6 +347,8 @@ class Algorithm(StrEnum):
Algorithm.aid: AID_HELP,
Algorithm.fastod: FASTOD_HELP,
Algorithm.order: ORDER_HELP,
Algorithm.spider: SPIDER_HELP,
Algorithm.faida: FAIDA_HELP,
Algorithm.naive_fd_verifier: NAIVE_FD_VERIFIER_HELP,
Algorithm.naive_afd_verifier: NAIVE_AFD_VERIFIER_HELP,
Algorithm.icde09_mfd_verifier: ICDE09_MFD_VERIFIER_HELP
Expand All @@ -322,6 +367,8 @@ class Algorithm(StrEnum):
Task.od: TaskInfo([Algorithm.fastod, Algorithm.order],
Algorithm.fastod),
Task.pfd: TaskInfo([Algorithm.pfdtane], Algorithm.pfdtane),
Task.ind: TaskInfo([Algorithm.spider, Algorithm.faida],
Algorithm.spider),
Task.fd_verification: TaskInfo([Algorithm.naive_fd_verifier],
Algorithm.naive_fd_verifier),
Task.afd_verification: TaskInfo([Algorithm.naive_afd_verifier],
Expand All @@ -344,6 +391,8 @@ class Algorithm(StrEnum):
Algorithm.aid: desbordante.fd.algorithms.Aid,
Algorithm.fastod: desbordante.od.algorithms.Fastod,
Algorithm.order: desbordante.od.algorithms.Order,
Algorithm.spider: desbordante.ind.algorithms.Spider,
Algorithm.faida: desbordante.ind.algorithms.Faida,
Algorithm.naive_fd_verifier: desbordante.fd_verification.algorithms.FDVerifier,
Algorithm.naive_afd_verifier: desbordante.afd_verification.algorithms.FDVerifier,
Algorithm.icde09_mfd_verifier: desbordante.mfd_verification.algorithms.MetricVerifier
Expand Down Expand Up @@ -393,6 +442,35 @@ def check_error_measure_option_presence(task: str | None, error_measure: str | N
sys.exit(1)


def parse_tables_list_file(file: click.File) \
-> list[tuple[str, str, bool]]:
try:
result = []
for line_num, line in enumerate(file.readlines(), start=1):
table_tuple = line.rsplit(maxsplit=2)
if len(table_tuple) != 3:
click.echo(
f'ERROR: Invalid format of table description on line {line_num}: {line}')
sys.exit(1)
filename, separator, has_header_str = table_tuple
result.append((filename, separator, bool(has_header_str)))
return result
except OSError as exc:
click.echo(exc)
sys.exit(1)


def parse_tables_directory(tp: tuple[click.Path, str, bool]) \
-> list[tuple[str, str, bool]]:
dir_name, separator, has_header = tp
try:
entries = scandir(dir_name)
return [(dir_entry.path, separator, has_header) for dir_entry in entries]
except OSError as exc:
click.echo(exc)
sys.exit(1)


def is_omitted(value: Any) -> bool:
return value is None or value == ()

Expand Down Expand Up @@ -441,6 +519,8 @@ def get_algo_result(algo: desbordante.Algorithm, algo_name: str) -> Any:
result = algo.get_asc_ods() + algo.get_desc_ods() + algo.get_simple_ods()
case Algorithm.order:
result = algo.get_list_ods()
case algo_name if algo_name in TASK_INFO[Task.ind].algos:
result = algo.get_inds()
case _:
assert False, 'No matching get_result function.'
return result
Expand Down Expand Up @@ -478,7 +558,10 @@ def print_result(result: Any, filename: str | None) -> None:


def print_unused_opts(used_opts: set, provided_opts: set) -> None:
unused_opts = provided_opts - (used_opts | {TASK, ALGO, VERBOSE, FILENAME})
unused_opts = provided_opts - (used_opts | {TASK, ALGO, VERBOSE, FILENAME} |
({TABLES_LIST, TABLES_DIRECTORY}
if TABLES in used_opts
else set()))
if unused_opts:
click.echo(f'Unused options: {unused_opts}')

Expand All @@ -496,7 +579,7 @@ def print_algo_help_page(algo_name: str) -> None:
algo = ALGOS[Algorithm(algo_name)]()
help_info = ''
for opt in algo.get_possible_options():
if opt not in ('table', 'is_null_equal_null'):
if opt not in ('table', TABLES, 'is_null_equal_null'):
help_info += get_option_help_info(opt, algo)
click.echo(f'{ALGO_HELP_PAGES[Algorithm(algo_name)]}{help_info}')

Expand Down Expand Up @@ -534,6 +617,18 @@ def get_option_type_info() -> dict[str, Any]:
return option_type_info


def process_tables_options(opts: dict[str, Any], algo_name: str) -> dict[str, Any]:
result = opts.copy()

for option_name, parse_func in ((TABLES_LIST, parse_tables_list_file),
(TABLES_DIRECTORY, parse_tables_directory)):
value = result.pop(option_name)
if not is_omitted(value):
result[TABLES] = list(result[TABLES]) + parse_func(value)

return result


def algos_options() -> Callable:
option_type_info = get_option_type_info()

Expand All @@ -542,11 +637,14 @@ def decorator(func: Callable) -> Callable:
in option_type_info.items():
arg = f'--{opt_name}'
if opt_main_type == list:
click.option(arg, multiple=True,
if opt_additional_types[0] == desbordante.data_types.Table:
click.option(arg, type=(str, str, bool),
multiple=True)(func)
else:
click.option(arg, multiple=True,
type=opt_additional_types[0])(func)
elif opt_main_type == desbordante.data_types.Table:
click.option(arg, type=(str, str, bool),
required=True)(func)
click.option(arg, type=(str, str, bool))(func)
else:
click.option(arg, type=opt_main_type)(func)
return func
Expand All @@ -568,6 +666,9 @@ def decorator(func: Callable) -> Callable:
callback=get_algorithm, is_eager=True)
@click.option(f'--{FILENAME}', type=str)
@click.option(f'--{VERBOSE}', is_flag=True)
@click.option(f'--{TABLES_LIST}', type=click.File('r'))
@click.option(f'--{TABLES_DIRECTORY}', type=(click.Path(exists=True, file_okay=False,
dir_okay=True, resolve_path=True, allow_dash=False), str, bool))
@algos_options()
def desbordante_cli(**kwargs: Any) -> None:
"""Takes in options from console as a dictionary, sets these options
Expand All @@ -585,10 +686,12 @@ def desbordante_cli(**kwargs: Any) -> None:
check_error_option_presence(curr_task, error_opt)
check_error_measure_option_presence(curr_task, error_measure_opt)

opts = process_tables_options(kwargs, curr_algo_name)

start_point = process_time()
used_opts = set_algo_options(curr_algo, kwargs)
used_opts = set_algo_options(curr_algo, opts)
curr_algo.load_data()
used_opts |= set_algo_options(curr_algo, kwargs)
used_opts |= set_algo_options(curr_algo, opts)
provided_options = get_provided_options(kwargs)
print_unused_opts(used_opts, set(provided_options.keys()))
result = get_algo_result(curr_algo, curr_algo_name)
Expand Down
8 changes: 6 additions & 2 deletions src/core/config/descriptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,12 @@ constexpr auto kDBumpsLimit = "max considered intervals amount. Pass 0 to remove
constexpr auto kDTimeLimitSeconds = "max running time of the algorithm. Pass 0 to remove limit";
constexpr auto kDIterationsLimit = "limit for iterations of sampling";
constexpr auto kDACSeed = "seed, needed for choosing a data sample";
constexpr auto kDHllAccuracy = "HyperLogLog approximation accuracy";
constexpr auto kDSampleSize = "Size of a table sample";
constexpr auto kDHllAccuracy =
"HyperLogLog approximation accuracy. Must be positive\n"
"Closer to 0 - higher accuracy, more memory needed and slower the algorithm.\n";
constexpr auto kDSampleSize =
"Size of a table sample. Greater value - more correct answers, but higher memory "
"consumption.\n Applies to all tables";
constexpr auto kDFindNary = "Detect n-ary inclusion dependencies [true|false]";
constexpr auto kDIgnoreNullCols =
"Ignore INDs which contain columns filled only with NULLs. May increase "
Expand Down

0 comments on commit d0a916c

Please sign in to comment.