Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement set based aod verifier, support aod mining in fastod #468

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
4267c23
Add ColumnIndexOption and move there ValidateIndex
polyntsov Sep 28, 2024
31f3c47
Add parameter to allow empty list of indices in IndicesOption
polyntsov Sep 28, 2024
1e76762
Move complex stripeed partition swap and create definitions to cpp
polyntsov Sep 28, 2024
20592c8
Refactor Swap in complex stripped partition
polyntsov Sep 28, 2024
4d65979
Introduce od::Ordering enum and use it instead of bool Ascending
polyntsov Sep 28, 2024
f043b22
Introduce partition type for complex stripped partition as create param
polyntsov Sep 28, 2024
b452c28
Accept in CreateAttributeSet any range as list of attributes
polyntsov Sep 29, 2024
789c617
Store DataFrame in ComplexStrippedPartition as raw pointer
polyntsov Sep 29, 2024
0214e5d
Store DataFrame directly as value in Fastod
polyntsov Sep 29, 2024
eef5077
Move nd_verifier's VectorToString to general util and accept any range
polyntsov Sep 29, 2024
7cff09d
Add missing <vector> include to config/iption.h
polyntsov Sep 30, 2024
cae91d2
Add method to convert fastod::AttributeSet to vector of column indices
polyntsov Sep 30, 2024
187a778
Add a callback to Option which is called before the option is set
polyntsov Sep 30, 2024
1f901a6
Implement getters for context and cols in canonical ods
polyntsov Sep 30, 2024
cdb4980
Implement a function to load algo data without configuring execute opts
polyntsov Sep 30, 2024
8b24e2f
Introduce is required callback to option
polyntsov Sep 30, 2024
d938c38
Allow absence of non-required options in algo factory
polyntsov Sep 30, 2024
fa7af04
Implement aod verifier and cover it with tests
polyntsov Sep 30, 2024
7d04aea
Implement python bindings to set based aod verifier
polyntsov Sep 30, 2024
60c2927
Implement error parameter for fastod and add tests for aod mining
polyntsov Sep 30, 2024
df33951
Specify in readme that we now support approximate set-based ODs
polyntsov Sep 30, 2024
4c350d0
Implement aod verification python example
polyntsov Sep 30, 2024
56a0f31
Avoid unnecessary copying of partitions in fastod partition cache
polyntsov Sep 30, 2024
a0e7af5
Don't store indices vectors via shared ptr in fastod complex partition
polyntsov Sep 30, 2024
3f6ec49
Fallback to split and swap validation when error is zero in canonical od
polyntsov Oct 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions src/core/algorithms/od/fastod/fastod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ void Fastod::ComputeODs() {

CCPut(context, context_cc);

AddCandidates<false>(context, del_attrs);
AddCandidates<true>(context, del_attrs);
AddCandidates<od::Ordering::descending>(context, del_attrs);
AddCandidates<od::Ordering::ascending>(context, del_attrs);
}

size_t delete_index = 0;
Expand Down Expand Up @@ -181,8 +181,8 @@ void Fastod::ComputeODs() {
}
});

CalculateODs<false>(context, del_attrs);
CalculateODs<true>(context, del_attrs);
CalculateODs<od::Ordering::descending>(context, del_attrs);
CalculateODs<od::Ordering::ascending>(context, del_attrs);
}
}

Expand All @@ -193,8 +193,9 @@ void Fastod::PruneLevels() {

for (auto attribute_set_it = context_in_current_level_.begin();
attribute_set_it != context_in_current_level_.end();) {
if (IsEmptySet(CCGet(*attribute_set_it)) && CSGet<true>(*attribute_set_it).empty() &&
CSGet<false>(*attribute_set_it).empty()) {
if (IsEmptySet(CCGet(*attribute_set_it)) &&
CSGet<od::Ordering::ascending>(*attribute_set_it).empty() &&
CSGet<od::Ordering::descending>(*attribute_set_it).empty()) {
context_in_current_level_.erase(attribute_set_it++);
} else {
++attribute_set_it;
Expand Down
40 changes: 20 additions & 20 deletions src/core/algorithms/od/fastod/fastod.h
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, some commit description will be nice

Original file line number Diff line number Diff line change
Expand Up @@ -68,36 +68,36 @@ class Fastod : public Algorithm {
void CCPut(AttributeSet const& key, AttributeSet attribute_set);
AttributeSet const& CCGet(AttributeSet const& key);

template <bool Ascending>
template <od::Ordering Ordering>
void CSPut(AttributeSet const& key, AttributePair const& value) {
if constexpr (Ascending) {
if constexpr (Ordering == +od::Ordering::ascending) {
cs_asc_[key].emplace(value);
} else {
cs_desc_[key].emplace(value);
}
}

template <bool Ascending>
template <od::Ordering Ordering>
void CSPut(AttributeSet const& key, AttributePair&& value) {
if constexpr (Ascending) {
if constexpr (Ordering == +od::Ordering::ascending) {
cs_asc_[key].emplace(std::move(value));
} else {
cs_desc_[key].emplace(std::move(value));
}
}

template <bool Ascending>
template <od::Ordering Ordering>
std::unordered_set<AttributePair>& CSGet(AttributeSet const& key) {
if constexpr (Ascending) {
if constexpr (Ordering == +od::Ordering::ascending) {
return cs_asc_[key];
} else {
return cs_desc_[key];
}
}

template <bool Ascending>
void AddToResult(fastod::CanonicalOD<Ascending>&& od) {
if constexpr (Ascending) {
template <od::Ordering Ordering>
void AddToResult(fastod::CanonicalOD<Ordering>&& od) {
if constexpr (Ordering == +od::Ordering::ascending) {
Comment on lines -73 to +103
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO it is better to define

constexpr bool is_ascending(od::Ordering order) {
    return order == od::Ordering::ascending;
}

and then use if constexpr (is_ascending(Ordering))
Or maybe even

template <od::Ordering Ordering>
concept Ascending = (Ordering == od::Ordering::ascending);
template <od::Ordering Ordering>
concept Descending = (Ordering == od::Ordering::descending);

and do

    if constexpr (Ascending<Ordering>) {
        cs_asc_[key].emplace(value);
    } else {
        cs_desc_[key].emplace(value);
    }

result_asc_.emplace_back(std::move(od));
} else {
result_desc_.emplace_back(std::move(od));
Expand All @@ -108,31 +108,31 @@ class Fastod : public Algorithm {
result_simple_.emplace_back(std::move(od));
}

template <bool Ascending>
template <od::Ordering Ordering>
void AddCandidates(AttributeSet const& context,
std::vector<AttributeSet> const& deleted_attrs) {
if (level_ == 2) {
for (model::ColumnIndex i = 0; i < data_->GetColumnCount(); i++) {
for (model::ColumnIndex j = 0; j < data_->GetColumnCount(); j++) {
if (i == j) continue;
CSPut<Ascending>(fastod::CreateAttributeSet({i, j}, data_->GetColumnCount()),
AttributePair(i, j));
CSPut<Ordering>(fastod::CreateAttributeSet({i, j}, data_->GetColumnCount()),
AttributePair(i, j));
}
}
} else if (level_ > 2) {
context.Iterate([this, &deleted_attrs, &context](model::ColumnIndex attr) {
auto const& candidates = CSGet<Ascending>(deleted_attrs[attr]);
auto const& candidates = CSGet<Ordering>(deleted_attrs[attr]);

for (AttributePair const& attribute_pair : candidates) {
const AttributeSet context_delete_ab = fastod::DeleteAttribute(
AttributeSet const context_delete_ab = fastod::DeleteAttribute(
deleted_attrs[attribute_pair.left], attribute_pair.right);

bool add_context = true;

context_delete_ab.Iterate([this, &deleted_attrs, &attribute_pair,
&add_context](model::ColumnIndex attr) {
std::unordered_set<AttributePair> const& cs =
CSGet<Ascending>(deleted_attrs[attr]);
CSGet<Ordering>(deleted_attrs[attr]);

if (cs.find(attribute_pair) == cs.end()) {
add_context = false;
Expand All @@ -141,25 +141,25 @@ class Fastod : public Algorithm {
});

if (add_context) {
CSPut<Ascending>(context, attribute_pair);
CSPut<Ordering>(context, attribute_pair);
}
}
});
}
}

template <bool Ascending>
template <od::Ordering Ordering>
void CalculateODs(AttributeSet const& context, std::vector<AttributeSet> const& deleted_attrs) {
auto& cs_for_con = CSGet<Ascending>(context);
auto& cs_for_con = CSGet<Ordering>(context);

for (auto it = cs_for_con.begin(); it != cs_for_con.end();) {
model::ColumnIndex a = it->left;
model::ColumnIndex b = it->right;

if (ContainsAttribute(CCGet(deleted_attrs[b]), a) &&
ContainsAttribute(CCGet(deleted_attrs[a]), b)) {
fastod::CanonicalOD<Ascending> od(fastod::DeleteAttribute(deleted_attrs[a], b), a,
b);
fastod::CanonicalOD<Ordering> od(fastod::DeleteAttribute(deleted_attrs[a], b), a,
b);

if (od.IsValid(data_, partition_cache_)) {
AddToResult(std::move(od));
Expand Down
43 changes: 25 additions & 18 deletions src/core/algorithms/od/fastod/model/canonical_od.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@

namespace algos::fastod {

template <bool Ascending>
CanonicalOD<Ascending>::CanonicalOD(AttributeSet const& context, model::ColumnIndex left,
model::ColumnIndex right)
template <od::Ordering Ordering>
CanonicalOD<Ordering>::CanonicalOD(AttributeSet const& context, model::ColumnIndex left,
model::ColumnIndex right)
: context_(std::move(context)), ap_(left, right) {}

template <bool Ascending>
bool CanonicalOD<Ascending>::IsValid(std::shared_ptr<DataFrame> data, PartitionCache& cache) const {
return !(cache.GetStrippedPartition(context_, data).Swap<Ascending>(ap_.left, ap_.right));
template <od::Ordering Ordering>
bool CanonicalOD<Ordering>::IsValid(std::shared_ptr<DataFrame> data, PartitionCache& cache) const {
return !(cache.GetStrippedPartition(context_, data).Swap<Ordering>(ap_.left, ap_.right));
}

template <bool Ascending>
std::string CanonicalOD<Ascending>::ToString() const {
template <od::Ordering Ordering>
std::string CanonicalOD<Ordering>::ToString() const {
std::stringstream result;

result << context_.ToString() << " : " << ap_.left + 1 << (Ascending ? "<=" : ">=") << " ~ "
<< ap_.right + 1 << "<=";
result << context_.ToString() << " : " << ap_.left + 1
<< ((Ordering == +od::Ordering::ascending) ? "<=" : ">=") << " ~ " << ap_.right + 1
<< "<=";

return result.str();
}
Expand All @@ -40,31 +41,37 @@ std::string SimpleCanonicalOD::ToString() const {
return result.str();
}

bool operator==(CanonicalOD<true> const& x, CanonicalOD<true> const& y) {
bool operator==(CanonicalOD<od::Ordering::ascending> const& x,
CanonicalOD<od::Ordering::ascending> const& y) {
return x.context_ == y.context_ && x.ap_ == y.ap_;
}

bool operator!=(CanonicalOD<true> const& x, CanonicalOD<true> const& y) {
bool operator!=(CanonicalOD<od::Ordering::ascending> const& x,
CanonicalOD<od::Ordering::ascending> const& y) {
return !(x == y);
}

bool operator<(CanonicalOD<true> const& x, CanonicalOD<true> const& y) {
bool operator<(CanonicalOD<od::Ordering::ascending> const& x,
CanonicalOD<od::Ordering::ascending> const& y) {
if (x.ap_ != y.ap_) {
return x.ap_ < y.ap_;
}

return x.context_ < y.context_;
}

bool operator==(CanonicalOD<false> const& x, CanonicalOD<false> const& y) {
bool operator==(CanonicalOD<od::Ordering::descending> const& x,
CanonicalOD<od::Ordering::descending> const& y) {
return x.context_ == y.context_ && x.ap_ == y.ap_;
}

bool operator!=(CanonicalOD<false> const& x, CanonicalOD<false> const& y) {
bool operator!=(CanonicalOD<od::Ordering::descending> const& x,
CanonicalOD<od::Ordering::descending> const& y) {
return !(x == y);
}

bool operator<(CanonicalOD<false> const& x, CanonicalOD<false> const& y) {
bool operator<(CanonicalOD<od::Ordering::descending> const& x,
CanonicalOD<od::Ordering::descending> const& y) {
Comment on lines +68 to +98
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use AscCanonicalOD and DescCanonicalOD here

if (x.ap_ != y.ap_) {
return x.ap_ < y.ap_;
}
Expand All @@ -88,7 +95,7 @@ bool operator<(SimpleCanonicalOD const& x, SimpleCanonicalOD const& y) {
return x.context_ < y.context_;
}

template class CanonicalOD<true>;
template class CanonicalOD<false>;
template class CanonicalOD<od::Ordering::ascending>;
template class CanonicalOD<od::Ordering::descending>;

} // namespace algos::fastod
43 changes: 25 additions & 18 deletions src/core/algorithms/od/fastod/model/canonical_od.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
#include <memory>

#include "algorithms/od/fastod/hashing/hashing.h"
#include "algorithms/od/fastod/od_ordering.h"
#include "algorithms/od/fastod/storage/partition_cache.h"
#include "attribute_pair.h"

namespace algos::fastod {

template <bool Ascending>
template <od::Ordering Ordering>
class CanonicalOD {
private:
AttributeSet context_;
Expand All @@ -21,18 +22,24 @@ class CanonicalOD {
bool IsValid(std::shared_ptr<DataFrame> data, PartitionCache& cache) const;
std::string ToString() const;

friend bool operator==(CanonicalOD<true> const& x, CanonicalOD<true> const& y);
friend bool operator!=(CanonicalOD<true> const& x, CanonicalOD<true> const& y);
friend bool operator<(CanonicalOD<true> const& x, CanonicalOD<true> const& y);
friend bool operator==(CanonicalOD<false> const& x, CanonicalOD<false> const& y);
friend bool operator!=(CanonicalOD<false> const& x, CanonicalOD<false> const& y);
friend bool operator<(CanonicalOD<false> const& x, CanonicalOD<false> const& y);

friend struct std::hash<CanonicalOD<Ascending>>;
friend bool operator==(CanonicalOD<od::Ordering::ascending> const& x,
CanonicalOD<od::Ordering::ascending> const& y);
friend bool operator!=(CanonicalOD<od::Ordering::ascending> const& x,
CanonicalOD<od::Ordering::ascending> const& y);
friend bool operator<(CanonicalOD<od::Ordering::ascending> const& x,
CanonicalOD<od::Ordering::ascending> const& y);
friend bool operator==(CanonicalOD<od::Ordering::descending> const& x,
CanonicalOD<od::Ordering::descending> const& y);
friend bool operator!=(CanonicalOD<od::Ordering::descending> const& x,
CanonicalOD<od::Ordering::descending> const& y);
friend bool operator<(CanonicalOD<od::Ordering::descending> const& x,
CanonicalOD<od::Ordering::descending> const& y);

friend struct std::hash<CanonicalOD<Ordering>>;
};

using AscCanonicalOD = CanonicalOD<true>;
using DescCanonicalOD = CanonicalOD<false>;
using AscCanonicalOD = CanonicalOD<od::Ordering::ascending>;
using DescCanonicalOD = CanonicalOD<od::Ordering::descending>;

class SimpleCanonicalOD {
private:
Expand All @@ -57,11 +64,11 @@ class SimpleCanonicalOD {

namespace std {

template <bool Ascending>
struct hash<algos::fastod::CanonicalOD<Ascending>> {
size_t operator()(algos::fastod::CanonicalOD<Ascending> const& od) const noexcept {
const size_t context_hash = hash<algos::fastod::AttributeSet>{}(od.context_);
const size_t ap_hash = hash<algos::fastod::AttributePair>{}(od.ap_);
template <algos::od::Ordering Ordering>
struct hash<algos::fastod::CanonicalOD<Ordering>> {
size_t operator()(algos::fastod::CanonicalOD<Ordering> const& od) const noexcept {
size_t const context_hash = hash<algos::fastod::AttributeSet>{}(od.context_);
size_t const ap_hash = hash<algos::fastod::AttributePair>{}(od.ap_);

return algos::fastod::hashing::CombineHashes(context_hash, ap_hash);
}
Expand All @@ -70,8 +77,8 @@ struct hash<algos::fastod::CanonicalOD<Ascending>> {
template <>
struct hash<algos::fastod::SimpleCanonicalOD> {
size_t operator()(algos::fastod::SimpleCanonicalOD const& od) const noexcept {
const size_t context_hash = hash<algos::fastod::AttributeSet>{}(od.context_);
const size_t right_hash = hash<model::ColumnIndex>{}(od.right_);
size_t const context_hash = hash<algos::fastod::AttributeSet>{}(od.context_);
size_t const right_hash = hash<model::ColumnIndex>{}(od.right_);

return algos::fastod::hashing::CombineHashes(context_hash, right_hash);
}
Expand Down
7 changes: 7 additions & 0 deletions src/core/algorithms/od/fastod/od_ordering.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#pragma once

#include <enum.h>

namespace algos::od {
BETTER_ENUM(Ordering, char, ascending = 0, descending);
} // namespace algos::od
Loading