Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement set based aod verifier, support aod mining in fastod #468

Open
wants to merge 25 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
4267c23
Add ColumnIndexOption and move there ValidateIndex
polyntsov Sep 28, 2024
31f3c47
Add parameter to allow empty list of indices in IndicesOption
polyntsov Sep 28, 2024
1e76762
Move complex stripeed partition swap and create definitions to cpp
polyntsov Sep 28, 2024
20592c8
Refactor Swap in complex stripped partition
polyntsov Sep 28, 2024
4d65979
Introduce od::Ordering enum and use it instead of bool Ascending
polyntsov Sep 28, 2024
f043b22
Introduce partition type for complex stripped partition as create param
polyntsov Sep 28, 2024
b452c28
Accept in CreateAttributeSet any range as list of attributes
polyntsov Sep 29, 2024
789c617
Store DataFrame in ComplexStrippedPartition as raw pointer
polyntsov Sep 29, 2024
0214e5d
Store DataFrame directly as value in Fastod
polyntsov Sep 29, 2024
eef5077
Move nd_verifier's VectorToString to general util and accept any range
polyntsov Sep 29, 2024
7cff09d
Add missing <vector> include to config/iption.h
polyntsov Sep 30, 2024
cae91d2
Add method to convert fastod::AttributeSet to vector of column indices
polyntsov Sep 30, 2024
187a778
Add a callback to Option which is called before the option is set
polyntsov Sep 30, 2024
1f901a6
Implement getters for context and cols in canonical ods
polyntsov Sep 30, 2024
cdb4980
Implement a function to load algo data without configuring execute opts
polyntsov Sep 30, 2024
8b24e2f
Introduce is required callback to option
polyntsov Sep 30, 2024
d938c38
Allow absence of non-required options in algo factory
polyntsov Sep 30, 2024
fa7af04
Implement aod verifier and cover it with tests
polyntsov Sep 30, 2024
7d04aea
Implement python bindings to set based aod verifier
polyntsov Sep 30, 2024
60c2927
Implement error parameter for fastod and add tests for aod mining
polyntsov Sep 30, 2024
df33951
Specify in readme that we now support approximate set-based ODs
polyntsov Sep 30, 2024
4c350d0
Implement aod verification python example
polyntsov Sep 30, 2024
56a0f31
Avoid unnecessary copying of partitions in fastod partition cache
polyntsov Sep 30, 2024
a0e7af5
Don't store indices vectors via shared ptr in fastod complex partition
polyntsov Sep 30, 2024
3f6ec49
Fallback to split and swap validation when error is zero in canonical od
polyntsov Oct 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/core/algorithms/od/fastod/fastod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ void Fastod::ComputeODs() {
[this, &context, &del_attrs, &cc](model::ColumnIndex attr) {
SimpleCanonicalOD od(del_attrs[attr], attr);

if (od.IsValid(data_, partition_cache_)) {
if (od.IsValid(*data_, partition_cache_)) {
AddToResult(std::move(od));
CCPut(context, fastod::DeleteAttribute(cc, attr));

Expand Down
2 changes: 1 addition & 1 deletion src/core/algorithms/od/fastod/fastod.h
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, some commit description will be nice

Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ class Fastod : public Algorithm {
fastod::CanonicalOD<Ordering> od(fastod::DeleteAttribute(deleted_attrs[a], b), a,
b);

if (od.IsValid(data_, partition_cache_)) {
if (od.IsValid(*data_, partition_cache_)) {
AddToResult(std::move(od));
cs_for_con.erase(it++);
} else {
Expand Down
4 changes: 2 additions & 2 deletions src/core/algorithms/od/fastod/model/canonical_od.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ CanonicalOD<Ordering>::CanonicalOD(AttributeSet const& context, model::ColumnInd
: context_(std::move(context)), ap_(left, right) {}

template <od::Ordering Ordering>
bool CanonicalOD<Ordering>::IsValid(std::shared_ptr<DataFrame> data, PartitionCache& cache) const {
bool CanonicalOD<Ordering>::IsValid(DataFrame const& data, PartitionCache& cache) const {
return !(cache.GetStrippedPartition(context_, data).Swap<Ordering>(ap_.left, ap_.right));
}

Expand All @@ -30,7 +30,7 @@ SimpleCanonicalOD::SimpleCanonicalOD() : right_(0) {}
SimpleCanonicalOD::SimpleCanonicalOD(AttributeSet const& context, model::ColumnIndex right)
: context_(context), right_(right) {}

bool SimpleCanonicalOD::IsValid(std::shared_ptr<DataFrame> data, PartitionCache& cache) const {
bool SimpleCanonicalOD::IsValid(DataFrame const& data, PartitionCache& cache) const {
return !(cache.GetStrippedPartition(context_, data).Split(right_));
}

Expand Down
4 changes: 2 additions & 2 deletions src/core/algorithms/od/fastod/model/canonical_od.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class CanonicalOD {
CanonicalOD() noexcept = default;
CanonicalOD(AttributeSet const& context, model::ColumnIndex left, model::ColumnIndex right);

bool IsValid(std::shared_ptr<DataFrame> data, PartitionCache& cache) const;
bool IsValid(DataFrame const& data, PartitionCache& cache) const;
std::string ToString() const;

friend bool operator==(CanonicalOD<od::Ordering::ascending> const& x,
Expand Down Expand Up @@ -50,7 +50,7 @@ class SimpleCanonicalOD {
SimpleCanonicalOD();
SimpleCanonicalOD(AttributeSet const& context, model::ColumnIndex right);

bool IsValid(std::shared_ptr<DataFrame> data, PartitionCache& cache) const;
bool IsValid(DataFrame const& data, PartitionCache& cache) const;
std::string ToString() const;

friend bool operator==(SimpleCanonicalOD const& x, SimpleCanonicalOD const& y);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,42 +7,24 @@ namespace algos::fastod {
ComplexStrippedPartition::ComplexStrippedPartition()
: data_(nullptr), is_stripped_partition_(true), should_be_converted_to_sp_(false) {}

ComplexStrippedPartition::ComplexStrippedPartition(std::shared_ptr<DataFrame> data,
ComplexStrippedPartition::ComplexStrippedPartition(DataFrame const& data,
std::shared_ptr<std::vector<size_t>> indexes,
std::shared_ptr<std::vector<size_t>> begins)
: sp_indexes_(indexes),
sp_begins_(begins),
data_(data),
data_(&data),
is_stripped_partition_(true),
should_be_converted_to_sp_(false) {}

ComplexStrippedPartition::ComplexStrippedPartition(
std::shared_ptr<DataFrame> data, std::shared_ptr<std::vector<DataFrame::Range>> indexes,
DataFrame const& data, std::shared_ptr<std::vector<DataFrame::Range>> indexes,
std::shared_ptr<std::vector<size_t>> begins)
: rb_indexes_(indexes),
rb_begins_(begins),
data_(data),
data_(&data),
is_stripped_partition_(false),
should_be_converted_to_sp_(false) {}

ComplexStrippedPartition& ComplexStrippedPartition::operator=(
ComplexStrippedPartition const& other) {
if (this == &other) {
return *this;
}

sp_indexes_ = other.sp_indexes_;
sp_begins_ = other.sp_begins_;
rb_indexes_ = other.rb_indexes_;
rb_begins_ = other.rb_begins_;
data_ = other.data_;

should_be_converted_to_sp_ = other.should_be_converted_to_sp_;
is_stripped_partition_ = other.is_stripped_partition_;

return *this;
}

std::string ComplexStrippedPartition::ToString() const {
return is_stripped_partition_ ? CommonToString() : RangeBasedToString();
}
Expand Down Expand Up @@ -166,45 +148,44 @@ template bool ComplexStrippedPartition::Swap<od::Ordering::descending>(
model::ColumnIndex left, model::ColumnIndex right) const;

template <ComplexStrippedPartition::Type PartitionType>
ComplexStrippedPartition ComplexStrippedPartition::Create(std::shared_ptr<DataFrame> data) {
ComplexStrippedPartition ComplexStrippedPartition::Create(DataFrame const& data) {
if constexpr (PartitionType == Type::kRangeBased) {
auto rb_indexes = std::make_unique<std::vector<DataFrame::Range>>();
auto rb_begins = std::make_unique<std::vector<size_t>>();

size_t const tuple_count = data->GetTupleCount();
size_t const tuple_count = data.GetTupleCount();
rb_begins->push_back(0);

if (tuple_count != 0) {
rb_indexes->push_back({0, tuple_count - 1});
rb_begins->push_back(1);
}

return ComplexStrippedPartition(std::move(data), std::move(rb_indexes),
std::move(rb_begins));
return ComplexStrippedPartition(data, std::move(rb_indexes), std::move(rb_begins));
}

auto sp_indexes = std::make_unique<std::vector<size_t>>();
auto sp_begins = std::make_unique<std::vector<size_t>>();

sp_indexes->reserve(data->GetTupleCount());
sp_indexes->reserve(data.GetTupleCount());

for (size_t i = 0; i < data->GetTupleCount(); i++) {
for (size_t i = 0; i < data.GetTupleCount(); i++) {
sp_indexes->push_back(i);
}

if (data->GetTupleCount() != 0) {
if (data.GetTupleCount() != 0) {
sp_begins->push_back(0);
}

sp_begins->push_back(data->GetTupleCount());
sp_begins->push_back(data.GetTupleCount());

return ComplexStrippedPartition(std::move(data), std::move(sp_indexes), std::move(sp_begins));
return ComplexStrippedPartition(data, std::move(sp_indexes), std::move(sp_begins));
}

template ComplexStrippedPartition ComplexStrippedPartition::Create<
ComplexStrippedPartition::Type::kRangeBased>(std::shared_ptr<DataFrame> data);
template ComplexStrippedPartition ComplexStrippedPartition::Create<
ComplexStrippedPartition::Type::kStripped>(std::shared_ptr<DataFrame> data);
ComplexStrippedPartition::Type::kRangeBased>(DataFrame const& data);
template ComplexStrippedPartition
ComplexStrippedPartition::Create<ComplexStrippedPartition::Type::kStripped>(DataFrame const& data);

std::string ComplexStrippedPartition::CommonToString() const {
std::stringstream result;
Expand Down
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Store DataFrame in ComplexStrippedPartition as raw pointer:

A commit description explaining why will come in handy

Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "algorithms/od/fastod/storage/data_frame.h"
#include "algorithms/od/fastod/od_ordering.h"
#include "algorithms/od/fastod/model/removal_set.h"
#include "table/tuple_index.h"

namespace algos::fastod {
Expand All @@ -22,7 +23,7 @@ class ComplexStrippedPartition {
std::shared_ptr<std::vector<size_t>> sp_begins_;
std::shared_ptr<std::vector<DataFrame::Range>> rb_indexes_;
std::shared_ptr<std::vector<size_t>> rb_begins_;
std::shared_ptr<DataFrame> data_;
DataFrame const* data_;
bool is_stripped_partition_;
bool should_be_converted_to_sp_;

Expand All @@ -41,11 +42,10 @@ class ComplexStrippedPartition {
size_t group_start,
size_t group_end);

ComplexStrippedPartition(std::shared_ptr<DataFrame> data,
std::shared_ptr<std::vector<size_t>> indexes,
ComplexStrippedPartition(DataFrame const& data, std::shared_ptr<std::vector<size_t>> indexes,
std::shared_ptr<std::vector<size_t>> begins);

ComplexStrippedPartition(std::shared_ptr<DataFrame> data,
ComplexStrippedPartition(DataFrame const& data,
std::shared_ptr<std::vector<DataFrame::Range>> indexes,
std::shared_ptr<std::vector<size_t>> begins);
std::vector<Tuple> GetTuplesForColumns(model::ColumnIndex left, model::ColumnIndex right,
Expand All @@ -55,9 +55,6 @@ class ComplexStrippedPartition {
enum class Type { kStripped, kRangeBased };

ComplexStrippedPartition();
ComplexStrippedPartition(ComplexStrippedPartition const& origin) = default;

ComplexStrippedPartition& operator=(ComplexStrippedPartition const& other);

std::string ToString() const;
void Product(model::ColumnIndex attribute);
Expand All @@ -68,9 +65,8 @@ class ComplexStrippedPartition {

template <od::Ordering Ordering>
bool Swap(model::ColumnIndex left, model::ColumnIndex right) const;

template <Type PartitionType>
static ComplexStrippedPartition Create(std::shared_ptr<DataFrame> data);
static ComplexStrippedPartition Create(DataFrame const& data);
};

} // namespace algos::fastod
4 changes: 2 additions & 2 deletions src/core/algorithms/od/fastod/storage/partition_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class PartitionCache {
}

ComplexStrippedPartition GetStrippedPartition(AttributeSet const& attribute_set,
std::shared_ptr<DataFrame> data) {
DataFrame const& data) {
if (cache_.Contains(attribute_set)) {
return cache_.Get(attribute_set);
}
Expand All @@ -53,7 +53,7 @@ class PartitionCache {
bool is_product_called = CallProductWithAttributesInCache(result_partition, attribute_set);

if (!is_product_called) {
result_partition = data->IsAttributesMostlyRangeBased(attribute_set)
result_partition = data.IsAttributesMostlyRangeBased(attribute_set)
? ComplexStrippedPartition::Create<
ComplexStrippedPartition::Type::kRangeBased>(data)
: ComplexStrippedPartition::Create<
Expand Down