Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dates to AC #332

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions examples/algebraic_constraints.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import desbordante
import pandas
import operator
from datetime import datetime

TABLE = 'examples/datasets/cargo_march.csv'
# Note that dates in the given dataset must be in the format Y-M-D
TABLE = 'examples/datasets/ACShippingDates.csv'
HEADER = 0
SEPARATOR = ','
P_FUZZ = 0.85
FUZZINESS = 0.2
P_FUZZ = 0.7
FUZZINESS = 0.3
BUMPS_LIMIT = 0
WEIGHT = 0.1
BIN_OPERATION = '-'
AC_SEED = 11
ITERATIONS_LIMIT = 4
ITERATIONS_LIMIT = 10
OPERATIONS = {
'+': (operator.add, 'Sum'),
'-': (operator.sub, 'Difference'),
Expand All @@ -23,7 +25,7 @@
algo = desbordante.ACAlgorithm()

df = pandas.read_csv(TABLE, sep=SEPARATOR, header=HEADER)
df_without_id = df[['Delivery date', 'Dispatch date']]
df_without_id = df[['deliveryDate', 'shipDate']]

algo.load_data(df=df_without_id)

Expand All @@ -33,18 +35,19 @@
ac_ranges = algo.get_ac_ranges()
for ac_range in ac_ranges:
l_col = df_without_id.columns[ac_range.column_indices[0]]
r_col = df_without_id.columns[ac_range.column_indices[1]]
r_col = df_without_id.columns[ac_range.column_indices[1]]
print(f'Discovered ranges for ({l_col} {BIN_OPERATION} {r_col}) are:')
print(ac_range.ranges)

ac_exceptions = algo.get_ac_exceptions()
print()
print(f'Rows in which the result of the chosen operation ({BIN_OPERATION}) is outside of discovered ranges:')
for ac_exception in ac_exceptions:
id, delivery_date, dispatch_date = df.iloc[ac_exception.row_index]
id, delievery_date, ship_date = df.iloc[ac_exception.row_index]
print(f'id: {id}')
print(f'Dispatch date: {dispatch_date}')
print(f'Delivery date: {delivery_date}')
print(f'{operation_name}: {operation(delivery_date, dispatch_date)}')
print(f'Shipping date : {ship_date}')
print(f'Delivery date: {delievery_date}')
date1 = datetime.strptime(ship_date, '%Y-%m-%d').date()
date2 = datetime.strptime(delievery_date, '%Y-%m-%d').date()
print(f'{operation_name}: {operation(date2, date1).days}')
print()

100 changes: 100 additions & 0 deletions examples/datasets/ACShippingDates.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
orderID,deliveryDate,shipDate
8E3,2008-08-02,2008-02-08
2Z8,2008-05-22,2008-02-25
0J0,2008-06-20,2008-06-12
2U0,2008-07-22,2008-05-13
4H8,2008-07-01,2008-05-26
0A4,2008-05-03,2008-04-28
6J2,2008-06-19,2008-04-11
0W5,2008-06-30,2008-05-25
7G9,2008-06-11,2008-01-23
0H6,2008-08-10,2008-07-23
9Y3,2008-07-30,2008-03-26
5N6,2008-04-17,2008-02-10
9N2,2008-05-07,2008-02-12
5Y8,2008-03-09,2008-02-21
1V2,2008-07-28,2008-06-14
7Q4,2008-07-24,2008-07-11
6H1,2008-07-24,2008-06-24
5J7,2008-06-26,2008-03-16
0H4,2008-07-22,2008-02-23
5Z8,2008-07-03,2008-02-01
7P2,2008-06-23,2008-06-21
1Q0,2008-07-18,2008-07-10
4W1,2008-07-21,2008-05-02
3T4,2008-07-19,2008-05-12
7C5,2008-08-03,2008-07-10
5Z6,2008-05-08,2008-04-25
9A6,2008-08-15,2008-06-25
2Q7,2008-08-08,2008-03-29
2V5,2008-04-15,2008-02-28
5A4,2008-08-12,2008-07-28
1Y3,2008-08-04,2008-07-17
4H1,2008-07-01,2008-06-14
1P9,2008-08-10,2008-03-29
9E0,2008-05-24,2008-05-08
6B4,2008-08-10,2008-03-31
9R1,2008-06-14,2008-04-12
0T9,2008-06-11,2008-06-09
8M1,2008-07-24,2008-04-05
1R7,2008-06-27,2008-05-29
0V7,2008-05-10,2008-05-09
2T1,2008-07-18,2008-04-30
5G4,2008-07-09,2008-04-24
2K8,2008-08-06,2008-08-04
4O6,2008-06-30,2008-05-06
0T4,2008-08-14,2008-07-26
3Y9,2008-05-25,2008-03-23
5S3,2008-05-28,2008-04-28
0E0,2008-08-12,2008-08-03
4H7,2008-08-09,2008-01-30
4Q7,2008-04-04,2008-01-11
4E2,2008-07-22,2008-05-10
2I6,2008-07-13,2008-04-23
8K2,2008-08-14,2008-02-05
4U2,2008-04-03,2008-01-17
6C9,2008-08-04,2008-03-17
0J5,2008-05-04,2008-02-27
0S3,2008-07-30,2008-07-22
3B0,2008-05-31,2008-03-01
4V4,2008-07-31,2008-03-09
3C9,2008-01-24,2008-01-23
2F1,2008-03-20,2008-01-14
4L3,2008-06-23,2008-05-13
1E4,2008-08-13,2008-08-12
5H1,2008-08-08,2008-07-20
2S5,2008-07-04,2008-04-09
9A0,2008-03-19,2008-03-14
0P0,2008-07-13,2008-01-15
6V5,2008-07-09,2008-02-12
0Z9,2008-07-10,2008-05-22
5T6,2008-07-24,2008-06-09
1E9,2008-08-02,2008-03-12
0R7,2008-04-22,2008-01-26
5G1,2008-05-04,2008-01-06
2U2,2008-05-26,2008-03-25
4H0,2008-06-24,2008-01-25
9P6,2008-08-14,2008-08-04
0V0,2008-08-01,2008-07-12
6Q0,2008-08-12,2008-07-21
8T9,2008-04-08,2008-01-17
6O4,2008-05-28,2008-05-11
8G4,2008-08-14,2008-07-29
4E5,2008-07-20,2008-07-14
2K6,2008-07-19,2008-06-16
8O5,2008-04-12,2008-01-24
8Q8,2008-08-10,2008-06-24
9R6,2008-03-15,2008-03-08
2C3,2008-04-30,2008-01-10
8F9,2008-07-21,2008-07-13
7E7,2008-07-26,2008-02-03
2C1,2008-07-19,2008-03-22
5F3,2008-08-15,2008-08-03
1H0,2008-07-20,2008-05-12
1R5,2008-08-11,2008-07-14
5C8,2008-06-13,2008-05-24
6E3,2008-06-05,2008-01-15
9E7,2008-02-02,2008-01-24
4C1,2008-03-31,2008-01-12
4N8,2008-03-28,2008-01-30
1S8,2008-08-07,2008-06-30
72 changes: 39 additions & 33 deletions src/core/algorithms/algebraic_constraints/ac_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,20 @@ void ACAlgorithm::RegisterOptions() {
auto check_and_set_binop = [this](Binop bin_operation) {
switch (bin_operation) {
case +Binop::Addition:
binop_pointer_ = &model::INumericType::Add;
num_binop_pointer_ = &model::INumericType::Add;
// date_binop_pointer remains nullptr because dates do not support addition
break;
case +Binop::Subtraction:
binop_pointer_ = &model::INumericType::Sub;
num_binop_pointer_ = &model::INumericType::Sub;
date_binop_pointer_ = &model::DateType::SubDate;
break;
case +Binop::Multiplication:
binop_pointer_ = &model::INumericType::Mul;
num_binop_pointer_ = &model::INumericType::Mul;
// date_binop_pointer remains nullptr because dates do not support multiplication
break;
case +Binop::Division:
binop_pointer_ = &model::INumericType::Div;
num_binop_pointer_ = &model::INumericType::Div;
// date_binop_pointer remains nullptr because dates do not support division
break;
default:
throw config::ConfigurationError(
Expand Down Expand Up @@ -147,9 +151,8 @@ std::vector<std::byte const*> ACAlgorithm::Sampling(std::vector<model::TypedColu
++i;
}
RestrictRangesAmount(ranges);
ac_pairs_.emplace_back(ACPairsCollection(
model::CreateSpecificType<model::INumericType>(data.at(lhs_i).GetTypeId(), true),
std::move(ac_pairs), lhs_i, rhs_i));
ac_pairs_.emplace_back(
ACPairsCollection(data.at(lhs_i).GetTypeId(), std::move(ac_pairs), lhs_i, rhs_i));
return ranges;
}

Expand All @@ -160,7 +163,6 @@ std::vector<std::byte const*> ACAlgorithm::SamplingIteration(
std::vector<std::byte const*> const& rhs = data.at(rhs_i).GetData();
ac_pairs.clear();
std::mt19937 gen(seed_);

std::bernoulli_distribution d(probability);
for (size_t i = 0; i < lhs.size(); ++i) {
if (d(gen)) {
Expand All @@ -169,10 +171,12 @@ std::vector<std::byte const*> ACAlgorithm::SamplingIteration(
if (data[lhs_i].IsNullOrEmpty(i) || data[rhs_i].IsNullOrEmpty(i)) {
continue;
}
auto res = std::unique_ptr<std::byte[]>(num_type_->Allocate());
num_type_->ValueFromStr(res.get(), "0");
std::unique_ptr<std::byte[]> res =
std::unique_ptr<std::byte[]>(type_wrapper_.NumericAllocate());
type_wrapper_.NumericFromStr(res.get(), "0");

if (bin_operation_ == +Binop::Division &&
num_type_->Compare(r, res.get()) == model::CompareResult::kEqual) {
type_wrapper_.NumericCompare(r, res.get()) == model::CompareResult::kEqual) {
continue;
}
InvokeBinop(l, r, res.get());
Expand All @@ -186,7 +190,7 @@ std::vector<std::byte const*> ACAlgorithm::SamplingIteration(
std::sort(ac_pairs.begin(), ac_pairs.end(),
[this](std::unique_ptr<ACPair> const& a, std::unique_ptr<ACPair> const& b) {
return model::CompareResult::kLess ==
this->num_type_->Compare(a->GetRes(), b->GetRes());
this->type_wrapper_.NumericCompare(a->GetRes(), b->GetRes());
});

return ConstructDisjunctiveRanges(ac_pairs);
Expand All @@ -207,7 +211,7 @@ void ACAlgorithm::RestrictRangesAmount(std::vector<std::byte const*>& ranges) co
double min_dist = -1;
size_t min_index = 1;
for (size_t i = min_index; i < bumps * 2 - 1; i += 2) {
double dist = num_type_->Dist(ranges.at(i), ranges.at(i + 1));
double dist = type_wrapper_.Dist(ranges.at(i), ranges.at(i + 1));
if (min_dist == -1 || dist < min_dist) {
min_dist = dist;
min_index = i;
Expand Down Expand Up @@ -241,18 +245,21 @@ ACPairsCollection const& ACAlgorithm::GetACPairsByColumns(size_t lhs_i, size_t r
return *res;
}

void ACAlgorithm::PrintRanges(std::vector<model::TypedColumnData> const& data) const {
void ACAlgorithm::PrintRanges(std::vector<model::TypedColumnData> const& data) {
for (size_t i = 0; i < ranges_.size(); ++i) {
LOG(DEBUG) << "lhs: " << data.at(ranges_[i].col_pair.col_i.first).ToString() << std::endl;
LOG(DEBUG) << "rhs: " << data.at(ranges_[i].col_pair.col_i.second).ToString() << std::endl;
if (ranges_[i].ranges.empty()) {
LOG(DEBUG) << "No intervals were found." << std::endl;
continue;
}

for (size_t k = 0; k < ranges_[i].ranges.size() - 1; k += 2) {
auto* num_type = ranges_[i].col_pair.num_type.get();
LOG(DEBUG) << "[" << num_type->ValueToString(ranges_[i].ranges[k]) << ", "
<< num_type->ValueToString(ranges_[i].ranges[k + 1]) << "]";
LOG(DEBUG) << "["
<< ranges_[i].col_pair.type_wrapper.NumericToString(ranges_[i].ranges[k])
<< ", "
<< ranges_[i].col_pair.type_wrapper.NumericToString(ranges_[i].ranges[k + 1])
<< "]";
if (k != ranges_[i].ranges.size() - 2) {
LOG(DEBUG) << ", ";
}
Expand All @@ -273,11 +280,11 @@ std::vector<std::byte const*> ACAlgorithm::ConstructDisjunctiveRanges(
ACPair const* r_border = nullptr;

if (weight_ < 1) {
double delta = num_type_->Dist(ac_pairs.front()->GetRes(), ac_pairs.back()->GetRes()) *
double delta = type_wrapper_.Dist(ac_pairs.front()->GetRes(), ac_pairs.back()->GetRes()) *
(weight_ / (1 - weight_));

for (size_t i = 0; i < ac_pairs.size() - 1; ++i) {
if (num_type_->Dist(ac_pairs[i]->GetRes(), ac_pairs[i + 1]->GetRes()) <= delta) {
if (type_wrapper_.Dist(ac_pairs[i]->GetRes(), ac_pairs[i + 1]->GetRes()) <= delta) {
r_border = ac_pairs[i + 1].get();
} else {
ranges.emplace_back(l_border->GetRes());
Expand Down Expand Up @@ -305,9 +312,8 @@ RangesCollection ACAlgorithm::ReconstructRangesByColumns(size_t lhs_i, size_t rh
ACPairsCollection const& constraints_collection = GetACPairsByColumns(lhs_i, rhs_i);
ACPairs const& ac_pairs = constraints_collection.ac_pairs;
std::vector<std::byte const*> ranges = ConstructDisjunctiveRanges(ac_pairs);
model::TypeId type_id = constraints_collection.col_pair.num_type->GetTypeId();
return RangesCollection{model::CreateSpecificType<model::INumericType>(type_id, true),
std::move(ranges), lhs_i, rhs_i};
model::TypeId type_id = constraints_collection.col_pair.type_wrapper.GetTypeId();
return RangesCollection{type_id, std::move(ranges), lhs_i, rhs_i};
}

unsigned long long ACAlgorithm::ExecuteInternal() {
Expand All @@ -318,23 +324,23 @@ unsigned long long ACAlgorithm::ExecuteInternal() {
auto start_time = std::chrono::system_clock::now();

for (size_t col_i = 0; col_i < data.size() - 1; ++col_i) {
if (!data.at(col_i).GetType().IsNumeric()) continue;
num_type_ =
model::CreateSpecificType<model::INumericType>(data.at(col_i).GetTypeId(), true);
if (!(data.at(col_i).GetType().IsNumeric() || data.at(col_i).GetType().IsDate())) continue;
type_wrapper_.Set(data.at(col_i).GetTypeId());
for (size_t col_k = col_i + 1; col_k < data.size(); ++col_k) {
if (data.at(col_i).GetTypeId() == data.at(col_k).GetTypeId()) {
ranges_.emplace_back(
RangesCollection{model::CreateSpecificType<model::INumericType>(
data.at(col_i).GetTypeId(), true),
Sampling(data, col_i, col_k), col_i, col_k});
if (data.at(col_i).GetTypeId() == +model::TypeId::kDate &&
bin_operation_ != +Binop::Subtraction) {
continue;
}

ranges_.emplace_back(data.at(col_i).GetTypeId(), Sampling(data, col_i, col_k),
col_i, col_k);
/* Because of asymmetry and division by 0, we need to rediscover ranges.
* We don't need to do that for minus: (column1 - column2) lies in *some ranges*
* there we can express one column through another without possible problems */
if (bin_operation_ == +Binop::Division) {
ranges_.emplace_back(
RangesCollection{model::CreateSpecificType<model::INumericType>(
data.at(col_i).GetTypeId(), true),
Sampling(data, col_k, col_i), col_k, col_i});
ranges_.emplace_back(data.at(col_i).GetTypeId(), Sampling(data, col_k, col_i),
col_k, col_i);
}
}
}
Expand Down
14 changes: 10 additions & 4 deletions src/core/algorithms/algebraic_constraints/ac_algorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "model/table/column_layout_typed_relation_data.h"
#include "model/types/types.h"
#include "ranges_collection.h"
#include "type_wrapper.h"
#include "typed_column_pair.h"

namespace algos {
Expand Down Expand Up @@ -57,8 +58,9 @@ class ACAlgorithm : public Algorithm {
double seed_;
std::vector<ACPairsCollection> ac_pairs_;
std::vector<RangesCollection> ranges_;
model::INumericType::NumericBinop binop_pointer_ = nullptr;
std::unique_ptr<model::INumericType> num_type_;
model::INumericType::NumericBinop num_binop_pointer_ = nullptr;
model::DateType::DateBinop date_binop_pointer_ = nullptr;
TypeWrapper type_wrapper_;

/* Returns vector with ranges boundaries constructed for columns with lhs_i and rhs_i indices.
* Value pairs (by which ranges constructed) fall into sample selection with chosen probability.
Expand All @@ -84,7 +86,11 @@ class ACAlgorithm : public Algorithm {

public:
void InvokeBinop(std::byte const* l, std::byte const* r, std::byte* res) const {
std::invoke(binop_pointer_, num_type_, l, r, res);
if (type_wrapper_.IsNumeric()) {
std::invoke(num_binop_pointer_, type_wrapper_.num_type, l, r, res);
} else {
std::invoke(date_binop_pointer_, type_wrapper_.date_type, l, r, res);
}
}

size_t CalculateSampleSize(size_t k_bumps) const;
Expand All @@ -110,7 +116,7 @@ class ACAlgorithm : public Algorithm {
return bin_operation_;
}

void PrintRanges(std::vector<model::TypedColumnData> const& data) const;
void PrintRanges(std::vector<model::TypedColumnData> const& data);

void CollectACExceptions() const {
ac_exception_finder_->CollectExceptions(this);
Expand Down
Loading