Skip to content

Commit

Permalink
add splitter fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
jonashering committed Jul 1, 2019
1 parent a7ce570 commit a255b82
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 4 deletions.
58 changes: 58 additions & 0 deletions wtrec/splitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from random import sample
import numpy as np
import os


Y_COLUMN = 'label'
DROP_COLUMS = ['raw', 'path']


def _shuffle(data):
return data.reindex(np.random.permutation(data.index))


def split_random(data, test_size=0.3):
test_size = int(data.shape[0] * test_size)
train_size = len(data) - test_size

data = _shuffle(data)[:test_size + train_size]
data = data.reindex().drop(DROP_COLUMS, axis=1)

test = data[:test_size]
train = data[test_size:]

test_Y = test[Y_COLUMN]
test_X = test.drop(Y_COLUMN, axis=1)

train_Y = train[Y_COLUMN]
train_X = train.drop(Y_COLUMN, axis=1)

return train_X, train_Y, test_X, test_Y


def _train_test_indices(num_samples, idx_file_path, test_size=0.3):
if not os.path.isfile(idx_file_path):
test_indices = np.array(sample(range(num_samples), k=int(num_samples * test_size)))
test_indices.dump(idx_file_path)

test_indices = np.load(idx_file_path)
train_indices = np.array([i for i in range(num_samples) if i not in test_indices])

return train_indices, test_indices


def split_fixed(data, idx_file_path):
data = data.reindex().drop(DROP_COLUMS, axis=1)

train_indices, test_indices = _train_test_indices(data.shape[0], idx_file_path)

test = data.iloc[test_indices]
train = data.iloc[train_indices]

test_Y = test[Y_COLUMN]
test_X = test.drop(Y_COLUMN, axis=1)

train_Y = train[Y_COLUMN]
train_X = train.drop(Y_COLUMN, axis=1)

return train_X, train_Y, test_X, test_Y
13 changes: 9 additions & 4 deletions wtrec/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def _add_global_layout_features(self):

def _add_layout_features(self):
for i in self.cols:
total_rowspan = np.sum([int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('rowspan', 0)) for x in i[0]])
total_rowspan = np.sum(
[int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('rowspan', 0)) for x in i[0]]
)
num_rowspan = len([1 for x in i[0] if 'rowspan' in bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs])
features = DataFrame({
'avg_length': [np.mean([len(str(elem)) for elem in i[1]])],
Expand All @@ -77,7 +79,9 @@ def _add_layout_features(self):
self.obj = concat([self.obj, features])

for i in self.rows:
total_colspan = np.sum([int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('colspan', 0)) for x in i[0]])
total_colspan = np.sum(
[int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('colspan', 0)) for x in i[0]]
)
num_colspan = len([1 for x in i[0] if 'colspan' in bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs])
features = DataFrame({
'avg_length': [np.mean([len(str(elem)) for elem in i[1]])],
Expand Down Expand Up @@ -147,7 +151,7 @@ def transform_for_baseline(raw_dataframe):
try:
with_features = with_features.append(_BaselineSample(row).transform(), ignore_index=True)
except IndexError: # FIXME: only tables with min shape 2x2 in dataset!
continue
print(row['path'])

return with_features

Expand Down Expand Up @@ -176,6 +180,7 @@ def transform_for_approach(raw_dataframe):
Args:
Dataframe with columns raw and label
Returns:
Dataframe with columns raw, label and feature space
Dataframe with columns raw, label and imagepath
Generates image representations of web table
"""
pass

0 comments on commit a255b82

Please sign in to comment.