add splitter fixed

jonashering · Jul 1, 2019 · a255b82 · a255b82
1 parent a7ce570
commit a255b82
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 4 deletions.
diff --git a/wtrec/splitter.py b/wtrec/splitter.py
@@ -0,0 +1,58 @@
+from random import sample
+import numpy as np
+import os
+
+
+Y_COLUMN = 'label'
+DROP_COLUMS = ['raw', 'path']
+
+
+def _shuffle(data):
+    return data.reindex(np.random.permutation(data.index))
+
+
+def split_random(data, test_size=0.3):
+    test_size = int(data.shape[0] * test_size)
+    train_size = len(data) - test_size 
+
+    data = _shuffle(data)[:test_size + train_size]
+    data = data.reindex().drop(DROP_COLUMS, axis=1)
+
+    test = data[:test_size]
+    train = data[test_size:]
+
+    test_Y = test[Y_COLUMN]
+    test_X = test.drop(Y_COLUMN, axis=1)
+
+    train_Y = train[Y_COLUMN]
+    train_X = train.drop(Y_COLUMN, axis=1)
+
+    return train_X, train_Y, test_X, test_Y
+
+
+def _train_test_indices(num_samples, idx_file_path, test_size=0.3):
+    if not os.path.isfile(idx_file_path):
+        test_indices = np.array(sample(range(num_samples), k=int(num_samples * test_size)))
+        test_indices.dump(idx_file_path)
+
+    test_indices = np.load(idx_file_path)
+    train_indices = np.array([i for i in range(num_samples) if i not in test_indices])
+
+    return train_indices, test_indices
+
+
+def split_fixed(data, idx_file_path):
+    data = data.reindex().drop(DROP_COLUMS, axis=1)
+
+    train_indices, test_indices = _train_test_indices(data.shape[0], idx_file_path)
+
+    test = data.iloc[test_indices]
+    train = data.iloc[train_indices]
+
+    test_Y = test[Y_COLUMN]
+    test_X = test.drop(Y_COLUMN, axis=1)
+
+    train_Y = train[Y_COLUMN]
+    train_X = train.drop(Y_COLUMN, axis=1)
+
+    return train_X, train_Y, test_X, test_Y
diff --git a/wtrec/transformer.py b/wtrec/transformer.py
@@ -65,7 +65,9 @@ def _add_global_layout_features(self):
 
     def _add_layout_features(self):
         for i in self.cols:
-            total_rowspan = np.sum([int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('rowspan', 0)) for x in i[0]])
+            total_rowspan = np.sum(
+                [int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('rowspan', 0)) for x in i[0]]
+            )
             num_rowspan = len([1 for x in i[0] if 'rowspan' in bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs])
             features = DataFrame({
                 'avg_length': [np.mean([len(str(elem)) for elem in i[1]])],
@@ -77,7 +79,9 @@ def _add_layout_features(self):
             self.obj = concat([self.obj, features])
 
         for i in self.rows:
-            total_colspan = np.sum([int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('colspan', 0)) for x in i[0]])
+            total_colspan = np.sum(
+                [int(bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs.get('colspan', 0)) for x in i[0]]
+            )
             num_colspan = len([1 for x in i[0] if 'colspan' in bs(x, 'html.parser').find_all(['td', 'th'])[0].attrs])
             features = DataFrame({
                 'avg_length': [np.mean([len(str(elem)) for elem in i[1]])],
@@ -147,7 +151,7 @@ def transform_for_baseline(raw_dataframe):
         try:
             with_features = with_features.append(_BaselineSample(row).transform(), ignore_index=True)
         except IndexError:  # FIXME: only tables with min shape 2x2 in dataset!
-            continue
+            print(row['path'])
 
     return with_features
 
@@ -176,6 +180,7 @@ def transform_for_approach(raw_dataframe):
     Args:
         Dataframe with columns raw and label
     Returns:
-        Dataframe with columns raw, label and feature space
+        Dataframe with columns raw, label and imagepath
+        Generates image representations of web table
     """
     pass