-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
129 lines (107 loc) · 4.47 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""
@author: Leon Scharwächter
"""
from scipy.io import arff
import scipy.signal
import pandas as pd
import torch
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
def getDataset(seqlen: int = 51, SOS: int = np.nan, EOS: int = np.nan):
'''
This function loads the NATOPS dataset in the arff-Format.
Both the train- and test-dataset consist of a numpy-void
containing 180 Series objects, in which the 24 feature
vectors are concatenated. As a first preprocessing step,
the dataset is converted into a Torch Tensor with the format
(N, S, E), where N is the number of multivariate sequences,
S is the sequence length (default = 51) and E is the number
of features. The sequence can be scaled to another length
using the input argument seqlen.
'''
# Load the dataset
data_train = arff.loadarff('NATOPS_TRAIN.arff')
data_test = arff.loadarff('NATOPS_TEST.arff')
df_train = pd.DataFrame(data_train[0])
df_test = pd.DataFrame(data_test[0])
# Find all labels in the dataset
# print(df_train['classAttribute'].sort_values().unique())
# Drop the labels
df_train = df_train.drop(columns='classAttribute')
df_test = df_test.drop(columns='classAttribute')
# Convert the dataset
nData = df_train.shape[0] # 180
nFeatures = df_train.iloc[0][0].shape[0] # 24
sequenceLength = seqlen
train_data = np.zeros((nData, sequenceLength, nFeatures))
test_data = np.zeros((nData, sequenceLength, nFeatures))
for i, row in df_train.iterrows():
for f in range(nFeatures):
train_data[i,:,f] = list(row.iloc[0][f])
for i, row in df_test.iterrows():
for f in range(nFeatures):
test_data[i,:,f] = list(row.iloc[0][f])
# Convert into Torch Tensor
train_data = torch.from_numpy(train_data)
test_data = torch.from_numpy(test_data)
return train_data, test_data
def getLabels():
'''
Returns the labels of the dataset
'''
# Load the dataset
data_train = arff.loadarff('NATOPS_TRAIN.arff')
data_test = arff.loadarff('NATOPS_TEST.arff')
df_train = pd.DataFrame(data_train[0])
df_test = pd.DataFrame(data_test[0])
# Define dictionary to remap the label values
labeldict = {b'1.0' : 1, b'2.0' : 2, b'3.0' : 3,
b'4.0' : 4, b'5.0' : 5, b'6.0' : 6}
df_train = df_train.replace({'classAttribute': labeldict})
df_test = df_test.replace({'classAttribute': labeldict})
# Get labels
train_labels = df_train.iloc[:,1].values
test_labels = df_test.iloc[:,1].values
return train_labels, test_labels
def addTokens(dataset: float, SOS: int = np.nan, EOS: int = np.nan):
'''
Adds a SOS-Token to each sequence of the multivariate dataset.
'''
N, _, E = dataset.shape
SOS_= torch.ones((N,1,E))*SOS
return torch.cat((SOS_,dataset),1)
def batchify(dataset: float, batch_size: int):
'''
Creates a list of batches from a given dataset.
Everytime this function is called, the dataset is shuffled
resulting in a different sample distribution per batch.
'''
idx = torch.randperm(dataset.shape[0])
dataset = dataset[idx].view(dataset.size())
num_batches = math.ceil(dataset.size()[0]/batch_size)
batches = [dataset[batch_size*y:batch_size*(y+1),:,:] for y in range(num_batches)]
return batches
def scale(dataset: float, SOS: int = np.nan, seqlen: int = 51):
'''
This function is used to scale the dataset samples to a different
sequence length after the dataset is constructed.
Thereby it ensures that the SOS-Token still only appears once.
If a SOS-Token is used, then the returned sequences have a
length of seqlen+1.
'''
N, _, E = dataset.shape
if SOS is np.nan:
new_dataset = np.zeros((N, seqlen, E))
for i, seq in enumerate(dataset):
new_dataset[i,:,:] = scipy.signal.resample(seq,seqlen,axis=0)
else:
new_dataset = np.zeros((N, seqlen+1, E))
new_dataset[:,0,:] = torch.ones((1,E))*SOS
for i, seq in enumerate(dataset):
new_dataset[i,1:,:] = scipy.signal.resample(seq[1:],seqlen,axis=0)
return torch.tensor(new_dataset)
#train_labels, test_labels = getLabels()
#train_data, test_data = getDataset()