-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_emb.py
159 lines (123 loc) · 6.11 KB
/
generate_emb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
from transformers import AutoTokenizer, AutoModel
import torch
import pdb
from tqdm import tqdm
from scipy.stats import linregress
import numpy as np
from copy import deepcopy
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
# csv_name = 'merged_csv_final.csv'
# df = pd.read_csv(csv_name)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# # Move the model to GPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)
# Move the model to GPU and wrap with DataParallel for multi-GPU support
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.device_count() > 1:
print(f"Using {torch.cuda.device_count()} GPUs")
model = torch.nn.DataParallel(model)
model.to(device)
sentence_1 = "Measurements of electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years. The sampling period is minutely. The min value is 0.09, the max value is 100."
from torch.utils.data import DataLoader, Dataset
import torch
# Custom Dataset class to handle multiple sentences
class SentenceDataset(Dataset):
def __init__(self, sentences):
self.sentences = sentences
def __len__(self):
return len(self.sentences)
def __getitem__(self, idx):
return self.sentences[idx], idx
def collate_fn(batch, tokenizer):
# Tokenize the batch of sentences and pad them
indices = [_batch[1] for _batch in batch]
text = [_batch[0] for _batch in batch]
batch_encodings = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
batch_encodings['indices'] = indices
return batch_encodings
def get_gpt_embeddings(sentences, model, tokenizer, batch_size=64):
# Create a custom dataset
dataset = SentenceDataset(sentences)
# Create a DataLoader for batching with a collate function
dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=lambda x: collate_fn(x, tokenizer))
all_embeddings = []
# Iterate over batches
for batch in tqdm(dataloader):
# Get input_ids and attention_mask from batch
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
indices = batch["indices"]
# pdb.set_trace()
# Ensure inputs are on the same device as the model
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
# Extract hidden states
hidden_states = outputs.last_hidden_state
# Compute the sentence embeddings
batch_embeddings = hidden_states.mean(dim=1)
batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True)
batch_embeddings = batch_embeddings.detach().cpu().numpy()
for k in range(len(indices)):
# df = pd.DataFrame({'embedding': batch_embeddings[k]})
# df.to_csv(f'/home/prquan/emb_data/{k}.csv', index=False)
idx = indices[k]//10**7
np.save(f'/home/prquan/emb_data/{idx}/{indices[k]}.npy', batch_embeddings[k])
def analyze_time_series(ts: np.ndarray):
# Calculate the overall trend using linear regression
x = np.arange(len(ts))
slope, _, _, _, _ = linregress(x, ts)
# Compute autocorrelation for different lags
n = len(ts)
ts_mean = ts - np.mean(ts)
autocorr_full = np.correlate(ts_mean, ts_mean, mode='full') / (np.var(ts) * n+1e-8)
autocorr = autocorr_full[n-1:] # Keep the second half (non-negative lags)
# Find the top lag (excluding lag 0)
top_lag = np.argmax(autocorr[1:]) + 1 # +1 to account for zero-based indexing
return slope, top_lag
# Function to process each iteration
def process_iteration(i, df, description, lag, length):
template = "The input has a minimum of {:.2f}, a maximum of {:.2f}, and a median of {:.2f}. The overall trend is {:.2f}. The top lag is {:.2f}."
dataset = df['source'].iloc[i]
timestamp = df['date'].iloc[i]
timestamp_str = f'The collection time of the input starts at {timestamp}. '
desc = random.choice(description[dataset])
ts = df['OT'].iloc[i:i+lag].to_numpy(dtype=np.float32)
if len(ts) <= 1:
return desc + timestamp_str + template.format(0,0,0,0,0)
slope, top_lag = analyze_time_series(ts)
Min, Max, med = ts.min(), ts.max(), np.median(ts)
return desc + timestamp_str + template.format(Min, Max, med, slope, top_lag)
if __name__ == '__main__':
csv_name = 'all_domain_m2.csv'
# csv_name = "./dataloaders/data/informer/etth/ETTh3.csv"
df = pd.read_csv(csv_name)
description_name = 'rephrased_descriptions.csv'
description = pd.read_csv(description_name)
lag, horizon = 192, 48
length = lag + horizon
sentences = []
# pdb.set_trace()
# Use ThreadPoolExecutor for multithreading
with ThreadPoolExecutor(max_workers=1) as executor:
futures = {executor.submit(process_iteration, i, df, description, lag, length): i for i in tqdm(range(0, len(df)))}
for future in tqdm(futures, total=len(futures)):
sentences.append(future.result())
# futures = [executor.submit(process_iteration, i, df, description, lag, length) for i in tqdm(range(0, len(df)))]
# # Use list(futures) with tqdm to wrap the futures and display progress
# for future in tqdm(list(futures), total=len(futures)):
# sentences.append(future.result())
# pdb.set_trace()
# embeddings = get_gpt_embeddings(sentences, model, tokenizer, batch_size=256)
get_gpt_embeddings(sentences, model, tokenizer, batch_size=256)
df = pd.DataFrame({'text': sentences})
df.to_csv('text.csv', index=False)
# embedding_strings = [np.array2string(row, precision=5, separator=',', suppress_small=True) for row in embeddings]
# df = pd.DataFrame({'embedding': embedding_strings})
# df.to_csv('emb.csv', index=False)