generate_emb.py

import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
from transformers import AutoTokenizer, AutoModel
import torch
import pdb
from tqdm import tqdm 
from scipy.stats import linregress
import numpy as np 
from copy import deepcopy
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

# csv_name = 'merged_csv_final.csv'
# df = pd.read_csv(csv_name)

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# # Move the model to GPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)

# Move the model to GPU and wrap with DataParallel for multi-GPU support
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs")
    model = torch.nn.DataParallel(model)
model.to(device)

sentence_1 = "Measurements of electric power consumption in one household with a one-minute sampling rate over a period of almost 4 years. The sampling period is minutely. The min value is 0.09, the max value is 100."

from torch.utils.data import DataLoader, Dataset
import torch

# Custom Dataset class to handle multiple sentences
class SentenceDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx], idx

def collate_fn(batch, tokenizer):
    # Tokenize the batch of sentences and pad them
    indices = [_batch[1] for _batch in batch]
    text = [_batch[0] for _batch in batch]
    batch_encodings = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    batch_encodings['indices'] = indices
    return batch_encodings

def get_gpt_embeddings(sentences, model, tokenizer, batch_size=64):
    # Create a custom dataset
    dataset = SentenceDataset(sentences)
    
    # Create a DataLoader for batching with a collate function
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=lambda x: collate_fn(x, tokenizer))

    all_embeddings = []

    # Iterate over batches
    for batch in tqdm(dataloader):
        # Get input_ids and attention_mask from batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        indices = batch["indices"]
        # pdb.set_trace()
        
        # Ensure inputs are on the same device as the model
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Extract hidden states
        hidden_states = outputs.last_hidden_state
        
        # Compute the sentence embeddings
        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True)
        batch_embeddings = batch_embeddings.detach().cpu().numpy()

        for k in range(len(indices)):
            # df = pd.DataFrame({'embedding': batch_embeddings[k]})
            # df.to_csv(f'/home/prquan/emb_data/{k}.csv', index=False)
            idx = indices[k]//10**7
            np.save(f'/home/prquan/emb_data/{idx}/{indices[k]}.npy', batch_embeddings[k])


def analyze_time_series(ts: np.ndarray):
    
    # Calculate the overall trend using linear regression
    x = np.arange(len(ts))
    slope, _, _, _, _ = linregress(x, ts)
    
    # Compute autocorrelation for different lags
    n = len(ts)
    ts_mean = ts - np.mean(ts)
    autocorr_full = np.correlate(ts_mean, ts_mean, mode='full') / (np.var(ts) * n+1e-8)
    autocorr = autocorr_full[n-1:]  # Keep the second half (non-negative lags)

    # Find the top lag (excluding lag 0)
    top_lag = np.argmax(autocorr[1:]) + 1  # +1 to account for zero-based indexing
    
    return slope, top_lag

# Function to process each iteration
def process_iteration(i, df, description, lag, length):
    template = "The input has a minimum of {:.2f}, a maximum of {:.2f}, and a median of {:.2f}. The overall trend is {:.2f}. The top lag is {:.2f}."
    dataset = df['source'].iloc[i]
    timestamp = df['date'].iloc[i]
    timestamp_str = f'The collection time of the input starts at {timestamp}. '
    desc = random.choice(description[dataset])

    ts = df['OT'].iloc[i:i+lag].to_numpy(dtype=np.float32)
    if len(ts) <= 1:
        return desc + timestamp_str + template.format(0,0,0,0,0)
    slope, top_lag = analyze_time_series(ts)
    Min, Max, med = ts.min(), ts.max(), np.median(ts)
    
    return desc + timestamp_str + template.format(Min, Max, med, slope, top_lag)

if __name__ == '__main__':
    csv_name = 'all_domain_m2.csv'
    # csv_name =  "./dataloaders/data/informer/etth/ETTh3.csv"
    df = pd.read_csv(csv_name)
    description_name = 'rephrased_descriptions.csv'
    description = pd.read_csv(description_name)
    lag, horizon = 192, 48
    length = lag + horizon
    sentences = []
    # pdb.set_trace()
    # Use ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor(max_workers=1) as executor:
        futures = {executor.submit(process_iteration, i, df, description, lag, length): i for i in tqdm(range(0, len(df)))}
        
        for future in tqdm(futures, total=len(futures)):
            sentences.append(future.result())
        # futures = [executor.submit(process_iteration, i, df, description, lag, length) for i in tqdm(range(0, len(df)))]

        # # Use list(futures) with tqdm to wrap the futures and display progress
        # for future in tqdm(list(futures), total=len(futures)):
        #     sentences.append(future.result())

    # pdb.set_trace()

    # embeddings = get_gpt_embeddings(sentences, model, tokenizer, batch_size=256)

    get_gpt_embeddings(sentences, model, tokenizer, batch_size=256)

    df = pd.DataFrame({'text': sentences})
    df.to_csv('text.csv', index=False)

    # embedding_strings = [np.array2string(row, precision=5, separator=',', suppress_small=True) for row in embeddings]
    # df = pd.DataFrame({'embedding': embedding_strings})
    # df.to_csv('emb.csv', index=False)