From 258ba29013c435c6082d5f4e7930efd0377cb51c Mon Sep 17 00:00:00 2001
From: Damien Sileo <dsileo@magnet9.lille.inria.fr>
Date: Wed, 10 Jul 2024 17:06:38 +0200
Subject: [PATCH] new tasks

---
 .../.ipynb_checkpoints/access-checkpoint.py   | 108 +++++++
 .../preprocess-checkpoint.py                  | 295 ++++++++++++++++++
 .../.ipynb_checkpoints/recast-checkpoint.py   | 115 +++++++
 .../.ipynb_checkpoints/tasks-checkpoint.py    |  53 +++-
 src/tasksource/preprocess.py                  |  38 ++-
 src/tasksource/recast.py                      |   6 +-
 src/tasksource/tasks.py                       |  53 +++-
 7 files changed, 629 insertions(+), 39 deletions(-)
 create mode 100644 src/tasksource/.ipynb_checkpoints/access-checkpoint.py
 create mode 100755 src/tasksource/.ipynb_checkpoints/preprocess-checkpoint.py
 create mode 100644 src/tasksource/.ipynb_checkpoints/recast-checkpoint.py

diff --git a/src/tasksource/.ipynb_checkpoints/access-checkpoint.py b/src/tasksource/.ipynb_checkpoints/access-checkpoint.py
new file mode 100644
index 0000000..bb49a19
--- /dev/null
+++ b/src/tasksource/.ipynb_checkpoints/access-checkpoint.py
@@ -0,0 +1,108 @@
+from .preprocess import Preprocessing
+import re
+import pandas as pd
+from . import tasks, recast
+from .metadata import dataset_rank
+from datasets import load_dataset
+import funcy as fc
+import os
+import copy
+from sorcery import dict_of
+from functools import cache
+import random
+
+
+class lazy_mtasks:
+    def __getattr__(self, name):
+        from . import mtasks
+        return getattr(mtasks, name)
+
+    def __dir__(self):
+        from . import mtasks
+        return dir(mtasks)
+lmtasks=lazy_mtasks()
+
+def parse_var_name(s):
+    config_name,task_name = None,None
+    if '__' in s and '___' not in s: # dataset__task
+        dataset_name, task_name = s.split('__') 
+    elif '__' not in s.replace('___','') and '___' in s: #dataset___config
+        dataset_name, config_name = s.split('___') 
+    elif  '___' in s and '__' in s.split('___')[1]: #dataset___config__task
+        dataset_name, config_task=s.split('___')
+        config_name,task_name = config_task.split('__')
+    else: # dataset 
+        dataset_name = s
+    return dataset_name,config_name,task_name
+
+def pretty_name(x):
+    dn = x.dataset_name.split("/")[-1]   
+    cn = x.config_name if x.config_name else ""
+    tn = x.task_name if x.task_name else ""
+    return f"{dn}/{cn}/{tn}".replace('//','/').rstrip('/')
+
+@cache
+def list_tasks(tasks_path=f'{os.path.dirname(__file__)}/tasks.py',multilingual=False,instruct=False, excluded=[]):
+    if multilingual:
+        tasks_path=tasks_path.replace('/tasks.py','/mtasks.py')
+    task_order = open(tasks_path).readlines()
+    task_order = [x.split('=')[0].rstrip() for x in task_order if '=' in x]
+    task_order = [x for x in task_order if x.isidentifier()]
+    task_order = fc.flip(dict(enumerate(task_order)))
+
+    l = []
+    _tasks = (lmtasks if multilingual else tasks)
+
+    for key in dir(_tasks):
+        if key not in task_order:
+            continue
+        value=getattr(_tasks, key)
+        if isinstance(value,Preprocessing):
+            dataset_name, config_name, task_name = parse_var_name(key)
+            dataset_name = (value.dataset_name if value.dataset_name else dataset_name)
+            config_name = (value.config_name if value.config_name else config_name)
+            hasattr(value,key)
+            l+=[{'dataset_name': dataset_name,
+                 'config_name' : config_name,
+                 'task_name': task_name,
+                 'preprocessing_name': key,
+                'task_type': value.__class__.__name__,'mapping': value,
+                'rank':task_order.get(key,None)}]   
+    df=pd.DataFrame(l).explode('config_name')
+    df = df.sort_values('rank').reset_index(drop=True)
+    df['id'] = df.apply(lambda x: pretty_name(x), axis=1)
+    df.insert(0, 'id', df.pop('id'))
+    del df['rank']
+    if instruct:
+        df=df[df.id.map(lambda x: not any(a in x for a in recast.improper_labels))]
+    df=df[df.id.map(lambda x: not any(x in a for a in excluded))]
+    return df
+
+#task_df =list_tasks()
+#mtask_df =list_tasks(multilingual=True)
+
+def dict_to_query(d=dict(), **kwargs):
+    d={**d,**kwargs}
+    return '&'.join([f'`{k}`=="{v}"' for k,v in d.items()])
+
+def load_preprocessing(tasks=tasks, **kwargs):
+    _tasks_df = list_tasks(multilingual=tasks==lmtasks)
+    y = _tasks_df.copy().query(dict_to_query(**kwargs)).iloc[0]
+    preprocessing= copy.copy(getattr(tasks, y.preprocessing_name))
+    for c in 'dataset_name','config_name':
+        if not isinstance(getattr(preprocessing,c), str):
+             setattr(preprocessing,c,getattr(y,c))
+    return preprocessing
+
+def load_task(id=None, dataset_name=None,config_name=None,task_name=None,preprocessing_name=None,
+         max_rows=None, max_rows_eval=None, multilingual=False, instruct=False, seed=0, **load_dataset_kwargs):
+    query = dict_of(id, dataset_name, config_name, task_name,preprocessing_name)
+    query = {k:v for k,v in query.items() if v}
+    _tasks = (lmtasks if multilingual else tasks)
+    preprocessing = load_preprocessing(_tasks, **query)
+    dataset = load_dataset(preprocessing.dataset_name, preprocessing.config_name, **load_dataset_kwargs)
+    dataset= preprocessing(dataset,max_rows, max_rows_eval)
+    dataset.task_type = preprocessing.__class__.__name__
+    if instruct:
+        dataset=recast.recast_instruct(dataset)
+    return dataset
\ No newline at end of file
diff --git a/src/tasksource/.ipynb_checkpoints/preprocess-checkpoint.py b/src/tasksource/.ipynb_checkpoints/preprocess-checkpoint.py
new file mode 100755
index 0000000..e737242
--- /dev/null
+++ b/src/tasksource/.ipynb_checkpoints/preprocess-checkpoint.py
@@ -0,0 +1,295 @@
+from collections.abc import Iterable
+from dotwiz import DotWiz
+from dataclasses import dataclass
+from typing import Union
+import itertools
+import funcy as fc
+import exrex 
+import magicattr 
+import numpy as np
+import copy
+import datasets
+import time
+
+MAX_MC_OPTIONS = 4
+
+def get_column_names(dataset):
+    cn = dataset.column_names
+    if type(cn)==dict:
+        return set(fc.flatten(cn.values()))
+    else:
+        return set(cn)
+
+
+def sample_dataset(dataset,n=10000, n_eval=1000,seed=0):
+    for k in dataset:
+        n_k=(n if k=='train' else n_eval)
+        if n_k and len(dataset[k])>n_k:
+            dataset[k]=dataset[k].train_test_split(train_size=n_k,seed=seed)['train']
+    return dataset
+
+class Preprocessing(DotWiz):
+    default_splits = ('train','validation','test')
+    _instances = []
+
+    def __post_init__(self):
+        Preprocessing._instances+=[self]
+
+    @staticmethod
+    def __map_to_target(x,fn=lambda x:None, target=None):
+        x[target]=fn(x)
+        return x
+        
+    def load(self):
+        return self(datasets.load_dataset(self.dataset_name,self.config_name))
+
+    def __call__(self,dataset, max_rows=None, max_rows_eval=None,seed=0):
+        dataset = self.pre_process(dataset)
+
+        # manage splits
+        for k,v in zip(self.default_splits, self.splits):
+            if v and k!=v:
+                dataset[k]=dataset[v]
+                del dataset[v]
+            if k in dataset and not v: # obfuscated label
+                del dataset[k]
+        dataset = fix_splits(dataset)
+
+        for k in list(dataset.keys()):
+            if k not in self.default_splits:
+                del dataset[k]
+        dataset = sample_dataset(dataset, max_rows, max_rows_eval,seed=seed)
+        
+        # field annotated with a string
+        substitutions = {v:k for k,v in self.to_dict().items()
+            if (k and k not in {'splits','dataset_name','config_name'} 
+            and type(v)==str and k!=v)}
+
+        dataset=dataset.remove_columns([c for c in substitutions.values() if c in dataset['train'].features and c not in substitutions])
+        dataset=dataset.rename_columns(substitutions)
+
+        # field annotated with a function                                
+        for k in self.to_dict().keys():
+            v=getattr(self, k)
+            if callable(v) and k not in {"post_process","pre_process","load"}:
+                dataset=dataset.map(self.__map_to_target,
+                                    fn_kwargs={'fn':v,'target':k})
+
+        dataset=dataset.remove_columns(
+            get_column_names(dataset)-set(self.to_dict().keys()))
+        dataset = fix_labels(dataset)
+        dataset = fix_splits(dataset) # again: label mapping changed
+        dataset = self.post_process(dataset)
+        return dataset
+
+
+@dataclass
+class cat(Preprocessing):
+    fields:Union[str,list]=None
+    separator:str=' '
+        
+    def __call__(self, example=None):
+        y=[np.char.array(example[f]) + sep 
+                for f,sep in zip(self.fields[::-1],itertools.repeat(self.separator))]
+        y=list(sum(*y))
+        if len(y)==1:
+            y=y[0]
+        return y
+
+
+def pretty(f):
+    class pretty_f(DotWiz):
+        def __init__(self,*args):
+            self.__f_arg = f(*args)
+            for a in args:
+                setattr(self,'value',a)
+                
+        def __call__(self, *args,**kwargs):
+            return self.__f_arg(*args,**kwargs)
+
+        def __repr__(self):
+            return f"{self.__f_arg.__qualname__ .split('.')[0]}({self.value})"
+    return pretty_f
+
+class dotgetter:
+    def __init__(self, path=''):
+        self.path=path
+
+    def __bool__(self):
+        return bool(self.path)
+
+    def __getattr__(self, k):
+        return self.__class__(f'{self.path}.{k}'.lstrip('.'))
+    
+    def __getitem__(self, i):
+        return self.__class__(f'{self.path}[{i}]')
+
+    def __call__(self, example=None):
+        return magicattr.get(DotWiz(example), self.path)
+
+    def __hash__(self):
+        return hash(self.path)
+
+
+@dataclass
+class ClassificationFields(Preprocessing):
+    sentence1:str='sentence1'
+    sentence2:str='sentence2'
+    labels:str='labels'
+
+@dataclass
+class Seq2SeqLMFields(Preprocessing):
+    prompt:str='prompt'
+    output:str='output'
+
+@dataclass
+class TokenClassificationFields(Preprocessing):
+    tokens:str='tokens'
+    labels:str='labels'
+        
+@dataclass
+class MultipleChoiceFields(Preprocessing):
+    inputs:str='input'
+    choices:Iterable=tuple()
+    labels:str='labels'
+    choices_list:str=None
+    def __post_init__(self):
+        for i, c in enumerate(self.choices):
+            setattr(self,f'choice{i}',c)
+        delattr(self,'choices')
+        if not self.choices_list:
+            delattr(self,'choices_list')
+    
+    def __call__(self,dataset, *args, **kwargs):
+        dataset = super().__call__(dataset, *args, **kwargs)
+        if self.choices_list:
+            dataset = dataset.filter(lambda x: 1<len(x['choices_list']))
+            n_options = min([len(x) for k in dataset for x in dataset[k]['choices_list']])
+            n_options = min(MAX_MC_OPTIONS,n_options)
+            dataset = dataset.map(self.flatten_choice_list, fn_kwargs={'n_options':n_options})
+
+        else:
+            dataset = dataset.map(self.sample_choices, fn_kwargs={'n_options':MAX_MC_OPTIONS})
+        return dataset
+
+    @staticmethod
+    def flatten_choice_list(x, n_options=None):
+        n_neg = n_options-1 if n_options else None
+        choices = x['choices_list']
+        label=x['labels']
+        neg = choices[:label] + choices[label+1:]
+        pos = choices[label]
+        x['labels']=0
+        x['choices_list']=[pos]+neg[:n_neg]
+        for i,o in enumerate(x['choices_list']):
+            x[f'choice{i}']=o
+        del x['choices_list']
+        return x
+
+    @staticmethod
+    def sample_choices(x, n_options=None):
+        choices = [x[c] for c in x if 'choice' in c]
+        if not MAX_MC_OPTIONS or len(choices)<=n_options:
+            return x
+        n_neg = n_options-1 if n_options else None
+        label=x['labels']
+        neg = choices[:label] + choices[label+1:]
+        pos = choices[label]
+        x['labels']=0
+        choices_list=[pos]+neg[:n_neg]
+        for c in list(x):
+            if 'choice' in c:
+                del x[c]
+        for i,o in enumerate(choices_list):
+            x[f'choice{i}']=o
+        return x
+
+@dataclass
+class SharedFields:
+    splits:list=Preprocessing.default_splits
+    dataset_name:str = None
+    config_name:str = None
+    pre_process: callable = fc.identity
+    post_process: callable = fc.identity
+    #language:str="en"
+    
+
+@dataclass
+class Classification(SharedFields, ClassificationFields): pass
+
+@dataclass
+class MultipleChoice(SharedFields, MultipleChoiceFields): pass
+
+@dataclass
+class TokenClassification(SharedFields, TokenClassificationFields): pass
+
+@dataclass
+class Seq2SeqLM(SharedFields, Seq2SeqLMFields): pass
+
+get=dotgetter()
+constant = pretty(fc.constantly)
+regen = lambda x: list(exrex.generate(x))
+
+def name(label_name, classes):
+    return lambda x:classes[x[label_name]]
+
+def fix_splits(dataset):
+
+    if len(dataset)==1 and "train" not in dataset:
+        k = list(dataset)[0]
+        dataset['train'] = copy.deepcopy(dataset[k])
+        del dataset[k]
+
+    if 'auxiliary_train' in dataset:
+        del dataset['auxiliary_train']
+    
+    if 'test' in dataset: # manage obfuscated labels
+        if 'labels' in dataset['test'].features:
+            if len(set(fc.flatten(dataset['test'].to_dict()['labels'])))==1:
+                del dataset['test']
+
+    if 'validation' in dataset and 'train' not in dataset:
+        train_validation = dataset['validation'].train_test_split(0.5, seed=0)
+        dataset['train'] = train_validation['train']
+        dataset['validation']=train_validation['test']
+    
+    if 'validation' in dataset and 'test' not in dataset:
+        validation_test = dataset['validation'].train_test_split(0.5, seed=0)
+        dataset['validation'] = validation_test['train']
+        dataset['test']=validation_test['test']
+
+    if 'train' in dataset and 'validation' not in dataset:
+        train_val = dataset['train'].train_test_split(train_size=0.90, seed=0)
+        dataset['train'] = train_val['train']
+        dataset['validation']=train_val['test']
+
+    if 'test' in dataset and 'validation' not in dataset:
+        validation_test = dataset['test'].train_test_split(0.5, seed=0)
+        dataset['validation'] = validation_test['train']
+        dataset['test']=validation_test['test']
+
+    if 'validation' not in dataset and 'test' not in dataset:
+        train_val_test = dataset["train"].train_test_split(train_size=0.90, seed=0)
+        val_test = train_val_test["test"].train_test_split(0.5, seed=0)
+        dataset["train"] = train_val_test["train"]
+        dataset["validation"] = val_test["train"]
+        dataset["test"] = val_test["test"]
+        
+    return dataset 
+
+def fix_labels(dataset, label_key='labels'):
+    if type(dataset['train'][label_key][0]) in [int,list,float]:
+        return dataset
+    labels=set(fc.flatten(dataset[k][label_key] for k in {"train"}))
+    if set(labels)=={'entailment','neutral','contradiction'}:
+        order=lambda x:dict(fc.flip(enumerate(['entailment','neutral','contradiction']))).get(x,x)
+    else:
+        order=str
+    labels=sorted(labels, key=order)
+    dataset=dataset.cast_column(label_key, datasets.ClassLabel(names=labels))
+    return dataset
+
+def concatenate_dataset_dict(l):
+    """Concatenate a list of DatastDict objects sharing same splits and columns."""
+    keys=l[0].keys()
+    return datasets.DatasetDict({k: datasets.concatenate_datasets([x[k] for x in l]) for k in keys})
\ No newline at end of file
diff --git a/src/tasksource/.ipynb_checkpoints/recast-checkpoint.py b/src/tasksource/.ipynb_checkpoints/recast-checkpoint.py
new file mode 100644
index 0000000..865aecb
--- /dev/null
+++ b/src/tasksource/.ipynb_checkpoints/recast-checkpoint.py
@@ -0,0 +1,115 @@
+import random
+from datasets import DatasetDict, Dataset
+from sorcery import dict_of
+import string
+
+improper_labels =['recast/recast_kg_relations','linguisticprobing',"lex_glue/scotus",'lexical_relation_classification/ROOT09',"pragmeval/squinky","pragmeval/emobank",'pragmeval/persuasiveness']
+improper_labels += ['glue/stsb', 'sick/relatedness', 'joci', 'utilitarianism', 'amazon_counterfactual/en', 'toxic_conversations', 'ethos/multilabel', 'lex_glue/eurlex', 'lex_glue/unfair_tos', 'app_reviews', 'humicroedit/subtask-1', 'stackoverflow-questions', 'go_emotions/simplified', 'google_wellformed_query', 'has_part', 'blog_authorship_corpus/age', 'promptCoherence', 'Sarcasm_News_Headline', 'auditor_review/demo-org--auditor_review', 'Dynasent_Disagreement', 'Politeness_Disagreement', 'SBIC_Disagreement', 'SChem_Disagreement', 'Dilemmas_Disagreement', 'sts-companion', 'acceptability-prediction', 'chaos-mnli-ambiguity', 'headline_cause/en_simple', 'oasst1_dense_flat', 'civil_comments']
+
+improper_labels += ['stsb_multi_mt','MLMA_hate_speech','icl-symbol-tuning-instruct','zero-shot-label-nli']
+
+improper_labels += ['essay-scoring','english-grading','HelpSteer','oasst2']
+
+def render_options(options):
+    options = [f'"{x}"' for x in options]
+    return f"{', '.join(options[:-1])} or {options[-1]}"
+
+def render_classification(text,options,answer):
+    example = 'text_A→text_B' if text.startswith('text_A:') else 'the following'
+    inputs = f'With no explanation, label {example} with either {render_options(options)}.\n{text}'
+    targets = f"{answer}."
+    return dict_of(inputs,targets)
+
+def render_token_classification(tokens,options,labels):
+    prefix = f'With no explanation, label each line with {render_options(options)} preceded by ":".\n'
+    inputs = prefix+"\n".join(tokens)
+    targets = "\n".join([':'.join(x) for x in zip(tokens,labels)])
+    return dict_of(inputs,targets)
+
+def render_multiple_choice(prompt, options, labels):
+    inputs=(prompt+'\n' if prompt else '')
+    letters = string.ascii_uppercase[:len(options)]
+    inputs=f'With no explanation, chose the best option from {render_options(letters)}. {inputs}'    
+    for letter, option in zip(letters, options):
+        inputs+=f'\n{letter}: {option}'
+    targets = f'{letters[labels]}.'
+    return dict_of(inputs, targets) 
+
+def negative_sample_options(y, labels,N=4):
+    if len(labels)<N:
+        return labels
+    else:
+        return [y]+random.sample([x for x in labels if x!=y], N-1)
+
+def shuffle_choices(x):
+    choices = sorted([k for k in x if 'choice' in k])
+    choices_texts = [x[c] for c in choices]
+    correct_choice =choices_texts[x['labels']]
+    random.shuffle(choices_texts)
+    for c, ct in zip(choices, choices_texts):
+        x[c]=ct
+    x["labels"]=choices_texts.index(correct_choice)
+    return x
+
+def recast_dataset_classification_to_mc(dataset,sep="[SEP]",N=4):
+
+    def recast_split(d,N=N):
+        labels = d.features['labels']
+        df=d.to_pandas()
+        df['inputs'] = df.sentence1
+        if "sentence2" in df:
+            df['inputs'] +=sep + df.sentence2
+
+        N=min(N, len(labels.names))
+        df['choices']=df.apply(lambda x:negative_sample_options(labels.int2str(x['labels']), labels.names,N),axis=1)     
+        df['labels']=df.apply(lambda x:x['choices'].index(labels.int2str(x['labels'])),axis=1)
+
+        for i in range(N):
+            df[f'choice{i}']= "This example is " + df.choices.map(lambda x:x[i])
+
+        choices = [f'choice{i}' for i in range(N)]
+        return Dataset.from_pandas(df[['inputs',*choices,'labels']],preserve_index=False)
+
+    return DatasetDict({k: recast_split(v) for k,v in dataset.items()})
+
+
+def recast_instruct(dataset):
+    features = dataset['train'].features
+    labels = features['labels']
+
+    if "sentence1" in features:
+        task_type='Classification'
+    if "choice0" in features:
+        task_type = "MultipleChoice"
+    if "tokens" in features:
+        task_type = "TokenClassification"
+
+    def recast_MultipleChoice(x):
+        x=shuffle_choices(x)
+        choices = sorted([k for k in x if 'choice' in k])
+        if all([x[c] in x['inputs'] for c in choices]):
+            return {"inputs":x['inputs'], 'targets': x[f"choice{x['labels']}"].strip()+"."}
+        else:
+            return render_multiple_choice(x['inputs'],[x[c] for c in choices],x['labels'])
+
+    def recast_TokenClassification(x):
+        distractors = list(labels.feature.names)
+        x_labels = [labels.feature.int2str(y) for y in x['labels']]
+        labels_set= list({labels.feature.int2str(y) for y in x['labels']})
+        options=list(dict.fromkeys(labels_set+distractors))[:max(len(labels_set),10)]
+        return render_token_classification(x['tokens'],options,x_labels)
+
+    def recast_Classification(x):
+        if 'sentence2' in x:
+            text=f"text_A: {x['sentence1']}\ntext_B: {x['sentence2']}"
+        else:
+            text=x['sentence1']
+            
+        answer=labels.int2str(x['labels']).strip()
+        options= negative_sample_options(answer, labels._int2str)
+        return render_classification(text, options, answer)
+        
+    dataset = dataset.map(eval(f"recast_{task_type}"))
+    dataset = dataset.remove_columns([k for k in features if k not in ['inputs','targets']])
+    return dataset
+ 
\ No newline at end of file
diff --git a/src/tasksource/.ipynb_checkpoints/tasks-checkpoint.py b/src/tasksource/.ipynb_checkpoints/tasks-checkpoint.py
index 244d97a..c1b8d72 100755
--- a/src/tasksource/.ipynb_checkpoints/tasks-checkpoint.py
+++ b/src/tasksource/.ipynb_checkpoints/tasks-checkpoint.py
@@ -69,7 +69,7 @@ def remove_neg_1(dataset):
     dataset_name="sileod/probability_words_nli", 
     config_name=["reasoning_1hop","reasoning_2hop","usnli"])
 
-nan_nli = Classification("premise", "hypothesis", "label", dataset_name="joey234/nan-nli", config_name="joey234--nan-nli")
+nan_nli = Classification("premise", "hypothesis", "label", dataset_name="joey234/nan-nli")
 
 nli_fever = Classification("premise","hypothesis","label",
     dataset_name="pietrolesci/nli_fever", splits=["train","dev",None])
@@ -368,7 +368,7 @@ def _split_choices(s):
 
 trec = Classification(sentence1="text", labels="fine_label")
 
-tals_vitaminc = Classification('claim','evidence','label', dataset_name="tals/vitaminc", config_name="tals--vitaminc")
+tals_vitaminc = Classification('claim','evidence','label', dataset_name="tals/vitaminc")
 
 hope_edi = Classification("text", labels="label", splits=["train", "validation", None], config_name=["english"])
 
@@ -484,7 +484,11 @@ def stance_kwargs(topic):
 
 #hyperpartisan_news_detection___byarticle = Classification(sentence1="text", labels="hyperpartisan", splits=["train", None, None]) # files too heavy
 #hyperpartisan_news_detection___bypublisher = Classification(sentence1="text", labels="hyperpartisan", splits=["train","validation", None]) # files too heavy
-hyperpartisan_news = Classification("text",labels="label",dataset_name="zapsdcn/hyperpartisan_news")
+
+hyperpartisan_news = Classification(
+    "text",
+    labels=lambda x: {'true':'hyperpartisan','false':'not_hyperpartisan'}.get(x["label"]),
+    dataset_name="zapsdcn/hyperpartisan_news")
 
 scierc = Classification("text",labels="label",dataset_name="zapsdcn/sciie")
 citation_intent = Classification("text",labels="label",dataset_name="zapsdcn/citation_intent")
@@ -634,7 +638,7 @@ def stance_kwargs(topic):
     labels=lambda x: "AB"[x["answer_index"]]
 )
 
-mwong_fever_evidence_related = Classification(sentence1="claim", sentence2="evidence", labels="labels", splits=["train", "valid", "test"], dataset_name="mwong/fever-evidence-related", config_name="mwong--fever-related")
+mwong_fever_evidence_related = Classification(sentence1="claim", sentence2="evidence", labels="labels", splits=["train", "valid", "test"], dataset_name="mwong/fever-evidence-related")
 
 numer_sense = Classification("sentence",labels="target",splits=["train",None,None])
 
@@ -648,7 +652,9 @@ def stance_kwargs(topic):
 
 sem_eval_2010_task_8 = Classification("sentence",labels="relation")
 
-demo_org_auditor_review = Classification(sentence1="sentence", labels="label", splits=["train", None, "test"], dataset_name="demo-org/auditor_review", config_name="demo-org--auditor_review")
+auditor_review = Classification(sentence1="sentence",
+    labels=name("label",['negative','neutral','positive']),
+    dataset_name="demo-org/auditor_review")
 
 medmcqa = MultipleChoice("question", choices=regen('op[a-d]'),labels='cop')
 
@@ -1141,9 +1147,11 @@ def _nlgraph_binarize(x):
 
 logicNLI = Classification('premise','hypothesis','label',dataset_name='tasksource/LogicNLI')
 
-contract_nli = Classification("premise","hypothesis","label", dataset_name="kiddothe2b/contract-nli",config_name="contractnli_a")
+contract_nli__seg = Classification("premise","hypothesis","label", dataset_name="kiddothe2b/contract-nli",config_name="contractnli_a")
+
+contract_nli__full = Classification("premise","hypothesis","label", dataset_name="kiddothe2b/contract-nli",config_name="contractnli_b")
 
-nli4ct = Classification(lambda x: "\n".join(x['Primary_evidence']),'Statement',
+nli4ct = Classification(lambda x: "\n".join(x['Primary_evidence']),'Statement',"Label",
     dataset_name="AshtonIsNotHere/nli4ct_semeval2024",splits=['train','dev',None])
 
 lsat_ar = MultipleChoice(
@@ -1204,7 +1212,7 @@ def _nlgraph_binarize(x):
 gs_order = MultipleChoice("sent2",regen("ending[0-1]"),"label",
         dataset_name="tasksource/goal-step-wikihow",config_name="order")
 
-paradise = MultipleChoice("sent2",regen("ending[0-1]"),"label",
+paradise = MultipleChoice("sent2",regen("ending[0-3]"),"label",
       dataset_name="GGLab/PARADISE")
 
 docnli = Classification("premise","hypothesis","label",dataset_name="tasksource/doc-nli")
@@ -1219,11 +1227,6 @@ def _nlgraph_binarize(x):
 
 lifeycle_entailment = Classification("premise","hypothesis","label",dataset_name='tasksource/lifecycle-entailment')
 
-helpsteer_2__helpfulness = Classification("prompt","response","helpfulness",dataset_name="nvidia/HelpSteer2")
-helpsteer_2__correctness = Classification("prompt", "response", "correctness", dataset_name="nvidia/HelpSteer2")
-helpsteer_2__coherence = Classification("prompt", "response", "coherence", dataset_name="nvidia/HelpSteer2")
-helpsteer_2__complexity = Classification("prompt", "response", "complexity", dataset_name="nvidia/HelpSteer2")
-helpsteer_2__verbosity = Classification("prompt", "response", "verbosity", dataset_name="nvidia/HelpSteer2")
 
 helpsteer__helpfulness = Classification("prompt", "response", "helpfulness", dataset_name="nvidia/HelpSteer")
 helpsteer__correctness = Classification("prompt", "response", "correctness", dataset_name="nvidia/HelpSteer")
@@ -1231,16 +1234,22 @@ def _nlgraph_binarize(x):
 helpsteer__complexity = Classification("prompt", "response", "complexity", dataset_name="nvidia/HelpSteer")
 helpsteer__verbosity = Classification("prompt", "response", "verbosity", dataset_name="nvidia/HelpSteer")
 
+helpsteer_2__helpfulness = Classification("prompt","response","helpfulness",dataset_name="nvidia/HelpSteer2")
+helpsteer_2__correctness = Classification("prompt", "response", "correctness", dataset_name="nvidia/HelpSteer2")
+helpsteer_2__coherence = Classification("prompt", "response", "coherence", dataset_name="nvidia/HelpSteer2")
+helpsteer_2__complexity = Classification("prompt", "response", "complexity", dataset_name="nvidia/HelpSteer2")
+helpsteer_2__verbosity = Classification("prompt", "response", "verbosity", dataset_name="nvidia/HelpSteer2")
+
 msci_nli = Classification('sentence1','sentence2','label',dataset_name='sadat2307/MSciNLI')
 
-lex_glue___ecthr_a = Classification(sentence1="text", labels="labels",dataset_name="coastalcph/lex_glue",config_name="ecthr_a") # too long
-lex_glue___ecthr_b = Classification(sentence1="text", labels="labels") # too long
+#lex_glue___ecthr_a = Classification(sentence1="text", labels="labels",dataset_name="coastalcph/lex_glue",config_name="ecthr_a") # too long
+#lex_glue___ecthr_b = Classification(sentence1="text", labels="labels") # too long
 
 ultrafeedback = MultipleChoice("question", choices=['response_j','response_k'],labels=constant(0), dataset_name="pushpdeep/UltraFeedback-paired")
 
 essay_scoring = Classification("full_text",labels="score",dataset_name='tasksource/AES2-essay-scoring')
 
-argument_feedback = Classification("discourse_text",labels="discourse_effectiveness", dataset_name="tasksource/argument-feedback")
+#argument_feedback = Classification("discourse_text",labels="discourse_effectiveness", dataset_name="tasksource/argument-feedback")
 
 eg = lambda x: Classification("full_text", labels=lambda y:int(y[x]), dataset_name="tasksource/english-grading")
 grading__cohesion = eg('cohesion')
@@ -1249,3 +1258,15 @@ def _nlgraph_binarize(x):
 grading__phraseology = eg('phraseology')
 grading__grammar = eg('grammar')
 grading__conventions = eg('conventions')
+
+wice = Classification(lambda x: "\n".join(x['evidence']),'claim','label',
+    dataset_name='tasksource/wice')
+
+hover = Classification("evidence","claim","label",
+    dataset_name="Dzeniks/hover") 
+
+tasksource_dpo = MultipleChoice("prompt",choices=['chosen','rejected'],labels=constant(0),
+    dataset_name="tasksource/tasksource_dpo_pairs")
+
+seahorse = Classification('article',cat(["summary", "question"]),'answer',
+    dataset_name="tasksource/seahorse_summarization_evaluation")
diff --git a/src/tasksource/preprocess.py b/src/tasksource/preprocess.py
index ac3e283..e737242 100755
--- a/src/tasksource/preprocess.py
+++ b/src/tasksource/preprocess.py
@@ -11,6 +11,8 @@
 import datasets
 import time
 
+MAX_MC_OPTIONS = 4
+
 def get_column_names(dataset):
     cn = dataset.column_names
     if type(cn)==dict:
@@ -28,6 +30,11 @@ def sample_dataset(dataset,n=10000, n_eval=1000,seed=0):
 
 class Preprocessing(DotWiz):
     default_splits = ('train','validation','test')
+    _instances = []
+
+    def __post_init__(self):
+        Preprocessing._instances+=[self]
+
     @staticmethod
     def __map_to_target(x,fn=lambda x:None, target=None):
         x[target]=fn(x)
@@ -158,12 +165,15 @@ def __call__(self,dataset, *args, **kwargs):
         if self.choices_list:
             dataset = dataset.filter(lambda x: 1<len(x['choices_list']))
             n_options = min([len(x) for k in dataset for x in dataset[k]['choices_list']])
-            n_options = min(5,n_options)
-            dataset = dataset.map(self.flatten, fn_kwargs={'n_options':n_options})
+            n_options = min(MAX_MC_OPTIONS,n_options)
+            dataset = dataset.map(self.flatten_choice_list, fn_kwargs={'n_options':n_options})
+
+        else:
+            dataset = dataset.map(self.sample_choices, fn_kwargs={'n_options':MAX_MC_OPTIONS})
         return dataset
 
     @staticmethod
-    def flatten(x, n_options=None):
+    def flatten_choice_list(x, n_options=None):
         n_neg = n_options-1 if n_options else None
         choices = x['choices_list']
         label=x['labels']
@@ -176,13 +186,31 @@ def flatten(x, n_options=None):
         del x['choices_list']
         return x
 
+    @staticmethod
+    def sample_choices(x, n_options=None):
+        choices = [x[c] for c in x if 'choice' in c]
+        if not MAX_MC_OPTIONS or len(choices)<=n_options:
+            return x
+        n_neg = n_options-1 if n_options else None
+        label=x['labels']
+        neg = choices[:label] + choices[label+1:]
+        pos = choices[label]
+        x['labels']=0
+        choices_list=[pos]+neg[:n_neg]
+        for c in list(x):
+            if 'choice' in c:
+                del x[c]
+        for i,o in enumerate(choices_list):
+            x[f'choice{i}']=o
+        return x
+
 @dataclass
 class SharedFields:
     splits:list=Preprocessing.default_splits
     dataset_name:str = None
     config_name:str = None
-    pre_process: callable = lambda x:x
-    post_process: callable = lambda x:x
+    pre_process: callable = fc.identity
+    post_process: callable = fc.identity
     #language:str="en"
     
 
diff --git a/src/tasksource/recast.py b/src/tasksource/recast.py
index 61a6952..865aecb 100644
--- a/src/tasksource/recast.py
+++ b/src/tasksource/recast.py
@@ -8,12 +8,14 @@
 
 improper_labels += ['stsb_multi_mt','MLMA_hate_speech','icl-symbol-tuning-instruct','zero-shot-label-nli']
 
+improper_labels += ['essay-scoring','english-grading','HelpSteer','oasst2']
+
 def render_options(options):
     options = [f'"{x}"' for x in options]
     return f"{', '.join(options[:-1])} or {options[-1]}"
 
 def render_classification(text,options,answer):
-    example = 'A→B' if text.startswith('A:') else 'the following'
+    example = 'text_A→text_B' if text.startswith('text_A:') else 'the following'
     inputs = f'With no explanation, label {example} with either {render_options(options)}.\n{text}'
     targets = f"{answer}."
     return dict_of(inputs,targets)
@@ -99,7 +101,7 @@ def recast_TokenClassification(x):
 
     def recast_Classification(x):
         if 'sentence2' in x:
-            text=f"A: {x['sentence1']}\nB: {x['sentence2']}"
+            text=f"text_A: {x['sentence1']}\ntext_B: {x['sentence2']}"
         else:
             text=x['sentence1']
             
diff --git a/src/tasksource/tasks.py b/src/tasksource/tasks.py
index 244d97a..c1b8d72 100755
--- a/src/tasksource/tasks.py
+++ b/src/tasksource/tasks.py
@@ -69,7 +69,7 @@ def remove_neg_1(dataset):
     dataset_name="sileod/probability_words_nli", 
     config_name=["reasoning_1hop","reasoning_2hop","usnli"])
 
-nan_nli = Classification("premise", "hypothesis", "label", dataset_name="joey234/nan-nli", config_name="joey234--nan-nli")
+nan_nli = Classification("premise", "hypothesis", "label", dataset_name="joey234/nan-nli")
 
 nli_fever = Classification("premise","hypothesis","label",
     dataset_name="pietrolesci/nli_fever", splits=["train","dev",None])
@@ -368,7 +368,7 @@ def _split_choices(s):
 
 trec = Classification(sentence1="text", labels="fine_label")
 
-tals_vitaminc = Classification('claim','evidence','label', dataset_name="tals/vitaminc", config_name="tals--vitaminc")
+tals_vitaminc = Classification('claim','evidence','label', dataset_name="tals/vitaminc")
 
 hope_edi = Classification("text", labels="label", splits=["train", "validation", None], config_name=["english"])
 
@@ -484,7 +484,11 @@ def stance_kwargs(topic):
 
 #hyperpartisan_news_detection___byarticle = Classification(sentence1="text", labels="hyperpartisan", splits=["train", None, None]) # files too heavy
 #hyperpartisan_news_detection___bypublisher = Classification(sentence1="text", labels="hyperpartisan", splits=["train","validation", None]) # files too heavy
-hyperpartisan_news = Classification("text",labels="label",dataset_name="zapsdcn/hyperpartisan_news")
+
+hyperpartisan_news = Classification(
+    "text",
+    labels=lambda x: {'true':'hyperpartisan','false':'not_hyperpartisan'}.get(x["label"]),
+    dataset_name="zapsdcn/hyperpartisan_news")
 
 scierc = Classification("text",labels="label",dataset_name="zapsdcn/sciie")
 citation_intent = Classification("text",labels="label",dataset_name="zapsdcn/citation_intent")
@@ -634,7 +638,7 @@ def stance_kwargs(topic):
     labels=lambda x: "AB"[x["answer_index"]]
 )
 
-mwong_fever_evidence_related = Classification(sentence1="claim", sentence2="evidence", labels="labels", splits=["train", "valid", "test"], dataset_name="mwong/fever-evidence-related", config_name="mwong--fever-related")
+mwong_fever_evidence_related = Classification(sentence1="claim", sentence2="evidence", labels="labels", splits=["train", "valid", "test"], dataset_name="mwong/fever-evidence-related")
 
 numer_sense = Classification("sentence",labels="target",splits=["train",None,None])
 
@@ -648,7 +652,9 @@ def stance_kwargs(topic):
 
 sem_eval_2010_task_8 = Classification("sentence",labels="relation")
 
-demo_org_auditor_review = Classification(sentence1="sentence", labels="label", splits=["train", None, "test"], dataset_name="demo-org/auditor_review", config_name="demo-org--auditor_review")
+auditor_review = Classification(sentence1="sentence",
+    labels=name("label",['negative','neutral','positive']),
+    dataset_name="demo-org/auditor_review")
 
 medmcqa = MultipleChoice("question", choices=regen('op[a-d]'),labels='cop')
 
@@ -1141,9 +1147,11 @@ def _nlgraph_binarize(x):
 
 logicNLI = Classification('premise','hypothesis','label',dataset_name='tasksource/LogicNLI')
 
-contract_nli = Classification("premise","hypothesis","label", dataset_name="kiddothe2b/contract-nli",config_name="contractnli_a")
+contract_nli__seg = Classification("premise","hypothesis","label", dataset_name="kiddothe2b/contract-nli",config_name="contractnli_a")
+
+contract_nli__full = Classification("premise","hypothesis","label", dataset_name="kiddothe2b/contract-nli",config_name="contractnli_b")
 
-nli4ct = Classification(lambda x: "\n".join(x['Primary_evidence']),'Statement',
+nli4ct = Classification(lambda x: "\n".join(x['Primary_evidence']),'Statement',"Label",
     dataset_name="AshtonIsNotHere/nli4ct_semeval2024",splits=['train','dev',None])
 
 lsat_ar = MultipleChoice(
@@ -1204,7 +1212,7 @@ def _nlgraph_binarize(x):
 gs_order = MultipleChoice("sent2",regen("ending[0-1]"),"label",
         dataset_name="tasksource/goal-step-wikihow",config_name="order")
 
-paradise = MultipleChoice("sent2",regen("ending[0-1]"),"label",
+paradise = MultipleChoice("sent2",regen("ending[0-3]"),"label",
       dataset_name="GGLab/PARADISE")
 
 docnli = Classification("premise","hypothesis","label",dataset_name="tasksource/doc-nli")
@@ -1219,11 +1227,6 @@ def _nlgraph_binarize(x):
 
 lifeycle_entailment = Classification("premise","hypothesis","label",dataset_name='tasksource/lifecycle-entailment')
 
-helpsteer_2__helpfulness = Classification("prompt","response","helpfulness",dataset_name="nvidia/HelpSteer2")
-helpsteer_2__correctness = Classification("prompt", "response", "correctness", dataset_name="nvidia/HelpSteer2")
-helpsteer_2__coherence = Classification("prompt", "response", "coherence", dataset_name="nvidia/HelpSteer2")
-helpsteer_2__complexity = Classification("prompt", "response", "complexity", dataset_name="nvidia/HelpSteer2")
-helpsteer_2__verbosity = Classification("prompt", "response", "verbosity", dataset_name="nvidia/HelpSteer2")
 
 helpsteer__helpfulness = Classification("prompt", "response", "helpfulness", dataset_name="nvidia/HelpSteer")
 helpsteer__correctness = Classification("prompt", "response", "correctness", dataset_name="nvidia/HelpSteer")
@@ -1231,16 +1234,22 @@ def _nlgraph_binarize(x):
 helpsteer__complexity = Classification("prompt", "response", "complexity", dataset_name="nvidia/HelpSteer")
 helpsteer__verbosity = Classification("prompt", "response", "verbosity", dataset_name="nvidia/HelpSteer")
 
+helpsteer_2__helpfulness = Classification("prompt","response","helpfulness",dataset_name="nvidia/HelpSteer2")
+helpsteer_2__correctness = Classification("prompt", "response", "correctness", dataset_name="nvidia/HelpSteer2")
+helpsteer_2__coherence = Classification("prompt", "response", "coherence", dataset_name="nvidia/HelpSteer2")
+helpsteer_2__complexity = Classification("prompt", "response", "complexity", dataset_name="nvidia/HelpSteer2")
+helpsteer_2__verbosity = Classification("prompt", "response", "verbosity", dataset_name="nvidia/HelpSteer2")
+
 msci_nli = Classification('sentence1','sentence2','label',dataset_name='sadat2307/MSciNLI')
 
-lex_glue___ecthr_a = Classification(sentence1="text", labels="labels",dataset_name="coastalcph/lex_glue",config_name="ecthr_a") # too long
-lex_glue___ecthr_b = Classification(sentence1="text", labels="labels") # too long
+#lex_glue___ecthr_a = Classification(sentence1="text", labels="labels",dataset_name="coastalcph/lex_glue",config_name="ecthr_a") # too long
+#lex_glue___ecthr_b = Classification(sentence1="text", labels="labels") # too long
 
 ultrafeedback = MultipleChoice("question", choices=['response_j','response_k'],labels=constant(0), dataset_name="pushpdeep/UltraFeedback-paired")
 
 essay_scoring = Classification("full_text",labels="score",dataset_name='tasksource/AES2-essay-scoring')
 
-argument_feedback = Classification("discourse_text",labels="discourse_effectiveness", dataset_name="tasksource/argument-feedback")
+#argument_feedback = Classification("discourse_text",labels="discourse_effectiveness", dataset_name="tasksource/argument-feedback")
 
 eg = lambda x: Classification("full_text", labels=lambda y:int(y[x]), dataset_name="tasksource/english-grading")
 grading__cohesion = eg('cohesion')
@@ -1249,3 +1258,15 @@ def _nlgraph_binarize(x):
 grading__phraseology = eg('phraseology')
 grading__grammar = eg('grammar')
 grading__conventions = eg('conventions')
+
+wice = Classification(lambda x: "\n".join(x['evidence']),'claim','label',
+    dataset_name='tasksource/wice')
+
+hover = Classification("evidence","claim","label",
+    dataset_name="Dzeniks/hover") 
+
+tasksource_dpo = MultipleChoice("prompt",choices=['chosen','rejected'],labels=constant(0),
+    dataset_name="tasksource/tasksource_dpo_pairs")
+
+seahorse = Classification('article',cat(["summary", "question"]),'answer',
+    dataset_name="tasksource/seahorse_summarization_evaluation")