tum-ai · zausin33 · Jan 9, 2025 · Jan 9, 2025 · Jan 11, 2025 · Jan 12, 2025
diff --git a/config/dataset_args/debug.yaml b/config/dataset_args/debug.yaml
@@ -0,0 +1 @@
+dataset_name: debug
diff --git a/config/dataset_args/multirc.yaml b/config/dataset_args/multirc.yaml
@@ -0,0 +1,2 @@
+dataset_name: multirc
+compute_number_metrics: false
diff --git a/config/dataset_args/rjokes.yaml b/config/dataset_args/rjokes.yaml
@@ -0,0 +1 @@
+dataset_name: rjokes
diff --git a/config/model_args/vanilla_t5.yaml b/config/model_args/vanilla_t5.yaml
@@ -1,3 +1,3 @@
 name: vanilla_t5
-config_name: t5-base
+config_name: t5-small
 number_encoding: none
diff --git a/config/model_args/vanilla_t5_custom_tokenizer.yaml b/config/model_args/vanilla_t5_custom_tokenizer.yaml
@@ -0,0 +1,4 @@
+name: vanilla_t5_custom_tokenizer
+config_name: t5-small
+number_encoding: none
+tokenizer_type: custom
diff --git a/config/model_args/vanilla_t5_ntl.yaml b/config/model_args/vanilla_t5_ntl.yaml
@@ -1,7 +1,6 @@
 name: vanilla_t5_ntl
-config_name: t5-base
+config_name: t5-small
 number_encoding: none
 number_token_loss: true
 number_token_loss_weight: 0.3
-number_token_loss_with_wasserstein: false
-#number_token_loss_function:
+number_token_loss_with_wasserstein: true
diff --git a/config/model_args/vanilla_t5_ntl_default_tokenizer.yaml b/config/model_args/vanilla_t5_ntl_default_tokenizer.yaml
@@ -0,0 +1,7 @@
+name: vanilla_t5_ntl_default_tokenizer
+config_name: t5-small
+number_encoding: none
+number_token_loss: true
+number_token_loss_weight: 0.3
+number_token_loss_with_wasserstein: true
+tokenizer_type: auto
diff --git a/config/model_args/vanilla_t5_regression_head.yaml b/config/model_args/vanilla_t5_regression_head.yaml
@@ -1,4 +1,4 @@
 name: vanilla_t5_regression_head
-config_name: t5-base
+config_name: t5-small
 number_encoding: none_regression_head
 log_scale_embeddings: false
diff --git a/config/run_specific_config/config.yaml b/config/run_specific_config/config.yaml
@@ -1,7 +1,9 @@
 training_args:
-  trial:
+  trial: ablation_studies
   special_name:
+  max_steps: 2500000
+  load_best_model_at_end: false
 
 model_args:
   model_name_or_path: google-t5/t5-small
-  config_name: t5-small
+  config_name: t5-small
diff --git a/config/training_args/train.yaml b/config/training_args/train.yaml
@@ -5,7 +5,7 @@ lr_scheduler_kwargs:
   factor: 0.5
   patience: 5
 weight_decay: 0.01
-num_train_epochs: 2000
+max_steps: 2500000
 save_total_limit: 2
 save_steps: 25000
 eval_steps: 25000

diff --git a/data/multirc/data/preprocessed/test_clean.jsonl b/data/multirc/data/preprocessed/test_clean.jsonl
diff --git a/data/multirc/data/preprocessed/train_clean.jsonl b/data/multirc/data/preprocessed/train_clean.jsonl
diff --git a/data/multirc/data/preprocessed/val_clean.jsonl b/data/multirc/data/preprocessed/val_clean.jsonl
diff --git a/data/multirc/data/test.jsonl b/data/multirc/data/test.jsonl
diff --git a/data/multirc/data/train.jsonl b/data/multirc/data/train.jsonl
diff --git a/data/multirc/data/val.jsonl b/data/multirc/data/val.jsonl
diff --git a/data/multirc/preprocess_data.py b/data/multirc/preprocess_data.py
@@ -0,0 +1,69 @@
+import json
+
+
+def create_clean_jsonl(input_file, output_file):
+    """
+    Reads the original MultiRC-style JSONL (with passage -> multiple questions -> answers)
+    and writes a new JSONL file in the format:
+
+    {
+      "question": <string>,
+      "answer": <string>
+    }
+
+    - "question": we combine the passage text and the question text into one.
+    - "answer": we concatenate all correct answers (label=1) from that question.
+    """
+
+    with open(input_file, 'r', encoding='utf-8') as fin, \
+            open(output_file, 'w', encoding='utf-8') as fout:
+
+        for line in fin:
+            line = line.strip()
+            if not line:
+                continue
+
+            # Parse the original record
+            record = json.loads(line)
+            passage_text = record["passage"]["text"]
+            questions = record["passage"]["questions"]
+
+            # For each question in this passage
+            for q in questions:
+                question_text = q["question"]
+                answers = q["answers"]
+
+                # Gather all the correct answers (label=1)
+                correct_answers = [
+                    ans["text"] for ans in answers
+                    if ans.get("label", 0) == 1
+                ]
+
+                # If no correct answers, you could skip or store empty
+                if not correct_answers:
+                    final_answer = ""
+                else:
+                    # Join multiple correct answers with " | " or any delimiter
+                    final_answer = " | ".join(correct_answers)
+
+                # Build the "question" field by including passage + question
+                combined_question = (
+                    f"{passage_text.strip()}\n\n"
+                    f"Question: {question_text.strip()}"
+                )
+
+                # The "answer" field (here, just the correct answers)
+                out_record = {
+                    "question": combined_question,
+                    "answer": final_answer
+                }
+
+                fout.write(json.dumps(out_record, ensure_ascii=False) + "\n")
+
+
+if __name__ == "__main__":
+    input_path = "../../data/multirc/data/val.jsonl"
+    output_path = "../../data/multirc/data/preprocessed/val_clean.jsonl"
+
+    create_clean_jsonl(input_path, output_path)
+    print(f"Finished writing to {output_path}")
diff --git a/data/rjokes-dataset/data/dev.jsonl b/data/rjokes-dataset/data/dev.jsonl
diff --git a/data/rjokes-dataset/data/dev.tsv b/data/rjokes-dataset/data/dev.tsv
diff --git a/data/rjokes-dataset/data/dev_distribution.json b/data/rjokes-dataset/data/dev_distribution.json
@@ -0,0 +1,14 @@
+{
+    "1": 10382,
+    "0": 14962,
+    "3": 4194,
+    "2": 7715,
+    "6": 825,
+    "5": 1590,
+    "4": 2540,
+    "9": 172,
+    "7": 461,
+    "10": 160,
+    "8": 239,
+    "11": 6
+}
diff --git a/data/rjokes-dataset/data/test.jsonl b/data/rjokes-dataset/data/test.jsonl
diff --git a/data/rjokes-dataset/data/test.tsv b/data/rjokes-dataset/data/test.tsv
diff --git a/data/rjokes-dataset/data/test_distribution.json b/data/rjokes-dataset/data/test_distribution.json
@@ -0,0 +1,14 @@
+{
+    "0": 14942,
+    "7": 446,
+    "1": 10336,
+    "2": 7908,
+    "4": 2571,
+    "3": 4069,
+    "6": 829,
+    "5": 1558,
+    "8": 240,
+    "10": 163,
+    "9": 179,
+    "11": 5
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		dataset_name: multirc
		compute_number_metrics: false