-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstruction_bool_to_query_training.py
84 lines (70 loc) · 4.09 KB
/
construction_bool_to_query_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import os
from matplotlib import pyplot as plt
from tqdm import tqdm
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
tokenizer = LlamaTokenizer.from_pretrained("out_7b/")
Apaca_PROMPT = 'Below is an instruction that describes a task, paired with an input that provides further context. ' \
'Write a response that appropriately completes the request.\n\n### ' \
'Instruction:\n Construct a high-quality natural language query for the provided systematic review boolean query.' \
'The effectiveness of the query will be determined by its capability to retrieve relevant documents when searching on a semantic-based search engine. ' \
'The generated query can also be the same as the original topic, as long as it can achieve high effectiveness.' \
'\n\n ### Input:\n{boolean}\n\n ' \
'### Response:\n'
Instruction = 'Construct a natural language query using the systematic review boolean query provided.' \
# 'The effectiveness of the query will be determined by its capability to retrieve relevant documents when searching on a semantic-based search engine. ' \
# 'The generated query can also be the same as the original topic, as long as it can achieve high effectiveness.' \
#metrics = ["last_rel", "ap", "recall1.0", "recall5.0", "recall10.0", "recall20.0", "wss_95", "wss_100"]
collections = ["clef-tar/CLEF-2017", "clef-tar/CLEF-2018", "clef-tar/CLEF-2019-dta", "clef-tar/CLEF-2019-intervention", "sysrev-seed-collection"]
#, "clef-tar/CLEF-2019-dta/training", "clef-tar/CLEF-2019-intervention/training", "sysrev-seed-collection/training"]
#types = ["title_openai", "title_alpaca", "title_bioalpaca"]
init_dict = {"instruction": Instruction}
for collection in collections:
qrel_file = os.path.join("data", collection, "qrel_abs_train_loo.txt")
qrel_dict = {}
with open(qrel_file) as f:
for line in f:
qid, _, docid, score = line.strip().split()
if qid not in qrel_dict:
qrel_dict[qid] = {}
qrel_dict[qid][docid] = score
preprocessed_file = os.path.join("data", "generated_from_boolean_openai.jsonl")
out_file = os.path.join("data", collection, f'generation_model_boolean_to_query.json')
with open(out_file, "w") as fw:
overall_list = []
with open(preprocessed_file) as f:
for line in f:
current_dict = json.loads(line)
dump_dict = init_dict.copy()
#prompt_len = len(tokenizer.tokenize(Apaca_PROMPT.format(boolean=current_dict["query"])))
query = current_dict["generated_query"]
boolean = current_dict["boolean"]
id = current_dict["id"]
if id in qrel_dict:
dump_dict["input"] = boolean
dump_dict["output"] = query
overall_list.append(dump_dict)
print(len(
overall_list))
json.dump(overall_list, fw, indent=4)
preprocessed_file = os.path.join("data", "generated_from_booleans_openai.jsonl")
out_file = os.path.join("data", collection, f'generation_model_boolean_to_queries.json')
with open(out_file, "w") as fw:
overall_list = []
with open(preprocessed_file) as f:
for line in f:
current_dict = json.loads(line)
dump_dict = init_dict.copy()
# prompt_len = len(tokenizer.tokenize(Apaca_PROMPT.format(boolean=current_dict["query"])))
queries = current_dict["generated_query"]
boolean = current_dict["boolean"]
id = current_dict["id"]
if id in qrel_dict:
for query in queries:
dump_dict["input"] = boolean
dump_dict["output"] = query
overall_list.append(dump_dict)
print(len(
overall_list))
json.dump(overall_list, fw, indent=4)