diff --git a/README.md b/README.md index 0da7166c..fbb8e160 100644 --- a/README.md +++ b/README.md @@ -51,8 +51,9 @@ This generates a `*.json` file in `./data/` corresponding to each `*.meta` file. ### Training Examples (proof steps) -1. Proofs steps used in the paper are found in `processed.tar.gz`, which can be downloaded from the replication package link provided above. This should be copied into `TacTok/TacTok` +1. Proofs steps used in the paper are found in `processed.tar.gz`, which can be downloaded from the replication package link provided above. This should be copied into `TacTok/` 2. To extract new proofs, run `python extract_proof_steps.py`. +3. To generate new proof steps that have `prev_tokens` field, run `python process_proof_steps.py`. This will generate `processed/proof_steps`. ## 4. Training TacTok diff --git a/TacTok/options.py b/TacTok/options.py index c2d1a66d..3885a3a9 100644 --- a/TacTok/options.py +++ b/TacTok/options.py @@ -20,7 +20,7 @@ def parse_args(): # experimental setup parser.add_argument('--include_synthetic', action='store_true') parser.add_argument('--exp_id', type=str) - parser.add_argument('--datapath', type=str, default='proof_steps/human') + parser.add_argument('--datapath', type=str, default='processed/proof_steps') parser.add_argument('--projs_split', type=str, default='../projs_split.json') parser.add_argument('--num_epochs', type=int, default=4) parser.add_argument('--resume', type=str, help='the model checkpoint to resume') diff --git a/TacTok/process_proof_steps.py b/TacTok/process_proof_steps.py new file mode 100644 index 00000000..1029a422 --- /dev/null +++ b/TacTok/process_proof_steps.py @@ -0,0 +1,68 @@ +import torch +from torch.utils.data import Dataset, DataLoader +from options import parse_args +import random +from progressbar import ProgressBar +import os +import sys +sys.setrecursionlimit(100000) +import pickle +from collections import defaultdict +import numpy as np +from glob import glob +import json +import pdb +import string + +rem_punc = string.punctuation.replace('\'','').replace('_', '') +table = str.maketrans('', '', rem_punc) + +def tokenize_text(raw_text): + without_punc = raw_text.translate(table) + words = without_punc.split() + return words + + +proof_steps = glob(os.path.join('proof_steps', 'train/*.pickle')) + \ + glob(os.path.join('proof_steps', 'valid/*.pickle')) + +proofs = {} + +print(len(proof_steps)) +print("Collecting proofs from steps") +for idx in range(len(proof_steps)): + f = open(proof_steps[idx], 'rb') + proof_step = pickle.load(f) + f.close() + key = (proof_step['file'], proof_step['proof_name']) + n_step = proof_step['n_step'] + if key not in proofs: + proofs[key] = {} + proofs[key][n_step] = idx + + +print("Generating new proof steps") +for p_key in proofs: + seq_raw = [] # list of strings, each string is a command + seq_proc_separated = [] # list of lists of strings, each string is a token + seq_proc_complete = [] # list of strings, each string is a token + proof = proofs[p_key] # dictionary + + for n_key in sorted(proof.keys()): + + idx = proof[n_key] + f = open(proof_steps[idx], 'rb') + proof_step = pickle.load(f) + f.close() + proof_step['prev_strings'] = seq_raw + proof_step['prev_tactic_list'] = seq_proc_separated + proof_step['prev_tokens'] = seq_proc_complete + new_file_name = os.path.join('processed', proof_steps[idx]) + new_f = open(new_file_name, 'wb') + pickle.dump(proof_step, new_f) + new_f.close() + raw_text = proof_step['tactic']['text'] + text_words = tokenize_text(raw_text) + seq_raw.append(raw_text) + seq_proc_separated.append(text_words) + seq_proc_complete += text_words diff --git a/TacTok/token_vocab.pickle b/TacTok/token_vocab.pickle new file mode 100644 index 00000000..ca73c5b7 Binary files /dev/null and b/TacTok/token_vocab.pickle differ