Added proof step processing script and token vocab. Edited default da…

…tapath in options.py and directions in README
LASER-UMASS · Feb 25, 2021 · f7d6cb1 · f7d6cb1
1 parent 44f5198
commit f7d6cb1
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -51,8 +51,9 @@ This generates a `*.json` file in `./data/` corresponding to each `*.meta` file.
 
 ### Training Examples (proof steps)
 
-1. Proofs steps used in the paper are found in `processed.tar.gz`, which can be downloaded from the replication package link provided above. This should be copied into `TacTok/TacTok`
+1. Proofs steps used in the paper are found in `processed.tar.gz`, which can be downloaded from the replication package link provided above. This should be copied into `TacTok/`
 2. To extract new proofs, run `python extract_proof_steps.py`.
+3. To generate new proof steps that have `prev_tokens` field, run `python process_proof_steps.py`. This will generate `processed/proof_steps`.
 
 ## 4. Training TacTok 
 

diff --git a/TacTok/options.py b/TacTok/options.py
@@ -20,7 +20,7 @@ def parse_args():
     # experimental setup
     parser.add_argument('--include_synthetic', action='store_true')
     parser.add_argument('--exp_id', type=str)
-    parser.add_argument('--datapath', type=str, default='proof_steps/human')
+    parser.add_argument('--datapath', type=str, default='processed/proof_steps')
     parser.add_argument('--projs_split', type=str, default='../projs_split.json')
     parser.add_argument('--num_epochs', type=int, default=4)
     parser.add_argument('--resume', type=str, help='the model checkpoint to resume')

diff --git a/TacTok/process_proof_steps.py b/TacTok/process_proof_steps.py
@@ -0,0 +1,68 @@
+import torch
+from torch.utils.data import Dataset, DataLoader
+from options import parse_args
+import random
+from progressbar import ProgressBar
+import os
+import sys
+sys.setrecursionlimit(100000)
+import pickle
+from collections import defaultdict
+import numpy as np
+from glob import glob
+import json
+import pdb
+import string
+
+rem_punc = string.punctuation.replace('\'','').replace('_', '')
+table = str.maketrans('', '', rem_punc)
+
+def tokenize_text(raw_text):
+	without_punc = raw_text.translate(table)
+	words = without_punc.split()
+	return words
+
+
+proof_steps = glob(os.path.join('proof_steps', 'train/*.pickle')) + \
+                               glob(os.path.join('proof_steps', 'valid/*.pickle'))
+
+proofs = {}
+
+print(len(proof_steps))
+print("Collecting proofs from steps")
+for idx in range(len(proof_steps)):
+	f = open(proof_steps[idx], 'rb')
+	proof_step = pickle.load(f)
+	f.close()
+	key = (proof_step['file'], proof_step['proof_name'])
+	n_step = proof_step['n_step']
+	if key not in proofs:
+		proofs[key] = {}
+	proofs[key][n_step] = idx
+
+
+print("Generating new proof steps")
+for p_key in proofs:
+	seq_raw = [] # list of strings, each string is a command
+	seq_proc_separated = [] # list of lists of strings, each string is a token
+	seq_proc_complete = [] # list of strings, each string is a token
+	proof = proofs[p_key] # dictionary
+
+	for n_key in sorted(proof.keys()):
+
+		idx = proof[n_key]
+		f = open(proof_steps[idx], 'rb')
+		proof_step = pickle.load(f)
+		f.close()
+		proof_step['prev_strings'] = seq_raw
+		proof_step['prev_tactic_list'] = seq_proc_separated
+		proof_step['prev_tokens'] = seq_proc_complete
+		new_file_name = os.path.join('processed', proof_steps[idx])
+		new_f = open(new_file_name, 'wb')
+		pickle.dump(proof_step, new_f)
+		new_f.close()
+		raw_text = proof_step['tactic']['text']
+		text_words = tokenize_text(raw_text)
+		seq_raw.append(raw_text) 
+		seq_proc_separated.append(text_words)
+		seq_proc_complete += text_words
diff --git a/TacTok/token_vocab.pickle b/TacTok/token_vocab.pickle