Skip to content

Commit

Permalink
Added proof step processing script and token vocab. Edited default da…
Browse files Browse the repository at this point in the history
…tapath in options.py and directions in README
  • Loading branch information
efirst committed Feb 25, 2021
1 parent 44f5198 commit f7d6cb1
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 2 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ This generates a `*.json` file in `./data/` corresponding to each `*.meta` file.

### Training Examples (proof steps)

1. Proofs steps used in the paper are found in `processed.tar.gz`, which can be downloaded from the replication package link provided above. This should be copied into `TacTok/TacTok`
1. Proofs steps used in the paper are found in `processed.tar.gz`, which can be downloaded from the replication package link provided above. This should be copied into `TacTok/`
2. To extract new proofs, run `python extract_proof_steps.py`.
3. To generate new proof steps that have `prev_tokens` field, run `python process_proof_steps.py`. This will generate `processed/proof_steps`.

## 4. Training TacTok

Expand Down
2 changes: 1 addition & 1 deletion TacTok/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def parse_args():
# experimental setup
parser.add_argument('--include_synthetic', action='store_true')
parser.add_argument('--exp_id', type=str)
parser.add_argument('--datapath', type=str, default='proof_steps/human')
parser.add_argument('--datapath', type=str, default='processed/proof_steps')
parser.add_argument('--projs_split', type=str, default='../projs_split.json')
parser.add_argument('--num_epochs', type=int, default=4)
parser.add_argument('--resume', type=str, help='the model checkpoint to resume')
Expand Down
68 changes: 68 additions & 0 deletions TacTok/process_proof_steps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import torch
from torch.utils.data import Dataset, DataLoader
from options import parse_args
import random
from progressbar import ProgressBar
import os
import sys
sys.setrecursionlimit(100000)
import pickle
from collections import defaultdict
import numpy as np
from glob import glob
import json
import pdb
import string

rem_punc = string.punctuation.replace('\'','').replace('_', '')
table = str.maketrans('', '', rem_punc)

def tokenize_text(raw_text):
without_punc = raw_text.translate(table)
words = without_punc.split()
return words


proof_steps = glob(os.path.join('proof_steps', 'train/*.pickle')) + \
glob(os.path.join('proof_steps', 'valid/*.pickle'))

proofs = {}

print(len(proof_steps))
print("Collecting proofs from steps")
for idx in range(len(proof_steps)):
f = open(proof_steps[idx], 'rb')
proof_step = pickle.load(f)
f.close()
key = (proof_step['file'], proof_step['proof_name'])
n_step = proof_step['n_step']
if key not in proofs:
proofs[key] = {}
proofs[key][n_step] = idx


print("Generating new proof steps")
for p_key in proofs:
seq_raw = [] # list of strings, each string is a command
seq_proc_separated = [] # list of lists of strings, each string is a token
seq_proc_complete = [] # list of strings, each string is a token
proof = proofs[p_key] # dictionary

for n_key in sorted(proof.keys()):

idx = proof[n_key]
f = open(proof_steps[idx], 'rb')
proof_step = pickle.load(f)
f.close()
proof_step['prev_strings'] = seq_raw
proof_step['prev_tactic_list'] = seq_proc_separated
proof_step['prev_tokens'] = seq_proc_complete
new_file_name = os.path.join('processed', proof_steps[idx])
new_f = open(new_file_name, 'wb')
pickle.dump(proof_step, new_f)
new_f.close()
raw_text = proof_step['tactic']['text']
text_words = tokenize_text(raw_text)
seq_raw.append(raw_text)
seq_proc_separated.append(text_words)
seq_proc_complete += text_words
Binary file added TacTok/token_vocab.pickle
Binary file not shown.

0 comments on commit f7d6cb1

Please sign in to comment.