forked from allenai/bilm-tf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathusage_character_source_code.py
83 lines (68 loc) · 3.13 KB
/
usage_character_source_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
'''
ELMo usage example with character inputs.
'''
import tensorflow as tf
import os
from bilm import Batcher, BidirectionalLanguageModel, weight_layers
# Location of pretrained LM. Here we use the test fixtures.
# datadir = os.path.join('tests', 'fixtures', 'model')
data_dir = '/disk/scratch/mpatsis/eddie/data/phog/js/'
vocab_file = os.path.join(data_dir, 'vocab')
model_dir = '/disk/scratch/mpatsis/eddie/models/phog/js/elmo/1024/'
options_file = os.path.join(model_dir, 'query_options.json')
weight_file = os.path.join(model_dir, 'weights/weights.hdf5')
# Create a Batcher to map text to character ids.
batcher = Batcher(vocab_file, 50)
# Input placeholders to the biLM.
code_character_ids = tf.placeholder('int32', shape=(None, None, 50))
# Build the biLM graph.
bilm = BidirectionalLanguageModel(options_file, weight_file)
# Get ops to compute the LM embeddings.
code_embeddings_op = bilm(code_character_ids)
# Get an op to compute ELMo (weighted average of the internal biLM layers)
# Our model includes ELMo at both the input layers
# of the task GRU, so we need 2x ELMo representations for the question
# and code at each of the input and output.
# We use the same ELMo weights for both the question and code
# at each of the input and output.
elmo_code_rep_op = weight_layers('input', code_embeddings_op, l2_coef=0.0)
# with tf.variable_scope('', reuse=True):
# # the reuse=True scope reuses weights from the code for the question
# elmo_question_input = weight_layers(
# 'input', question_embeddings_op, l2_coef=0.0
# )
# Now we can compute embeddings.
raw_code = [
'STD:function STD:( ID:e STD:, ID:tags STD:) STD:{ ID:tags STD:. ID:should STD:. ' +
'ID:have STD:. ID:lengthOf STD:( LIT:1 STD:) STD:; ID:done STD:( STD:) STD:; STD:} '
'STD:) STD:; STD:} STD:) STD:; STD:} STD:) STD:; STD:} STD:) STD:; STD:} STD:) STD:; ' +
'STD:} STD:) STD:; STD:} STD:) STD:; STD:} STD:) STD:; STD:} STD:) STD:; STD:} STD:) ' +
'STD:; STD:} STD:) STD:;',
'STD:var ID:gfm STD:= ID:require STD:( LIT:github-flavored-markdown STD:) STD:;'
]
tokenized_code = [sentence.split() for sentence in raw_code]
# tokenized_question = [
# ['What', 'are', 'biLMs', 'useful', 'for', '?'],
# ]
with tf.Session() as sess:
# It is necessary to initialize variables once before running inference.
sess.run(tf.global_variables_initializer())
# Create batches of data.
code_ids = batcher.batch_sentences(tokenized_code)
# question_ids = batcher.batch_sentences(tokenized_question)
# Warm up the LSTM state, otherwise will get inconsistent embeddings.
for step in range(500):
elmo_code_representation = sess.run(
[elmo_code_rep_op['weighted_op']],
feed_dict={code_character_ids: code_ids}
)
# Compute ELMo representations (here for the input only, for simplicity).
elmo_code_representation = sess.run(
[elmo_code_rep_op['weighted_op']],
feed_dict={code_character_ids: code_ids}
)
print(elmo_code_representation)
# elmo_question_input_ = sess.run(
# [elmo_question_input['weighted_op']],
# feed_dict={question_character_ids: question_ids}
# )