-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpretrain.py
76 lines (64 loc) · 3.15 KB
/
pretrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
from gensim.models.word2vec import Word2Vec
import nltk
import json
from typing import List, Dict
import sys
from graph.detectors.models.deepwukong import model_args as dwk_model_args
from graph.detectors.models.devign import model_args as devign_model_args
from graph.detectors.models.reveal import model_args as reveal_model_args
from graph.detectors.models.ivdetect import model_args as ivdetect_model_args
from graph.detectors.detector_utils.ivdetect_util import lexical_parse
from sequence.detectors.models.tokenlstm import model_args as tl_model_args
# pretrain embedding models for detectors
window_size = 10
w2v_sizes = {
"deepwukong": dwk_model_args.vector_size,
"devign": devign_model_args.vector_size,
"reveal": reveal_model_args.vector_size,
"ivdetect": ivdetect_model_args.feature_representation_size,
"tokenlstm": tl_model_args.vector_size
}
class Sentences:
def __init__(self, datas: List[Dict], detector: str):
self.datas: List[Dict] = datas
self.detector: str = detector
def __iter__(self):
for graph_data in self.datas:
if self.detector in {"devign", "reveal", "ivdetect"}:
json_contents = graph_data["nodes"]
for json_content in json_contents:
graph_data: Dict = json.loads(json_content)
if self.detector == "ivdetect":
statement_after_split = lexical_parse(graph_data["contents"][0][1])
else:
statement_after_split = nltk.word_tokenize(graph_data["contents"][0][1])
yield statement_after_split
# deepwukong
elif self.detector == "deepwukong":
contents = graph_data["nodes-line-sym"]
for statement in contents:
statement_after_split = nltk.word_tokenize(statement)
yield statement_after_split
# Sequence Based Detector
else:
contents: List[str] = list()
json_contents: List[str] = graph_data["node-line-content"]
for json_content in json_contents:
statement_after_split = nltk.word_tokenize(json_content)
contents.extend(statement_after_split)
yield contents
if __name__ == '__main__':
# name of detector, choice is deepwukong, ivdetect, reveal, devign
detector_name = sys.argv[1]
# path to dataset, should include train_vul.json and train_normal.json
train_dataset_path = sys.argv[2]
# path to save
pretrain_doc2vec_model_path = sys.argv[3]
train_jsons: List[Dict] = json.load(open(os.path.join(train_dataset_path,
"train_vul.json"), 'r', encoding='utf-8')) + \
json.load(open(os.path.join(train_dataset_path,
"train_normal.json"), 'r', encoding='utf-8'))
sentences = Sentences(train_jsons, detector_name)
model = Word2Vec(sentences, size=w2v_sizes[detector_name], window=window_size, hs=1, min_count=1, iter=20)
model.save(pretrain_doc2vec_model_path)