-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdefaults.py
280 lines (247 loc) · 14.4 KB
/
defaults.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import nltk
import os
import re
import sys
import numpy as np
# NOTE: This file only contains the hyperparameters at the most abstract level,
# those that are most likely to be tuned by the user. See relevant functions
# in Utils.py for finer tuning of parameters.
# NOTE: If you wish to keep the following values as defaults, but try out other
# values, you can override the defaults by assinging variables in lda_config.py
### determine hyperparameters ###
### Data management hyperparameters
NN = False # Set to False for LDA, to true for neural network classification
ENTIRE_CORPUS = True # Are we using a random subset of comments, or the whole
# dataset? The names of model files and output directories will include the
# value of this variable (e.g. the default LDA output directory label is
# LDA_[ENTIRE_CORPUS]_[num_topics] )
OVERWRITE = False # Overwrites existing sampled comment indices. Only matters
# if ENTIRE_CORPUS = False
DOWNLOAD_RAW = True # If a raw data file is not available on disk, download it
# NOTE: Be mindful of possible changes to compression algorithm used at
# https://files.pushshift.io/reddit/comments/ beyond 02-2019, as they would
# not be reflected in the parser's code, which assumes the latest files have
# .zst extensions
CLEAN_RAW = False # After parsing, delete the raw data file from disk if it was
# not downloaded during parsing
vote_counting = True # Record the fuzzed number of upvotes when parsing
WRITE_ORIGINAL = True # Write original comments to file when parsing
author = True # Write the username of each post's author to a separate file
sentiment = False # Write sentence- and document-level sentiment of a post to
# file (based on TextBlob and Vader)
add_sentiment = False # Add CoreNLP sentiment values as a post-parsing step
# NOTE: Make sure that Stanford CoreNLP's Python package is unzipped to the
# same directory as this file and CoreNLP_server.py is also available before
# running this function.
# NOTE: Because of incompatibility with batching and hyperthreading used in
# parsing, this function should be run sequentially from NN_Book_Keeping.py
num_cores = 4 # Number of threads for sentence-by-sentence parallelization of
# CoreNLP sentiment values. Only matters if add_sentiment == True
# NOTE: Massive slow-down if the number is not slightly lower than number of cores
### Pre-processing hyperparameters
# NOTE: Matters for optimization of the parallelizations used in the functions.
# NOTE: On Brown University's supercomputer, batches of 24 months were found to
# be optimal
MaxVocab = 200000 # maximum size of the vocabulary
FrequencyFilter = 1 # tokens with a frequency equal or less than this number
# will be filtered out of the corpus (when NN=True)
no_below = 2 # tokens that appear in less than this number of documents in
# corpus will be filtered out (when NN=False, i.e. for the LDA model)
no_above = 0.99 # tokens that appear in more than this fraction of documents in
# corpus will be filtered out
training_fraction = 0.99 # what percentage of data will be used for learning the
# LDA model. The rest of the dataset will be used as an evaluation set for
# calculating perplexity and preventing overfitting
NN_training_fraction = 0.80 # fraction of the data that is used for training
# the neural network.[1 - training_fraction] fraction of the dataset will be
# divided randomly and equally into evaluation and test sets
calculate_perc_rel = True # whether the percentage of relevant comments from
# each year should be calculated and written to file
num_process = 3 # the number of parallel processes to be executed for parsing
# NOTE: Uses Python's multiprocessing package
Neural_Relevance_Filtering = True # The dataset will be cleaned from posts
# irrelevant to the topic using a pre-trained neural network model.
# NOTE: Needs results of parsing for the same dates with WRITE_ORIGINAL==True
# NOTE: Requires a pre-trained simpletransformers model. One such model trained
# for the marijuana legalization Reddit dataset is included in the repository.
# NOTE: Default model_path is [repository path]/Human_Ratings/1_1/full_1005/
# See ROBERTA_Classifier.py for training, learning and evaluation details.
# NOTE: This task takes a long time to complete.
rel_sample_num = 200 # By default, a random sample of this size will be extracted
# from the dataset to evaluate the classification model.
balanced_rel_sample = True # whether the random filtering sample should be
# balanced across classification categories (relevant, irrelevant by default)
eval_relevance = False # F1, recall, precision and accuracy for the sample derived
# from Neural_Relevance_Filtering. Requires the sample to be complemented by
# manual labels. The default location for the sample is
# [repository path]/auto_labels/sample_auto_labeled.csv
# NOTE: Set to false if you intend to extract the relevance sample, since the produced
# files will be empty of human judgments and eval_relevance results nonsensical
num_annot = 3 # number of relevance annotators. Used to divide [rel_sample_num]
# documents evenly between the annotators with specified overlap
# NOTE: [rel_sample_num] should be divisible by this number
overlap = 0.2 # degree of overlap between annotators. Multiplying [rel_sample_num]
# by this should result in an integer
# [repository path]/original_comm/sample_auto_labeled.csv
### LDA hyperparameters
# NOTE: Number of processes for parallelization are currently set manually. See
# notes in reddit_parser.py and and Reddit_LDA_Analysis.py for more details
n_random_comments = 1500 # number of comments to sample from each year for
# training. Only matters if ENTIRE_CORPUS = False.
iterations = 1000 # number of times LDA posterior distributions will be sampled
num_threads = 5 # number of threads used for parallelized processing of comments
# Only matters if using _Threaded functions
num_topics = 10 # number of topics to be generated in each LDA sampling
alpha = 0.1 # determines how many high probability topics will be assigned to a
# document in general (not to be confused with NN l2regularization constant)
minimum_probability = 0.01 # minimum acceptable probability for an output topic
# across corpus
eta = 0.1 # determines how many high probability words will be assigned to a
# topic in general
minimum_phi_value = 0.01 # determines the lower bound on per-term topic
# probability. Only matters if per_word_topics = True.
one_hot_topic_contributions=False
# NOTE: With bad model fits sum of topic contributions for certain posts may not
# add up to close enough to 1 and the model would fail quality assurance assetion
# checks. You can examine the cases that have failed the assertion in a file
# named failures in [path], with the following format: post index, word index,
# sum of topic contributions, number of repetitions within the post
topic_cont_freq="monthly" # The frequency of topic contribution calculation
topic_idf = False # whether an inverse frequency term should be considered for
# in determining the top topics in the corpus. If set to False, contribution
# calculation will only prioritize higher overall contribution #TODO: Debug
topic_idf_thresh = 0.1 # what proportion of contributions in a post would add to
# the frequency count for a certain topic that will adversely affect its
# estimated contribution to the discourse. Only matters if topic_idf = True.
# Must be greater than zero and less than one.
# TODO: add support for a range of idf values to be tested automatically
calculate_perplexity = False # whether perplexity is calculated for the model
calculate_coherence = False # whether umass coherence is calculated for the model
### Neural Network Hyperparameters
# TODO: create a function for sampling a hundred posts at random and compare
# the neural networks on them for sentiment analysis and pro/anti based on
# unsupervised sentiment training AND supervised movie review pre-training
## determine kind of network
special_doi = False # If False, the neural network will model sentiment.
# If true, it will perform classification on comments based on a user-defined
# "dimension of interest" #TODO: Clarify "dimension of interest"
pretrained = False # whether there is sentiment analysis pre-training.
# NOTE: Should only be set to True if special_doi is also True
# NOTE: For classifier pretraining, the code should first be run with
# special_doi = False & pretrained = False and param_path should be set
# according to the output_path that results from the first run of the code
# NOTE: If pre-training is on, network hyperparameters should not be changed for
# the DOI run from the ones used for pre-training #TODO: Remove this requirement
LDA_topics = True # whether the neural networks take as part of their input
# topic contributions to each post as determined by a previously-run LDA analysis
# NOTE: the path to the LDA output to be used needs to be entered below manually
authorship = True # whether the neural networks take as part of their input
# the username of a post's author.
# NOTE: When the username is missing for posts (e.g. in case the author deleted
# their Reddit account), the model assumes a different author for each anonymous
# posting
# NOTE: The functions assume that "author" files from pre-processing are
# available in the same folder as the one containing this file
use_simple_bert = True
## Training hyperparameters
epochs = 3 # number of epochs
learning_rate = 0.003 # learning rate
#TODO: write code for automatically testing a set of learning rates
batchSz = 50 # number of parallel batches
word_embedSz = 128 # word embedding size
hiddenSz = 512 # number of units in the recurrent layer
author_embedSz = 128 # author embedding size. Matters only if authorship == True
ff1Sz = 1000 # number of units in the first feedforward layer
ff2Sz = 1000 # number of units in the second feedforward layer
keepP = 0.5 # 1 - dropout rate
early_stopping = True # whether to stop training if development set perplexity is going up
l2regularization = False # whether the model will be penalized for longer weight vectors. Helps prevent overfitting
NN_alpha = 0.01 # L2 regularization constant
### LDA-based sampling hyperparameters
# TODO: use the top topics function in Gensim to get topic-specific coherence vals
top_topic_set = list(range(num_topics)) # Choose specific topics to sample comments
# and top words
# from. set to None to use threshold or fraction of [num_topics] instead
sample_topics = 0.2 # proportion of topics that will be selected for reporting
# based on average yearly contribution. Set to None if choosing topics based on
# threshold instead.
# NOTE: Must be a valid proportion (not None) if topic_idf = True
top_topic_thresh = None # threshold for proportion contribution to the corpus
# determining topics to report. Only matters if topic_idf = False
topn = 40 # the number of high-probability words for each topic to be exported
# NOTE: Many of the words will inevitably be high probability general
# non-content and non-framing words. So topn should be set to significantly
# higher than the number of relevant words you wish to see
sample_comments = 5 # number of comments that will be sampled from top topics
min_comm_length = 20 # the minimum acceptable number of words in a sampled
# comment. Set to None for no length filtering
# Determines how topic contributions are calculated. When set to True, the
# topic of each word is set to be simply the most probable topic. When False,
# the topic of each word is set to the entire probability distribution over
# num_topics topics.
num_pop = None # number of the most up- or down-voted comments sampled for model
# comparison. Set to None for no sampling. Needs data parsed with
# write_original = True
### Paths
## where the data is
file_path = os.path.abspath(__file__)
model_path = os.path.dirname(file_path)
# For the neural filtering
rel_model_path = model_path+"/Human_Ratings/full_774/"
data_path = model_path
# NOTE: if not fully available on file, set Download for Parser function to
# True (source: http://files.pushshift.io/reddit/comments/)
# NOTE: if not in the same directory as this file, change the path variable
# accordingly
## Year/month combinations to get Reddit data for
dates=[] # initialize a list to contain the year, month tuples
months=range(1,13) # month range
years=range(2008,2020) # year range
for year in years:
for month in months:
dates.append((year,month))
## where the output will be stored
# NOTE: To avoid confusion between different kinds of models, record the
# variables most important to your iteration in the folder name
## where the output will be stored
# NOTE: To avoid confusion between different kinds of models, always include doi\
# and pre in the output directory's name. After those, record the variables most
# important to your iteration
if NN: # If running a neural network analysis
output_path = model_path+"/"+"doi_"+str(special_doi)+"_pre_"+str(pretrained)+"_e_"+str(epochs)+"_"+"hd"+"_"+str(hiddenSz)
if not os.path.exists(output_path):
print("Creating directory to store the output")
os.makedirs(output_path)
## where the saved parameters are
# NOTE: Enter manually. Only matters if special_doi = True and pretrained = True
param_path = model_path+"/doi_False_pre_False_e_3_hd_512/"
if pretrained == True:
if not os.path.exists(param_path):
raise Exception("Could not find saved pre-trained parameter values.")
else: # if doing topic modeling
# Force this import so output_path is correctly set
from lda_config import ENTIRE_CORPUS
output_path = model_path + "/LDA_"+str(ENTIRE_CORPUS)+"_"+str(num_topics)
### Preprocessing ###
### determine the set of stopwords used in preprocessing
keepers = ["how","should","should've","could","can","need","needn","why","few",
"more","most","all","any","against","because","ought","must","mustn","mustn't",
"shouldn","shouldn't","couldn't","couldn","shan't", "needn't"]
stop = []
for word in set(nltk.corpus.stopwords.words('english')):
if word not in keepers:
stop.append(str(word))
### Define the regex filter used for finding relevant comments
regex_iteration = 2
engineering = []
with open("engineering_" + str(regex_iteration) + ".txt", 'r') as f:
for line in f:
engineering.append(re.compile(line.lower().strip()))
genetic = []
with open("genetic_" + str(regex_iteration) + ".txt", 'r') as f:
for line in f:
genetic.append(re.compile(line.lower().strip()))
disease = []
with open("disease_" + str(regex_iteration) + ".txt", 'r') as f:
for line in f:
disease.append(re.compile(line.lower().strip()))