-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbgmmSensTest.sh
executable file
·361 lines (301 loc) · 16.8 KB
/
bgmmSensTest.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#!/bin/sh
##########################################################################################
# File: bgmmSensTest.sh #
VERSION="v0.1.0" #
# Author: Justin C. Bagley #
# Date: created by Justin Bagley on Wed Jan 4 09:09:34 2017 -0600 #
# Last update: March 3, 2019 #
# Copyright (c) 2017-2019 Justin C. Bagley. All rights reserved. #
# Please report bugs to <[email protected]>. #
# #
# Description: #
# SHELL SCRIPT FOR CONDUCTING SIMPLE SENSITIVITY TESTING TO EXPLORE BELIEF-BASED #
# GAUSSIAN MIXTURE MODEL ROBUSTNESS TO VARYING PRIORS ON KNOWN OBSERVATIONS (P MATRIX) #
# #
##########################################################################################
if [[ "$1" == "-v" ]] || [[ "$1" == "--version" ]]; then
echo "$(basename $0) $VERSION";
exit
fi
echo "
##########################################################################################
# bgmmSensTest v0.1.0, December 2016 #
##########################################################################################
"
######################################## START ###########################################
echo "INFO | $(date) | Starting bgmmSensTest analysis... "
echo "INFO | $(date) | STEP #1: SETUP AND USER INPUT. "
###### Set paths and filetypes as different variables:
##--Set new path/dir environmental variable to the current directory:
# MY_PATH=`pwd -P`
MY_PATH="$(pwd -P)"
echo "INFO | $(date) | Setting working directory to: $MY_PATH "
CR=$(printf '\r')
calc () {
bc -l <<< "$@"
}
echo "INFO | $(date) | Reading in sensitivity test conditions and file names from configuration file. "
###### Use grep and awk to read in information from the user about the desired sensitivity
##--test conditions, by pulling the required information from the 'bgmm_sens_test.cfg'
##--configuration file supplied by the user, present in current working directory.
OPT_STRING_PART="$(grep -n "opt_string_part" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
NUM_COMPONENTS="$(grep -n "num_clusters" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
MY_DATA_FILE="$(grep -n "data_file" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
MY_PROBS_FILE="$(grep -n "probs_file" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
TARGET_VALUE="$(grep -n "target_diag_value" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
NUM_TEST_VALUES="$(grep -n "num_test_vals" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
TEST_MIN="$(grep -n "test_val_min" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
TEST_MAX="$(grep -n "test_val_max" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
TEST_INC="$(grep -n "test_val_inc" ./bgmm_sens_test.cfg | awk -F"=" '{print $NF}')";
echo "INFO | $(date) | STEP #2: MAKE DIFFERENT INPUT FILES FOR THE TEST, VARYING TEST STAT ACROSS A RANGE OF VALUES. "
###### Make functions that make new input files, each of which has the target value changed
##--to one test value in the range made up by the minimum, increment, and maximum values
##--given by the user above. The functions thus also have to change the corresponding off-
##--diagonal values, to the appropriate 1-y/k value (where y is the on-diagonal value). We
##--use a shell one-liner (using grep, sed, and head) to pull the off-diagonal values out
##--of the input file. Because we are dealing with probs matrices composed of probability
##--values, which sum to one thus can never individually be greater than 1, we are working
##--with float values, so our function must deal with float values throughout. We name
##--this function "makeInputFiles".
makeInputFiles () {
count=0
INPUT_NCOL=$(cat $MY_PROBS_FILE | awk '{print NF}' | head -n1);
NCOL_DROPONE=$(calc $INPUT_NCOL - 1);
AFTER_DECI="$(echo $TARGET_VALUE | sed 's/0\.//g')";
OFF_DIAG_VALUES=$(grep -n ''$TARGET_VALUE'\t' $MY_PROBS_FILE | \
sed 's/^[0-9\:A-Za-z\_\-]*[ ]*//g; s/0\.${AFTER_DECI} //g' | head | \
sed 's/[0-9\.]* //g' | head -n1);
NEW_OFF_DIAG_VALUES=$(calc $(calc 1 - $TARGET_VALUE) / $NCOL_DROPONE);
(
for i in $(seq $TEST_MIN $TEST_INC $TEST_MAX); do
IFIX="$(echo $i | sed 's/0\.//g')";
sed 's/0\.${AFTER_DECI} /0\.'$IFIX' /g' $MY_PROBS_FILE | sed 's/$OFF_DIAG_VALUES/$NEW_OFF_DIAG_VALUES/g' > ./run"$count"_$i.txt
MY_RUN_TXT_FILE=./run*.txt; ## There will only be one of these, which we will use and then move into a folder so that it does not conflict with the next txt file generated by this loop.
basename=$(echo $MY_RUN_TXT_FILE | sed 's/\.\///g; s/\.txt//g');
mkdir $basename;
mv $MY_RUN_TXT_FILE ./$basename/;
count=$((count+1))
done
)
}
##--Don't forget to run the function!
makeInputFiles
echo "INFO | $(date) | STEP #3: RUN ALGORITHM ACROSS TEST INPUT FILES, SAVE & COLLATE RESULTS. "
###### Move a copy of the data file into each test run folder created using the loop in
##--STEP #2 above, and then run the GaussClust 'lite' script (in wkdir) from within each
##--folder. We will specify options so that we run belief-based GMM using the 'bgmm' R
##--package, using the same exact parameters, on each input file, which are brought in
##--from the OPT_STRING_PART environmental variable (captured from cfg file). We'll also
##--make sure the output has appropriate names, and we will extract and format, as needed,
##--any desired information from results of each run.
(
for j in ./*/; do
cp ../GaussClust_lite.sh $MY_DATA_FILE $j;
cd $j;
MY_LOCAL_PROBS_FILE=./run*.txt;
./GaussClust_lite.sh $OPT_STRING_PART -p $MY_LOCAL_PROBS_FILE -c $NUM_COMPONENTS $MY_DATA_FILE;
cd ..;
done
)
##--Organize posterior probability matrices resulting from each run on a separate input
##--file created in STEP #2 above, by placing one copy named as 'tij' (posterior probs
##--matrix name; see bgmm R package documentation) followed by the test value (e.g. "tij_0.5")
##--into the working directory.
(
for k in ./*/; do
FOLDERNAME=$(echo $k | sed 's/\.\/run[0-9]*\_//g; s/\/$//g');
cp "$k"R/*_posteriorProbs.txt ./tij_$FOLDERNAME.txt;
done
)
###### Prep some values as environmental variables for use in Rscript below:
##--Make environmental variable containing posterior probability (PP) matrix names:
MY_PP_MATRICES=./tij_*.txt;
FUTURE_TIJ_OBJ_NAMES=$(echo $MY_PP_MATRICES | sed 's/\.\///g; s/.txt//g; s/\ /\,\ /g'); ## This is used in the Rscript below.
#
##--Account for possible odd or even value for $NUM_TEST_VALUES
EVEN_CHECK=$(( $NUM_TEST_VALUES % 2 ))
if [ $EVEN_CHECK -eq 0 ]; then
HALF_NUM_TEST_VALUES=$(calc $NUM_TEST_VALUES/2);
else
HALF_NUM_TEST_VALUES=$(calc $(calc $NUM_TEST_VALUES +1)/2);
fi
###### Prep some text and code for math/arithmetic work in Rscript below:
##--Make files with all pairs of combinations of the sequence from 1 to the total number of
##--tij matrices, which is already $NUM_TEST_VALUES.
set -- $(seq $NUM_TEST_VALUES)
for a; do shift; for b; do printf "%s %s\n" "$a" "$b"; done; done > pairs.txt
set -- $(seq $NUM_TEST_VALUES)
for a; do shift; for b; do printf "%s %s %s %s\n" "$a" "$b" "$a" "$b"; done; done > pairs_x2.txt
##--Make code for subtracting matrices to get pairwise difference matrices:
sed 's/\(^[0-9]*\)\ /\1/' pairs_x2.txt > pairs1.tmp;
sed 's/^/dif\_m/g' pairs1.tmp > pairs2.tmp;
sed 's/\ \([0-9]*\)\ \([0-9*]\)/\ \<\-\ m\1\ \-\ m\2\;\ /' pairs2.tmp > pairs3.tmp;
MAKE_PAIR_DIFF_MATRICES=$(cat ./pairs3.tmp); ## THIS goes into the Rscript section (STEP #4 below) as a one-liner, using only the name of the environmental variable. All text stored within this variable will be echoed into the script by the shell, and then the script is run at the end.
##--Make code for summing all difference matrices:
sed 's/\([0-9]*\)\ /dif_m\1/; s/$/\ /' ./pairs.txt | sed 's/\,\ $'$CR'^$//' > pairs4.tmp;
MY_PAIRS=$(cat ./pairs4.tmp);
SUM_ALL_DIFF_MATRICES=$(echo $MY_PAIRS | sed 's/\ /\,\ /g'); ## THIS goes into the Rscript section as 'sum(c($SUM_ALL_DIFF_MATRICES))' to sum all difference matrices, for overall comparison.
##--Make code for summing original tij matrices:
MATRIX_NUMBERS=$(echo $(seq 6) | sed 's/\([0-9*]\)/m\1/g');
SUM_TIJ_MATRICES=$(echo $MATRIX_NUMBERS | sed 's/\ /\,\ /g'); ## THIS goes into the Rscript section as 'sum(c($SUM_TIJ_MATRICES))'
##--Make code for calculating difference proportion of total (from original two tij matrices)
##--for each pairwise difference matrix. We will make the R code by using sed to modify the
##--pairs of combinations present in the "pairs.txt" file. This operation is more complex
##--than the simple sed commands in the preceding lines, and though I'd like to use comment
##--symbols (pound signs) in the code the shell doesn't handle those so well. So we will
##--write the code to an Rscript, which we will then source from within the Rscript below,
##--using the 'source' function. First, let's write the code into a txt file:
sed 's/\([0-9]*\)\ \([0-9]*\)/'$CR'x\ \<\-\ sum(dif_m\1\2)'$CR'y\ \<\-\ sum(m\1\,\ m\2)'$CR'diffpro_m\1\2\ \<\-\ x\/y'$CR'diffpro_m\1\2'$CR''$CR'/g' ./pairs.txt > ./diffProTests.txt ;
##--Now we have modified the pairs into a set of R code in a local txt file, so we need to
##--make an appropriate header for the Rscript (I will just echo it) and then concatenate
##--the header and the txt code into a '.r' file that R can call:
echo "
#!/usr/bin/env Rscript
" > Rtop.tmp
cat ./Rtop.tmp ./diffProTests.txt > ./diffProTests.r ;
#
##--Make code for logical testing of whether tij matrices are identical in their portions
##--containing known observations, or not, etc. We will make this code based on the total
##--number of rows with known observations in the :
cd ./run0*/txt/
NUM_KNOWN_ROWS=$(grep "\"known" ./mydata_names_df.txt | wc -l)
cd ..; cd ..;
##--Make txt file with the logical testing code and place in variable and in Rscript:
sed 's/\([0-9]*\)\ \([0-9]*\)/identical(tijObjList\[\1\]\[\[1\]\]\[1\:'"$NUM_KNOWN_ROWS"'\,\]\,\ tijObjList\[\2\]\[\[1\]\]\[1\:'"$NUM_KNOWN_ROWS"'\,\])/g; s/\ //g' ./pairs.txt > ./logicalTests.txt ;
IDENT_MATRIX_TESTS=$(cat ./logicalTests.txt | sed 's/)/)\;/g'); ## Place logical testing code into environmental variable.
cat ./Rtop.tmp ./logicalTests.txt > ./logicalTests.r ; ## Make final Rscript for this code, which you will source in the compareProbs script below.
rm ./Rtop.tmp;
##--Make code for combining all difference proportion of total ('diffpro_') calculations
##--in order to plot them:
COMBINE_DIFF_PROPS=$(echo $MY_PAIRS | sed 's/\ /\,\ /g; s/dif/diffpro/g'); ## THIS goes into the Rscript section as 'sum(c($SUM_ALL_DIFF_MATRICES))' to sum all difference matrices, for overall comparison.
echo "INFO | $(date) | STEP #4: MAKE R SCRIPT CONTAINING ENVIRONMENTAL VARIABLES AND ANALYSIS CODE FOR VISUALIZING AND COMPARING RESULTS... "
############ MAKE R SCRIPT
echo "
#!/usr/bin/env Rscript
################################### compareProbs.R #######################################
############ I. SETUP
setwd('$MY_PATH')
#
##--Load needed library, R code, or package stuff. Install packages if not present.
packages <- c('tools', 'ggfortify', 'grid', 'gplots')
if (length(setdiff(packages, rownames(installed.packages()))) > 0) {
install.packages(setdiff(packages, rownames(installed.packages())))
}
library(tools)
library(ggfortify)
library(grid)
library(gplots)
############ II. READ AND PREP tij MATRICES FOR PLOTTING
##--Create vector list of filenames for the posterior probability matrices from each run:
tijFileNames <- list.files(getwd(), pattern='^tij')
#
##--The following function will rotate the matrices into correct orientation during image
##--plotting; the idea for this rotation method is from the following site: https://blog.snap.uaf.edu/2012/06/08/matrix-rotation-for-image-and-contour-plots-in-r/
f <- function(m) t(m)[,nrow(m):1]
#
##--Loop through the filenames to read each file in and assign it to the appropriately
##--named object:
for(i in 1:length(tijFileNames)){
# x <- tij_file.names[i]
basename <- file_path_sans_ext(tijFileNames[i])
oname = paste(basename)
assign(basename, as.matrix(read.table(tijFileNames[i], header=TRUE, sep='\t')))
}
############ III. PLOT THE MATRICES IN A SINGLE GRAPH TO PERMIT BROAD VISUAL COMPARISON OF RESULTS ACROSS RANGE OF TEST VALUES
##--Plot the matrices in the quartz window with par-specified parameters (given above):
tijObjList <- list($FUTURE_TIJ_OBJ_NAMES)
pdf('plots.pdf')
##--Use par and col to specify the plot layout and color characteristics:
par(mfrow=c(3,2), mai = c(0.5, 0.1, 0.1, 0.1))
col<- colorRampPalette(c('ivory2', 'yellow', 'red'))(20)
for(i in tijObjList){
image(f(i), col=col)
}
dev.off()
##--Some logical testing to see if any matrices are the same (likely, they are NOT):
$IDENT_MATRIX_TESTS
##--Some mathematical/arithmetic testing:
##--Calculate the differences between the posterior probability matrices (tij's, placed in
##--'tijObjList' above), which gives one difference matrix for each pair of tij matrices.
##--Then sum across all difference matrices to get the cumulative difference across them.
##--Compare this to the sum of all the original tij matrices (which since each probs matrix
##--row sums to one is simply equal to the number of rows minus the header, in each probs
##--matrix). Two tests might be whether 1) the differences between any pair of matrices is
##--more/less than five percent of the sum of entries in the 2 matrices, or 2) whether the
##--sum of differences across all difference matrices is more/less than five percent of the
##--sum of entries across all difference matrices.
#
##--First, put all individual tij matrices, calculated from probs matrices with different
##--on/off-diagonal test values, into separate objects, using a for loop:
for(i in 1:6) {
name <- paste('m', i, sep = '')
assign(name, tijObjList[i][[1]])
}
##--Next, calculate a difference matrix for each pair of tij matrices:
$MAKE_PAIR_DIFF_MATRICES
#
##--Next, get the sum of all differences across the pairwise diff matrices, as well as the
##--sum of all tij matrices themselves. Then compare these two to do the 'test' #2 mentioned
##--above under 'Some mathematical/arithmetic testing', by simple division:
sum_difm <- sum(c($SUM_ALL_DIFF_MATRICES))
sum_difm
sum_tijm <- sum(c($SUM_TIJ_MATRICES))
sum_tijm
sum_difm/sum_tijm ## Is this value > < or = to 0.05?
#
##--Now, source the ./diffProTests.r code generated in the final section of the sh code
##--before this Rscript, in order to calculate the difference proportion of total (from
##--original two tij matrices) for each pairwise difference matrix.
source('diffProTests.r')
#
diffpro_list <- as.list(apropos('diffpro'))
#
for(i in 3:length(diffpro_list)){
cat('---', diffpro_list[i][[1]][1],'---\n');
print(get(diffpro_list[[i]][1]))
}
diffpros <- abs(c($COMBINE_DIFF_PROPS))
##--Print min, max, average, and range of difference proportions across diffpros values.
##--Ideally, all of these would be minimal, e.g. <0.05 (arbitrary value).
min(diffpros)
max(diffpros)
mean(diffpros)
range(diffpros)
##--Now, plot out the difference proportions of total across all pairs, and save the
##--resulting line graph as a PDF file. We can do this twice, once with the diff pros
##--sorted in ascending order, and the other with them in their original order:
pdf('diffpro_sort.pdf')
plot(sort(diffpros, decreasing=FALSE), type='o', col='blue')
dev.off()
#
pdf('diffpro_unsort.pdf')
plot(diffpros, type='o', col='blue')
dev.off()
#
pdf('diffpro_unsort_ylim0.1.pdf')
plot(diffpros, type='o', col='blue', xlim=c(0, 15), ylim=c(0, 0.1))
dev.off()
#
pdf('diffpro_unsort_ylim1.pdf')
plot(diffpros, type='o', col='blue', xlim=c(0, 15), ylim=c(0, 1))
dev.off()
#
######################################### END ############################################
" > comparePPMats.r
############ FINAL STEPS:
echo "INFO | $(date) | STEP #5: RUN THE R SCRIPT AND CONDUCT CLEANUP. "
R CMD BATCH ./comparePPMats.R ;
##--Cleanup remaining unnecessary files:
rm ./tij*.txt;
rm ./*.tmp;
rm ./pairs*.txt;
rm ./diffProTests.txt ./logicalTests.txt;
echo "INFO | $(date) | Done conducting sensitivity test(s) examining the effect of varying the 'prior' probabilities of known \
observations in the beliefs matrix supplied to bgmm, using bgmmSensTest."
echo "INFO | $(date) | Bye.
"
#
#
#
######################################### END ############################################
exit 0