-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_eqtls.R
91 lines (61 loc) · 3.23 KB
/
load_eqtls.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#Code to generate list of lists to be fed into glm_predict.R
library(sqldf)
library(plyr)
library(RSQLite)
library(BBmisc)
library(doParallel)
args <- list()
##USAGE <RESULTS_DIR> <DATABSE_FILE> <KFOLD> <SAMPLE_NUM> <SAMPLE_FILE> <THREADS>
oargs <- commandArgs(trailingOnly=TRUE)
args$RESULTS_DIR <- oargs[1]
args$DATABASE_FILE <- oargs[2]
args$KFOLD <- as.integer(oargs[3])
args$SAMPLE_NUM <- as.integer(oargs[4])
args$SAMPLE_FILE <- oargs[5]
args$THREADS <- as.integer(oargs[6])
eqtl.files <- dir(args$RESULTS_DIR,pattern="*.txt",full.names=T)
kfolds <- as.integer(gsub(".+s([0-9]+).txt","\\1",eqtl.files))
cis.transs <- gsub(".+(cis|trans)[0-9]+.txt","\\1",eqtl.files)
if(TRUE){
write.db.file <- function(eqtl.file,cis.trans,kfold,dbfile){
db <- dbConnect(drv=dbDriver("SQLite"),dbname=dbfile)
dbGetQuery(db,"pragma busy_timeout=20000")
dbGetQuery(db,"pragma main.page_size=4096")
dbGetQuery(db,"pragma main.cache_size=50000")
dbGetQuery(db,"pragma synchronous=0")
dbGetQuery(db,"pragma main.journal_mode=WAL")
dbGetQuery(db,"pragma temp_store=1")
dbGetQuery(db,"pragma temp_store_directory='.'")
eqtl <- read.csv.sql(eqtl.file,sep="\t",header=T,eol="\n")
eqtl$Kfold<-kfold
eqtl$CisTrans <- cis.trans
dbWriteTable(db,name="eqtls",eqtl,row.names=F,append=T)
dbDisconnect(db)
}
qtlargs <- data.frame(eqtl.file=eqtl.files,cis.trans=cis.transs,kfold=kfolds,dbfile=args$DATABASE_FILE,stringsAsFactors=F)
m_ply(.data=qtlargs,.fun=write.db.file,.parallel=F,.paropts=list(.inorder=F,.export="write.db.file",.packages=c("RSQLite","sqldf")),.inform=F,.progress="text")
}
db <- dbConnect(drv=dbDriver("SQLite"),dbname=args$DATABASE_FILE)
#########get just eqtls that are in every cross validation
#Subset on genes that have SOME prediction in every cross validation
#dbSendQuery(db,"create table ggenes as SELECT gene FROM (SELECT gene,count(distinct Kfold) from eqtls group by gene having count(distinct Kfold)>9)")
#dbSendQuery(db,"create index g on ggenes(gene)")
#distinct.genes <- dbGetQuery(db,"select distinct gene from ggenes")
#distinct.snps <- dbGetQuery(db,"select distinct SNP from eqtls,ggenes where eqtls.gene=ggenes.gene")
#write.table(distinct.snps,file="eqtlsnps.txt",sep="\t",col.names=T,row.names=F,quote=F)
#write.table(distinct.genes,file="eqtlgenes.txt",sep="\t",col.names=T,row.names=F,quote=F)
train.indices <- chunk(rep(1:args$SAMPLE_NUM,args$KFOLD),n.chunks=args$KFOLD)
test.indices <- chunk(1:args$SAMPLE_NUM,chunk.size=ceiling(args$SAMPLE_NUM/args$KFOLD))
train.indices <- mapply(FUN=function(x,y)x[-y],train.indices,test.indices,SIMPLIFY=F)
sample.cases <- scan(args$SAMPLE_FILE,what="character",sep="\t",nlines=1)[-1]
test.df <- do.call("rbind",lapply(1:length(test.indices),function(x){
data.frame(Sample=sample.cases[test.indices[[x]]],Kfold=x,Index=test.indices[[x]])
}))
train.df<- do.call("rbind",lapply(1:length(train.indices),function(x){
data.frame(Sample=sample.cases[train.indices[[x]]],Kfold=x,Index=train.indices[[x]])
}))
dbWriteTable(db,name="testSamples",test.df,row.names=F,overwrite=T,append=F)
dbWriteTable(db,name="trainSamples",train.df,row.names=F,overwrite=T,append=F)
dbSendQuery(db,"Create index k on testSamples(Kfold)")
dbSendQuery(db,"Create index ktr on trainSamples(Kfold)")
dbDisconnect(db)