-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_smm.r
47 lines (43 loc) · 2.04 KB
/
parse_smm.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
##jmd
##3.7.11
##parse_smm.r
#also deal w/: acp, coa?
#charge column mostly NA: oxidized/reduced e.g. Reduced-Quinones & Cytochromes-C-Reduced, charge e.g. FE+4?
#infer chem form from: fatty-acids e.g. cis-cis-D9-27-C46-2-ACPs, rxns w/ only 1 molecule w/o chem form?
##clean metacyc chem form in smm file
clean.meta.chem.form <- function(smm){
#clean
smm$CHEMICAL.FORMULA <- gsub('\\(|\\)', '', smm$CHEMICAL.FORMULA)
#extract hidden chem.forms
no.chem.ind <- which(smm$CHEM==''|is.na(smm$CHEM))
#add '$' to end
smm$CHEMICAL.FORMULA[no.chem.ind] <- sub('$', '$', smm$FRAME[no.chem.ind])
#find hidden chem forms
hidden.chem <- intersect(no.chem.ind, grep('-INST-.+-C[[:digit:]]', smm$CHEM))
#strip, add space, & turn '-' into '$'
smm$CHEMICAL.FORMULA[hidden.chem] <- gsub(pattern='([[:upper:]])([[:digit:]])', replacement='\\1 \\2',
gsub('-', '$', gsub(pattern='.+-(C[[:digit:]])', replacement='\\1', smm$CHEM[hidden.chem])))
return(smm)
}
#redox
#sm[grep('(^|-)ox(idized|$|-)', sm$FRAME, ignore.case=TRUE),]
##turn meta chem form into data.frame
chemForm2df <- function(chem.form){
chem.form.pairs <- unlist(strsplit(chem.form, split='\\$'))
elements <- unlist(strsplit(chem.form.pairs, split=' [[:digit:]]+$'))
count <- sub(pattern='(.+) ([[:digit:]]+)$', replacement='\\2', chem.form.pairs)
#when chem.form='1-3-alpha-D-Glucans' ie when it has no spaces, count gives the chem.form
normal.met.cf.ind <- grep(pattern=' [[:digit:]]+$', x=chem.form.pairs)
#can't use count[-grep(...)]=1 since when length(chem.form)=1,
#grep can come up empty and assigning 1 to count[character(0)]=count[numeric(0)] doesn't work
count[setdiff(1:length(count), normal.met.cf.ind)] <- 1
count <- as.numeric(count)
dat <- data.frame(els=elements, count=count)
#if elements are not unique
if (any(duplicated(dat$els))){
dat.v <- tapply(X=dat$count, INDEX=dat$els, FUN=sum)
dat <- data.frame(els=names(dat.v), count=dat.v)
rownames(dat) <- 1:length(dat.v)
}
return(dat)
}