This repository has been archived by the owner. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathudf_SNPscrape.R
51 lines (45 loc) · 1.87 KB
/
udf_SNPscrape.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
udf_SNPscrape <- function(fileName="SNP_Tables.xlsx",
excelSheetN=1) {
# About -------------------------------------------------------------------
# Date: 13/05/2014
# Scrape SNP names from:
# - Excel file: SNP_Tables.xlsx [Status: Done]
# - Text file: SNP_Tables.txt (copy paste from ".doc" file) [Status: Done]
# Future improvements:
# ?? - Word file: SNP_Tables.doc [Status: to be added] - requires AntiWord
# ?? - CSV file: ...
# ?? - flexible striingsplit, or better regex...
# Workspace ---------------------------------------------------------------
require("xlsx")
# Data prep ---------------------------------------------------------------
#get file type
pos <- regexpr("\\.([[:alnum:]]+)$", fileName)
fileType <- ifelse(pos > -1L, substring(fileName, pos + 1L), "")
#check file extension
if(!(fileType %in% c("xlsx","txt")))stop("Supported input files: .xlsx, .txt")
#read file
dat <-
switch(fileType,
#Excel file
xlsx = {
#convert to 1 col matrix
matrix(as.matrix(
read.xlsx(file=fileName,
sheetIndex=excelSheetN,
stringsAsFactors=FALSE)),
ncol=1)},
#Text file
txt = {readLines(con=fileName,
n=-1,
warn=FALSE)
})
# Scrape SNPs -------------------------------------------------------------
# split by "/|,|\\s|\\*", this might need updating depending on ugliness of
# input SNP tables.
dat <- unique(unlist(strsplit(dat,"/|,|\\s|\\*|\\(|\\)")))
#scrape using regex: rs123, chr1:123:I, chr1:123:D
dat <- dat[grepl("rs[0-9]{1,}$|chr[0-9]{1,2}:[0-9]*:[I|D]$",dat)]
output <- sort(unique(dat))
# Return output -----------------------------------------------------------
return(output)
}