-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocc_filter_v1.1.R
89 lines (78 loc) · 3.57 KB
/
occ_filter_v1.1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#This script filters the occurences of motifs in a genome (mapped with BLAMM)
#Motifs must lie in a range of 1500 bp of gene start and end, respectively
#Before the occurences can be filtered regarding the motif preferences the genes orientations must be determined
# ATTENTION! The occurence file from BLAMM can be very large.
# To keep computational power low it is highly recommended to split the occurence file into smaller ones (with 1 billion lines)
#############################################################
######################################### USE THIS AS BASH ##
#project="Soly_blamm_20230417"
#inputFile="occurrences_0.00001.txt"
#outputSize="1000000"
#split -l $outputSize --numeric-suffixes $inputFile smallfile
#mkdir occ$project
#mv smallfile* occ$project/
#############################################################
setwd("G:/Machina_Eva/R_Home/moca_blue/moca_blue/2023_mo_slyc_spenn")
############################################################# ITS WORKING !
dirpath <- "./occSpen_ch01_0e3"
output_file <- "./out/outSpen_ch01_0e3.txt"
file_paths <- list.files(dirpath, pattern = "^smallfile*", full.names = TRUE)
#############################################################
library(tidyr)
library(dplyr)
library(readr)
library(magrittr)
#############################################################
# initialize an empty data frame to store the results
combined_df <- data.frame()
# loop over all files and append the results to the combined data frame
for (filepath in file_paths) {
# read the file and skip lines with less than 9 elements
occurrence_df <- read.table(filepath, header = FALSE, fill = TRUE)
# select rows with the specified pattern and update the column names
# occurrence_df <- subset(occurrence_df, grepl("^epmSola", V3))
colnames(occurrence_df) <- c("loc",
"source",
"motif",
"mstart",
"mend",
"score",
"strand",
"V7",
"V8")
# split the "loc" column into "chr", "gene_start", and "gene_end"
# selected_df <- separate(occurrence_df, loc, into = c("chr", "gene_start", "gene_end"), sep = "[:-]", fill = "left")
selected_df <- separate(occurrence_df,
loc,
into = c("chr", "gene_loc"),
sep = ":",
fill = "left") %>%
separate(
gene_loc, into = c("gene_start", "gene_end"),
sep = "-",
fill = "left")
# combine "chr", "gene_start", and "gene_end" into a new "loc" column
selected_df$loc <- paste(selected_df$chr,
selected_df$gene_start,
sep = ":")
selected_df$loc <- ifelse(
is.na(
selected_df$gene_end),
selected_df$loc,
paste(
selected_df$loc,
selected_df$gene_end,
sep = "-"))
# filter rows based on the specified condition
filtered_df <- selected_df %>%
mutate(across(c(mstart, gene_start, gene_end),
as.numeric)) %>% # convert columns to numeric
filter(mstart <= 1500 | abs(mstart - (gene_end-gene_start +1)) <= 1500)
# append the filtered data frame to the combined data frame
combined_df <- rbind(combined_df, filtered_df)
}
# write the combined data frame to a file
write.table(combined_df,
output_file,
sep = "\t",
row.names = FALSE)