-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path00_fastqUtils.R
More file actions
118 lines (111 loc) · 4.63 KB
/
00_fastqUtils.R
File metadata and controls
118 lines (111 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
## Utilities for fastq files
################## removing reads that still match any of the primers
fastqFilterPrimersMatchs <- function(fwdPrimer, revPrimer, fastqR1In, fastqR2In = NULL, fastqR1Out, fastqR2Out = NULL, sampleName = NULL, maxMismatch = 0, splitHeader = " ") {
require(ShortRead)
require(Biostrings)
require(dada2)
# reading both fastq
tmpR1 <- readFastq(fastqR1In)
if (!is.null(fastqR2In)) tmpR2 <- readFastq(fastqR2In)
# sample name
if (!is.null(sampleName)) sample.name <- sampleName else sample.name <- strsplit(basename(fastqR1In), ".fastq")[[1]][1]
if (!is.null(fastqR2In))
{
# IDs are supposed to be matching all along !
if (!identical(sapply(strsplit(as.vector(id(tmpR1)), splitHeader), `[`, 1), sapply(strsplit(as.vector(id(tmpR2)), splitHeader), `[`, 1)))
{
message("not the same order in the fastq...")
break
}
}
# make reverseComplement of both primers
fwd <- fwdPrimer
fwdRC <- dada2:::rc(fwdPrimer)
rev <- revPrimer
revRC <- dada2:::rc(rev)
for (p in c(fwd, fwdRC, rev, revRC))
{
# check fwd primer in all R1 reads
idx <- vmatchPattern(p, sread(tmpR1), fixed=FALSE, max.mismatch=maxMismatch)
# get the indexing
xx <- elementNROWS(idx)
tmpR1 <- tmpR1[which(xx==0)]
if (!is.null(fastqR2In)) tmpR2 <- tmpR2[which(xx==0)]
}
if (!is.null(fastqR2In))
{
for (p in c(fwd, fwdRC, rev, revRC))
{
# check fwd primer in all R2 reads
idy <- vmatchPattern(p, sread(tmpR2), fixed=FALSE, max.mismatch=maxMismatch)
# get the indexing
yy <- elementNROWS(idy)
tmpR1 <- tmpR1[which(yy==0)]
tmpR2 <- tmpR2[which(yy==0)]
}
}
# export the reads that did not match the primers
writeFastq(tmpR1, fastqR1Out, "a")
if (!is.null(fastqR2In)) writeFastq(tmpR2, fastqR2Out, "a")
return(list("sampleID" = sample.name, "readsCount" = length(tmpR1)))
}
############################################################################################################
############################################################################################################
################## filter by length fastq
fastqFilterWidth <- function(fastqR1In, fastqR2In = NULL, fastqR1Out, fastqR2Out = NULL, widthFilter = 20, sampleName = NULL, splitHeader = " ") {
require(ShortRead)
require(Biostrings)
# reading both fastq
tmpR1 <- readFastq(fastqR1In)
if (!is.null(fastqR2In)) tmpR2 <- readFastq(fastqR2In)
# sample name
if (!is.null(sampleName)) sample.name <- sampleName else sample.name <- strsplit(basename(fastqR1In), ".fastq")[[1]][1]
#sample.name <- paste0(strsplit(basename(fastqR1In), "_")[[1]][5], "_", strsplit(basename(fastqR1In), "_")[[1]][6]) ## deep-sea samples names
if (!is.null(fastqR2In))
{
# IDs are supposed to be matching all along !
if (!identical(sapply(strsplit(as.vector(id(tmpR1)), splitHeader), `[`, 1), sapply(strsplit(as.vector(id(tmpR2)), splitHeader), `[`, 1)))
{
message("not the same order in the fastq...check header of the fastq to split correctly?")
break
}
}
# Filter R1 & R2 with width of R1
keepFWD <- width(tmpR1) >= widthFilter
tmpR1 <- tmpR1[keepFWD]
if (!is.null(fastqR2In)) tmpR2 <- tmpR2[keepFWD]
# Filter R1 & R2 with width of R2
if (!is.null(fastqR2In))
{
keepREV <- width(tmpR2) >= widthFilter
tmpR1 <- tmpR1[keepREV]
tmpR2 <- tmpR2[keepREV]
}
# export the reads that did not match the primers
writeFastq(tmpR1, fastqR1Out, "a")
if (!is.null(fastqR2In)) writeFastq(tmpR2, fastqR2Out, "a")
return(list("sampleID" = sample.name, "readsCount" = length(tmpR1)))
}
############################################################################################################
############################################################################################################
################## Count reads
fastqCountReads <- function(fastqR1In, fastqR2In = NULL, sampleName = NULL, splitHeader = " ") {
require(ShortRead)
require(Biostrings)
# reading both fastq
tmpR1 <- readFastq(fastqR1In)
if (!is.null(fastqR2In)) tmpR2 <- readFastq(fastqR2In)
# sample name
if (!is.null(sampleName)) sample.name <- sampleName else sample.name <- strsplit(basename(fastqR1In), ".fastq")[[1]][1]
# IDs are supposed to be matching all along !
if (!is.null(fastqR2In))
{
if (!identical(sapply(strsplit(as.vector(id(tmpR1)), splitHeader), `[`, 1), sapply(strsplit(as.vector(id(tmpR2)), splitHeader), `[`, 1)))
{
message("not the same order in the fastq...check header of the fastq to split correctly?")
break
}
}
# retunr reads count
return(list("sampleID" = sample.name, "readsCount" = length(tmpR1)))
}