Introduction to Bioconductor in R
Paula Andrea Martinez, PhD.
Data Scientist
Remove duplicates or at least mark them
Mark duplicates using a threshold
library(ShortRead)
# Counting duplicates TRUE is the number of duplicates table(srduplicated(dfqsample))
FALSE TRUE
500 500
# Cleaning reads from duplicates x[fun(x)] cleanReads <- mydReads[srduplicated(mydReads) == FALSE]
# Counting duplicates table(srduplicated(cleanReads))
FALSE
500
srFilter
to filter based on a condition x[fun(x)]
Filter example
library(ShortRead)
# Use a custom filter to remove reads from fqsample # This filter to remove reads shorter than a min number of bases readWidthCutOff <- srFilter(function(x) {width(x) >= minWidth}, name = "MinWidth")
minWidth <- 51
fqsample[readWidthCutOff(fqsample)]
library(ShortRead)
# save your filter, .name is optional myFilter <- nFilter(threshold = 10, .name = "cleanNFilter")
# use the filter at reading point filtered <- readFastq(dirPath = "data", pattern = ".fastq", filter = myFilter) # you will retrieve only those reads that have a maximum of 10 N's filtered
library(ShortRead)
#id filter example myFilterID <- idFilter(regex = ":3:1") # will return only those ids that contain the regular expression # optional parameters are .name, fixed and exclude # use the filter at reading point filtered <- readFastq(dirPath = "data", pattern = ".fastq", filter = myFilterID)
# filter to remove poly-A regions myFilterPolyA <- polynFilter(threshold = 10, nuc = c("A")) # will return the sequences that have a maximun number of 10 consecutive A's
# use the filter for subsetting filtered[myFilterPolyA(filtered)]
Introduction to Bioconductor in R