Introduction to Bioconductor in R
Paula Andrea Martinez, PhD.
Data Scientist
Remove duplicates or at least mark them
Mark duplicates using a threshold
library(ShortRead)# Counting duplicates TRUE is the number of duplicates table(srduplicated(dfqsample))
FALSE TRUE
500 500
# Cleaning reads from duplicates x[fun(x)] cleanReads <- mydReads[srduplicated(mydReads) == FALSE]# Counting duplicates table(srduplicated(cleanReads))
FALSE
500
srFilter to filter based on a condition x[fun(x)]
Filter example
library(ShortRead)# Use a custom filter to remove reads from fqsample # This filter to remove reads shorter than a min number of bases readWidthCutOff <- srFilter(function(x) {width(x) >= minWidth}, name = "MinWidth")minWidth <- 51fqsample[readWidthCutOff(fqsample)]
library(ShortRead)# save your filter, .name is optional myFilter <- nFilter(threshold = 10, .name = "cleanNFilter")# use the filter at reading point filtered <- readFastq(dirPath = "data", pattern = ".fastq", filter = myFilter) # you will retrieve only those reads that have a maximum of 10 N's filtered
library(ShortRead)#id filter example myFilterID <- idFilter(regex = ":3:1") # will return only those ids that contain the regular expression # optional parameters are .name, fixed and exclude # use the filter at reading point filtered <- readFastq(dirPath = "data", pattern = ".fastq", filter = myFilterID)# filter to remove poly-A regions myFilterPolyA <- polynFilter(threshold = 10, nuc = c("A")) # will return the sequences that have a maximun number of 10 consecutive A's# use the filter for subsetting filtered[myFilterPolyA(filtered)]
Introduction to Bioconductor in R