From my data I want to extract the strings that are between the L and R string from my database.
My database includes 4 different L and R string combinations and I want to test all of them.
One way is to write a for loop, but is there any more elegant and clever way?
library(tidyverse)
data <- c("CCACGAAGCTCTCCTACGTACGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA",
"CCACGAAGCTCTCCTACGTACGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA",
"CCACGAAGCTCTCCTAGGGGGGGGCTATTTTGGACTGCGTTACCAGTCCAGCGCCAACCAGATAAGTGGAATCTAGTTCGA",
"CCACGTAGCTCTCCTCCGTGCGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA") %>%
as.data.frame() %>%
rename(seq=1)
database=data.frame(L=c("CTACG","CTAGG","CTCCG"), R=c("CAGTC","CAGTC","CAGTC"))
data %>%
mutate(extracts= str_extract(.$seq,
str_c("(?<=",str_c(database[1,1], collapse = ""),").*(?=",str_c(database[1,2], collapse = ""),")")))
#> seq
#> 1 CCACGAAGCTCTCCTACGTACGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA
#> 2 CCACGAAGCTCTCCTACGTACGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA
#> 3 CCACGAAGCTCTCCTAGGGGGGGGCTATTTTGGACTGCGTTACCAGTCCAGCGCCAACCAGATAAGTGGAATCTAGTTCGA
#> 4 CCACGTAGCTCTCCTCCGTGCGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA
#> extracts
#> 1 TACGGTTATATTGACAGACCGAGGG
#> 2 TACGGTTATATTGACAGACCGAGGG
#> 3 <NA>
#> 4 <NA>
Created on 2022-02-01 by the reprex package (v2.0.1)
CodePudding user response:
I would just create your unique patterns and then apply across them. Then you can just cbind your results with your original data frame.
library(stringr)
patterns <- paste0(database$L, "(.*)", database$R)
names(patterns) <- paste0("pattern", 1:3)
cbind(
data,
lapply(
patterns,
\(x) str_match(data$seq, x)[,2]
)
)
#> seq
#> 1 CCACGAAGCTCTCCTACGTACGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA
#> 2 CCACGAAGCTCTCCTACGTACGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA
#> 3 CCACGAAGCTCTCCTAGGGGGGGGCTATTTTGGACTGCGTTACCAGTCCAGCGCCAACCAGATAAGTGGAATCTAGTTCGA
#> 4 CCACGTAGCTCTCCTCCGTGCGGTTATATTGACAGACCGAGGGCAGTCCAGCGCCAACCAGATAAGTGAAATCTAGTTCCA
#> pattern1 pattern2 pattern3
#> 1 TACGGTTATATTGACAGACCGAGGG <NA> <NA>
#> 2 TACGGTTATATTGACAGACCGAGGG <NA> <NA>
#> 3 <NA> GGGGGGCTATTTTGGACTGCGTTAC <NA>
#> 4 <NA> <NA> TGCGGTTATATTGACAGACCGAGGG
This only captures the first match. If you need to capture additional matches, can get a bit more complex. I think easiest would be to generate all unique combinations of the patterns you want to check and sequences, then create a list column in mutate(). In this case we can go with your original lookaheads/behinds and use str_extract_all().
library(dplyr)
library(tidyr)
patterns <- paste0("(?<=", database$L, ")(.*)(?=", database$R, ")")
names(patterns) <- paste0("pattern", 1:3)
expand_grid(seq = data$seq,
pattern = patterns) %>%
distinct() %>%
mutate(match = str_extract_all(seq, pattern)) %>%
pivot_wider(
names_from = "pattern",
values_from = "match"
) %>%
rename_with(~names(patterns),
.cols = -seq)
#> # A tibble: 3 × 4
#> seq pattern1 pattern2 pattern3
#> <chr> <list> <list> <list>
#> 1 CCACGAAGCTCTCCTACGTACGGTTATATTGACAGACCGAGGGCAGTCCA… <chr> <chr> <chr>
#> 2 CCACGAAGCTCTCCTAGGGGGGGGCTATTTTGGACTGCGTTACCAGTCCA… <chr> <chr> <chr>
#> 3 CCACGTAGCTCTCCTCCGTGCGGTTATATTGACAGACCGAGGGCAGTCCA… <chr> <chr> <chr>
