I have a list as:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
I would like to compare each string with every other string in the list and I use the following command:
z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN=Vectorize(function(x, y) sum(bitwXor(x, y) > 0)))
However, I would like to output the number of character differences (instead of characters matching) based on position:
For example "tggc" when compared with the string "gcgt" should be output as 3.
CodePudding user response:
Just use a simple negation ! as per the following:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
dmat
Or use a straightforward equality comparison given that you've mapped the characters to integers.
dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))
Both give output:
peel peer pear tggc gcgt
peel 4 3 2 0 0
peer 3 4 3 0 0
pear 2 3 4 0 0
tggc 0 0 0 4 1
gcgt 0 0 0 1 4
Note: If you have fixed string length, you can also use subtraction, but the above saves you from passing this explicitly, which adds a little generality.
CodePudding user response:
If performance is a concern:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
replace(matrix(nrow = n, ncol = n),
sequence(n1, seq(n 1L, by = n, length.out = n - 1L)),
colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] NA 3 2 0 0
#> [2,] NA NA 3 0 0
#> [3,] NA NA NA 0 0
#> [4,] NA NA NA NA 1
#> [5,] NA NA NA NA NA
# benchmarking with a larger character vector
s <- mapply(FUN = function(x) paste0(sample(letters[1:4]), collapse = ""), 1:100)
microbenchmark::microbenchmark(bitwXor = {z <- Map(utf8ToInt, s)
outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))},
logical = {z <- Map(utf8ToInt, s)
outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))},
mat = {z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
replace(matrix(nrow = n, ncol = n),
sequence(n1, seq(n 1L, by = n, length.out = n - 1L)),
colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> bitwXor 23846.1 24875.6 26207.230 26120.95 27134.35 33842.8 100
#> logical 16645.5 17514.8 19020.051 18383.35 19875.15 32716.8 100
#> mat 387.4 455.0 511.322 482.70 544.05 1224.4 100
# confirm that the results are the same
z <- Map(utf8ToInt, s)
mat1 <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
mat2 <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
mat3 <- replace(matrix(nrow = n, ncol = n), sequence(n1, seq(n 1L, by = n, length.out = n - 1L)), colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
all.equal(mat1[upper.tri(mat1)], mat2[upper.tri(mat2)])
#> [1] TRUE
all.equal(mat1[upper.tri(mat1)], mat3[upper.tri(mat3)])
#> [1] TRUE
CodePudding user response:
A possible solution:
library(tidyverse)
sample <- c('peel','peer','pear','tggc','gcgt')
sample %>%
expand.grid(sample) %>%
rowwise %>%
mutate(cmp = mapply(function(x,y)
{ x != y}, x=str_split(Var1, ""), y=str_split(Var2, "")) %>% sum)
#> # A tibble: 25 × 3
#> # Rowwise:
#> Var1 Var2 cmp
#> <fct> <fct> <int>
#> 1 peel peel 0
#> 2 peer peel 1
#> 3 pear peel 2
#> 4 tggc peel 4
#> 5 gcgt peel 4
#> 6 peel peer 1
#> 7 peer peer 0
#> 8 pear peer 1
#> 9 tggc peer 4
#> 10 gcgt peer 4
#> # … with 15 more rows
