Home > OS >  How to compare list of strings and output number of strings that are not matching?
How to compare list of strings and output number of strings that are not matching?

Time:01-15

I have a list as:

s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')

I would like to compare each string with every other string in the list and I use the following command:

z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN=Vectorize(function(x, y) sum(bitwXor(x, y) > 0)))

However, I would like to output the number of character differences (instead of characters matching) based on position:

For example "tggc" when compared with the string "gcgt" should be output as 3.

CodePudding user response:

Just use a simple negation ! as per the following:

s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
dmat

Or use a straightforward equality comparison given that you've mapped the characters to integers.

dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))

Both give output:

     peel peer pear tggc gcgt
peel    4    3    2    0    0
peer    3    4    3    0    0
pear    2    3    4    0    0
tggc    0    0    0    4    1
gcgt    0    0    0    1    4

Note: If you have fixed string length, you can also use subtraction, but the above saves you from passing this explicitly, which adds a little generality.

CodePudding user response:

If performance is a concern:

s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
replace(matrix(nrow = n, ncol = n),
        sequence(n1, seq(n   1L, by = n, length.out = n - 1L)),
        colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]   NA    3    2    0    0
#> [2,]   NA   NA    3    0    0
#> [3,]   NA   NA   NA    0    0
#> [4,]   NA   NA   NA   NA    1
#> [5,]   NA   NA   NA   NA   NA

# benchmarking with a larger character vector
s <- mapply(FUN = function(x) paste0(sample(letters[1:4]), collapse = ""), 1:100)
microbenchmark::microbenchmark(bitwXor = {z <- Map(utf8ToInt, s)
                                          outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))},
                               logical = {z <- Map(utf8ToInt, s)
                                          outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))},
                               mat = {z <- mapply(utf8ToInt, s)
                                      n <- length(s)
                                      n1 <- 1:(n - 1L)
                                      replace(matrix(nrow = n, ncol = n),
                                              sequence(n1, seq(n   1L, by = n, length.out = n - 1L)),
                                              colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))})
#> Unit: microseconds
#>     expr     min      lq      mean   median       uq     max neval
#>  bitwXor 23846.1 24875.6 26207.230 26120.95 27134.35 33842.8   100
#>  logical 16645.5 17514.8 19020.051 18383.35 19875.15 32716.8   100
#>      mat   387.4   455.0   511.322   482.70   544.05  1224.4   100

# confirm that the results are the same
z <- Map(utf8ToInt, s)
mat1 <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
mat2 <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
mat3 <- replace(matrix(nrow = n, ncol = n), sequence(n1, seq(n   1L, by = n, length.out = n - 1L)), colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
all.equal(mat1[upper.tri(mat1)], mat2[upper.tri(mat2)])
#> [1] TRUE
all.equal(mat1[upper.tri(mat1)], mat3[upper.tri(mat3)])
#> [1] TRUE

CodePudding user response:

A possible solution:

library(tidyverse)

sample <- c('peel','peer','pear','tggc','gcgt')

sample %>% 
  expand.grid(sample) %>% 
  rowwise %>% 
  mutate(cmp = mapply(function(x,y) 
                { x != y}, x=str_split(Var1, ""), y=str_split(Var2, "")) %>% sum)

#> # A tibble: 25 × 3
#> # Rowwise: 
#>    Var1  Var2    cmp
#>    <fct> <fct> <int>
#>  1 peel  peel      0
#>  2 peer  peel      1
#>  3 pear  peel      2
#>  4 tggc  peel      4
#>  5 gcgt  peel      4
#>  6 peel  peer      1
#>  7 peer  peer      0
#>  8 pear  peer      1
#>  9 tggc  peer      4
#> 10 gcgt  peer      4
#> # … with 15 more rows
  •  Tags:  
  • Related