How to left rows with the same values in columns using dplyr package in R?-CodePudding

I wonder how can we filter rows with the same values in columns using dplyr package? I tried doing something in opposite to what was asked and proposed in this

CodePudding user response：

Not quite native tidyverse, but does what you desire?

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
               var1 = ceiling(runif(10, min = 5, max = 10)),
               var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
               var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9))

test %>% filter(sapply(1:nrow(test), function(i) {all(diff(as.numeric(test[i,2:4])) == 0)}))
#> # A tibble: 2 × 4
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8

^{Created on 2022-01-14 by the reprex package (v2.0.1)}

EDIT:

Here is an entirely tidyverse way:

test %>% 
  rowwise() %>% 
  filter(all(diff(var1:var3) == 0))

CodePudding user response：

This is follow-up to @Yuriy Saraykin's answer, whose solution I think is the most representative for the tidyverse way and uses the robust n_distinct function. His answer achieves the outcome you are looking for in virtually all circumstances, but I wanted to expand a bit on using it with missing data because NA is always a unique value.

library(dplyr)
library(tidyr)  

set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
               var1 = c(6, 6, 6, 8, 9, NA, NA, 8, 8, NA),
               var2 = c(6, 5, 4, 8, 7, 12, NA, 1, 9, NA),
               var3 = c(6, 3, 4, 8, 7, 45, NA, 78, 0, 3))

# count number of distinct values by row across columns -- NAs are all counted as unique values
test %>% 
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3)))
#> # A tibble: 10 x 5
#> # Rowwise: 
#>       id  var1  var2  var3 unique
#>    <dbl> <dbl> <dbl> <dbl>  <int>
#>  1    90     6     6     6      1
#>  2    71     6     5     3      3
#>  3    13     6     4     4      2
#>  4    60     8     8     8      1
#>  5    20     9     7     7      2
#>  6    70    NA    12    45      3
#>  7     8    NA    NA    NA      1
#>  8     4     8     1    78      3
#>  9    41     8     9     0      3
#> 10    84    NA    NA     3      2

# Filter out rows with same value on colums -- NAs are all counted as unique values
test %>% 
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3)) == 1) 
#> # A tibble: 3 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8
#> 3     8    NA    NA    NA


# count number of distinct values by row across columns -- NAs excluded from count
test %>% 
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Rowwise: 
#>       id  var1  var2  var3 unique
#>    <dbl> <dbl> <dbl> <dbl>  <int>
#>  1    90     6     6     6      1
#>  2    71     6     5     3      3
#>  3    13     6     4     4      2
#>  4    60     8     8     8      1
#>  5    20     9     7     7      2
#>  6    70    NA    12    45      2
#>  7     8    NA    NA    NA      0
#>  8     4     8     1    78      3
#>  9    41     8     9     0      3
#> 10    84    NA    NA     3      1

# Filter out rows with same value on colums -- NAs excluded from count
test %>% 
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1) 
#> # A tibble: 3 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8
#> 3    84    NA    NA     3


# count number of distinct values by row across columns -- NAs excluded altogether
test %>% 
  tidyr::drop_na(var1:var3) %>%
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 7 x 5
#> # Rowwise: 
#>      id  var1  var2  var3 unique
#>   <dbl> <dbl> <dbl> <dbl>  <int>
#> 1    90     6     6     6      1
#> 2    71     6     5     3      3
#> 3    13     6     4     4      2
#> 4    60     8     8     8      1
#> 5    20     9     7     7      2
#> 6     4     8     1    78      3
#> 7    41     8     9     0      3

# Filter out rows with same value on colums -- NAs excluded altogether
test %>% 
  tidyr::drop_na(var1:var3) %>%
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1)
#> # A tibble: 2 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8

^{Created on 2022-01-14 by the reprex package (v2.0.1)}

CodePudding user response：

You may select the rows whose rowwise standard deviation or variance is 0.

test[matrixStats::rowSds(as.matrix(test[-1])) == 0, ]

# A tibble: 2 x 4
#     id  var1  var2  var3
#  <dbl> <dbl> <dbl> <dbl>
#1    90     6     6     6
#2    60     8     8     8