Home > Back-end >  How to left rows with the same values in columns using dplyr package in R?
How to left rows with the same values in columns using dplyr package in R?

Time:01-15

I wonder how can we filter rows with the same values in columns using dplyr package? I tried doing something in opposite to what was asked and proposed in this enter image description here

CodePudding user response:

Not quite native tidyverse, but does what you desire?

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
               var1 = ceiling(runif(10, min = 5, max = 10)),
               var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
               var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9))

test %>% filter(sapply(1:nrow(test), function(i) {all(diff(as.numeric(test[i,2:4])) == 0)}))
#> # A tibble: 2 × 4
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8

Created on 2022-01-14 by the reprex package (v2.0.1)

EDIT:

Here is an entirely tidyverse way:

test %>% 
  rowwise() %>% 
  filter(all(diff(var1:var3) == 0))

CodePudding user response:

This is follow-up to @Yuriy Saraykin's answer, whose solution I think is the most representative for the tidyverse way and uses the robust n_distinct function. His answer achieves the outcome you are looking for in virtually all circumstances, but I wanted to expand a bit on using it with missing data because NA is always a unique value.

library(dplyr)
library(tidyr)  

set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
               var1 = c(6, 6, 6, 8, 9, NA, NA, 8, 8, NA),
               var2 = c(6, 5, 4, 8, 7, 12, NA, 1, 9, NA),
               var3 = c(6, 3, 4, 8, 7, 45, NA, 78, 0, 3))

# count number of distinct values by row across columns -- NAs are all counted as unique values
test %>% 
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3)))
#> # A tibble: 10 x 5
#> # Rowwise: 
#>       id  var1  var2  var3 unique
#>    <dbl> <dbl> <dbl> <dbl>  <int>
#>  1    90     6     6     6      1
#>  2    71     6     5     3      3
#>  3    13     6     4     4      2
#>  4    60     8     8     8      1
#>  5    20     9     7     7      2
#>  6    70    NA    12    45      3
#>  7     8    NA    NA    NA      1
#>  8     4     8     1    78      3
#>  9    41     8     9     0      3
#> 10    84    NA    NA     3      2

# Filter out rows with same value on colums -- NAs are all counted as unique values
test %>% 
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3)) == 1) 
#> # A tibble: 3 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8
#> 3     8    NA    NA    NA


# count number of distinct values by row across columns -- NAs excluded from count
test %>% 
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Rowwise: 
#>       id  var1  var2  var3 unique
#>    <dbl> <dbl> <dbl> <dbl>  <int>
#>  1    90     6     6     6      1
#>  2    71     6     5     3      3
#>  3    13     6     4     4      2
#>  4    60     8     8     8      1
#>  5    20     9     7     7      2
#>  6    70    NA    12    45      2
#>  7     8    NA    NA    NA      0
#>  8     4     8     1    78      3
#>  9    41     8     9     0      3
#> 10    84    NA    NA     3      1

# Filter out rows with same value on colums -- NAs excluded from count
test %>% 
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1) 
#> # A tibble: 3 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8
#> 3    84    NA    NA     3


# count number of distinct values by row across columns -- NAs excluded altogether
test %>% 
  tidyr::drop_na(var1:var3) %>%
  rowwise() %>% 
  mutate(unique =  n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 7 x 5
#> # Rowwise: 
#>      id  var1  var2  var3 unique
#>   <dbl> <dbl> <dbl> <dbl>  <int>
#> 1    90     6     6     6      1
#> 2    71     6     5     3      3
#> 3    13     6     4     4      2
#> 4    60     8     8     8      1
#> 5    20     9     7     7      2
#> 6     4     8     1    78      3
#> 7    41     8     9     0      3

# Filter out rows with same value on colums -- NAs excluded altogether
test %>% 
  tidyr::drop_na(var1:var3) %>%
  rowwise() %>% 
  filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1)
#> # A tibble: 2 x 4
#> # Rowwise: 
#>      id  var1  var2  var3
#>   <dbl> <dbl> <dbl> <dbl>
#> 1    90     6     6     6
#> 2    60     8     8     8

Created on 2022-01-14 by the reprex package (v2.0.1)

CodePudding user response:

You may select the rows whose rowwise standard deviation or variance is 0.

test[matrixStats::rowSds(as.matrix(test[-1])) == 0, ]

# A tibble: 2 x 4
#     id  var1  var2  var3
#  <dbl> <dbl> <dbl> <dbl>
#1    90     6     6     6
#2    60     8     8     8
  •  Tags:  
  • Related