I wonder how can we filter rows with the same values in columns using dplyr package? I tried doing something in opposite to what was asked and proposed in this 
CodePudding user response:
Not quite native tidyverse, but does what you desire?
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
var1 = ceiling(runif(10, min = 5, max = 10)),
var2 = c(6, 5, 4, 8, 12, 1223, 14, 1, 90, 1),
var3 = c(6, 3, 4, 8, 11, 45, 56, 78, 0, 9))
test %>% filter(sapply(1:nrow(test), function(i) {all(diff(as.numeric(test[i,2:4])) == 0)}))
#> # A tibble: 2 × 4
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
Created on 2022-01-14 by the reprex package (v2.0.1)
EDIT:
Here is an entirely tidyverse way:
test %>%
rowwise() %>%
filter(all(diff(var1:var3) == 0))
CodePudding user response:
This is follow-up to @Yuriy Saraykin's answer, whose solution I think is the most representative for the tidyverse way and uses the robust n_distinct function. His answer achieves the outcome you are looking for in virtually all circumstances, but I wanted to expand a bit on using it with missing data because NA is always a unique value.
library(dplyr)
library(tidyr)
set.seed(2022)
test <- tibble(id = floor(runif(10, min = 0, max = 111)),
var1 = c(6, 6, 6, 8, 9, NA, NA, 8, 8, NA),
var2 = c(6, 5, 4, 8, 7, 12, NA, 1, 9, NA),
var3 = c(6, 3, 4, 8, 7, 45, NA, 78, 0, 3))
# count number of distinct values by row across columns -- NAs are all counted as unique values
test %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3)))
#> # A tibble: 10 x 5
#> # Rowwise:
#> id var1 var2 var3 unique
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 90 6 6 6 1
#> 2 71 6 5 3 3
#> 3 13 6 4 4 2
#> 4 60 8 8 8 1
#> 5 20 9 7 7 2
#> 6 70 NA 12 45 3
#> 7 8 NA NA NA 1
#> 8 4 8 1 78 3
#> 9 41 8 9 0 3
#> 10 84 NA NA 3 2
# Filter out rows with same value on colums -- NAs are all counted as unique values
test %>%
rowwise() %>%
filter(n_distinct(c_across(var1:var3)) == 1)
#> # A tibble: 3 x 4
#> # Rowwise:
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
#> 3 8 NA NA NA
# count number of distinct values by row across columns -- NAs excluded from count
test %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 10 x 5
#> # Rowwise:
#> id var1 var2 var3 unique
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 90 6 6 6 1
#> 2 71 6 5 3 3
#> 3 13 6 4 4 2
#> 4 60 8 8 8 1
#> 5 20 9 7 7 2
#> 6 70 NA 12 45 2
#> 7 8 NA NA NA 0
#> 8 4 8 1 78 3
#> 9 41 8 9 0 3
#> 10 84 NA NA 3 1
# Filter out rows with same value on colums -- NAs excluded from count
test %>%
rowwise() %>%
filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1)
#> # A tibble: 3 x 4
#> # Rowwise:
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
#> 3 84 NA NA 3
# count number of distinct values by row across columns -- NAs excluded altogether
test %>%
tidyr::drop_na(var1:var3) %>%
rowwise() %>%
mutate(unique = n_distinct(c_across(var1:var3), na.rm = TRUE))
#> # A tibble: 7 x 5
#> # Rowwise:
#> id var1 var2 var3 unique
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 90 6 6 6 1
#> 2 71 6 5 3 3
#> 3 13 6 4 4 2
#> 4 60 8 8 8 1
#> 5 20 9 7 7 2
#> 6 4 8 1 78 3
#> 7 41 8 9 0 3
# Filter out rows with same value on colums -- NAs excluded altogether
test %>%
tidyr::drop_na(var1:var3) %>%
rowwise() %>%
filter(n_distinct(c_across(var1:var3), na.rm = TRUE) == 1)
#> # A tibble: 2 x 4
#> # Rowwise:
#> id var1 var2 var3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 90 6 6 6
#> 2 60 8 8 8
Created on 2022-01-14 by the reprex package (v2.0.1)
CodePudding user response:
You may select the rows whose rowwise standard deviation or variance is 0.
test[matrixStats::rowSds(as.matrix(test[-1])) == 0, ]
# A tibble: 2 x 4
# id var1 var2 var3
# <dbl> <dbl> <dbl> <dbl>
#1 90 6 6 6
#2 60 8 8 8
