Using a column as row names and set the other columns as binary values based on strings-CodePudding

I am sorry if my English is bad but I will try to summarize my problem.

I need one of my columns to be set as row name, accounting for duplicates, and the other columns should be separated by unique values into new columns assuming binary values if they are related to the row name.

Example: ORIGINAL DATAFRAME

> df <- data.frame(module = c("M1","M1","M1","M1","M1","M2"),
                   GO = c("inflama","inflama","ciclo","inflama","ciclo","sinapse"),
                   gene = c("PPARG","RELA","RELA","IRF5","ACKR1","GATA3"))
> df
  module      GO  gene
1     M1 inflama PPARG
2     M1 inflama  RELA
3     M1   ciclo  RELA
4     M1 inflama  IRF5
5     M1   ciclo ACKR1
6     M2 sinapse GATA3

FINAL DATAFRAME

> df2 <- data.frame(gene = c("PPARG","RELA","IRF5","ACKR1","GATA3"),
                    M1 = c(1,1,1,1,0),
                    M2 = c(0,0,0,0,1),
                    inflama = c(1,1,1,0,0),
                    ciclo = c(0,1,0,1,0), sinapse = c(0,0,0,0,1))
> df2
   gene M1 M2 inflama ciclo sinapse
1 PPARG  1  0       1     0       0
2  RELA  1  0       1     1       0
3  IRF5  1  0       1     0       0
4 ACKR1  1  0       0     1       0
5 GATA3  0  1       0     0       1

Thank you in advance for your help!

CodePudding user response：

We may either use pivot_wider (from tidyr) to reshape or with dummy_cols (from fastDummies)

library(fastDummies)
library(dplyr)
library(stringr)
df %>% 
 dummy_cols(c('module', 'GO'), remove_selected_columns = TRUE) %>%
 group_by(gene) %>% 
 summarise(across(everything(),  ~   (any(.x)))) %>%
 rename_with(~ str_remove(., ".*_"), contains("_"))

-output

# A tibble: 5 × 6
  gene     M1    M2 ciclo inflama sinapse
  <chr> <int> <int> <int>   <int>   <int>
1 ACKR1     1     0     1       0       0
2 GATA3     0     1     0       0       1
3 IRF5      1     0     0       1       0
4 PPARG     1     0     0       1       0
5 RELA      1     0     1       1       0

CodePudding user response：

The Matrix package has a very efficient fac2sparse function for this purpose. If you downloaded R from CRAN, then you already have it installed.

library("Matrix")
fac2dense <- function(x) as(t(fac2sparse(x)), "matrix")
df2 <- data.frame(df["gene"], fac2dense(df[["module"]]), fac2dense(df[["GO"]]))
df2

   gene M1 M2 ciclo inflama sinapse
1 PPARG  1  0     0       1       0
2  RELA  1  0     0       1       0
3  RELA  1  0     1       0       0
4  IRF5  1  0     0       1       0
5 ACKR1  1  0     1       0       0
6 GATA3  0  1     0       0       1

aggregate(. ~ gene, df2, max)

   gene M1 M2 ciclo inflama sinapse
1 ACKR1  1  0     1       0       0
2 GATA3  0  1     0       0       1
3  IRF5  1  0     0       1       0
4 PPARG  1  0     0       1       0
5  RELA  1  0     1       1       0

CodePudding user response：

I think calling pivot_wider twice is good enough here.

library(tidyr)

df %>%
    pivot_wider(names_from = module,
                values_from = module,
                values_fill = 0,
                values_fn = \(x)1) %>%
    pivot_wider(names_from = GO,
                values_from = GO,
                values_fill = 0,
                values_fn = \(x)1)

# A tibble: 5 × 6
  gene     M1    M2 inflama ciclo sinapse
  <chr> <dbl> <dbl>   <dbl> <dbl>   <dbl>
1 PPARG     1     0       1     0       0
2 RELA      1     0       1     1       0
3 IRF5      1     0       1     0       0
4 ACKR1     1     0       0     1       0
5 GATA3     0     1       0     0       1

If we want to make it a bit simpler, we can define a pivoting function:

pivoting<-function(x, y) pivot_wider(x,
                        names_from = y,
                        values_from = y,
                        values_fill = 0,
                        values_fn = \(x)1)
df %>%
    pivoting('module') %>%
    pivoting('GO')

But I think this is too much trouble as there is already a package for that. As mentioned by @akrun, I would go with dummy_cols

CodePudding user response：

Using data.table is also an option:

library(data.table)

setDT(df)

Use dcast() to aggregate:

DT1 = dcast(df, gene ~ GO, fill = 0L, fun.agg = \(x) sum(!is.na(x)))

DT2 = dcast(df, gene ~ module, fill = 0L, fun.agg = \(x) ifelse(x == 0, 0, 1))

Then you can join the tables:

DT3 = DT1[DT2, on = "gene"]

DT3

Output

    gene ciclo inflama sinapse M1 M2
1: ACKR1     1       0       0  1  0
2: GATA3     0       0       1  0  1
3:  IRF5     0       1       0  1  0
4: PPARG     0       1       0  1  0
5:  RELA     1       1       0  1  0

CodePudding user response：

You should check out dplyr, tidyr, and tidyverse libraries for data wrangling in R.

## Loading the required libraries 
library(dplyr)
library(tidyr)
library(tidyverse)

## Creating a pivot table
df_pivot = df %>% 
  pivot_longer(!gene, names_to = 'Attributes', values_to = "AttributeValue") %>% ## Wider to longer
  group_by(gene,AttributeValue) %>% ## Grouping
  summarise(Count = n()) %>% ## Aggregation
  pivot_wider(names_from = AttributeValue , values_from = Count) ## Longer to wider

## Replacing NA with 0
df_pivot[is.na(df_pivot)] = 0

df_pivot
# A tibble: 5 x 6
# Groups:   gene [5]
  gene  ciclo    M1    M2 sinapse inflama
  <chr> <int> <int> <int>   <int>   <int>
1 ACKR1     1     1     0       0       0
2 GATA3     0     0     1       1       0
3 IRF5      0     1     0       0       1
4 PPARG     0     1     0       0       1
5 RELA      1     2     0       0       1