I have the following dataset which has firm identifiers in column id_isin and a series of covariates (features) in the column covariate. The dataset comes in wide format as each covariate is observed at multiple time periods. These are organized in columns (y2010 to y2020). I report the input data called forSO below:
library(data.table)
forSO = fread("~/Desktop/forSO.csv")
forSO
#> id_isin covariate y2010 y2011 y2012
#> 1: ZAE000255915 NET INC BEFORE_EXTRA/PFDDIVS 8118000.0 9674000.0 8.3930e 06
#> 2: ZAE000255915 OPERATING INCOME 11756000.0 14134000.0 1.2266e 07
#> 3: ZAE000255915 RETURN ON ASSETS 2.5 2.3 1.7800e 00
#> 4: ZAE000198289 NET INC BEFORE EXTRA/PFD DIVS NA NA NA
#> 5: ZAE000198289 OPERATING INCOME NA NA NA
#> 6: ZAE000198289 RETURN ON ASSETS NA NA NA
#> y2013 y2014 y2015 y2016 y2017 y2018
#> 1: 1.1981e 07 1.3216e 07 14331000 14708000.00 13823000.00 13917000.00
#> 2: 1.7975e 07 1.9921e 07 21227000 22210000.00 21329000.00 21772000.00
#> 3: 1.8400e 00 1.9300e 00 2 2.06 2.01 1.91
#> 4: NA NA 40811 559094.00 786806.00 814462.00
#> 5: NA NA 48190 233141.00 299230.00 307252.00
#> 6: NA NA NA 10.84 12.86 11.76
#> y2019 y2020
#> 1: 14256000.00 5880000.00
#> 2: 21820000.00 10765000.00
#> 3: 1.87 1.09
#> 4: 920734.00 485423.00
#> 5: 368575.00 326465.00
#> 6: 11.24 5.57
Created on 2021-12-21 by the reprex package (v2.0.1)
I want to transpose the dataset into a panel data structure which looks like the following:
library(data.table)
output = fread("~/Desktop/minimal.csv")
output
#> id_isin year NET INC BEFORE_EXTRA/PFDDIVS OPERATING INCOME
#> 1: ZAE000255915 2010 8118000 11756000
#> 2: ZAE000255915 2011 9674000 14134000
#> 3: ZAE000255915 2012 8393000 12266000
#> 4: ZAE000255915 2013 11981000 17975000
#> 5: ZAE000255915 2014 13216000 19921000
#> 6: ZAE000255915 2015 14331000 21227000
#> 7: ZAE000255915 2016 14708000 22210000
#> 8: ZAE000255915 2017 13823000 21329000
#> 9: ZAE000255915 2018 13917000 21772000
#> 10: ZAE000255915 2019 14256000 21820000
#> 11: ZAE000255915 2020 5880000 10765000
#> 12: ZAE000198289 2010 NA NA
#> 13: ZAE000198289 2011 NA NA
#> 14: ZAE000198289 2012 NA NA
#> 15: ZAE000198289 2013 NA NA
#> 16: ZAE000198289 2014 NA NA
#> 17: ZAE000198289 2015 40811 48190
#> 18: ZAE000198289 2016 559094 233141
#> 19: ZAE000198289 2017 786806 299230
#> 20: ZAE000198289 2018 814462 307252
#> 21: ZAE000198289 2019 920734 368575
#> 22: ZAE000198289 2020 485423 326465
#> id_isin year NET INC BEFORE_EXTRA/PFDDIVS OPERATING INCOME
#> RETURN ON ASSETS
#> 1: 2.50
#> 2: 2.30
#> 3: 1.78
#> 4: 1.84
#> 5: 1.93
#> 6: 2.00
#> 7: 2.06
#> 8: 2.01
#> 9: 1.91
#> 10: 1.87
#> 11: 1.09
#> 12: NA
#> 13: NA
#> 14: NA
#> 15: NA
#> 16: NA
#> 17: NA
#> 18: 10.84
#> 19: 12.86
#> 20: 11.76
#> 21: 11.24
#> 22: 5.57
#> RETURN ON ASSETS
Created on 2021-12-21 by the reprex package (v2.0.1)
Please find both datasets to import in R below.
Any suggestion is appreciated!
Input Dataset
structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000198289", "ZAE000198289", "ZAE000198289"),
covariate = c("NET INC BEFORE_EXTRA/PFDDIVS",
"OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS",
"OPERATING INCOME", "RETURN ON ASSETS"),
y2010 = c(8118000, 11756000, 2.5, NA, NA, NA),
y2011 = c(9674000, 14134000, 2.3, NA, NA, NA),
y2012 = c(8393000, 12266000, 1.78, NA, NA, NA),
y2013 = c(11981000, 17975000, 1.84, NA, NA, NA),
y2014 = c(13216000, 19921000, 1.93, NA, NA, NA),
y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA),
y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84),
y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86),
y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76),
y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24),
y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)),
row.names = c(NA, -6L), class = c("data.table", "data.frame" ))
Desired Outcome
structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289",
"ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289",
"ZAE000198289", "ZAE000198289", "ZAE000198289"),
year = c(2010L,
2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L, 2019L,
2020L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L,
2018L, 2019L, 2020L),
`NET INC BEFORE_EXTRA/PFDDIVS` = c(8118000L,
9674000L, 8393000L, 11981000L, 13216000L, 14331000L, 14708000L,
13823000L, 13917000L, 14256000L, 5880000L, NA, NA, NA, NA, NA,
40811L, 559094L, 786806L, 814462L, 920734L, 485423L),
`OPERATING INCOME` = c(11756000L,
14134000L, 12266000L, 17975000L, 19921000L, 21227000L, 22210000L,
21329000L, 21772000L, 21820000L, 10765000L, NA, NA, NA, NA, NA,
48190L, 233141L, 299230L, 307252L, 368575L, 326465L),
`RETURN ON ASSETS` = c(2.5,
2.3, 1.78, 1.84, 1.93, 2, 2.06, 2.01, 1.91, 1.87, 1.09, NA, NA,
NA, NA, NA, NA, 10.84, 12.86, 11.76, 11.24, 5.57)),
row.names = c(NA, -22L), class = c("data.table", "data.frame"))
CodePudding user response:
This should get you started:
x <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000198289", "ZAE000198289", "ZAE000198289"),
covariate = c("NET INC BEFORE_EXTRA/PFDDIVS",
"OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS",
"OPERATING INCOME", "RETURN ON ASSETS"),
y2010 = c(8118000, 11756000, 2.5, NA, NA, NA),
y2011 = c(9674000, 14134000, 2.3, NA, NA, NA),
y2012 = c(8393000, 12266000, 1.78, NA, NA, NA),
y2013 = c(11981000, 17975000, 1.84, NA, NA, NA),
y2014 = c(13216000, 19921000, 1.93, NA, NA, NA),
y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA),
y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84),
y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86),
y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76),
y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24),
y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)),
row.names = c(NA, -6L), class = c("data.table", "data.frame" ))
library(tidyr)
x %>%
pivot_longer(-c(id_isin, covariate) ) %>%
pivot_wider(names_from = "covariate") %>%
mutate(year = as.numeric(stringr::str_remove(name, "y")))%>%
select(id_isin, year, `NET INC BEFORE_EXTRA/PFDDIVS`, `OPERATING INCOME`)
Which gives us the following:
# A tibble: 22 × 4
id_isin year `NET INC BEFORE_EXTRA/PFDDIVS` `OPERATING INCOME`
<chr> <dbl> <dbl> <dbl>
1 ZAE000255915 2010 8118000 11756000
2 ZAE000255915 2011 9674000 14134000
3 ZAE000255915 2012 8393000 12266000
4 ZAE000255915 2013 11981000 17975000
CodePudding user response:
This should do the trick:
library(tidyverse)
df <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000198289", "ZAE000198289", "ZAE000198289"),
covariate = c("NET INC BEFORE_EXTRA/PFDDIVS",
"OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS",
"OPERATING INCOME", "RETURN ON ASSETS"),
y2010 = c(8118000, 11756000, 2.5, NA, NA, NA),
y2011 = c(9674000, 14134000, 2.3, NA, NA, NA),
y2012 = c(8393000, 12266000, 1.78, NA, NA, NA),
y2013 = c(11981000, 17975000, 1.84, NA, NA, NA),
y2014 = c(13216000, 19921000, 1.93, NA, NA, NA),
y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA),
y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84),
y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86),
y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76),
y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24),
y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)),
row.names = c(NA, -6L), class = c("data.table", "data.frame" ))
df %>%
pivot_longer(cols = c(paste("y",2010:2020,sep = "")), names_to = "year", values_to = "HHA") %>%
pivot_wider(names_from = "covariate", values_from = "HHA") %>%
mutate(`NET INC BEFORE_EXTRA/PFDDIVS` = coalesce(`NET INC BEFORE_EXTRA/PFDDIVS`,`NET INC BEFORE EXTRA/PFD DIVS`),
year = str_remove(year, "y")) %>%
select(-`NET INC BEFORE EXTRA/PFD DIVS`)
Output:
# A tibble: 22 x 5
id_isin year `NET INC BEFORE_EXTRA/PFDDIVS` `OPERATING INCOME` `RETURN ON ASSETS`
<chr> <chr> <dbl> <dbl> <dbl>
1 ZAE000255915 2010 8118000 11756000 2.5
2 ZAE000255915 2011 9674000 14134000 2.3
3 ZAE000255915 2012 8393000 12266000 1.78
4 ZAE000255915 2013 11981000 17975000 1.84
5 ZAE000255915 2014 13216000 19921000 1.93
6 ZAE000255915 2015 14331000 21227000 2
7 ZAE000255915 2016 14708000 22210000 2.06
8 ZAE000255915 2017 13823000 21329000 2.01
9 ZAE000255915 2018 13917000 21772000 1.91
10 ZAE000255915 2019 14256000 21820000 1.87
# ... with 12 more rows
CodePudding user response:
If you prefer data.table functions/syntax, perhaps:
library(data.table)
x <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000198289", "ZAE000198289", "ZAE000198289"),
covariate = c("NET INC BEFORE_EXTRA/PFDDIVS",
"OPERATING INCOME", "RETURN ON ASSETS", "NET INC BEFORE EXTRA/PFD DIVS",
"OPERATING INCOME", "RETURN ON ASSETS"),
y2010 = c(8118000, 11756000, 2.5, NA, NA, NA),
y2011 = c(9674000, 14134000, 2.3, NA, NA, NA),
y2012 = c(8393000, 12266000, 1.78, NA, NA, NA),
y2013 = c(11981000, 17975000, 1.84, NA, NA, NA),
y2014 = c(13216000, 19921000, 1.93, NA, NA, NA),
y2015 = c(14331000L, 21227000L, 2L, 40811L, 48190L, NA),
y2016 = c(14708000, 22210000, 2.06, 559094, 233141, 10.84),
y2017 = c(13823000, 21329000, 2.01, 786806, 299230, 12.86),
y2018 = c(13917000, 21772000, 1.91, 814462, 307252, 11.76),
y2019 = c(14256000, 21820000, 1.87, 920734, 368575, 11.24),
y2020 = c(5880000, 10765000, 1.09, 485423, 326465, 5.57)),
row.names = c(NA, -6L), class = c("data.table", "data.frame" ))
x.m2 <- melt(x, id.vars = c("id_isin", "covariate"))
#> Warning in melt.data.table(x, id.vars = c("id_isin", "covariate")):
#> 'measure.vars' [y2010, y2011, y2012, y2013, ...] are not all of the same type.
#> By order of hierarchy, the molten data value column will be of type 'double'.
#> All measure variables not of type 'double' will be coerced too. Check DETAILS
#> in ?melt.data.table for more on coercion.
x.m3 <- dcast(x.m2, formula = id_isin variable ~ covariate, value.var = "value")
x.m3$year <- as.integer(gsub(x = x.m3$variable, pattern = "y", replacement = ""))
x.m4 <- x.m3[,variable := NULL]
x.m5 <- x.m4[,`NET INC BEFORE_EXTRA/PFDDIVS` := .(fcoalesce(`NET INC BEFORE_EXTRA/PFDDIVS`,`NET INC BEFORE EXTRA/PFD DIVS`))]
x.m6 <- x.m5[,`NET INC BEFORE EXTRA/PFD DIVS` := NULL]
x.m6
outcome <- structure(list(id_isin = c("ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000255915", "ZAE000255915", "ZAE000255915", "ZAE000255915",
"ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289",
"ZAE000198289", "ZAE000198289", "ZAE000198289", "ZAE000198289",
"ZAE000198289", "ZAE000198289", "ZAE000198289"),
year = c(2010L,
2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L, 2018L, 2019L,
2020L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L,
2018L, 2019L, 2020L),
`NET INC BEFORE_EXTRA/PFDDIVS` = c(8118000L,
9674000L, 8393000L, 11981000L, 13216000L, 14331000L, 14708000L,
13823000L, 13917000L, 14256000L, 5880000L, NA, NA, NA, NA, NA,
40811L, 559094L, 786806L, 814462L, 920734L, 485423L),
`OPERATING INCOME` = c(11756000L,
14134000L, 12266000L, 17975000L, 19921000L, 21227000L, 22210000L,
21329000L, 21772000L, 21820000L, 10765000L, NA, NA, NA, NA, NA,
48190L, 233141L, 299230L, 307252L, 368575L, 326465L),
`RETURN ON ASSETS` = c(2.5,
2.3, 1.78, 1.84, 1.93, 2, 2.06, 2.01, 1.91, 1.87, 1.09, NA, NA,
NA, NA, NA, NA, 10.84, 12.86, 11.76, 11.24, 5.57)),
row.names = c(NA, -22L), class = c("data.table", "data.frame"))
dplyr::all_equal(x.m6, outcome)
#> [1] "- Different types for column `NET INC BEFORE_EXTRA/PFDDIVS`: double vs integer\n- Different types for column `OPERATING INCOME`: double vs integer\n"
Created on 2021-12-21 by the reprex package (v2.0.1)
CodePudding user response:
concise data.table approach (x is your input structure, above):
dcast(melt(
x[,covariate:=fifelse(
grepl("^NET",covariate),
"NET INC BEFORE_EXTRA/PFDDIVS",
covariate)],
,id=c(1,2),variable.name = "year")[
,year:=as.integer(gsub("y","",year))],
id_isin year~covariate,value.var = "value"
)
