I am trying to pivot a table that has headings and sub-headings, so that the headings go into a column "date", and the subheadings are two columns instead of repeating.
Here is an example of my data.
This was produced using dput(), so while in the original excel file, each date spanned over both sub-headings ("blue" and "green"), once in R, these blank cells were re-named X.1, X.2, etc.
table <- " X X.1 X02.Jul.12 X.2 X03.Jul.12 X.3 X04.Jul.12 X.4
1 category number blue green blue green blue green
2 G 1 1 0 1 0 1 0
3 G 2 2 99 2 99 1 99
4 G 3 1 1 1 99 1 99
5 G 4 1 1 1 1 2 99
6 G 5 1 0 1 0 1 99
7 G 6 1 99 1 1 1 99
8 G 7 1 0 1 0 1 0
9 G 8 1 1 1 1 1 99
10 G 9 1 1 1 1 1 1
11 H 1 1 1 1 1 1 1
12 H 2 1 99 1 0 1 0
13 H 3 1 1 1 1 1 99
14 H 4 1 99 1 2 1 99
15 H 5 1 1 1 1 1 1
16 H 6 1 0 1 0 1 99
17 H 7 1 1 2 1 1 99
18 H 8 2 0 2 0 1 1
19 H 9 2 0 2 0 1 1"
#Create a dataframe with the above table
df <- read.table(text=table, header = TRUE)
df
Here is an example of what it looks like in Excel:
This is the desired output I am trying to achieve:
While this can be done manually in Excel, I have multiple files with over 100 dates/columns, so would prefer to find a way to clean it in R.
Any help would be appreciated!
Excel Reprex
Here is a reprex of the dataset, as if it were read from Excel without name correction:
structure(
list(
c("category", "G", "G", "G", "G", "G", "G", "G", "G", "G", "H", "H", "H", "H", "H", "H", "H", "H", "H"),
c("number", "1", "2", "3", "4", "5", "6", "7", "8", "9", "1", "2", "3", "4", "5", "6", "7", "8", "9"),
`02.Jul.12` = c("blue", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2"),
c("green", "0", "99", "1", "1", "0", "99", "0", "1", "1", "1", "99", "1", "99", "1", "0", "1", "0", "0"),
`03.Jul.12` = c("blue", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2"),
c("green", "0", "99", "99", "1", "0", "1", "0", "1", "1", "1", "0", "1", "2", "1", "0", "1", "0", "0"),
`04.Jul.12` = c("blue", "1", "1", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
c("green", "0", "99", "99", "99", "99", "99", "0", "99", "1", "1", "0", "99", "99", "1", "99", "99", "1", "1")
),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19")
)
CodePudding user response:
The following code should produce your desired output, though others may have more elegant solutions:
#Create a dataframe with the above table
df <- read.table(text=table, header = TRUE)
df
names(df) <- df[1,]
library(lubridate); library(tidyr)
startdate <- dmy("02-Jul-12")
for (i in seq(3, ncol(df), by = 2)){
names(df)[i:(i 1)] <- paste0(startdate, ":", names(df)[i:(i 1)])
startdate <- startdate 1
}
df.tdy <- df[-1,] %>% pivot_longer(-c("category","number"), names_to = "datecol", values_to = "value") %>%
separate(datecol, c("date","color"), sep = ":") %>%
pivot_wider(names_from = "color") %>%
arrange(date,category,number)
# category number date blue green
# <chr> <chr> <chr> <chr> <chr>
# 1 G 1 2012-07-02 1 0
# 2 G 2 2012-07-02 2 99
# 3 G 3 2012-07-02 1 1
# 4 G 4 2012-07-02 1 1
# 5 G 5 2012-07-02 1 0
# 6 G 6 2012-07-02 1 99
# 7 G 7 2012-07-02 1 0
# 8 G 8 2012-07-02 1 1
# 9 G 9 2012-07-02 1 1
# 10 H 1 2012-07-02 1 1
CodePudding user response:
Here is another option using a combination of base R and tidyverse. Here, I first clean up the column names by including the date in the column names for the column to the left (i.e., "green), so that each column has a date. Then, I concatenate the header with the sub-heading, except for the first 2 columns (i.e., category and number). Then, I remove the first row and pivot to a long format with the date in one column and the colors stay in their own column.
library(tidyverse)
colnames(df)[seq(2, ncol(df), 2)] <- colnames(df)[seq(1, ncol(df), 2)]
colnames(df) <-
c(df[1, 1], df[1, 2], paste(sep = '_', colnames(df)[3:ncol(df)], as.character(unlist(df[1, 3:ncol(df)]))))
df %>%
slice(-1) %>%
pivot_longer(-c(category, number),
names_to = c("Date", ".value"),
names_sep = "_") %>%
arrange(Date, category, number) %>%
mutate(Date = dmy(Date))
Output
# A tibble: 54 × 5
category number Date blue green
<chr> <chr> <date> <chr> <chr>
1 G 1 2012-07-02 1 0
2 G 2 2012-07-02 2 99
3 G 3 2012-07-02 1 1
4 G 4 2012-07-02 1 1
5 G 5 2012-07-02 1 0
6 G 6 2012-07-02 1 99
7 G 7 2012-07-02 1 0
8 G 8 2012-07-02 1 1
9 G 9 2012-07-02 1 1
10 H 1 2012-07-02 1 1
# … with 44 more rows
Data
df <- structure(
list(
c("category", "G", "G", "G", "G", "G", "G", "G", "G", "G", "H", "H", "H", "H", "H", "H", "H", "H", "H"),
c("number", "1", "2", "3", "4", "5", "6", "7", "8", "9", "1", "2", "3", "4", "5", "6", "7", "8", "9"),
`02.Jul.12` = c("blue", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2"),
c("green", "0", "99", "1", "1", "0", "99", "0", "1", "1", "1", "99", "1", "99", "1", "0", "1", "0", "0"),
`03.Jul.12` = c("blue", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2"),
c("green", "0", "99", "99", "1", "0", "1", "0", "1", "1", "1", "0", "1", "2", "1", "0", "1", "0", "0"),
`04.Jul.12` = c("blue", "1", "1", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
c("green", "0", "99", "99", "99", "99", "99", "0", "99", "1", "1", "0", "99", "99", "1", "99", "99", "1", "1")
),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19")
)
CodePudding user response:
df %>%
set_names(enframe(unlist(df[1,])) %>%
mutate(name = na_if(name, ''))%>%
fill(name)%>%
transmute(nms = coalesce(str_c(name, value, sep='_'), value)) %>%
pull(nms)) %>%
slice(-1)%>%
type.convert(as.is = TRUE)%>%
pivot_longer(-c(category, number), names_to = c('Date', '.value'),
names_sep = '_', names_transform = list(Date = dmy)) %>%
arrange(category, Date, number)
# A tibble: 54 x 5
category number Date blue green
<chr> <int> <date> <int> <int>
1 G 1 2012-07-02 1 0
2 G 2 2012-07-02 2 99
3 G 3 2012-07-02 1 1
4 G 4 2012-07-02 1 1
5 G 5 2012-07-02 1 0
6 G 6 2012-07-02 1 99
7 G 7 2012-07-02 1 0
8 G 8 2012-07-02 1 1
9 G 9 2012-07-02 1 1
10 G 1 2012-07-03 1 0
# ... with 44 more rows
CodePudding user response:
Here's a tidyverse solution that doesn't rely on splicing any column names:
Solution
First import the tidyverse:
# Load the tidyverse.
library(tidyverse)
Then read the Excel file and extract the headers as manipulable rows.
# Read the Excel file as an unnamed dataset.
df <- readxl::read_xlsx("./reprex.xlsx", col_names = FALSE)
# Extract the dates (from the first row) and fill the blanks rightwards.
dates <- df[1, ] %>% t() %>% as_tibble() %>% fill(1, .direction = "down") %>% .[[1]]
# Extract the intended column names (from the second row).
names <- df[2, ] %>% t() %>% as.vector()
Finally, use this workflow to properly name and pivot the data.
# Cut out the headers from the data.
df <- df[-(1:2), ] %>%
# Properly name the dataset.
set_names(nm = names) %>%
# Pivot the color columns.
pivot_longer(cols = !c(category, number), names_to = "color") %>%
# Convert to the proper datatypes.
mutate(
category = as.character(category),
number = as.integer(number),
value = as.numeric(value)
) %>%
# Identify each "clump" of colors by the one row from which it originated;
# where {'category', 'number'} uniquely identify each such row.
group_by(category, number) %>%
# Map the date names to each clump.
mutate(
# Index the entries in each clump.
date = row_number(),
# Map each date to its corresponding entry.
date = dates[!is.na(dates)][date],
# Convert to date objects.
date = lubridate::as_datetime(date, format = "%d.%b.%y")
) %>% ungroup() %>%
# Pivot the colors into consolidated columns: one for each color.
pivot_wider(names_from = color, values_from = value) %>%
# Sort as desired.
arrange(date, category, number)
Results
Given a reprex.xlsx like that reproduced by this code
openxlsx::write.xlsx(
file = "./reprex.xlsx",
x = structure(
list(
c("category", "G", "G", "G", "G", "G", "G", "G", "G", "G", "H", "H", "H", "H", "H", "H", "H", "H", "H"),
c("number", "1", "2", "3", "4", "5", "6", "7", "8", "9", "1", "2", "3", "4", "5", "6", "7", "8", "9"),
"02.Jul.12" = c("blue", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2"),
c("green", "0", "99", "1", "1", "0", "99", "0", "1", "1", "1", "99", "1", "99", "1", "0", "1", "0", "0"),
"03.Jul.12" = c("blue", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "2", "2", "2"),
c("green", "0", "99", "99", "1", "0", "1", "0", "1", "1", "1", "0", "1", "2", "1", "0","1", "0", "0"),
"04.Jul.12" = c("blue", "1", "1", "1", "2", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
c("green", "0", "99", "99", "99", "99", "99", "0", "99", "1", "1", "0", "99", "99", "1", "99", "99", "1", "1")
),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19")
)
)
this solution should yield the following result for df:
# A tibble: 54 x 5
category number date blue green
<chr> <int> <dttm> <dbl> <dbl>
1 G 1 2012-07-02 00:00:00 1 0
2 G 2 2012-07-02 00:00:00 2 99
3 G 3 2012-07-02 00:00:00 1 1
4 G 4 2012-07-02 00:00:00 1 1
5 G 5 2012-07-02 00:00:00 1 0
6 G 6 2012-07-02 00:00:00 1 99
7 G 7 2012-07-02 00:00:00 1 0
8 G 8 2012-07-02 00:00:00 1 1
9 G 9 2012-07-02 00:00:00 1 1
10 H 1 2012-07-02 00:00:00 1 1
11 H 2 2012-07-02 00:00:00 1 99
12 H 3 2012-07-02 00:00:00 1 1
13 H 4 2012-07-02 00:00:00 1 99
14 H 5 2012-07-02 00:00:00 1 1
15 H 6 2012-07-02 00:00:00 1 0
16 H 7 2012-07-02 00:00:00 1 1
17 H 8 2012-07-02 00:00:00 2 0
18 H 9 2012-07-02 00:00:00 2 0
19 G 1 2012-07-03 00:00:00 1 0
20 G 2 2012-07-03 00:00:00 2 99
21 G 3 2012-07-03 00:00:00 1 99
22 G 4 2012-07-03 00:00:00 1 1
23 G 5 2012-07-03 00:00:00 1 0
24 G 6 2012-07-03 00:00:00 1 1
25 G 7 2012-07-03 00:00:00 1 0
26 G 8 2012-07-03 00:00:00 1 1
27 G 9 2012-07-03 00:00:00 1 1
28 H 1 2012-07-03 00:00:00 1 1
29 H 2 2012-07-03 00:00:00 1 0
30 H 3 2012-07-03 00:00:00 1 1
31 H 4 2012-07-03 00:00:00 1 2
32 H 5 2012-07-03 00:00:00 1 1
33 H 6 2012-07-03 00:00:00 1 0
34 H 7 2012-07-03 00:00:00 2 1
35 H 8 2012-07-03 00:00:00 2 0
36 H 9 2012-07-03 00:00:00 2 0
37 G 1 2012-07-04 00:00:00 1 0
38 G 2 2012-07-04 00:00:00 1 99
39 G 3 2012-07-04 00:00:00 1 99
40 G 4 2012-07-04 00:00:00 2 99
41 G 5 2012-07-04 00:00:00 1 99
42 G 6 2012-07-04 00:00:00 1 99
43 G 7 2012-07-04 00:00:00 1 0
44 G 8 2012-07-04 00:00:00 1 99
45 G 9 2012-07-04 00:00:00 1 1
46 H 1 2012-07-04 00:00:00 1 1
47 H 2 2012-07-04 00:00:00 1 0
48 H 3 2012-07-04 00:00:00 1 99
49 H 4 2012-07-04 00:00:00 1 99
50 H 5 2012-07-04 00:00:00 1 1
51 H 6 2012-07-04 00:00:00 1 99
52 H 7 2012-07-04 00:00:00 1 99
53 H 8 2012-07-04 00:00:00 1 1
54 H 9 2012-07-04 00:00:00 1 1
CodePudding user response:
A base R option using resahpe
u <- type.convert(setNames(df[-1, ], df[1, ]), as.is = TRUE)
transform(
reshape(
cbind(
u[1:2],
setNames(
u[-c(1:2)],
paste0(
names(u)[-c(1:2)],
".",
ave(seq(length(u) - 2), names(u)[-c(1:2)], FUN = seq_along)
)
)
),
direction = "long",
idvar = c("category", "number"),
varying = -c(1:2),
timevar = "date"
),
date = Filter(nchar, names(df))[date]
)
gives
category number date blue green
G.1.1 G 1 02.Jul.12 1 0
G.2.1 G 2 02.Jul.12 2 99
G.3.1 G 3 02.Jul.12 1 1
G.4.1 G 4 02.Jul.12 1 1
G.5.1 G 5 02.Jul.12 1 0
G.6.1 G 6 02.Jul.12 1 99
G.7.1 G 7 02.Jul.12 1 0
G.8.1 G 8 02.Jul.12 1 1
G.9.1 G 9 02.Jul.12 1 1
H.1.1 H 1 02.Jul.12 1 1
H.2.1 H 2 02.Jul.12 1 99
H.3.1 H 3 02.Jul.12 1 1
H.4.1 H 4 02.Jul.12 1 99
H.5.1 H 5 02.Jul.12 1 1
H.6.1 H 6 02.Jul.12 1 0
H.7.1 H 7 02.Jul.12 1 1
H.8.1 H 8 02.Jul.12 2 0
H.9.1 H 9 02.Jul.12 2 0
G.1.2 G 1 03.Jul.12 1 0
G.2.2 G 2 03.Jul.12 2 99
G.3.2 G 3 03.Jul.12 1 99
G.4.2 G 4 03.Jul.12 1 1
G.5.2 G 5 03.Jul.12 1 0
G.6.2 G 6 03.Jul.12 1 1
G.7.2 G 7 03.Jul.12 1 0
G.8.2 G 8 03.Jul.12 1 1
G.9.2 G 9 03.Jul.12 1 1
H.1.2 H 1 03.Jul.12 1 1
H.2.2 H 2 03.Jul.12 1 0
H.3.2 H 3 03.Jul.12 1 1
H.4.2 H 4 03.Jul.12 1 2
H.5.2 H 5 03.Jul.12 1 1
H.6.2 H 6 03.Jul.12 1 0
H.7.2 H 7 03.Jul.12 2 1
H.8.2 H 8 03.Jul.12 2 0
H.9.2 H 9 03.Jul.12 2 0
G.1.3 G 1 04.Jul.12 1 0
G.2.3 G 2 04.Jul.12 1 99
G.3.3 G 3 04.Jul.12 1 99
G.4.3 G 4 04.Jul.12 2 99
G.5.3 G 5 04.Jul.12 1 99
G.6.3 G 6 04.Jul.12 1 99
G.7.3 G 7 04.Jul.12 1 0
G.8.3 G 8 04.Jul.12 1 99
G.9.3 G 9 04.Jul.12 1 1
H.1.3 H 1 04.Jul.12 1 1
H.2.3 H 2 04.Jul.12 1 0
H.3.3 H 3 04.Jul.12 1 99
H.4.3 H 4 04.Jul.12 1 99
H.5.3 H 5 04.Jul.12 1 1
H.6.3 H 6 04.Jul.12 1 99
H.7.3 H 7 04.Jul.12 1 99
H.8.3 H 8 04.Jul.12 1 1
H.9.3 H 9 04.Jul.12 1 1
CodePudding user response:
Here is another solution using dplyr and tidyr. We would first combine the headers and subheaders and then pivot the data frame. We will do two pivoting operations: first collect everything into date, name (consisting of either "blue" or "green") and value (consisting of the corresponding values for "blue" and "green"); then, pivot_wider the name and value columns. df comes directly from your excel reprex.
library(dplyr)
library(tidyr)
nms1 <- tidyr:::fillDown(na_if(names(df), ""))
nms2 <- unlist(df[1L, ])
df[-1L, ] %>%
setNames(if_else(is.na(nms1), nms2, paste(nms1, nms2, sep = "_"))) %>%
pivot_longer(-c(category, number), c("date", "name"), names_sep = "_") %>%
pivot_wider()
Output
# A tibble: 54 x 5
category number date blue green
<chr> <chr> <chr> <chr> <chr>
1 G 1 02.Jul.12 1 0
2 G 1 03.Jul.12 1 0
3 G 1 04.Jul.12 1 0
4 G 2 02.Jul.12 2 99
5 G 2 03.Jul.12 2 99
6 G 2 04.Jul.12 1 99
7 G 3 02.Jul.12 1 1
8 G 3 03.Jul.12 1 99
9 G 3 04.Jul.12 1 99
10 G 4 02.Jul.12 1 1
# ... with 44 more rows


