Personal code snippets of @tmasjc

Site powered by Hugo + Blogdown

Image by Mads Schmidt Rasmussen from unsplash.com

Minimal Bootstrap Theme by Zachary Betz

Wrangling Messy Data for D3 Time Series

Feb 25, 2020 #dplyr #json

This is a walkthrough how I transform a dataset to be used in D3. In the process I also export a copy of ‘tidy format’ for EDA purposes. The dataset is about electric car sales in US from year 2005 to 2017. Origianl data format.

X1 X2 X3 X4 X5 X6
Make Honda Toyota Honda Ford Honda
Model Insight Prius Civic Escape Accord
Type HEV HEV HEV HEV HEV
2005-01 7 5566 1169 908 805
2005-02 22 7078 1353 1092 855
2005-03 56 10236 2896 1569 1862

The final data format I would like to feed to D3 is in JSON and looks like the following

[
  {
    "date": "2005-01-01",
    "record": {
      "Honda Insight": {
        "brand": "Honda",
        "model": "Insight",
        "type": "HEV",
        "sales": 7
      },
      "Toyota Prius": {
        "brand": "Toyota",
        "model": "Prius",
        "type": "HEV",
        "sales": 5566
      },      
      ...
    },
    ...
}]

1 Import

library(tidyverse)
library(kableExtra)

# this is our column name
carnames   <- read_csv(filepath, col_names = FALSE, n_max = 3) %>% column_to_rownames("X1")

# this is our data
raw        <- read_csv(filepath, col_names = FALSE, skip = 3) %>% column_to_rownames("X1")

# loop through each column and form full car brand
names(raw) <- apply(carnames, 2, paste0, collapse = "-")
head(raw)[1:3] %>% 
    kable("html") %>% 
    kable_styling(position = "center")
Honda-Insight-HEV Toyota-Prius-HEV Honda-Civic-HEV
2005-01 7 5566 1169
2005-02 22 7078 1353
2005-03 56 10236 2896
2005-04 90 11345 3466
2005-05 52 9461 1895
2005-06 69 9622 1852

2 Transform

# this is our tidy data
df <- raw %>%
    rownames_to_column("date") %>%
    mutate(date = paste0(date, "-01"),
           date = as.Date(date)) %>% 
    gather(fullname, sales, -date) %>% 
    mutate(brand = str_extract(fullname, "^\\w+"), 
           model = str_extract(fullname, "(?<=-)\\w+(?=-)"),
           type  = str_extract(fullname, "\\w+$")) %>% 
    as_tibble() %>% 
    # column arrangement
    select(-sales, everything(), -fullname)

# preview
df %>% 
    head() %>% 
    kable("html") %>% 
    kable_styling(position = "center")
date brand model type sales
2005-01-01 Honda Insight HEV 7
2005-02-01 Honda Insight HEV 22
2005-03-01 Honda Insight HEV 56
2005-04-01 Honda Insight HEV 90
2005-05-01 Honda Insight HEV 52
2005-06-01 Honda Insight HEV 69

3 Loop

library(jsonlite)
library(listviewer)

# split to chunks to wrap in JSON
chunks_by_date <- df %>% 
    arrange(date) %>% 
    filter(!is.na(sales)) %>% 
    group_split(date, keep = TRUE)
dates <- map(chunks_by_date, ~ as.character(unique(.x$date))) # safer approach

# to convert each to a list in recursion
# must pass the dots
from_row_to_list <- function(brand, model, type, sales, ...) {
    list(
        "brand" = brand,
        "model" = model,
        "type"  = type,
        "sales" = sales
    )
}

# to name list in recursion at one go
paste_list_name <- function(brand, model, ...) {
    paste(brand, model)
}

# use map instead of for loop
out <- map2(
    .x = chunks_by_date, .y = dates,
    .f = ~ {
            l <- pmap(.x, from_row_to_list)
            n <- pmap(.x, paste_list_name)
            list(date = .y, record = set_names(l, n))
        }
    )

# final result
head(out, 10) %>% listviewer::jsonedit()

Done