Cleaning Messy Data
Mar 8, 2018 #dplyr
Get some sample data.
library(rvest)
library(dplyr)
# Scrap info from MIT Technology Review
xml <- "https://www.technologyreview.com/lists/companies/2017/intro/" %>% read_html()
info <- xml %>% html_nodes(css = ".company__stats__item , .company__title") %>% html_text()
# Do not need "Valuation"
info <- info[-(info %>% grep(pattern = "^Valuation\\s"))]
head(info, 10)
## [1] "Nvidia"
## [2] "\n\t\t\tHeadquarters\n\t\t\t\n\t\t\tSanta Clara, California\n\t\t"
## [3] "Industry Intelligent machines"
## [4] "Status Public"
## [5] "Years on the List\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2015\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2016\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2017\n\t\t\t\t\t\n\t\t\n\t\t\n\t\t"
## [6] "SpaceX"
## [7] "\n\t\t\tHeadquarters\n\t\t\t\n\t\t\tHawthorne, California\n\t\t"
## [8] "Industry Transportation"
## [9] "Status Private"
## [10] "Years on the List\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2011\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2012\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2013\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2014\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2015\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2016\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2017\n\t\t\t\t\t\n\t\t\n\t\t\n\t\t"
Convert from vector to data frame.
# Convert to data frame
df <- info %>% matrix(ncol = 5, byrow = TRUE) %>% as.data.frame() %>% as_tibble()
# Assign names
names(df) <- c("company", "hq", "industry", "status", "yol")
df
## # A tibble: 50 x 5
## company hq industry status yol
## <fct> <fct> <fct> <fct> <fct>
## 1 Nvidia "\n\t\t\tHeadquarter… Industry Int… Status… "Years on the List\n\…
## 2 SpaceX "\n\t\t\tHeadquarter… Industry Tra… Status… "Years on the List\n\…
## 3 Amazon "\n\t\t\tHeadquarter… Industry Con… Status… "Years on the List\n\…
## 4 23andMe "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
## 5 Alphabet "\n\t\t\tHeadquarter… Industry Con… Status… "Years on the List\n\…
## 6 iFlytek "\n\t\t\tHeadquarter… Industry Int… Status… "Years on the List\n\…
## 7 Kite Phar… "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
## 8 Tencent "\n\t\t\tHeadquarter… Industry Con… Status… "Years on the List\n\…
## 9 Regeneron "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
## 10 Spark The… "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
## # … with 40 more rows
Clean up texts with stringr
.
library(stringr)
df <- df %>%
mutate(hq = str_extract(hq, "[A-Z]+.*,.*[a-z]+")) %>%
mutate(yol = str_extract_all(yol, "[0-9]\\,?")) %>%
mutate(industry = str_replace(industry, "^\\w+\\s?", ""),
status = str_replace(status, "^\\w+\\s?", ""))
df
## # A tibble: 50 x 5
## company hq industry status yol
## <fct> <chr> <chr> <chr> <list>
## 1 Nvidia Santa Clara, California Intelligent machi… Public <chr [12…
## 2 SpaceX Hawthorne, California Transportation Priva… <chr [28…
## 3 Amazon Seattle, Washington Connectivity Public <chr [20…
## 4 23andMe Mountain View, Califor… Biomedicine Priva… <chr [8]>
## 5 Alphabet Mountain View, Califor… Connectivity Public <chr [32…
## 6 iFlytek Hefei, China Intelligent machi… Public <chr [4]>
## 7 Kite Pharma Santa Monica, Californ… Biomedicine Public <chr [4]>
## 8 Tencent Shenzhen, China Connectivity Public <chr [20…
## 9 Regeneron Tarrytown, New York Biomedicine Public <chr [4]>
## 10 Spark Therapeuti… Philadelphia, Pennsylv… Biomedicine Public <chr [8]>
## # … with 40 more rows
Clean up uneven column (year_on_list
).
df <- df %>% rowwise() %>%
# unlist column
mutate(yol = unlist(yol) %>% paste(collapse = ""))
head(df$yol)
## [1] "201520162017" "2011201220132014201520162017"
## [3] "20132014201520162017" "20162017"
## [5] "20102011201220132014201520162017" "2017"
df <- df %>%
tidyr::separate(yol, into = paste0("year_on_list_", 1:8), sep = ",", convert = TRUE, fill = "right")