Cleaning Messy Data

Mar 8, 2018 #dplyr

Get some sample data.

library(rvest)
library(dplyr)

# Scrap info from MIT Technology Review
xml <- "https://www.technologyreview.com/lists/companies/2017/intro/" %>% read_html()
info <- xml %>% html_nodes(css = ".company__stats__item , .company__title") %>% html_text()
# Do not need "Valuation"
info <- info[-(info %>% grep(pattern = "^Valuation\\s"))]

head(info, 10)

##  [1] "Nvidia"                                                                                                                                                                                                                                                                                                                                                                                                                               
##  [2] "\n\t\t\tHeadquarters\n\t\t\t\n\t\t\tSanta Clara, California\n\t\t"                                                                                                                                                                                                                                                                                                                                                                    
##  [3] "Industry Intelligent machines"                                                                                                                                                                                                                                                                                                                                                                                                        
##  [4] "Status Public"                                                                                                                                                                                                                                                                                                                                                                                                                        
##  [5] "Years on the List\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2015\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2016\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2017\n\t\t\t\t\t\n\t\t\n\t\t\n\t\t"                                                                                                                                                                                                                    
##  [6] "SpaceX"                                                                                                                                                                                                                                                                                                                                                                                                                               
##  [7] "\n\t\t\tHeadquarters\n\t\t\t\n\t\t\tHawthorne, California\n\t\t"                                                                                                                                                                                                                                                                                                                                                                      
##  [8] "Industry Transportation"                                                                                                                                                                                                                                                                                                                                                                                                              
##  [9] "Status Private"                                                                                                                                                                                                                                                                                                                                                                                                                       
## [10] "Years on the List\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2011\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2012\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2013\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2014\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2015\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2016\n\t\t\t\t\t,\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t2017\n\t\t\t\t\t\n\t\t\n\t\t\n\t\t"

Convert from vector to data frame.

# Convert to data frame
df <- info %>% matrix(ncol = 5, byrow = TRUE) %>% as.data.frame() %>% as_tibble()
# Assign names
names(df) <- c("company", "hq", "industry", "status", "yol")

df

## # A tibble: 50 x 5
##    company    hq                    industry      status  yol                   
##    <fct>      <fct>                 <fct>         <fct>   <fct>                 
##  1 Nvidia     "\n\t\t\tHeadquarter… Industry Int… Status… "Years on the List\n\…
##  2 SpaceX     "\n\t\t\tHeadquarter… Industry Tra… Status… "Years on the List\n\…
##  3 Amazon     "\n\t\t\tHeadquarter… Industry Con… Status… "Years on the List\n\…
##  4 23andMe    "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
##  5 Alphabet   "\n\t\t\tHeadquarter… Industry Con… Status… "Years on the List\n\…
##  6 iFlytek    "\n\t\t\tHeadquarter… Industry Int… Status… "Years on the List\n\…
##  7 Kite Phar… "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
##  8 Tencent    "\n\t\t\tHeadquarter… Industry Con… Status… "Years on the List\n\…
##  9 Regeneron  "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
## 10 Spark The… "\n\t\t\tHeadquarter… Industry Bio… Status… "Years on the List\n\…
## # … with 40 more rows

Clean up texts with stringr.

library(stringr)

df <- df %>%  
    mutate(hq = str_extract(hq, "[A-Z]+.*,.*[a-z]+")) %>% 
    mutate(yol = str_extract_all(yol, "[0-9]\\,?")) %>% 
    mutate(industry = str_replace(industry, "^\\w+\\s?", ""), 
           status = str_replace(status, "^\\w+\\s?", ""))

df

## # A tibble: 50 x 5
##    company           hq                      industry           status yol      
##    <fct>             <chr>                   <chr>              <chr>  <list>   
##  1 Nvidia            Santa Clara, California Intelligent machi… Public <chr [12…
##  2 SpaceX            Hawthorne, California   Transportation     Priva… <chr [28…
##  3 Amazon            Seattle, Washington     Connectivity       Public <chr [20…
##  4 23andMe           Mountain View, Califor… Biomedicine        Priva… <chr [8]>
##  5 Alphabet          Mountain View, Califor… Connectivity       Public <chr [32…
##  6 iFlytek           Hefei, China            Intelligent machi… Public <chr [4]>
##  7 Kite Pharma       Santa Monica, Californ… Biomedicine        Public <chr [4]>
##  8 Tencent           Shenzhen, China         Connectivity       Public <chr [20…
##  9 Regeneron         Tarrytown, New York     Biomedicine        Public <chr [4]>
## 10 Spark Therapeuti… Philadelphia, Pennsylv… Biomedicine        Public <chr [8]>
## # … with 40 more rows

Clean up uneven column (year_on_list).

df <- df %>% rowwise() %>% 
    # unlist column 
    mutate(yol = unlist(yol) %>% paste(collapse = ""))
head(df$yol)

## [1] "201520162017"                     "2011201220132014201520162017"    
## [3] "20132014201520162017"             "20162017"                        
## [5] "20102011201220132014201520162017" "2017"

df <- df %>%
    tidyr::separate(yol, into = paste0("year_on_list_", 1:8), sep = ",", convert = TRUE, fill = "right")