Personal code snippets of @tmasjc

Site powered by Hugo + Blogdown

Image by Mads Schmidt Rasmussen from unsplash.com

Minimal Bootstrap Theme by Zachary Betz

Simulate Customer Retention

Feb 28, 2019 #survival

The code below simulate a scenario where nc customers onboard during a ts timespan. However, none of them managed to retain for more than sp periods. The objective is to compare various customer retention analysis especially periodic and retrospective analysis techniques.

library(tidyverse)
library(charlatan)

# Generate Some Samples  ----------------------------------------------

# seeding
set.seed(123)

# number of customers
nc = 500
# timespan (entire periods)
ts = 1:12
# survival periods
sp = 3

# generate some customers data
cust_info <- data_frame(
  id = paste0("KB", ch_integer(n = nc, min = 100)),
  jobs = sample(ch_job(n = 4, locale = "zh_TW"), 
                size = length(id), replace = TRUE)
)

# generate a sequence of vector with length n 
make_seq <- function(n, x) {
  
  # initiate an empty vector
  vec = rep(0, n)
  
  # generate index with range no more than x
  ind = 1 : (2 + x)
  while(diff(range(ind)) > x) {
    ind = runif(x, 1, n + 1)
  }
  
  # fill vector with 1s 
  vec[ind] <- 1
  
  return(vec)
}
  
# repeats ts times
surv_times <- nc %>% 
  replicate(make_seq(max(ts), sp)) %>% 
  t() %>% as_data_frame()
names(surv_times) <- paste0("M", ts)

# calc initial register period
join <- apply(surv_times, 1, function(x) min(which(x == 1)))

# samples 
dat <- bind_cols(cust_info, join = join, surv_times)
head(dat)  
## # A tibble: 6 x 15
##   id    jobs    join    M1    M2    M3    M4    M5    M6    M7    M8    M9   M10
##   <chr> <chr>  <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 KB514 清潔工     1     1     1     0     1     0     0     0     0     0     0
## 2 KB562 清潔工     1     1     1     0     0     0     0     0     0     0     0
## 3 KB278 不動產/商…     4     0     0     0     1     1     1     0     0     0     0
## 4 KB625 清潔工     7     0     0     0     0     0     0     1     0     1     1
## 5 KB294 產品事業處…     4     0     0     0     1     1     0     1     0     0     0
## 6 KB917 財務或會計…     5     0     0     0     0     1     0     0     1     0     0
## # … with 2 more variables: M11 <dbl>, M12 <dbl>
# Active Customers ---------------------------------------------

calc_active_rate <- function(m) {
  dat %>% 
    gather(period, active, -c(id, jobs, join)) %>% 
    filter(join <= m, period == paste0("M", m)) %>% 
    summarise(rate = mean(active)) %>% 
    `[[`("rate")
}
active_rate <- map_dbl(ts, calc_active_rate)

# make plot
data_frame(ts, active_rate) %>% 
  ggplot(aes(ts, active_rate, group = 1)) +
  geom_point() +
  geom_line(col = "navyblue") + 
  scale_x_continuous(breaks = 1:12) +
  scale_y_continuous(labels = scales::percent) + 
  coord_cartesian(ylim = seq(0, 1, 0.05)) + 
  theme_minimal() +
  labs(x = "Month", y = "Active Customers %")

# Periodic Survival -------------------------------------------------------

# (active at period t + 1) / (active at period t)
rolling_active <- function(t) {
  
  # subset those who active at period t
  s = dat[dat[which(names(dat) == paste0("M", t))] == 1, ]
  
  # remain in the subsequent period
  r = s[which(names(s) == paste0("M", t + 1))] == 1
  
  # return
  sum(r) / nrow(s)
}

# (active at period t + 1) / (register at period t) 
rolling_retain <- function(t) {
  
  # subset those who join at period t
  s = subset(dat, join == t)
  
  # remain in the subsequent period
  r = s[which(names(s) == paste0("M", t + 1))] == 1
  
  # return
  sum(r) / nrow(s)
}

# put result into a data frame
res <- data_frame(
  ind = 1:(max(ts) - 1),
  grp_by_register_period = map_dbl(ind, rolling_retain),
  grp_by_active_period = map_dbl(ind, rolling_active)
) 

# make plot
res %>% 
  gather(var, val, -ind) %>% 
  ggplot(aes(ind, val, col = var)) + 
  geom_line() + 
  scale_color_discrete(labels = c("Group By Active Period", 
                                  "Group By Register Period")) +
  scale_x_continuous(breaks = ts) +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(x = "Month", y = "Retention Rate %", col = "")

# Cohort Analysis ---------------------------------------------------------

# export to Excel to do pivot table
file <- dat %>% 
  select(-jobs) %>% 
  gather(month, active, -c(id, join)) %>% 
  mutate(month = factor(str_remove(month, "^M")))
# write_csv(file, "some/path")