The Warner Music Group launched a new company in April, Warner Music South Asia targeting region markets that include Bangladesh, Nepal, Pakistan, and Sri Lanka. The company thought that the region was populated and diverse. I will analyze the reasons behind that decision based on some economic data related to South Asia.
library(tidyverse) library(tidyquant) library(timetk) #Population Ages 15 to 64 for South Asia (percent of total) #(https://fred.stlouisfed.org/series/SPPOP1564TOZSSAS) df_pop_15_64 <- tq_get("SPPOP1564TOZSSAS", get = "economic.data", from = "1960-01-01") %>% rename(value = price) #Ratio of Female to Male Secondary School Enrollment for South Asia #(Ratio of girls to boys) #(https://fred.stlouisfed.org/series/SEENRSECOFMZSSAS) df_female_to_male_secondary_enrollment <- tq_get("SEENRSECOFMZSSAS", get = "economic.data", from = "1960-01-01") %>% rename(value = price) #Constant GDP per capita for South Asia (2010 U.S. Dollars) #(https://fred.stlouisfed.org/series/NYGDPPCAPKDSAS) df_gdp_per_capita <- tq_get("NYGDPPCAPKDSAS", get = "economic.data", from = "1960-01-01") %>% rename(value = price) #Mobile Cellular Subscriptions in South Asia (Number per 100 People) #(https://fred.stlouisfed.org/series/ITCELSETSP2SAS) df_mobile_phone <- tq_get("ITCELSETSP2SAS", get = "economic.data", from = "1975-01-01") %>% rename(value = price) #Youth Unemployment Rate for South Asia (Percent) #(https://fred.stlouisfed.org/series/SLUEM1524ZSSAS) df_youth_unemployment <- tq_get("SLUEM1524ZSSAS", get = "economic.data", from = "1991-01-01") %>% rename(value = price) #Merging all data sets df_merged <- rbind( df_pop_15_64, df_gdp_per_capita, df_youth_unemployment, df_mobile_phone, df_female_to_male_secondary_enrollment )
Now we will draw all the variables above together to compare and find an insight.
#Plot the time series with facets p <- plot_time_series( .data = df_merged, .date_var = date, .value = value, .facet_vars = symbol, .title = "South Asia", .interactive = FALSE ) #Custom titles for each facet custom_labels <- c("SPPOP1564TOZSSAS" = "Population Ages 15 to 64 (Percent of total)", "NYGDPPCAPKDSAS" = "Constant GDP per capita (2010 U.S. Dollars)", "SLUEM1524ZSSAS" = "Youth Unemployment Rate (Percent)", "ITCELSETSP2SAS" = "Mobile Cellular Subscriptions (Number per 100 People)", "SEENRSECOFMZSSAS" = "Ratio of Female to Male Secondary School Enrollment") #Use `facet_wrap` with a custom label p + facet_wrap(~symbol, scales = "free", ncol = 2, labeller = as_labeller(custom_labels)) + theme(text = element_text(family = "Bricolage Grotesque", face = "bold"))
It is seen that mobile phone ownership has risen sharply since 2000 which is critical for a company whose main revenue is mostly streaming. Besides that, all other variables seem plausible to an investable market.
Now, we will model the GDP per capita based on other variables to find which factors impact the most on the development of the region’s economy. In order to model, we will use polynomial support vector machines (SVMs) via kernlab engine.
#Merging data for modeling #By renaming the predictor and target variables df_mod_merged <- df_gdp_per_capita %>% rename(gdp_per_capita = value) %>% select(-symbol) %>% left_join(df_pop_15_64 %>% rename(pop_15_64 = value) %>% select(-symbol)) %>% left_join(df_youth_unemployment %>% rename(youth_unemployment = value) %>% select(-symbol)) %>% left_join(df_mobile_phone %>% rename(mobile_phone = value) %>% select(-symbol)) %>% left_join(df_female_to_male_secondary_enrollment %>% rename(female_to_male_secondary_enrollment = value) %>% select(-symbol)) %>% drop_na() #Bootstraping for tuning library(tidymodels) set.seed(12345) df_folds <- bootstraps(df_mod_merged, times = 1000) #Preprocessing df_rec <- recipe(gdp_per_capita ~ ., data = df_mod_merged) %>% step_date(date, features = c("year")) %>% step_rm(date) %>% step_normalize(all_numeric()) #Processed data df_rec %>% prep() %>% bake(new_data = NULL) #Model: Polynomial support vector machines (SVMs) via kernlab df_spec <- svm_poly( cost = tune(), degree = tune(), scale_factor = tune(), margin = tune() ) %>% set_engine("kernlab") %>% set_mode("regression") #Building workflow sets library(workflowsets) wflow_svm <- workflow_set( preproc = list(formula = df_rec), models = list(SVM_poly = df_spec) ) #Tuning and evaluating all the models grid_ctrl <- control_grid( save_pred = TRUE, parallel_over = "everything", save_workflow = TRUE ) grid_results <- wflow_svm %>% workflow_map( seed = 98765, resamples = df_folds, grid = 10, control = grid_ctrl ) #Accuracy of the grid results grid_results %>% rank_results(select_best = TRUE, rank_metric = "rsq") %>% select(Models = wflow_id, .metric, mean) # A tibble: 2 × 3 # Models .metric mean # <chr> <chr> <dbl> #1 formula_SVM_poly rmse 0.166 #2 formula_SVM_poly rsq 0.989 #Finalizing the model with the best parameters svm_best_param <- grid_results %>% extract_workflow_set_result("formula_SVM_poly") %>% select_best(metric = "rsq") svm_fit_wflw <- grid_results %>% extract_workflow("formula_SVM_poly") %>% finalize_workflow(svm_best_param) %>% fit(df_mod_merged) #Calibrate the fitted workflow to the full data set library(modeltime) calibration_tbl_svm <- svm_fit_wflw %>% modeltime_calibrate(new_data = df_mod_merged) #Accuracy of the finalized workflow calibration_tbl_svm %>% modeltime_accuracy(metric_set = metric_set(rmse,rsq)) # A tibble: 1 × 5 # .model_id .model_desc .type rmse rsq # <int> <chr> <chr> <dbl> <dbl> # 1 1 KERNLAB Test 0.136 0.990
Since our model’s accuracy results look sharp, we can move on to the permutation-based variable importance.
#Variable importance library(DALEXtra) #Fitted workflow for KNN set.seed(98765) knn_wflow_fitted <- workflow() %>% add_recipe(rec_features) %>% add_model(spec_knn) %>% fit(df_train) #Processed data frame for variable importance calculation imp_data <- df_rec %>% prep() %>% bake(new_data = NULL) #Explainer object explainer_svm <- explain_tidymodels( svm_fit_wflw %>% extract_fit_parsnip(), data = imp_data %>% select(-gdp_per_capita), y = imp_data$gdp_per_capita, label = "", verbose = FALSE ) #Calculating permutation-based variable importance set.seed(1983) vip_svm <- model_parts(explainer_svm, loss_function = loss_root_mean_square, type = "difference", B = 100,#the number of permutations label = "") #Variable importance plot vip_svm %>% mutate(variable = case_when( variable == "date_year" ~ "Year", variable == "pop_15_64" ~ "Population Ages 15 to 64", variable == "youth_unemployment" ~ "Youth Unemployment", variable == "mobile_phone" ~ "Mobile Phone Ownership", variable == "female_to_male_secondary_enrollment" ~ "Female to Male Secondary School Enrollment", TRUE ~ variable )) %>% plot() + scale_y_continuous(expand = expansion(mult = c(.1, .1))) + labs(color = "", x = "", y = "", subtitle = "Higher indicates more important", title = "Factors Affecting GDP per capita for South Asia") + theme_minimal(base_family = "Bricolage Grotesque", base_size = 16) + theme(legend.position = "none", plot.title = element_text(hjust = 0.5, size = 14, face = "bold"), plot.subtitle = element_text(hjust = 0.5, size = 12), panel.grid.minor.x = element_blank(), panel.grid.major.y = element_blank(), plot.background = element_rect(fill = "#ffc900"))
The boxplot of the female-to-male ratio in secondary enrollment includes zero, indicating no significant impact of this ratio on GDP per capita. On the other hand, population, youth employment, and mobile phone ownership seem to have a significant impact on GDP per capita in South Asia.