intro-to-data-science-21 · github-classroom · Dec 1, 2021 · Dec 4, 2021 · Dec 5, 2021 · Dec 5, 2021
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+data/
+*.rds
+02_build_dashboard.R
+03_map.R
+03_plotlymap.R
+99_updates.R
diff --git a/00_load_data.R b/00_load_data.R
@@ -0,0 +1,126 @@
+library(pacman)
+p_load(wbstats, rio, tidyverse, magrittr)
+
+
+### get data from WDI: ----
+from <- 1990
+to <- 2020
+
+wdi <- wb_data(indicator = c(pop = "SP.POP.TOTL",
+                             area = "AG.LND.TOTL.K2",
+                             internet = "IT.NET.USER.ZS",
+                             import = "BM.GSR.GNFS.CD",
+                             export = "BX.GSR.GNFS.CD",
+                             fdi_in = "BX.KLT.DINV.CD.WD",
+                             fdi_out = "BM.KLT.DINV.CD.WD",
+                             tourism_in = "ST.INT.ARVL",
+                             tourism_out = "ST.INT.DPRT"),
+                      start_date = from, end_date = to, 
+                      return_wide = T) %>% 
+  rowwise() %>% 
+  # if only in OR out is not reported, it's assumed that the other is neglectable (see Schröder 2020)
+  mutate(fdi = ifelse(is.na(fdi_in) && is.na(fdi_out),
+                      NA,
+                      sum(abs(fdi_in), abs(fdi_out), na.rm = T)),
+         trade = ifelse(is.na(import) && is.na(export),
+                        NA,
+                        sum(import, export, na.rm = T)),
+         tourism = ifelse(is.na(tourism_in) && is.na(tourism_out),
+                          NA, 
+                          sum(tourism_in, tourism_out, na.rm = T)))
+
+### Indicators not available via WDI: ----
+#   - International telephone traffic (ITU)
+#   - International Meetings/Conferences (UIA)
+#   - International aircraft passengers (ICAO)
+
+### join data from other sources:
+icao <- rio::import("data/ICAO.xlsx")
+
+uia <- rio::import("data/UIA.xlsx",
+                   na.strings = c("NA", "..")) %>% 
+  pivot_longer(., cols = `1990`:`2018`,
+               names_to = "Year",
+               values_to = "int_meetings",
+               names_transform = list(Year = as.integer))
+
+phone <- rio::import("data/phone.xlsx", 
+                     which = "total total", 
+                     na.strings = c("NA", "..")) %>% 
+  pivot_longer(., cols = `1990_value`:`2017_value`, 
+             # string removal not very elegant!
+               names_to = c("Year", "drop"), 
+               values_to = "int_phone_minutes", 
+               names_sep = 4, names_transform = list(Year = as.integer))
+
+
+# join local data:
+other_sources <- right_join(uia, full_join(phone, icao,
+                                          by = c("Year", "Country" = "Name")),
+                           by = c("Year", "Code" = "State")) %>% 
+  select(Code, Year, Int_Departures, int_phone_minutes, int_meetings)
+
+
+### join wdi and others: ----
+data_raw <- full_join(wdi, other_sources,
+                      by = c("date" = "Year", 
+                             "iso3c" = "Code"))
+
+
+
+### Even more valid indicators (see Schröder 2020): ----
+# Note: Some of this data is not publicly available. 
+#   The processed data can be found in folder /data_processed. 
+#   Raw data for replication upon request.
+
+#   - Replace air passengers with international revenue passenger kilometres (ICAO)
+RPK <- rio::import("data/ICAO_RPK.xlsx", 
+                   which = "Int. RPK clean", 
+                   na.strings = c("NA", "..")) %>% 
+  pivot_longer(., cols = `1990`:`2017`,
+               names_to = "date", 
+               values_to = "int_rpk",
+               names_transform = list(date = as.integer))
+
+data_raw %<>% left_join(., RPK,
+                        by = c("iso3c" = "Code", "date"))
+
+#   - Replace number of internet users with internationally transferred bandwidth (ITU)
+international_internet <- rio::import("data/ITU.xlsx", 
+                                      which = "int. IT bandwidth", 
+                                      na.strings = c("NA", "..")) %>% 
+  pivot_longer(., cols = `1990`:`2017`,
+               names_to = "date", 
+               values_to = "int_mbits",
+               names_transform = list(date = as.integer)) %>% 
+  select(-`1988`, -`1989`)
+
+data_raw %<>% left_join(., international_internet,
+                        by = c("country" = "Country", "date"))
+
+#   - Extend Trade in goods & services with primary income (WDI)
+data_raw <- wbstats::wb_data(indicator = c(
+                 import_g_s_pi = "BM.GSR.TOTL.CD",
+                 export_g_s_pi = "BX.GSR.TOTL.CD"),
+                 start_date = from, end_date = to, 
+                 return_wide = T) %>% 
+  select(-iso2c, -country) %>% 
+  rowwise() %>% 
+    # if only in OR out is not reported, the other is assumed to be neglectable (see Schröder 2020)
+  mutate(trade_g_s_pi = ifelse(is.na(import_g_s_pi) && is.na(export_g_s_pi),
+                        NA,
+                        sum(import_g_s_pi, export_g_s_pi, na.rm = T))) %>% 
+  right_join(., data_raw,
+             by = c("iso3c", "date"))
+
+
+#   - create communication technology indicator reflecting technological change relevant for globalization:
+#     - until 2005: phone traffic correlates highly with all other globalization indicators while internet does not
+#     - from 2006: other way round, hence include:
+#       - telephone traffic prior to 2006
+#       - internet traffic from 2006
+data_raw %<>% mutate(comtech = ifelse(date < 2006,
+                                      int_phone_minutes,
+                                      int_mbits))
+
+  # however, coverage (esp. in terms of years) are not that good for some indicators, hence we offer both indices.
diff --git a/01_build_index.R b/01_build_index.R
@@ -0,0 +1,159 @@
+source("00_load_data.R")
+p_load(tidyverse, 
+       magrittr, 
+       stats, 
+       magrittr)
+options(scipen = 999)
+
+### normalize data:
+
+# by population:
+data_pc <- data_raw %>% 
+  select(country, 
+         iso3c, 
+         date, 
+         area, 
+         pop, 
+         import,
+         int_rpk:comtech,
+         everything(),
+         -iso2c) %>% 
+  mutate(across(import:int_meetings, 
+                ~ .x / pop)) %>% 
+  filter(!is.na(iso3c) && !is.na(country)) # excluding rows with no information
+
+
+
+
+### panel normalization:
+
+# exclude small states (as defined by Kessler 2016) for normalization
+data_pc %<>% 
+  mutate(small = if_else(pop < 1000000 | area < 3000,
+                         T,
+                         F),
+# There are some NAs for variable small:
+  # NAs <- filter(data_pc, is.na(small))
+  # some years for Kosovo, South Sudan, Eritrea, and Kuwait, none of them being a small state according to the definition
+        small = ifelse(is.na(small),
+                F,
+                small))
+
+# get long df:
+data_pc_long <- data_pc %>% 
+  pivot_longer(cols = import:int_meetings, 
+               names_to = "variable", 
+               values_to = "value")
+
+
+# only relevant variables 
+normal_range <- data_pc %>% 
+  # for now, we use all years. This is to allow to adapt code later to avoid biases induced by skewed patterns of data availability:
+  filter(., between(date, 1990, 2020)) %>% 
+  select(import:int_meetings)
+
+distribution_step1 <- as.data.frame(apply(normal_range, 2, summary)) %>% 
+  t() %>% 
+  as.data.frame() %>% 
+  mutate(iqr = `3rd Qu.`- `1st Qu.`) %>% 
+  select(lower_quartile = `1st Qu.`, 
+         upper_quartile = `3rd Qu.`, 
+         iqr) %>% 
+  rownames_to_column(var = "variable") %>% 
+  right_join(., data_pc_long,
+             by = "variable")
+
+# exclude small states and extreme outliers before defining max/min: 
+distribution_step2 <- distribution_step1 %>% 
+  mutate(value = ifelse(value > upper_quartile + 3 * iqr | 
+                        value < lower_quartile - 3 * iqr |
+                        small == T,
+                        NA,
+                        value)) %>% 
+  select(variable, value) %>% 
+  rownames_to_column(var = "unique_identifier_i_actually_dont_need") %>% 
+  pivot_wider(names_from = "variable",
+              values_from = "value") %>% 
+  select(-unique_identifier_i_actually_dont_need) # not pretty but works as well
+
+# join data with max/min
+distribution <- as.data.frame(apply(distribution_step2, 2, summary)) %>% 
+  t() %>% 
+  as.data.frame() %>% 
+  select(minimum = Min., 
+         maximum = Max.) %>% 
+  rownames_to_column(var = "variable")
+
+data_pc_long %<>% left_join(., distribution, 
+                            by = "variable")
+
+# exclude extreme outlier values outside between(25% quantile - 3 * IQR, 75% quantile + 3 * IQR)(see Schröder 2020):
+data_normalized_long <- data_pc_long %>%  
+  mutate(normalized = ((value - minimum) / (maximum - minimum)) * 100) %>% 
+         # set outliers to 0 / 100:
+         mutate(normalized = case_when(normalized > 100 ~ 100,
+                                       normalized < 0 ~ 0,
+                                       TRUE ~ normalized))
+
+
+indicators <- c("internet", "fdi", "trade", "tourism", "Int_Departures", "int_phone_minutes", "int_meetings", "comtech", "int_rpk", "trade_g_s_pi")
+data_normalized <- data_pc %>% 
+  select(country:pop, small)
+
+for (i in 1:length(indicators)){
+  data_normalized %<>% 
+    bind_cols(., data_normalized_long %>% 
+                  filter(variable == indicators[i]) %>% 
+                  select(normalized))
+  }
+
+data_normalized %<>% rename("internet" = normalized...7, 
+                            "fdi" = normalized...8, 
+                            "trade" = normalized...9, 
+                            "tourism" = normalized...10, 
+                            "Int_Departures" = normalized...11, 
+                            "int_phone_minutes" = normalized...12, 
+                            "int_meetings" = normalized...13,
+                            "comtech" = normalized...14, 
+                            "int_rpk" = normalized...15, 
+                            "trade_g_s_pi" = normalized...16)
+
+### Combining to index ----
+
+# all variables are theoretically valid, load strongly on a common factor and are highly intercorrelated (Kessler 2016, Schröder 2020)
+# therefore the index can be constructed simply by taking the average of all available normalized variables:
+
+index <- data_normalized %>% 
+  rowwise() %>% 
+  mutate(KGI_original = mean(c(internet, 
+                               fdi, 
+                               trade, 
+                               tourism, 
+                               int_rpk, 
+                               int_phone_minutes, 
+                               int_meetings),
+                              na.rm = T),
+         KGI_new = mean(c(comtech, 
+                          fdi, 
+                          trade_g_s_pi, 
+                          tourism,
+                          int_rpk,
+                          int_meetings),
+                        na.rm = T)) %>% 
+  bind_cols(., 
+            apply(data_normalized %>% 
+              select(internet:int_meetings), 
+              1, 
+              function(x) sum(!is.na(x))), 
+            apply(data_normalized %>% 
+              select(fdi, tourism, int_meetings:trade_g_s_pi), 
+                     1, 
+                     function(x) sum(!is.na(x)))) %>% 
+  rename(n_vars_original = ...19,
+         n_vars_new = ...20)
+
+### save processed data ----
+try(dir.create("data_processed"), silent = T)
+save(index, file = "data_processed/KGI.Rdata")
+rio::export(index, file = "data_processed/KGI.csv")
+rio::export(index, file = "data_processed/KGI.xlsx")