Squashed commit of the following:

commit bbe76d0 Author: Erik-Jan van Kesteren <[email protected]> Date: Wed Apr 17 14:01:11 2024 +0200 remove unneeded files commit e0f04d6 Author: Erik-Jan van Kesteren <[email protected]> Date: Wed Apr 17 13:54:05 2024 +0200 add col_select to duckdb solution commit 3a25ddd Merge: 6d778c7 212b0ee Author: Erik-Jan van Kesteren <[email protected]> Date: Wed Apr 17 13:29:40 2024 +0200 Merge branch 'solutions' of https://github.com/sodascience/workshop_efficient_microdata into solutions commit 6d778c7 Author: Erik-Jan van Kesteren <[email protected]> Date: Mon Apr 15 15:39:48 2024 +0200 add solutions commit 212b0ee Merge: 797ceea ee6b801 Author: Erik-Jan van Kesteren <[email protected]> Date: Mon Apr 15 16:09:14 2024 +0200 Merge branch 'main' into solutions commit 797ceea Author: Erik-Jan van Kesteren <[email protected]> Date: Mon Apr 15 15:39:48 2024 +0200 add solutions
sodascience · Apr 17, 2024 · 73bddca · 73bddca
1 parent 51b4323
commit 73bddca
Show file tree

Hide file tree

Showing 4 changed files with 130 additions and 74 deletions.
diff --git a/notes.txt b/notes.txt
diff --git a/questions.txt b/questions.txt
diff --git a/solutions/solution_chunked.R b/solutions/solution_chunked.R
@@ -0,0 +1,66 @@
+# Solution 2: manual chunks & online statistics to compute
+# mean and variance in a streaming way
+
+# function to get a data chunk, with only required columns
+get_chunk <- function(start_pos = 0L, chunksize = 1e6) {
+  read_spss(
+    spolis_loc,
+    n_max = chunksize,
+    skip = start_pos,
+    col_select = c(SBASISLOON, SBASISUREN, SCONTRACTSOORT)
+  ) 
+}
+
+# function to compute n, the sum, and the sum of squares
+compute_stats <- function(df) {
+  df |>
+    mutate(hourlywage = SBASISLOON / pmax(SBASISUREN, 1)) |>
+    summarize(
+      sum = sum(hourlywage),
+      ssq = sum(hourlywage^2),
+      n = n(),
+      .by = SCONTRACTSOORT
+    )
+}
+
+# loop over chunks, add to result every time
+cur_pos <- 0L
+chunk <- get_chunk(cur_pos)
+result <- compute_stats(chunk)
+while (nrow(chunk) != 0) {
+  cur_pos <- cur_pos + nrow(chunk)
+  cat("Row:", cur_pos, "\r")
+  chunk <- get_chunk(cur_pos)
+  result <- bind_rows(result, compute_stats(chunk))
+}
+write_rds(result, "processed_data/chunked_result.rds")
+
+# we need to do one extra aggregation step
+output <- 
+  result |> 
+  summarize(
+    sum = sum(sum), 
+    ssq = sum(ssq),
+    n = sum(n), 
+    .by = SCONTRACTSOORT
+  ) |> 
+  mutate(
+    mean = sum / n,
+    var  = ssq / n - (sum / n)^2,
+    sd   = sqrt(var),
+    sem  = sd / sqrt(n),
+    lwr  = mean - 1.96*sem,
+    upr  = mean + 1.96*sem
+  )
+
+# create plot!
+output |> 
+  ggplot(aes(x = as_factor(SCONTRACTSOORT, levels = "labels"), y = mean, ymax = upr, ymin = lwr)) +
+  geom_pointrange() +
+  labs(
+    x = "Contract type",
+    y = "Average wage",
+    title = "Average wage per unit time for different contract types."
+  ) +
+  theme_linedraw()
+
diff --git a/solutions/solution_duckdb.R b/solutions/solution_duckdb.R
@@ -0,0 +1,64 @@
+# Solution 1: duckdb to the rescue!
+library(tidyverse)
+library(haven)
+library(duckdb)
+library(dbplyr)
+
+# first, read the whole table into a duckdb
+# database. Do this in chunks to ensure low 
+# RAM usage.
+spolis_loc <- "fake_cbs_data/Spolis/SPOLISBUS2022V2.sav"
+
+drv <- duckdb("processed_data/spolis.duckdb")
+dbc <- dbConnect(drv)
+
+cur_pos <- 0L
+chunk_size <- 1e6
+cur_df <- read_spss(
+  file = spolis_loc, n_max = chunk_size, skip = cur_pos, 
+  col_select = c(RINPERSOON, RINPERSOONS, SCONTRACTSOORT, SBASISLOON, SBASISUREN)
+)
+dbWriteTable(dbc, "income", cur_df, append = TRUE)
+while (nrow(cur_df) != 0) {
+  cur_pos <- cur_pos + nrow(cur_df)
+  cat("Row:", cur_pos, "\r")
+  cur_df <- read_spss(
+    file = spolis_loc, n_max = chunk_size, skip = cur_pos,
+    col_select = c(RINPERSOON, RINPERSOONS, SCONTRACTSOORT, SBASISLOON, SBASISUREN)
+  )
+  dbWriteTable(dbc, "income", cur_df, append = TRUE)
+}
+
+
+
+# connect to the table we just created
+income_tbl <- tbl(dbc, "income")
+
+income_tbl |> 
+  summarize(
+    mean  = mean(SBASISLOON / pmax(1, SBASISUREN)), 
+    stdev = sd(SBASISLOON / pmax(1, SBASISUREN)),
+    n     = n(),
+    .by   = SCONTRACTSOORT
+  ) |> 
+  mutate(
+    stderr = stdev / sqrt(n),
+    lower  = mean - 1.96*stderr,
+    upper  = mean + 1.96*stderr
+  ) |> 
+  ggplot(aes(
+    x = as_factor(SCONTRACTSOORT),
+    y = mean,
+    ymax = upper,
+    ymin = lower
+  )) +
+  geom_pointrange() +
+  labs(
+    x = "Contract type",
+    y = "Average wage",
+    title = "Average wage per unit time for different contract types."
+  ) +
+  theme_linedraw()
+
+dbDisconnect(dbc)
+duckdb_shutdown(drv)