Skip to content

Commit

Permalink
Squashed commit of the following:
Browse files Browse the repository at this point in the history
commit bbe76d0
Author: Erik-Jan van Kesteren <[email protected]>
Date:   Wed Apr 17 14:01:11 2024 +0200

    remove unneeded files

commit e0f04d6
Author: Erik-Jan van Kesteren <[email protected]>
Date:   Wed Apr 17 13:54:05 2024 +0200

    add col_select to duckdb solution

commit 3a25ddd
Merge: 6d778c7 212b0ee
Author: Erik-Jan van Kesteren <[email protected]>
Date:   Wed Apr 17 13:29:40 2024 +0200

    Merge branch 'solutions' of https://github.com/sodascience/workshop_efficient_microdata into solutions

commit 6d778c7
Author: Erik-Jan van Kesteren <[email protected]>
Date:   Mon Apr 15 15:39:48 2024 +0200

    add solutions

commit 212b0ee
Merge: 797ceea ee6b801
Author: Erik-Jan van Kesteren <[email protected]>
Date:   Mon Apr 15 16:09:14 2024 +0200

    Merge branch 'main' into solutions

commit 797ceea
Author: Erik-Jan van Kesteren <[email protected]>
Date:   Mon Apr 15 15:39:48 2024 +0200

    add solutions
  • Loading branch information
vankesteren committed Apr 17, 2024
1 parent 51b4323 commit 73bddca
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 74 deletions.
34 changes: 0 additions & 34 deletions notes.txt

This file was deleted.

40 changes: 0 additions & 40 deletions questions.txt

This file was deleted.

66 changes: 66 additions & 0 deletions solutions/solution_chunked.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Solution 2: manual chunks & online statistics to compute
# mean and variance in a streaming way

# function to get a data chunk, with only required columns
get_chunk <- function(start_pos = 0L, chunksize = 1e6) {
read_spss(
spolis_loc,
n_max = chunksize,
skip = start_pos,
col_select = c(SBASISLOON, SBASISUREN, SCONTRACTSOORT)
)
}

# function to compute n, the sum, and the sum of squares
compute_stats <- function(df) {
df |>
mutate(hourlywage = SBASISLOON / pmax(SBASISUREN, 1)) |>
summarize(
sum = sum(hourlywage),
ssq = sum(hourlywage^2),
n = n(),
.by = SCONTRACTSOORT
)
}

# loop over chunks, add to result every time
cur_pos <- 0L
chunk <- get_chunk(cur_pos)
result <- compute_stats(chunk)
while (nrow(chunk) != 0) {
cur_pos <- cur_pos + nrow(chunk)
cat("Row:", cur_pos, "\r")
chunk <- get_chunk(cur_pos)
result <- bind_rows(result, compute_stats(chunk))
}
write_rds(result, "processed_data/chunked_result.rds")

# we need to do one extra aggregation step
output <-
result |>
summarize(
sum = sum(sum),
ssq = sum(ssq),
n = sum(n),
.by = SCONTRACTSOORT
) |>
mutate(
mean = sum / n,
var = ssq / n - (sum / n)^2,
sd = sqrt(var),
sem = sd / sqrt(n),
lwr = mean - 1.96*sem,
upr = mean + 1.96*sem
)

# create plot!
output |>
ggplot(aes(x = as_factor(SCONTRACTSOORT, levels = "labels"), y = mean, ymax = upr, ymin = lwr)) +
geom_pointrange() +
labs(
x = "Contract type",
y = "Average wage",
title = "Average wage per unit time for different contract types."
) +
theme_linedraw()

64 changes: 64 additions & 0 deletions solutions/solution_duckdb.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Solution 1: duckdb to the rescue!
library(tidyverse)
library(haven)
library(duckdb)
library(dbplyr)

# first, read the whole table into a duckdb
# database. Do this in chunks to ensure low
# RAM usage.
spolis_loc <- "fake_cbs_data/Spolis/SPOLISBUS2022V2.sav"

drv <- duckdb("processed_data/spolis.duckdb")
dbc <- dbConnect(drv)

cur_pos <- 0L
chunk_size <- 1e6
cur_df <- read_spss(
file = spolis_loc, n_max = chunk_size, skip = cur_pos,
col_select = c(RINPERSOON, RINPERSOONS, SCONTRACTSOORT, SBASISLOON, SBASISUREN)
)
dbWriteTable(dbc, "income", cur_df, append = TRUE)
while (nrow(cur_df) != 0) {
cur_pos <- cur_pos + nrow(cur_df)
cat("Row:", cur_pos, "\r")
cur_df <- read_spss(
file = spolis_loc, n_max = chunk_size, skip = cur_pos,
col_select = c(RINPERSOON, RINPERSOONS, SCONTRACTSOORT, SBASISLOON, SBASISUREN)
)
dbWriteTable(dbc, "income", cur_df, append = TRUE)
}



# connect to the table we just created
income_tbl <- tbl(dbc, "income")

income_tbl |>
summarize(
mean = mean(SBASISLOON / pmax(1, SBASISUREN)),
stdev = sd(SBASISLOON / pmax(1, SBASISUREN)),
n = n(),
.by = SCONTRACTSOORT
) |>
mutate(
stderr = stdev / sqrt(n),
lower = mean - 1.96*stderr,
upper = mean + 1.96*stderr
) |>
ggplot(aes(
x = as_factor(SCONTRACTSOORT),
y = mean,
ymax = upper,
ymin = lower
)) +
geom_pointrange() +
labs(
x = "Contract type",
y = "Average wage",
title = "Average wage per unit time for different contract types."
) +
theme_linedraw()

dbDisconnect(dbc)
duckdb_shutdown(drv)

0 comments on commit 73bddca

Please sign in to comment.