-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
commit bbe76d0 Author: Erik-Jan van Kesteren <[email protected]> Date: Wed Apr 17 14:01:11 2024 +0200 remove unneeded files commit e0f04d6 Author: Erik-Jan van Kesteren <[email protected]> Date: Wed Apr 17 13:54:05 2024 +0200 add col_select to duckdb solution commit 3a25ddd Merge: 6d778c7 212b0ee Author: Erik-Jan van Kesteren <[email protected]> Date: Wed Apr 17 13:29:40 2024 +0200 Merge branch 'solutions' of https://github.com/sodascience/workshop_efficient_microdata into solutions commit 6d778c7 Author: Erik-Jan van Kesteren <[email protected]> Date: Mon Apr 15 15:39:48 2024 +0200 add solutions commit 212b0ee Merge: 797ceea ee6b801 Author: Erik-Jan van Kesteren <[email protected]> Date: Mon Apr 15 16:09:14 2024 +0200 Merge branch 'main' into solutions commit 797ceea Author: Erik-Jan van Kesteren <[email protected]> Date: Mon Apr 15 15:39:48 2024 +0200 add solutions
- Loading branch information
1 parent
51b4323
commit 73bddca
Showing
4 changed files
with
130 additions
and
74 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Solution 2: manual chunks & online statistics to compute | ||
# mean and variance in a streaming way | ||
|
||
# function to get a data chunk, with only required columns | ||
get_chunk <- function(start_pos = 0L, chunksize = 1e6) { | ||
read_spss( | ||
spolis_loc, | ||
n_max = chunksize, | ||
skip = start_pos, | ||
col_select = c(SBASISLOON, SBASISUREN, SCONTRACTSOORT) | ||
) | ||
} | ||
|
||
# function to compute n, the sum, and the sum of squares | ||
compute_stats <- function(df) { | ||
df |> | ||
mutate(hourlywage = SBASISLOON / pmax(SBASISUREN, 1)) |> | ||
summarize( | ||
sum = sum(hourlywage), | ||
ssq = sum(hourlywage^2), | ||
n = n(), | ||
.by = SCONTRACTSOORT | ||
) | ||
} | ||
|
||
# loop over chunks, add to result every time | ||
cur_pos <- 0L | ||
chunk <- get_chunk(cur_pos) | ||
result <- compute_stats(chunk) | ||
while (nrow(chunk) != 0) { | ||
cur_pos <- cur_pos + nrow(chunk) | ||
cat("Row:", cur_pos, "\r") | ||
chunk <- get_chunk(cur_pos) | ||
result <- bind_rows(result, compute_stats(chunk)) | ||
} | ||
write_rds(result, "processed_data/chunked_result.rds") | ||
|
||
# we need to do one extra aggregation step | ||
output <- | ||
result |> | ||
summarize( | ||
sum = sum(sum), | ||
ssq = sum(ssq), | ||
n = sum(n), | ||
.by = SCONTRACTSOORT | ||
) |> | ||
mutate( | ||
mean = sum / n, | ||
var = ssq / n - (sum / n)^2, | ||
sd = sqrt(var), | ||
sem = sd / sqrt(n), | ||
lwr = mean - 1.96*sem, | ||
upr = mean + 1.96*sem | ||
) | ||
|
||
# create plot! | ||
output |> | ||
ggplot(aes(x = as_factor(SCONTRACTSOORT, levels = "labels"), y = mean, ymax = upr, ymin = lwr)) + | ||
geom_pointrange() + | ||
labs( | ||
x = "Contract type", | ||
y = "Average wage", | ||
title = "Average wage per unit time for different contract types." | ||
) + | ||
theme_linedraw() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# Solution 1: duckdb to the rescue! | ||
library(tidyverse) | ||
library(haven) | ||
library(duckdb) | ||
library(dbplyr) | ||
|
||
# first, read the whole table into a duckdb | ||
# database. Do this in chunks to ensure low | ||
# RAM usage. | ||
spolis_loc <- "fake_cbs_data/Spolis/SPOLISBUS2022V2.sav" | ||
|
||
drv <- duckdb("processed_data/spolis.duckdb") | ||
dbc <- dbConnect(drv) | ||
|
||
cur_pos <- 0L | ||
chunk_size <- 1e6 | ||
cur_df <- read_spss( | ||
file = spolis_loc, n_max = chunk_size, skip = cur_pos, | ||
col_select = c(RINPERSOON, RINPERSOONS, SCONTRACTSOORT, SBASISLOON, SBASISUREN) | ||
) | ||
dbWriteTable(dbc, "income", cur_df, append = TRUE) | ||
while (nrow(cur_df) != 0) { | ||
cur_pos <- cur_pos + nrow(cur_df) | ||
cat("Row:", cur_pos, "\r") | ||
cur_df <- read_spss( | ||
file = spolis_loc, n_max = chunk_size, skip = cur_pos, | ||
col_select = c(RINPERSOON, RINPERSOONS, SCONTRACTSOORT, SBASISLOON, SBASISUREN) | ||
) | ||
dbWriteTable(dbc, "income", cur_df, append = TRUE) | ||
} | ||
|
||
|
||
|
||
# connect to the table we just created | ||
income_tbl <- tbl(dbc, "income") | ||
|
||
income_tbl |> | ||
summarize( | ||
mean = mean(SBASISLOON / pmax(1, SBASISUREN)), | ||
stdev = sd(SBASISLOON / pmax(1, SBASISUREN)), | ||
n = n(), | ||
.by = SCONTRACTSOORT | ||
) |> | ||
mutate( | ||
stderr = stdev / sqrt(n), | ||
lower = mean - 1.96*stderr, | ||
upper = mean + 1.96*stderr | ||
) |> | ||
ggplot(aes( | ||
x = as_factor(SCONTRACTSOORT), | ||
y = mean, | ||
ymax = upper, | ||
ymin = lower | ||
)) + | ||
geom_pointrange() + | ||
labs( | ||
x = "Contract type", | ||
y = "Average wage", | ||
title = "Average wage per unit time for different contract types." | ||
) + | ||
theme_linedraw() | ||
|
||
dbDisconnect(dbc) | ||
duckdb_shutdown(drv) |