Skip to content

Commit

Permalink
Generate date of birth and date of death without retrying
Browse files Browse the repository at this point in the history
Lets get_random_value take extra constraints and apply those to
just this specific value generation
  • Loading branch information
rebkwok committed Oct 17, 2024
1 parent e9a3250 commit db6a88d
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 25 deletions.
89 changes: 64 additions & 25 deletions ehrql/dummy_data/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import structlog

from ehrql.dummy_data.query_info import QueryInfo
from ehrql.dummy_data.query_info import ColumnInfo, QueryInfo
from ehrql.query_engines.in_memory import InMemoryQueryEngine
from ehrql.query_engines.in_memory_database import InMemoryDatabase
from ehrql.query_model.introspection import all_inline_patient_ids
Expand Down Expand Up @@ -186,34 +186,69 @@ def generate_patient_facts(self, patient_id):
self.rnd.seed(f"{self.random_seed}:{patient_id}")
# TODO: We could obviously generate more realistic age distributions than this

while True:
# Retry until we have a date of birth and date of death that are
# within reasonable ranges
dob_column = self.get_patient_column("date_of_birth")
if dob_column is not None and dob_column.get_constraint(
Constraint.GeneralRange
):
dob_column = self.get_patient_column("date_of_birth")
# if we also have a date of death column with a range constraint,
# we need to ensure date of birth isn't after date of death
dod_column = self.get_patient_column("date_of_death")
dob_max_from_dod_constraint = None
if dod_column is not None and dod_column.get_constraint(
Constraint.GeneralRange
):
dod_constraint = dod_column.get_constraint(Constraint.GeneralRange)
if dod_constraint.minimum:
dob_max_from_dod = dod_constraint.minimum
else:
dob_max_from_dod = dod_constraint.maximum
dob_max_from_dod_constraint = Constraint.GeneralRange(
maximum=dob_max_from_dod
)

extra_dob_constraints = (
(dob_max_from_dod_constraint,) if dob_max_from_dod_constraint else ()
)
if dob_column is not None:
if dob_column.get_constraint(Constraint.GeneralRange):
self.events_start = self.today - timedelta(days=120 * 365)
self.events_end = self.today
date_of_birth = self.get_random_value(dob_column)
date_of_birth = self.get_random_value(dob_column, extra_dob_constraints)
else:
date_of_birth = self.today - timedelta(
days=self.rnd.randrange(0, 120 * 365)
extra_dob_constraints = extra_dob_constraints + (
Constraint.GeneralRange(
minimum=self.today - timedelta(days=120 * 365),
maximum=self.today,
),
)
date_of_birth = self.get_random_value(dob_column, extra_dob_constraints)
else:
date_of_birth = self.today - timedelta(
days=self.rnd.randrange(0, 120 * 365)
)

dod_column = self.get_patient_column("date_of_death")
if dod_column is not None and dod_column.get_constraint(
Constraint.GeneralRange
):
date_of_death = self.get_random_value(dod_column)
else:
age_days = self.rnd.randrange(105 * 365)
date_of_death = date_of_birth + timedelta(days=age_days)

if date_of_death >= date_of_birth and (
date_of_death - date_of_birth < timedelta(105 * 365)
):
break
if dod_column is not None:
minimum = None
maximum = None
includes_minimum = True
if range_constraint := dod_column.get_constraint(Constraint.GeneralRange):
if range_constraint.minimum is None:
minimum = date_of_birth
else:
includes_minimum = range_constraint.include_minimum
if range_constraint.maximum is None:
maximum = date_of_birth + timedelta(days=105 * 365)

date_of_death = self.get_random_value(
dod_column,
extra_constraints=(
Constraint.GeneralRange(
minimum=minimum,
maximum=maximum,
includes_minimum=includes_minimum,
),
),
)
else:
age_days = self.rnd.randrange(105 * 365)
date_of_death = date_of_birth + timedelta(days=age_days)

self.date_of_birth = date_of_birth
self.date_of_death = date_of_death if date_of_death < self.today else None
Expand Down Expand Up @@ -257,8 +292,12 @@ def populate_row(self, table_info, row):
if name not in row:
row[name] = self.get_random_value(column_info)

def get_random_value(self, column_info):
def get_random_value(self, original_column_info, extra_constraints: tuple = ()):
# TODO: This never returns None although for realism it sometimes should
column_info = ColumnInfo.from_column_info(
original_column_info, extra_constraints=extra_constraints
)

if cat_constraint := column_info.get_constraint(Constraint.Categorical):
# TODO: It's obviously not true in general that categories are equiprobable
return self.rnd.choice(cat_constraint.values)
Expand Down
10 changes: 10 additions & 0 deletions ehrql/dummy_data/query_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ def from_column(cls, name, column, extra_constraints=()):
),
)

@classmethod
def from_column_info(cls, column_info, extra_constraints=()):
return cls(
column_info.name,
column_info.type,
constraints=normalize_constraints(
tuple(column_info.constraints) + tuple(extra_constraints)
),
)

def __post_init__(self):
self._constraints_by_type = {type(c): c for c in self.constraints}

Expand Down

0 comments on commit db6a88d

Please sign in to comment.