From 3e37a110cfb721ca9d5b2c498b264255b429ca39 Mon Sep 17 00:00:00 2001
From: Becky Smith <becky.smith@thedatalab.org>
Date: Wed, 13 Apr 2022 14:12:09 +0100
Subject: [PATCH] Tests for stats logging

---
 tests/test_logging.py | 321 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 321 insertions(+)
 create mode 100644 tests/test_logging.py

diff --git a/tests/test_logging.py b/tests/test_logging.py
new file mode 100644
index 00000000..5c3186bb
--- /dev/null
+++ b/tests/test_logging.py
@@ -0,0 +1,321 @@
+import re
+from unittest.mock import patch
+
+import pytest
+import structlog
+
+from cohortextractor import StudyDefinition, codelist, patients
+from cohortextractor.cohortextractor import generate_cohort, generate_measures
+from tests.test_tpp_backend import (  # noqa: F401; We need to import these for setup for the tests that require a tpp database;
+    set_database_url,
+    setup_function,
+    setup_module,
+    teardown_module,
+)
+
+
+@pytest.fixture(name="logger")
+def fixture_logger():
+    """Modify `capture_logs` to keep reference to `processors` list intact,
+    in order to also modify bound loggers which get the list assigned by reference.
+
+    See https://github.com/hynek/structlog/issues/408
+    """
+    cap = structlog.testing.LogCapture()
+    # Modify `_Configuration.default_processors` set via `configure` but always keep
+    # the list instance intact to not break references held by bound loggers.
+    processors = structlog.get_config()["processors"]
+    old_processors = processors[:]  # copy original processors
+    try:
+        processors.clear()  # clear processors list
+        processors.append(cap)  # append the LogCapture processor for testing
+        structlog.configure(processors=processors)
+        yield cap
+    finally:
+        processors.clear()  # remove LogCapture
+        processors.extend(old_processors)  # restore original processors
+        structlog.configure(processors=processors)
+
+
+def get_stats_logs(log_output):
+    return [
+        {k: v for k, v in log_item.items() if k not in ["event", "log_level"]}
+        for log_item in log_output
+        if isinstance(log_item["event"], str)
+        and log_item["event"] == "cohortextractor-stats"
+    ]
+
+
+def get_timing_logs(log_output):
+    return [log_entry for log_entry in log_output.entries if "target" in log_entry]
+
+
+def test_study_definition_initial_stats_logging(logger):
+    StudyDefinition(
+        default_expectations={
+            "rate": "exponential_increase",
+            "incidence": 0.2,
+            "date": {"earliest": "1900-01-01", "latest": "today"},
+        },
+        population=patients.all(),
+        event_date_1=patients.with_these_clinical_events(
+            codelist(["A"], system="ctv3"),
+            returning="date",
+            date_format="YYYY-MM-DD",
+        ),
+        event_min_date=patients.minimum_of(
+            "event_date_1",
+            event_date_2=patients.with_these_clinical_events(
+                codelist(["B", "C"], system="ctv3"),
+                returning="date",
+                date_format="YYYY-MM-DD",
+            ),
+        ),
+    )
+    assert get_stats_logs(logger.entries) == [
+        # output columns include patient_id, and the 4 variables defined in the
+        # study defniiton, including event_date_2, which is defined as a parameter to
+        # event_min_date
+        # tables - Patient, temp event table for each codelist
+        {"output_column_count": 5, "table_count": 3, "table_joins_count": 2},
+        # variable_count is a count of the top-level variables defined in the study def (i.e. not event_date_2)
+        {"variable_count": 4},
+        # 2 variables use a codelist (event_date_1, and the nested event_date_2)
+        {"variables_using_codelist_count": 2},
+        # for each variable using a codelist, we log the size of the codelist
+        {"variable_using_codelist": "event_date_1", "codelist_size": 1},
+        {"variable_using_codelist": "event_date_2", "codelist_size": 2},
+    ]
+
+
+def test_stats_logging_tpp_backend(logger):
+    study = StudyDefinition(
+        population=patients.all(),
+        event=patients.with_these_clinical_events(codelist(["A"], "snomed")),
+    )
+    study.to_dicts()
+
+    # initial stats
+    expected_initial_study_def_logs = [
+        # output columns include patient_id, and the 2 variables defined in the
+        # study defniiton
+        # tables - Patient, temp event table for codelist
+        {"output_column_count": 3, "table_count": 2, "table_joins_count": 1},
+        {"variable_count": 2},
+        {"variables_using_codelist_count": 1},
+        {"variable_using_codelist": "event", "codelist_size": 1},
+    ]
+
+    # timing stats
+    expected_timing_targets = [
+        "Uploading codelist for event",
+        "INSERT INTO #tmp1_event_codelist (code, category) ...",
+        "Query for event",
+        "Query for population",
+        "Join all columns for final output",
+    ]
+
+    assert_stats_logs(logger, expected_initial_study_def_logs, expected_timing_targets)
+
+
+@patch("cohortextractor.cohortextractor.preflight_generation_check")
+@patch(
+    "cohortextractor.cohortextractor.list_study_definitions",
+    return_value=[("study_definition", "")],
+)
+@patch("cohortextractor.cohortextractor.load_study_definition")
+@pytest.mark.parametrize(
+    "output_format,write_to_file_log",
+    [
+        ("csv", "write_rows_to_csv"),
+        ("dta", "Create df and write dataframe_to_file"),
+    ],
+)
+def test_stats_logging_generate_cohort(
+    mock_load,
+    _mock_list,
+    _mock_check,
+    tmp_path,
+    logger,
+    output_format,
+    write_to_file_log,
+):
+    mock_load.return_value = StudyDefinition(
+        default_expectations={
+            "date": {"earliest": "1900-01-01", "latest": "today"},
+        },
+        population=patients.all(),
+        sex=patients.sex(),
+    )
+
+    generate_cohort(
+        output_dir=tmp_path,
+        expectations_population=None,
+        dummy_data_file=None,
+        output_format=output_format,
+    )
+
+    # initial stats
+    expected_initial_study_def_logs = [
+        # these 3 are logged from StudyDefinition instantiation
+        # patient_id, population, sex - all from patient table, but we make one temp # table per variable
+        {"output_column_count": 3, "table_count": 2, "table_joins_count": 1},
+        {"variable_count": 2},  # population, sex
+        {"variables_using_codelist_count": 0},
+        # index_date_count logged from generate_cohort
+        {"index_date_count": 0},
+    ]
+
+    expected_timing_targets = [
+        # logs in tpp_backend during query execution
+        "Query for sex",
+        "Query for population",
+        # logs specifically from study.to_file
+        "Writing results into #final_output",
+        "CREATE INDEX ix_patient_id ON #final_output (patie...",
+        f"{write_to_file_log} {tmp_path}/input.{output_format}",
+        "Deleting '#final_output'",
+        # logging the overall timing for the cohort generation
+        "generate_cohort for study_definition",
+        "generate_cohort for study_definition (all index dates)",
+    ]
+
+    assert_stats_logs(logger, expected_initial_study_def_logs, expected_timing_targets)
+
+
+@patch("cohortextractor.cohortextractor.preflight_generation_check")
+@patch(
+    "cohortextractor.cohortextractor.list_study_definitions",
+    return_value=[("study_definition_test", "_test")],
+)
+@patch("cohortextractor.cohortextractor.load_study_definition")
+def test_stats_logging_generate_cohort_with_index_dates(
+    mock_load, _mock_list, _mock_check, logger, tmp_path
+):
+    mock_load.return_value = StudyDefinition(
+        default_expectations={
+            "date": {"earliest": "1900-01-01", "latest": "today"},
+        },
+        population=patients.all(),
+        sex=patients.sex(),
+    )
+
+    generate_cohort(
+        output_dir=tmp_path,
+        expectations_population=None,
+        dummy_data_file=None,
+        index_date_range="2020-01-01 to 2020-03-01 by month",
+    )
+
+    expected_index_dates = ["2020-03-01", "2020-02-01", "2020-01-01"]
+
+    # initial stats
+    expected_initial_study_def_logs = [
+        # these 3 are logged from StudyDefinition instantiation
+        {"variable_count": 2},  # population, sex
+        {"variables_using_codelist_count": 0},
+        # index_date_count logged from generate_cohort
+        {"index_date_count": 3},
+        {"min_index_date": "2020-01-01", "max_index_date": "2020-03-01"},
+        # output_column/table/joins_count is logged in tpp_backend on backend instantiation so it's repeated for each index date
+        *[{"output_column_count": 3, "table_count": 2, "table_joins_count": 1}] * 4,
+        *[
+            {"resetting_backend_index_date": ix_date}
+            for ix_date in expected_index_dates
+        ],
+    ]
+
+    index_date_timing_logs = []
+    for index_date in expected_index_dates:
+        index_date_timing_logs.extend(
+            [
+                # logs in tpp_backend during query execution
+                "Query for sex",
+                "Query for population",
+                # logs specifically from study.to_file
+                "Writing results into #final_output",
+                "CREATE INDEX ix_patient_id ON #final_output (patie...",
+                f"write_rows_to_csv {tmp_path}/input_test_{index_date}.csv",
+                "Deleting '#final_output'",
+                # logging the overall timing for the cohort generation
+                f"generate_cohort for study_definition_test at {index_date}",
+            ]
+        )
+
+    expected_timing_targets = [
+        *index_date_timing_logs,
+        "generate_cohort for study_definition_test (all index dates)",
+    ]
+    assert_stats_logs(logger, expected_initial_study_def_logs, expected_timing_targets)
+
+
+@patch("cohortextractor.cohortextractor.preflight_generation_check")
+@patch(
+    "cohortextractor.cohortextractor.list_study_definitions",
+    return_value=[("study_definition", "")],
+)
+@patch("cohortextractor.cohortextractor.load_study_definition")
+def test_stats_logging_generate_measures(
+    mock_load, _mock_list, _mock_check, logger, tmp_path
+):
+    import csv
+
+    from cohortextractor.measure import Measure
+
+    measures = [
+        Measure(
+            id="has_code",
+            numerator="has_code",
+            denominator="population",
+        ),
+        Measure(
+            id="has_code_one_group",
+            numerator="has_code",
+            denominator="population",
+            group_by="population",
+        ),
+    ]
+
+    mock_load.return_value = measures
+
+    # set up an expected input file
+    with open(tmp_path / "input_2020-01-01.csv", "w") as output_file:
+        writer = csv.writer(output_file)
+        writer.writerow(["patient_id", "has_code"])
+        writer.writerow([1, 1])
+        writer.writerow([2, 1])
+        writer.writerow([3, 1])
+        writer.writerow([4, 0])
+
+    generate_measures(output_dir=tmp_path)
+
+    stats_logs = get_stats_logs(logger.entries)
+    assert len(stats_logs) == 2
+    assert stats_logs[0] == {"measures_count": 2}
+    assert "execution_time" in stats_logs[1]
+    assert (
+        stats_logs[1]["target"]
+        == f"generate_measures for {tmp_path}/input_2020-01-01.csv"
+    )
+
+
+def assert_stats_logs(
+    log_output, expected_initial_study_def_logs, expected_timing_targets
+):
+    all_stats_logs = get_stats_logs(log_output.entries)
+    for log_params in expected_initial_study_def_logs:
+        assert log_params in all_stats_logs
+
+    timing_logs = get_timing_logs(log_output)
+    assert len(timing_logs) == len(expected_timing_targets), timing_logs
+
+    for i, timing_log in enumerate(timing_logs):
+        assert timing_log["target"] == expected_timing_targets[i]
+        assert re.match(r"\d:\d{2}:\d{2}.\d{6}", timing_log["execution_time"]).group(
+            0
+        ), timing_log["execution_time"]
+
+    # Make sure we've checked all the stats logs
+    assert len(all_stats_logs) == len(expected_initial_study_def_logs) + len(
+        timing_logs
+    )