Merge pull request #1746 from opensafely-core/cache-csv-data

Cache csv download data and shas
opensafely-core · Nov 16, 2023 · 1bedeec · 1bedeec
2 parents 5749b7a + e397766
commit 1bedeec
Show file tree

Hide file tree

Showing 7 changed files with 227 additions and 43 deletions.
diff --git a/codelists/api.py b/codelists/api.py
@@ -351,7 +351,12 @@ def codelists_check(requests):
                 return JsonResponse(
                     {"status": "error", "data": {"error": f"{line} could not be found"}}
                 )
-        codelist_download_data[line] = codelist_version.csv_data_shas()
+        try:
+            codelist_download_data[line] = codelist_version.csv_data_shas()
+        except AssertionError:
+            return JsonResponse(
+                {"status": "error", "data": {"error": f"{line} is not downloadable"}}
+            )
 
     # Compare with manifest file
     # The manifest file is generated when `opensafely codelists update` is run in a study repo

diff --git a/codelists/migrations/0058_codelistversion_cached_csv_data.py b/codelists/migrations/0058_codelistversion_cached_csv_data.py
@@ -0,0 +1,17 @@
+# Generated by Django 4.2.4 on 2023-11-15 16:08
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("codelists", "0057_codelistversion_compatible_releases"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="codelistversion",
+            name="cached_csv_data",
+            field=models.JSONField(default=dict),
+        ),
+    ]
diff --git a/codelists/models.py b/codelists/models.py
@@ -16,7 +16,7 @@
 from opencodelists.hash_utils import hash, unhash
 
 from .codeset import Codeset
-from .coding_systems import CODING_SYSTEMS
+from .coding_systems import CODING_SYSTEMS, most_recent_database_alias
 from .hierarchy import Hierarchy
 from .presenters import present_definition_for_download
 
@@ -273,6 +273,8 @@ class CodelistVersion(models.Model):
     tag = models.CharField(max_length=12, null=True)
     csv_data = models.TextField(verbose_name="CSV data", null=True)
 
+    cached_csv_data = models.JSONField(default=dict)
+
     created_at = models.DateTimeField(auto_now_add=True)
     updated_at = models.DateTimeField(auto_now=True)
 
@@ -542,25 +544,43 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True):
         fixed_headers: if True, uploaded csv_data is converted to two columns, headed "code" and "term".
           This parameter is ignored for dmd downloads that include mapped VMPs
         """
-        if self.csv_data:
-            dmd_with_mapped_vmps = (
-                self.coding_system_id == "dmd" and include_mapped_vmps
+        dmd_version_needs_refreshed = (
+            self.coding_system_id == "dmd"
+            and include_mapped_vmps
+            and self.cached_csv_data.get("release") != most_recent_database_alias("dmd")
+        )
+        cache_key = f"download_data_fixed_headers_{fixed_headers}_include_mapped_vmps_{include_mapped_vmps}"
+        if not self.cached_csv_data.get(cache_key) or dmd_version_needs_refreshed:
+            if self.csv_data:
+                dmd_with_mapped_vmps = (
+                    self.coding_system_id == "dmd" and include_mapped_vmps
+                )
+                if not fixed_headers and not dmd_with_mapped_vmps:
+                    csv_data = self.csv_data
+                else:
+                    # if fixed_headers were not explicitly requested (i.e. by OSI), include a column
+                    # with the original code column header. This ensures that any new downloads continue
+                    # to work with existing study/data definitions that import codelists with a named
+                    # code column.
+                    # Currently this is likely to only apply to dm+d codelists which have often been
+                    # converted from BNF codelists, and previously used "dmd_id" as the code column
+                    csv_data = rows_to_csv_data(
+                        self.table_with_fixed_headers(
+                            include_mapped_vmps=include_mapped_vmps,
+                            include_original_header=not fixed_headers,
+                        )
+                    )
+            else:
+                csv_data = rows_to_csv_data(self.table)
+            relevant_release = (
+                most_recent_database_alias("dmd")
+                if self.coding_system_id == "dmd"
+                else self.coding_system_release.database_alias
             )
-            if not fixed_headers and not dmd_with_mapped_vmps:
-                return self.csv_data
-            # if fixed_headers were not explicitly requested (i.e. by OSI), include a column
-            # with the original code column header. This ensures that any new downloads continue
-            # to work with existing study/data definitions that import codelists with a named
-            # code column.
-            # Currently this is likely to only apply to dm+d codelists which have often been
-            # converted from BNF codelists, and previously used "dmd_id" as the code column
-            table = self.table_with_fixed_headers(
-                include_mapped_vmps=include_mapped_vmps,
-                include_original_header=not fixed_headers,
+            self.cached_csv_data.update(
+                {cache_key: csv_data, "release": relevant_release}
             )
-        else:
-            table = self.table
-        return rows_to_csv_data(table)
+        return self.cached_csv_data[cache_key]
 
     def csv_data_sha(self, csv_data=None):
         """
@@ -573,34 +593,61 @@ def csv_data_sha(self, csv_data=None):
         data_for_download = "\n".join(csv_data.splitlines())
         return hashlib.sha1(data_for_download.encode()).hexdigest()
 
+    def _get_dmd_shas(self, shas, current_csv_data_download):
+        """
+        Return valid shas for a dm+d codelist CSV download
+        """
+        # To try and minimise impact on users, we check if there are mapped
+        # VMPs for this set of dm+d codes. If there are not, then the user's
+        # downloaded CSV data will still be OK, so we can check against the
+        # sha of the old style dm+d download (no mapped VMPs, and uses the
+        # original headers)
+        current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
+        if len(current_csv_data_in_rows) == len(self.table):
+            old_dmd_download = rows_to_csv_data(self.table)
+            shas.append(self.csv_data_sha(old_dmd_download))
+        # Also allow CSV downloads that only included the "code" and "term"
+        # columns (i.e. not the original code column)
+        # We only care about checking this if there is an original code column
+        # that would be included (it is included by default, but can be missing
+        # if the original code header was "code" to start with)
+        if len(current_csv_data_in_rows[0]) == 3:
+            fixed_header_data = rows_to_csv_data(
+                [row[:2] for row in current_csv_data_in_rows]
+            )
+            shas.append(self.csv_data_sha(fixed_header_data))
+        return shas
+
     def csv_data_shas(self):
         """
         Return a list of shas that should all be considered valid when
         checked against the downloaded data in a study repo.
+
+        For non-dm+d codelists, we shoud never need to re-compute these one, as codes don't
+        change after a codelist version is moved to under-review (and they can't be downloaded
+        when draft).
+
+        For dm+d codelists, we only need to re-compute the shas if the dmd release has changed
+        since the last time they were computed.
         """
-        current_csv_data_download = self.csv_data_for_download()
-        shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
-        if self.coding_system_id == "dmd":
-            # To try and minimise impact on users, we check if there are mapped
-            # VMPs for this set of dm+d codes. If there are not, then the user's
-            # downloaded CSV data will still be OK, so we can check against the
-            # sha of the old style dm+d download (no mapped VMPs, and uses the
-            # original headers)
-            current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
-            if len(current_csv_data_in_rows) == len(self.table):
-                old_dmd_download = rows_to_csv_data(self.table)
-                shas.append(self.csv_data_sha(old_dmd_download))
-            # Also allow CSV downloads that only included the "code" and "term"
-            # columns (i.e. not the original code column)
-            # We only care about checking this if there is an original code column
-            # that would be included (it is included by default, but can be missing
-            # if the original code header was "code" to start with)
-            if len(current_csv_data_in_rows[0]) == 3:
-                fixed_header_data = rows_to_csv_data(
-                    [row[:2] for row in current_csv_data_in_rows]
-                )
-                shas.append(self.csv_data_sha(fixed_header_data))
-        return shas
+        if not self.cached_csv_data or (
+            self.coding_system_id == "dmd"
+            and self.cached_csv_data.get("release") != most_recent_database_alias("dmd")
+        ):
+            # reset the cache so we refresh any stored dmd data
+            self.cached_csv_data = {}
+            current_csv_data_download = self.csv_data_for_download()
+            shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
+            if self.coding_system_id == "dmd":
+                shas = self._get_dmd_shas(shas, current_csv_data_download)
+                release = most_recent_database_alias("dmd")
+            else:
+                release = self.coding_system_release.database_alias
+
+            self.cached_csv_data.update({"shas": shas, "release": release})
+            self.save()
+
+        return self.cached_csv_data["shas"]
 
     def table_with_fixed_headers(
         self, include_mapped_vmps=True, include_original_header=False

diff --git a/codelists/tests/test_api.py b/codelists/tests/test_api.py
@@ -671,6 +671,10 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication):
     data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
     resp = client.post("/api/v1/check/", data)
     assert resp.json() == {"status": "ok"}
+
+    # mock cache to be related to an older release
+    dmd_version_asthma_medication.cached_csv_data["release"] = "old"
+    dmd_version_asthma_medication.save()
     # Add a VMP mapping which will be added into the CSV download
     VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999")
     resp = client.post("/api/v1/check/", data)
@@ -817,3 +821,37 @@ def test_codelists_check_sha(version_with_no_searches):
         version_with_no_searches.csv_data_sha()
         == hashlib.sha1(csv_data_clean.encode()).hexdigest()
     )
+
+
+def test_codelists_check_non_downloadable_dmd(client, dmd_version_asthma_medication):
+    dmd_version_asthma_medication.csv_data = (
+        dmd_version_asthma_medication.csv_data.replace("dmd_id", "bad_header")
+    )
+    dmd_version_asthma_medication.save()
+
+    assert not dmd_version_asthma_medication.downloadable
+
+    codelist_id = (
+        f"{dmd_version_asthma_medication.organisation.slug}/"
+        f"{dmd_version_asthma_medication.codelist.slug}/"
+        f"{dmd_version_asthma_medication.hash}"
+    )
+    codelist_csv_id = codelist_id.replace("/", "-") + ".csv"
+
+    # Test the happy path for this dmd version
+    manifest = {
+        "files": {
+            codelist_csv_id: {
+                "id": codelist_id,
+                "url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
+                "downloaded_at": "2023-10-04 13:55:17.569997Z",
+                "sha": "dummy-sha",
+            },
+        }
+    }
+    data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
+    resp = client.post("/api/v1/check/", data)
+    assert resp.json() == {
+        "status": "error",
+        "data": {"error": f"{codelist_id} is not downloadable"},
+    }
diff --git a/codelists/tests/test_models.py b/codelists/tests/test_models.py
@@ -305,3 +305,53 @@ def test_draft_is_not_downloadable(draft_with_no_searches):
     assert draft_with_no_searches.downloadable is False
     builder_actions.save(draft=draft_with_no_searches)
     assert draft_with_no_searches.downloadable is True
+
+
+def test_cached_csv_data(old_style_version):
+    assert old_style_version.cached_csv_data == {}
+    old_style_version.csv_data_shas()
+    assert old_style_version.cached_csv_data["release"] == "snomedct_test_20200101"
+    assert (
+        "download_data_fixed_headers_False_include_mapped_vmps_True"
+        in old_style_version.cached_csv_data
+    )
+    assert (
+        old_style_version.cached_csv_data[
+            "download_data_fixed_headers_False_include_mapped_vmps_True"
+        ]
+        == old_style_version.csv_data
+    )
+    assert len(old_style_version.cached_csv_data["shas"]) == 1
+
+
+def test_cached_csv_data_dmd_codelist(dmd_version_asthma_medication):
+    clv = dmd_version_asthma_medication
+    assert clv.cached_csv_data == {}
+    clv.csv_data_shas()
+
+    # take a copy of the cached data
+    cached_csv_data = {**clv.cached_csv_data}
+    assert clv.cached_csv_data["release"] == "dmd_test_20200101"
+    assert (
+        "download_data_fixed_headers_False_include_mapped_vmps_True"
+        in clv.cached_csv_data
+    )
+    # 3 shas
+    # - original
+    # - mapped version, with fixed headers
+    # - mapped version with fixed headers and extra original code column
+    assert len(clv.cached_csv_data["shas"]) == 3
+
+    clv.csv_data = "code,term"
+    clv.save()
+    # csv_data_for_download still returns the cached data
+    assert clv.csv_data_for_download() != "code,term\r\n"
+    assert (
+        clv.csv_data_for_download()
+        == cached_csv_data["download_data_fixed_headers_False_include_mapped_vmps_True"]
+    )
+
+    # make the release not the latest one so the data needs to be re-cached
+    clv.cached_csv_data["release"] = "old"
+    clv.save()
+    assert clv.csv_data_for_download() == "code,term\r\n"
diff --git a/coding_systems/dmd/import_data.py b/coding_systems/dmd/import_data.py
@@ -8,6 +8,7 @@
 from lxml import etree
 from tqdm import tqdm
 
+from codelists.models import CodelistVersion
 from coding_systems.base.import_data_utils import CodingSystemImporter
 from coding_systems.dmd import models
 from mappings.dmdvmpprevmap.models import Mapping
@@ -57,6 +58,7 @@ def import_release(
             import_coding_system(Path(tempdir), database_alias)
 
     update_vmp_prev_mapping(database_alias)
+    update_cached_download_data()
 
 
 def import_coding_system(release_dir, database_alias):
@@ -314,3 +316,18 @@ def update_vmp_prev_mapping(database_alias):
         desc="Update VMP previous mapping",
     ):
         Mapping.objects.get_or_create(id=vmp.id, vpidprev=vmp.vpidprev)
+
+
+def update_cached_download_data():
+    """
+    Refresh the cached_csv_data by calling csv_data_shas for each non-draft dmd
+    CodelistVersion. This will also cache the default download data.
+    """
+    logger.info("Updating cached download data")
+    for codelist_version in tqdm(
+        CodelistVersion.objects.filter(codelist__coding_system_id="dmd").exclude(
+            status="draft"
+        )
+    ):
+        if codelist_version.downloadable:
+            codelist_version.csv_data_shas()
diff --git a/coding_systems/dmd/tests/test_import_data.py b/coding_systems/dmd/tests/test_import_data.py
@@ -18,8 +18,11 @@ def coding_systems_database_tmp_dir(coding_systems_tmp_path):
     yield coding_systems_tmp_path
 
 
-def test_import_data(settings, dmd_data, mock_data_download):
+def test_import_data(
+    settings, dmd_data, dmd_version_asthma_medication, mock_data_download
+):
     cs_release_count = CodingSystemRelease.objects.count()
+    assert dmd_version_asthma_medication.cached_csv_data == {}
 
     # import mock XML data
     # This consists of the AMP 222311000001102 (Ventolin 100micrograms/dose Evohaler)
@@ -72,6 +75,13 @@ def test_import_data(settings, dmd_data, mock_data_download):
     assert mapping.id == "39113611000001102"
     assert mapping.vpidprev == "320139002"
 
+    # download data on existing CodelistVersions has been cached
+    dmd_version_asthma_medication.refresh_from_db()
+    assert (
+        dmd_version_asthma_medication.cached_csv_data["release"]
+        == "dmd_2022-100_20221001"
+    )
+
 
 def test_import_data_unexpected_file(mock_data_download_bad_zip):
     cs_release_count = CodingSystemRelease.objects.count()