From b021ddeb2cae74800fe66fcda0d092976979e425 Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Wed, 15 Nov 2023 14:58:40 +0000 Subject: [PATCH 1/4] Return an error in the check response for non-downloadable dmd codelists --- codelists/api.py | 7 ++++++- codelists/tests/test_api.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/codelists/api.py b/codelists/api.py index c1ae3940..b412c37c 100644 --- a/codelists/api.py +++ b/codelists/api.py @@ -351,7 +351,12 @@ def codelists_check(requests): return JsonResponse( {"status": "error", "data": {"error": f"{line} could not be found"}} ) - codelist_download_data[line] = codelist_version.csv_data_shas() + try: + codelist_download_data[line] = codelist_version.csv_data_shas() + except AssertionError: + return JsonResponse( + {"status": "error", "data": {"error": f"{line} is not downloadable"}} + ) # Compare with manifest file # The manifest file is generated when `opensafely codelists update` is run in a study repo diff --git a/codelists/tests/test_api.py b/codelists/tests/test_api.py index fc11b5cb..f14770d3 100644 --- a/codelists/tests/test_api.py +++ b/codelists/tests/test_api.py @@ -671,6 +671,7 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication): data = {"codelists": codelist_id, "manifest": json.dumps(manifest)} resp = client.post("/api/v1/check/", data) assert resp.json() == {"status": "ok"} + # Add a VMP mapping which will be added into the CSV download VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999") resp = client.post("/api/v1/check/", data) @@ -817,3 +818,37 @@ def test_codelists_check_sha(version_with_no_searches): version_with_no_searches.csv_data_sha() == hashlib.sha1(csv_data_clean.encode()).hexdigest() ) + + +def test_codelists_check_non_downloadable_dmd(client, dmd_version_asthma_medication): + dmd_version_asthma_medication.csv_data = ( + dmd_version_asthma_medication.csv_data.replace("dmd_id", "bad_header") + ) + dmd_version_asthma_medication.save() + + assert not dmd_version_asthma_medication.downloadable + + codelist_id = ( + f"{dmd_version_asthma_medication.organisation.slug}/" + f"{dmd_version_asthma_medication.codelist.slug}/" + f"{dmd_version_asthma_medication.hash}" + ) + codelist_csv_id = codelist_id.replace("/", "-") + ".csv" + + # Test the happy path for this dmd version + manifest = { + "files": { + codelist_csv_id: { + "id": codelist_id, + "url": f"https://opencodelist.org/codelists/{codelist_csv_id}/", + "downloaded_at": "2023-10-04 13:55:17.569997Z", + "sha": "dummy-sha", + }, + } + } + data = {"codelists": codelist_id, "manifest": json.dumps(manifest)} + resp = client.post("/api/v1/check/", data) + assert resp.json() == { + "status": "error", + "data": {"error": f"{codelist_id} is not downloadable"}, + } From e0cc3f72463b2e43b5350457f92b0f2641c50b5d Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Wed, 15 Nov 2023 15:41:57 +0000 Subject: [PATCH 2/4] Cache the codelist download shas --- .../0058_codelistversion_cached_csv_data.py | 17 +++++ codelists/models.py | 75 +++++++++++++------ codelists/tests/test_api.py | 3 + 3 files changed, 71 insertions(+), 24 deletions(-) create mode 100644 codelists/migrations/0058_codelistversion_cached_csv_data.py diff --git a/codelists/migrations/0058_codelistversion_cached_csv_data.py b/codelists/migrations/0058_codelistversion_cached_csv_data.py new file mode 100644 index 00000000..cc48b7a9 --- /dev/null +++ b/codelists/migrations/0058_codelistversion_cached_csv_data.py @@ -0,0 +1,17 @@ +# Generated by Django 4.2.4 on 2023-11-15 16:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("codelists", "0057_codelistversion_compatible_releases"), + ] + + operations = [ + migrations.AddField( + model_name="codelistversion", + name="cached_csv_data", + field=models.JSONField(default=dict), + ), + ] diff --git a/codelists/models.py b/codelists/models.py index 2f7e6575..65de4ef4 100644 --- a/codelists/models.py +++ b/codelists/models.py @@ -16,7 +16,7 @@ from opencodelists.hash_utils import hash, unhash from .codeset import Codeset -from .coding_systems import CODING_SYSTEMS +from .coding_systems import CODING_SYSTEMS, most_recent_database_alias from .hierarchy import Hierarchy from .presenters import present_definition_for_download @@ -273,6 +273,8 @@ class CodelistVersion(models.Model): tag = models.CharField(max_length=12, null=True) csv_data = models.TextField(verbose_name="CSV data", null=True) + cached_csv_data = models.JSONField(default=dict) + created_at = models.DateTimeField(auto_now_add=True) updated_at = models.DateTimeField(auto_now=True) @@ -573,34 +575,59 @@ def csv_data_sha(self, csv_data=None): data_for_download = "\n".join(csv_data.splitlines()) return hashlib.sha1(data_for_download.encode()).hexdigest() + def _get_dmd_shas(self, shas, current_csv_data_download): + """ + Return valid shas for a dm+d codelist CSV download + """ + # To try and minimise impact on users, we check if there are mapped + # VMPs for this set of dm+d codes. If there are not, then the user's + # downloaded CSV data will still be OK, so we can check against the + # sha of the old style dm+d download (no mapped VMPs, and uses the + # original headers) + current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download) + if len(current_csv_data_in_rows) == len(self.table): + old_dmd_download = rows_to_csv_data(self.table) + shas.append(self.csv_data_sha(old_dmd_download)) + # Also allow CSV downloads that only included the "code" and "term" + # columns (i.e. not the original code column) + # We only care about checking this if there is an original code column + # that would be included (it is included by default, but can be missing + # if the original code header was "code" to start with) + if len(current_csv_data_in_rows[0]) == 3: + fixed_header_data = rows_to_csv_data( + [row[:2] for row in current_csv_data_in_rows] + ) + shas.append(self.csv_data_sha(fixed_header_data)) + return shas + def csv_data_shas(self): """ Return a list of shas that should all be considered valid when checked against the downloaded data in a study repo. + + For non-dm+d codelists, we shoud never need to re-compute these one, as codes don't + change after a codelist version is moved to under-review (and they can't be downloaded + when draft). + + For dm+d codelists, we only need to re-compute the shas if the dmd release has changed + since the last time they were computed. """ - current_csv_data_download = self.csv_data_for_download() - shas = [self.csv_data_sha(csv_data=current_csv_data_download)] - if self.coding_system_id == "dmd": - # To try and minimise impact on users, we check if there are mapped - # VMPs for this set of dm+d codes. If there are not, then the user's - # downloaded CSV data will still be OK, so we can check against the - # sha of the old style dm+d download (no mapped VMPs, and uses the - # original headers) - current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download) - if len(current_csv_data_in_rows) == len(self.table): - old_dmd_download = rows_to_csv_data(self.table) - shas.append(self.csv_data_sha(old_dmd_download)) - # Also allow CSV downloads that only included the "code" and "term" - # columns (i.e. not the original code column) - # We only care about checking this if there is an original code column - # that would be included (it is included by default, but can be missing - # if the original code header was "code" to start with) - if len(current_csv_data_in_rows[0]) == 3: - fixed_header_data = rows_to_csv_data( - [row[:2] for row in current_csv_data_in_rows] - ) - shas.append(self.csv_data_sha(fixed_header_data)) - return shas + if not self.cached_csv_data or ( + self.coding_system_id == "dmd" + and self.cached_csv_data.get("release") != most_recent_database_alias("dmd") + ): + current_csv_data_download = self.csv_data_for_download() + shas = [self.csv_data_sha(csv_data=current_csv_data_download)] + if self.coding_system_id == "dmd": + shas = self._get_dmd_shas(shas, current_csv_data_download) + release = most_recent_database_alias("dmd") + else: + release = self.coding_system_release.database_alias + + self.cached_csv_data.update({"shas": shas, "release": release}) + self.save() + + return self.cached_csv_data["shas"] def table_with_fixed_headers( self, include_mapped_vmps=True, include_original_header=False diff --git a/codelists/tests/test_api.py b/codelists/tests/test_api.py index f14770d3..e9159310 100644 --- a/codelists/tests/test_api.py +++ b/codelists/tests/test_api.py @@ -672,6 +672,9 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication): resp = client.post("/api/v1/check/", data) assert resp.json() == {"status": "ok"} + # mock cache to be related to an older release + dmd_version_asthma_medication.cached_csv_data["release"] = "old" + dmd_version_asthma_medication.save() # Add a VMP mapping which will be added into the CSV download VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999") resp = client.post("/api/v1/check/", data) From 27e1d5c7b601ace9abb8396d683a057a0a41ecbb Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Wed, 15 Nov 2023 17:13:31 +0000 Subject: [PATCH 3/4] Cache the csv download data --- codelists/models.py | 54 +++++++++++++++++++++++----------- codelists/tests/test_models.py | 50 +++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 17 deletions(-) diff --git a/codelists/models.py b/codelists/models.py index 65de4ef4..b8fb0019 100644 --- a/codelists/models.py +++ b/codelists/models.py @@ -544,25 +544,43 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True): fixed_headers: if True, uploaded csv_data is converted to two columns, headed "code" and "term". This parameter is ignored for dmd downloads that include mapped VMPs """ - if self.csv_data: - dmd_with_mapped_vmps = ( - self.coding_system_id == "dmd" and include_mapped_vmps + dmd_version_needs_refreshed = ( + self.coding_system_id == "dmd" + and include_mapped_vmps + and self.cached_csv_data.get("release") != most_recent_database_alias("dmd") + ) + cache_key = f"download_data_fixed_headers_{fixed_headers}_include_mapped_vmps_{include_mapped_vmps}" + if not self.cached_csv_data.get(cache_key) or dmd_version_needs_refreshed: + if self.csv_data: + dmd_with_mapped_vmps = ( + self.coding_system_id == "dmd" and include_mapped_vmps + ) + if not fixed_headers and not dmd_with_mapped_vmps: + csv_data = self.csv_data + else: + # if fixed_headers were not explicitly requested (i.e. by OSI), include a column + # with the original code column header. This ensures that any new downloads continue + # to work with existing study/data definitions that import codelists with a named + # code column. + # Currently this is likely to only apply to dm+d codelists which have often been + # converted from BNF codelists, and previously used "dmd_id" as the code column + csv_data = rows_to_csv_data( + self.table_with_fixed_headers( + include_mapped_vmps=include_mapped_vmps, + include_original_header=not fixed_headers, + ) + ) + else: + csv_data = rows_to_csv_data(self.table) + relevant_release = ( + most_recent_database_alias("dmd") + if self.coding_system_id == "dmd" + else self.coding_system_release.database_alias ) - if not fixed_headers and not dmd_with_mapped_vmps: - return self.csv_data - # if fixed_headers were not explicitly requested (i.e. by OSI), include a column - # with the original code column header. This ensures that any new downloads continue - # to work with existing study/data definitions that import codelists with a named - # code column. - # Currently this is likely to only apply to dm+d codelists which have often been - # converted from BNF codelists, and previously used "dmd_id" as the code column - table = self.table_with_fixed_headers( - include_mapped_vmps=include_mapped_vmps, - include_original_header=not fixed_headers, + self.cached_csv_data.update( + {cache_key: csv_data, "release": relevant_release} ) - else: - table = self.table - return rows_to_csv_data(table) + return self.cached_csv_data[cache_key] def csv_data_sha(self, csv_data=None): """ @@ -616,6 +634,8 @@ def csv_data_shas(self): self.coding_system_id == "dmd" and self.cached_csv_data.get("release") != most_recent_database_alias("dmd") ): + # reset the cache so we refresh any stored dmd data + self.cached_csv_data = {} current_csv_data_download = self.csv_data_for_download() shas = [self.csv_data_sha(csv_data=current_csv_data_download)] if self.coding_system_id == "dmd": diff --git a/codelists/tests/test_models.py b/codelists/tests/test_models.py index 5f8b47d4..b7720f95 100644 --- a/codelists/tests/test_models.py +++ b/codelists/tests/test_models.py @@ -305,3 +305,53 @@ def test_draft_is_not_downloadable(draft_with_no_searches): assert draft_with_no_searches.downloadable is False builder_actions.save(draft=draft_with_no_searches) assert draft_with_no_searches.downloadable is True + + +def test_cached_csv_data(old_style_version): + assert old_style_version.cached_csv_data == {} + old_style_version.csv_data_shas() + assert old_style_version.cached_csv_data["release"] == "snomedct_test_20200101" + assert ( + "download_data_fixed_headers_False_include_mapped_vmps_True" + in old_style_version.cached_csv_data + ) + assert ( + old_style_version.cached_csv_data[ + "download_data_fixed_headers_False_include_mapped_vmps_True" + ] + == old_style_version.csv_data + ) + assert len(old_style_version.cached_csv_data["shas"]) == 1 + + +def test_cached_csv_data_dmd_codelist(dmd_version_asthma_medication): + clv = dmd_version_asthma_medication + assert clv.cached_csv_data == {} + clv.csv_data_shas() + + # take a copy of the cached data + cached_csv_data = {**clv.cached_csv_data} + assert clv.cached_csv_data["release"] == "dmd_test_20200101" + assert ( + "download_data_fixed_headers_False_include_mapped_vmps_True" + in clv.cached_csv_data + ) + # 3 shas + # - original + # - mapped version, with fixed headers + # - mapped version with fixed headers and extra original code column + assert len(clv.cached_csv_data["shas"]) == 3 + + clv.csv_data = "code,term" + clv.save() + # csv_data_for_download still returns the cached data + assert clv.csv_data_for_download() != "code,term\r\n" + assert ( + clv.csv_data_for_download() + == cached_csv_data["download_data_fixed_headers_False_include_mapped_vmps_True"] + ) + + # make the release not the latest one so the data needs to be re-cached + clv.cached_csv_data["release"] = "old" + clv.save() + assert clv.csv_data_for_download() == "code,term\r\n" From e397766ef0dac62686465ef4ad62bcbb2df3f1e9 Mon Sep 17 00:00:00 2001 From: Becky Smith Date: Wed, 15 Nov 2023 17:35:34 +0000 Subject: [PATCH 4/4] Cache CSV download data after dm+d imports --- coding_systems/dmd/import_data.py | 17 +++++++++++++++++ coding_systems/dmd/tests/test_import_data.py | 12 +++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/coding_systems/dmd/import_data.py b/coding_systems/dmd/import_data.py index 67c4a502..dd9b0a43 100644 --- a/coding_systems/dmd/import_data.py +++ b/coding_systems/dmd/import_data.py @@ -8,6 +8,7 @@ from lxml import etree from tqdm import tqdm +from codelists.models import CodelistVersion from coding_systems.base.import_data_utils import CodingSystemImporter from coding_systems.dmd import models from mappings.dmdvmpprevmap.models import Mapping @@ -57,6 +58,7 @@ def import_release( import_coding_system(Path(tempdir), database_alias) update_vmp_prev_mapping(database_alias) + update_cached_download_data() def import_coding_system(release_dir, database_alias): @@ -314,3 +316,18 @@ def update_vmp_prev_mapping(database_alias): desc="Update VMP previous mapping", ): Mapping.objects.get_or_create(id=vmp.id, vpidprev=vmp.vpidprev) + + +def update_cached_download_data(): + """ + Refresh the cached_csv_data by calling csv_data_shas for each non-draft dmd + CodelistVersion. This will also cache the default download data. + """ + logger.info("Updating cached download data") + for codelist_version in tqdm( + CodelistVersion.objects.filter(codelist__coding_system_id="dmd").exclude( + status="draft" + ) + ): + if codelist_version.downloadable: + codelist_version.csv_data_shas() diff --git a/coding_systems/dmd/tests/test_import_data.py b/coding_systems/dmd/tests/test_import_data.py index 7cd29dae..96ca5e60 100644 --- a/coding_systems/dmd/tests/test_import_data.py +++ b/coding_systems/dmd/tests/test_import_data.py @@ -18,8 +18,11 @@ def coding_systems_database_tmp_dir(coding_systems_tmp_path): yield coding_systems_tmp_path -def test_import_data(settings, dmd_data, mock_data_download): +def test_import_data( + settings, dmd_data, dmd_version_asthma_medication, mock_data_download +): cs_release_count = CodingSystemRelease.objects.count() + assert dmd_version_asthma_medication.cached_csv_data == {} # import mock XML data # This consists of the AMP 222311000001102 (Ventolin 100micrograms/dose Evohaler) @@ -72,6 +75,13 @@ def test_import_data(settings, dmd_data, mock_data_download): assert mapping.id == "39113611000001102" assert mapping.vpidprev == "320139002" + # download data on existing CodelistVersions has been cached + dmd_version_asthma_medication.refresh_from_db() + assert ( + dmd_version_asthma_medication.cached_csv_data["release"] + == "dmd_2022-100_20221001" + ) + def test_import_data_unexpected_file(mock_data_download_bad_zip): cs_release_count = CodingSystemRelease.objects.count()