Skip to content

Commit

Permalink
Merge pull request #1746 from opensafely-core/cache-csv-data
Browse files Browse the repository at this point in the history
Cache csv download data and shas
  • Loading branch information
rebkwok authored Nov 16, 2023
2 parents 5749b7a + e397766 commit 1bedeec
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 43 deletions.
7 changes: 6 additions & 1 deletion codelists/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,12 @@ def codelists_check(requests):
return JsonResponse(
{"status": "error", "data": {"error": f"{line} could not be found"}}
)
codelist_download_data[line] = codelist_version.csv_data_shas()
try:
codelist_download_data[line] = codelist_version.csv_data_shas()
except AssertionError:
return JsonResponse(
{"status": "error", "data": {"error": f"{line} is not downloadable"}}
)

# Compare with manifest file
# The manifest file is generated when `opensafely codelists update` is run in a study repo
Expand Down
17 changes: 17 additions & 0 deletions codelists/migrations/0058_codelistversion_cached_csv_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 4.2.4 on 2023-11-15 16:08

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("codelists", "0057_codelistversion_compatible_releases"),
]

operations = [
migrations.AddField(
model_name="codelistversion",
name="cached_csv_data",
field=models.JSONField(default=dict),
),
]
129 changes: 88 additions & 41 deletions codelists/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from opencodelists.hash_utils import hash, unhash

from .codeset import Codeset
from .coding_systems import CODING_SYSTEMS
from .coding_systems import CODING_SYSTEMS, most_recent_database_alias
from .hierarchy import Hierarchy
from .presenters import present_definition_for_download

Expand Down Expand Up @@ -273,6 +273,8 @@ class CodelistVersion(models.Model):
tag = models.CharField(max_length=12, null=True)
csv_data = models.TextField(verbose_name="CSV data", null=True)

cached_csv_data = models.JSONField(default=dict)

created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)

Expand Down Expand Up @@ -542,25 +544,43 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True):
fixed_headers: if True, uploaded csv_data is converted to two columns, headed "code" and "term".
This parameter is ignored for dmd downloads that include mapped VMPs
"""
if self.csv_data:
dmd_with_mapped_vmps = (
self.coding_system_id == "dmd" and include_mapped_vmps
dmd_version_needs_refreshed = (
self.coding_system_id == "dmd"
and include_mapped_vmps
and self.cached_csv_data.get("release") != most_recent_database_alias("dmd")
)
cache_key = f"download_data_fixed_headers_{fixed_headers}_include_mapped_vmps_{include_mapped_vmps}"
if not self.cached_csv_data.get(cache_key) or dmd_version_needs_refreshed:
if self.csv_data:
dmd_with_mapped_vmps = (
self.coding_system_id == "dmd" and include_mapped_vmps
)
if not fixed_headers and not dmd_with_mapped_vmps:
csv_data = self.csv_data
else:
# if fixed_headers were not explicitly requested (i.e. by OSI), include a column
# with the original code column header. This ensures that any new downloads continue
# to work with existing study/data definitions that import codelists with a named
# code column.
# Currently this is likely to only apply to dm+d codelists which have often been
# converted from BNF codelists, and previously used "dmd_id" as the code column
csv_data = rows_to_csv_data(
self.table_with_fixed_headers(
include_mapped_vmps=include_mapped_vmps,
include_original_header=not fixed_headers,
)
)
else:
csv_data = rows_to_csv_data(self.table)
relevant_release = (
most_recent_database_alias("dmd")
if self.coding_system_id == "dmd"
else self.coding_system_release.database_alias
)
if not fixed_headers and not dmd_with_mapped_vmps:
return self.csv_data
# if fixed_headers were not explicitly requested (i.e. by OSI), include a column
# with the original code column header. This ensures that any new downloads continue
# to work with existing study/data definitions that import codelists with a named
# code column.
# Currently this is likely to only apply to dm+d codelists which have often been
# converted from BNF codelists, and previously used "dmd_id" as the code column
table = self.table_with_fixed_headers(
include_mapped_vmps=include_mapped_vmps,
include_original_header=not fixed_headers,
self.cached_csv_data.update(
{cache_key: csv_data, "release": relevant_release}
)
else:
table = self.table
return rows_to_csv_data(table)
return self.cached_csv_data[cache_key]

def csv_data_sha(self, csv_data=None):
"""
Expand All @@ -573,34 +593,61 @@ def csv_data_sha(self, csv_data=None):
data_for_download = "\n".join(csv_data.splitlines())
return hashlib.sha1(data_for_download.encode()).hexdigest()

def _get_dmd_shas(self, shas, current_csv_data_download):
"""
Return valid shas for a dm+d codelist CSV download
"""
# To try and minimise impact on users, we check if there are mapped
# VMPs for this set of dm+d codes. If there are not, then the user's
# downloaded CSV data will still be OK, so we can check against the
# sha of the old style dm+d download (no mapped VMPs, and uses the
# original headers)
current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
if len(current_csv_data_in_rows) == len(self.table):
old_dmd_download = rows_to_csv_data(self.table)
shas.append(self.csv_data_sha(old_dmd_download))
# Also allow CSV downloads that only included the "code" and "term"
# columns (i.e. not the original code column)
# We only care about checking this if there is an original code column
# that would be included (it is included by default, but can be missing
# if the original code header was "code" to start with)
if len(current_csv_data_in_rows[0]) == 3:
fixed_header_data = rows_to_csv_data(
[row[:2] for row in current_csv_data_in_rows]
)
shas.append(self.csv_data_sha(fixed_header_data))
return shas

def csv_data_shas(self):
"""
Return a list of shas that should all be considered valid when
checked against the downloaded data in a study repo.
For non-dm+d codelists, we shoud never need to re-compute these one, as codes don't
change after a codelist version is moved to under-review (and they can't be downloaded
when draft).
For dm+d codelists, we only need to re-compute the shas if the dmd release has changed
since the last time they were computed.
"""
current_csv_data_download = self.csv_data_for_download()
shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
if self.coding_system_id == "dmd":
# To try and minimise impact on users, we check if there are mapped
# VMPs for this set of dm+d codes. If there are not, then the user's
# downloaded CSV data will still be OK, so we can check against the
# sha of the old style dm+d download (no mapped VMPs, and uses the
# original headers)
current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
if len(current_csv_data_in_rows) == len(self.table):
old_dmd_download = rows_to_csv_data(self.table)
shas.append(self.csv_data_sha(old_dmd_download))
# Also allow CSV downloads that only included the "code" and "term"
# columns (i.e. not the original code column)
# We only care about checking this if there is an original code column
# that would be included (it is included by default, but can be missing
# if the original code header was "code" to start with)
if len(current_csv_data_in_rows[0]) == 3:
fixed_header_data = rows_to_csv_data(
[row[:2] for row in current_csv_data_in_rows]
)
shas.append(self.csv_data_sha(fixed_header_data))
return shas
if not self.cached_csv_data or (
self.coding_system_id == "dmd"
and self.cached_csv_data.get("release") != most_recent_database_alias("dmd")
):
# reset the cache so we refresh any stored dmd data
self.cached_csv_data = {}
current_csv_data_download = self.csv_data_for_download()
shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
if self.coding_system_id == "dmd":
shas = self._get_dmd_shas(shas, current_csv_data_download)
release = most_recent_database_alias("dmd")
else:
release = self.coding_system_release.database_alias

self.cached_csv_data.update({"shas": shas, "release": release})
self.save()

return self.cached_csv_data["shas"]

def table_with_fixed_headers(
self, include_mapped_vmps=True, include_original_header=False
Expand Down
38 changes: 38 additions & 0 deletions codelists/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,10 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication):
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}

# mock cache to be related to an older release
dmd_version_asthma_medication.cached_csv_data["release"] = "old"
dmd_version_asthma_medication.save()
# Add a VMP mapping which will be added into the CSV download
VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999")
resp = client.post("/api/v1/check/", data)
Expand Down Expand Up @@ -817,3 +821,37 @@ def test_codelists_check_sha(version_with_no_searches):
version_with_no_searches.csv_data_sha()
== hashlib.sha1(csv_data_clean.encode()).hexdigest()
)


def test_codelists_check_non_downloadable_dmd(client, dmd_version_asthma_medication):
dmd_version_asthma_medication.csv_data = (
dmd_version_asthma_medication.csv_data.replace("dmd_id", "bad_header")
)
dmd_version_asthma_medication.save()

assert not dmd_version_asthma_medication.downloadable

codelist_id = (
f"{dmd_version_asthma_medication.organisation.slug}/"
f"{dmd_version_asthma_medication.codelist.slug}/"
f"{dmd_version_asthma_medication.hash}"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"

# Test the happy path for this dmd version
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": "dummy-sha",
},
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {
"status": "error",
"data": {"error": f"{codelist_id} is not downloadable"},
}
50 changes: 50 additions & 0 deletions codelists/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,53 @@ def test_draft_is_not_downloadable(draft_with_no_searches):
assert draft_with_no_searches.downloadable is False
builder_actions.save(draft=draft_with_no_searches)
assert draft_with_no_searches.downloadable is True


def test_cached_csv_data(old_style_version):
assert old_style_version.cached_csv_data == {}
old_style_version.csv_data_shas()
assert old_style_version.cached_csv_data["release"] == "snomedct_test_20200101"
assert (
"download_data_fixed_headers_False_include_mapped_vmps_True"
in old_style_version.cached_csv_data
)
assert (
old_style_version.cached_csv_data[
"download_data_fixed_headers_False_include_mapped_vmps_True"
]
== old_style_version.csv_data
)
assert len(old_style_version.cached_csv_data["shas"]) == 1


def test_cached_csv_data_dmd_codelist(dmd_version_asthma_medication):
clv = dmd_version_asthma_medication
assert clv.cached_csv_data == {}
clv.csv_data_shas()

# take a copy of the cached data
cached_csv_data = {**clv.cached_csv_data}
assert clv.cached_csv_data["release"] == "dmd_test_20200101"
assert (
"download_data_fixed_headers_False_include_mapped_vmps_True"
in clv.cached_csv_data
)
# 3 shas
# - original
# - mapped version, with fixed headers
# - mapped version with fixed headers and extra original code column
assert len(clv.cached_csv_data["shas"]) == 3

clv.csv_data = "code,term"
clv.save()
# csv_data_for_download still returns the cached data
assert clv.csv_data_for_download() != "code,term\r\n"
assert (
clv.csv_data_for_download()
== cached_csv_data["download_data_fixed_headers_False_include_mapped_vmps_True"]
)

# make the release not the latest one so the data needs to be re-cached
clv.cached_csv_data["release"] = "old"
clv.save()
assert clv.csv_data_for_download() == "code,term\r\n"
17 changes: 17 additions & 0 deletions coding_systems/dmd/import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from lxml import etree
from tqdm import tqdm

from codelists.models import CodelistVersion
from coding_systems.base.import_data_utils import CodingSystemImporter
from coding_systems.dmd import models
from mappings.dmdvmpprevmap.models import Mapping
Expand Down Expand Up @@ -57,6 +58,7 @@ def import_release(
import_coding_system(Path(tempdir), database_alias)

update_vmp_prev_mapping(database_alias)
update_cached_download_data()


def import_coding_system(release_dir, database_alias):
Expand Down Expand Up @@ -314,3 +316,18 @@ def update_vmp_prev_mapping(database_alias):
desc="Update VMP previous mapping",
):
Mapping.objects.get_or_create(id=vmp.id, vpidprev=vmp.vpidprev)


def update_cached_download_data():
"""
Refresh the cached_csv_data by calling csv_data_shas for each non-draft dmd
CodelistVersion. This will also cache the default download data.
"""
logger.info("Updating cached download data")
for codelist_version in tqdm(
CodelistVersion.objects.filter(codelist__coding_system_id="dmd").exclude(
status="draft"
)
):
if codelist_version.downloadable:
codelist_version.csv_data_shas()
12 changes: 11 additions & 1 deletion coding_systems/dmd/tests/test_import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ def coding_systems_database_tmp_dir(coding_systems_tmp_path):
yield coding_systems_tmp_path


def test_import_data(settings, dmd_data, mock_data_download):
def test_import_data(
settings, dmd_data, dmd_version_asthma_medication, mock_data_download
):
cs_release_count = CodingSystemRelease.objects.count()
assert dmd_version_asthma_medication.cached_csv_data == {}

# import mock XML data
# This consists of the AMP 222311000001102 (Ventolin 100micrograms/dose Evohaler)
Expand Down Expand Up @@ -72,6 +75,13 @@ def test_import_data(settings, dmd_data, mock_data_download):
assert mapping.id == "39113611000001102"
assert mapping.vpidprev == "320139002"

# download data on existing CodelistVersions has been cached
dmd_version_asthma_medication.refresh_from_db()
assert (
dmd_version_asthma_medication.cached_csv_data["release"]
== "dmd_2022-100_20221001"
)


def test_import_data_unexpected_file(mock_data_download_bad_zip):
cs_release_count = CodingSystemRelease.objects.count()
Expand Down

0 comments on commit 1bedeec

Please sign in to comment.