Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache csv download data and shas #1746

Merged
merged 4 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion codelists/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,12 @@ def codelists_check(requests):
return JsonResponse(
{"status": "error", "data": {"error": f"{line} could not be found"}}
)
codelist_download_data[line] = codelist_version.csv_data_shas()
try:
codelist_download_data[line] = codelist_version.csv_data_shas()
except AssertionError:
return JsonResponse(
{"status": "error", "data": {"error": f"{line} is not downloadable"}}
)

# Compare with manifest file
# The manifest file is generated when `opensafely codelists update` is run in a study repo
Expand Down
17 changes: 17 additions & 0 deletions codelists/migrations/0058_codelistversion_cached_csv_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 4.2.4 on 2023-11-15 16:08

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("codelists", "0057_codelistversion_compatible_releases"),
]

operations = [
migrations.AddField(
model_name="codelistversion",
name="cached_csv_data",
field=models.JSONField(default=dict),
),
]
129 changes: 88 additions & 41 deletions codelists/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from opencodelists.hash_utils import hash, unhash

from .codeset import Codeset
from .coding_systems import CODING_SYSTEMS
from .coding_systems import CODING_SYSTEMS, most_recent_database_alias
from .hierarchy import Hierarchy
from .presenters import present_definition_for_download

Expand Down Expand Up @@ -273,6 +273,8 @@ class CodelistVersion(models.Model):
tag = models.CharField(max_length=12, null=True)
csv_data = models.TextField(verbose_name="CSV data", null=True)

cached_csv_data = models.JSONField(default=dict)

created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)

Expand Down Expand Up @@ -542,25 +544,43 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True):
fixed_headers: if True, uploaded csv_data is converted to two columns, headed "code" and "term".
This parameter is ignored for dmd downloads that include mapped VMPs
"""
if self.csv_data:
dmd_with_mapped_vmps = (
self.coding_system_id == "dmd" and include_mapped_vmps
dmd_version_needs_refreshed = (
self.coding_system_id == "dmd"
and include_mapped_vmps
and self.cached_csv_data.get("release") != most_recent_database_alias("dmd")
)
cache_key = f"download_data_fixed_headers_{fixed_headers}_include_mapped_vmps_{include_mapped_vmps}"
if not self.cached_csv_data.get(cache_key) or dmd_version_needs_refreshed:
if self.csv_data:
dmd_with_mapped_vmps = (
self.coding_system_id == "dmd" and include_mapped_vmps
)
if not fixed_headers and not dmd_with_mapped_vmps:
csv_data = self.csv_data
else:
# if fixed_headers were not explicitly requested (i.e. by OSI), include a column
# with the original code column header. This ensures that any new downloads continue
# to work with existing study/data definitions that import codelists with a named
# code column.
# Currently this is likely to only apply to dm+d codelists which have often been
# converted from BNF codelists, and previously used "dmd_id" as the code column
csv_data = rows_to_csv_data(
self.table_with_fixed_headers(
include_mapped_vmps=include_mapped_vmps,
include_original_header=not fixed_headers,
)
)
else:
csv_data = rows_to_csv_data(self.table)
relevant_release = (
most_recent_database_alias("dmd")
if self.coding_system_id == "dmd"
else self.coding_system_release.database_alias
)
if not fixed_headers and not dmd_with_mapped_vmps:
return self.csv_data
# if fixed_headers were not explicitly requested (i.e. by OSI), include a column
# with the original code column header. This ensures that any new downloads continue
# to work with existing study/data definitions that import codelists with a named
# code column.
# Currently this is likely to only apply to dm+d codelists which have often been
# converted from BNF codelists, and previously used "dmd_id" as the code column
table = self.table_with_fixed_headers(
include_mapped_vmps=include_mapped_vmps,
include_original_header=not fixed_headers,
self.cached_csv_data.update(
{cache_key: csv_data, "release": relevant_release}
)
else:
table = self.table
return rows_to_csv_data(table)
return self.cached_csv_data[cache_key]

def csv_data_sha(self, csv_data=None):
"""
Expand All @@ -573,34 +593,61 @@ def csv_data_sha(self, csv_data=None):
data_for_download = "\n".join(csv_data.splitlines())
return hashlib.sha1(data_for_download.encode()).hexdigest()

def _get_dmd_shas(self, shas, current_csv_data_download):
"""
Return valid shas for a dm+d codelist CSV download
"""
# To try and minimise impact on users, we check if there are mapped
# VMPs for this set of dm+d codes. If there are not, then the user's
# downloaded CSV data will still be OK, so we can check against the
# sha of the old style dm+d download (no mapped VMPs, and uses the
# original headers)
current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
if len(current_csv_data_in_rows) == len(self.table):
old_dmd_download = rows_to_csv_data(self.table)
shas.append(self.csv_data_sha(old_dmd_download))
# Also allow CSV downloads that only included the "code" and "term"
# columns (i.e. not the original code column)
# We only care about checking this if there is an original code column
# that would be included (it is included by default, but can be missing
# if the original code header was "code" to start with)
if len(current_csv_data_in_rows[0]) == 3:
fixed_header_data = rows_to_csv_data(
[row[:2] for row in current_csv_data_in_rows]
)
shas.append(self.csv_data_sha(fixed_header_data))
return shas

def csv_data_shas(self):
"""
Return a list of shas that should all be considered valid when
checked against the downloaded data in a study repo.

For non-dm+d codelists, we shoud never need to re-compute these one, as codes don't
change after a codelist version is moved to under-review (and they can't be downloaded
when draft).

For dm+d codelists, we only need to re-compute the shas if the dmd release has changed
since the last time they were computed.
"""
current_csv_data_download = self.csv_data_for_download()
shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
if self.coding_system_id == "dmd":
# To try and minimise impact on users, we check if there are mapped
# VMPs for this set of dm+d codes. If there are not, then the user's
# downloaded CSV data will still be OK, so we can check against the
# sha of the old style dm+d download (no mapped VMPs, and uses the
# original headers)
current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
if len(current_csv_data_in_rows) == len(self.table):
old_dmd_download = rows_to_csv_data(self.table)
shas.append(self.csv_data_sha(old_dmd_download))
# Also allow CSV downloads that only included the "code" and "term"
# columns (i.e. not the original code column)
# We only care about checking this if there is an original code column
# that would be included (it is included by default, but can be missing
# if the original code header was "code" to start with)
if len(current_csv_data_in_rows[0]) == 3:
fixed_header_data = rows_to_csv_data(
[row[:2] for row in current_csv_data_in_rows]
)
shas.append(self.csv_data_sha(fixed_header_data))
return shas
if not self.cached_csv_data or (
self.coding_system_id == "dmd"
and self.cached_csv_data.get("release") != most_recent_database_alias("dmd")
):
# reset the cache so we refresh any stored dmd data
self.cached_csv_data = {}
current_csv_data_download = self.csv_data_for_download()
shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
if self.coding_system_id == "dmd":
shas = self._get_dmd_shas(shas, current_csv_data_download)
release = most_recent_database_alias("dmd")
else:
release = self.coding_system_release.database_alias

self.cached_csv_data.update({"shas": shas, "release": release})
self.save()

return self.cached_csv_data["shas"]

def table_with_fixed_headers(
self, include_mapped_vmps=True, include_original_header=False
Expand Down
38 changes: 38 additions & 0 deletions codelists/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,10 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication):
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}

# mock cache to be related to an older release
dmd_version_asthma_medication.cached_csv_data["release"] = "old"
dmd_version_asthma_medication.save()
# Add a VMP mapping which will be added into the CSV download
VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999")
resp = client.post("/api/v1/check/", data)
Expand Down Expand Up @@ -817,3 +821,37 @@ def test_codelists_check_sha(version_with_no_searches):
version_with_no_searches.csv_data_sha()
== hashlib.sha1(csv_data_clean.encode()).hexdigest()
)


def test_codelists_check_non_downloadable_dmd(client, dmd_version_asthma_medication):
dmd_version_asthma_medication.csv_data = (
dmd_version_asthma_medication.csv_data.replace("dmd_id", "bad_header")
)
dmd_version_asthma_medication.save()

assert not dmd_version_asthma_medication.downloadable

codelist_id = (
f"{dmd_version_asthma_medication.organisation.slug}/"
f"{dmd_version_asthma_medication.codelist.slug}/"
f"{dmd_version_asthma_medication.hash}"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"

# Test the happy path for this dmd version
manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": "dummy-sha",
},
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {
"status": "error",
"data": {"error": f"{codelist_id} is not downloadable"},
}
50 changes: 50 additions & 0 deletions codelists/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,3 +305,53 @@ def test_draft_is_not_downloadable(draft_with_no_searches):
assert draft_with_no_searches.downloadable is False
builder_actions.save(draft=draft_with_no_searches)
assert draft_with_no_searches.downloadable is True


def test_cached_csv_data(old_style_version):
assert old_style_version.cached_csv_data == {}
old_style_version.csv_data_shas()
assert old_style_version.cached_csv_data["release"] == "snomedct_test_20200101"
assert (
"download_data_fixed_headers_False_include_mapped_vmps_True"
in old_style_version.cached_csv_data
)
assert (
old_style_version.cached_csv_data[
"download_data_fixed_headers_False_include_mapped_vmps_True"
]
== old_style_version.csv_data
)
assert len(old_style_version.cached_csv_data["shas"]) == 1


def test_cached_csv_data_dmd_codelist(dmd_version_asthma_medication):
clv = dmd_version_asthma_medication
assert clv.cached_csv_data == {}
clv.csv_data_shas()

# take a copy of the cached data
cached_csv_data = {**clv.cached_csv_data}
assert clv.cached_csv_data["release"] == "dmd_test_20200101"
assert (
"download_data_fixed_headers_False_include_mapped_vmps_True"
in clv.cached_csv_data
)
# 3 shas
# - original
# - mapped version, with fixed headers
# - mapped version with fixed headers and extra original code column
assert len(clv.cached_csv_data["shas"]) == 3

clv.csv_data = "code,term"
clv.save()
# csv_data_for_download still returns the cached data
assert clv.csv_data_for_download() != "code,term\r\n"
assert (
clv.csv_data_for_download()
== cached_csv_data["download_data_fixed_headers_False_include_mapped_vmps_True"]
)

# make the release not the latest one so the data needs to be re-cached
clv.cached_csv_data["release"] = "old"
clv.save()
assert clv.csv_data_for_download() == "code,term\r\n"
17 changes: 17 additions & 0 deletions coding_systems/dmd/import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from lxml import etree
from tqdm import tqdm

from codelists.models import CodelistVersion
from coding_systems.base.import_data_utils import CodingSystemImporter
from coding_systems.dmd import models
from mappings.dmdvmpprevmap.models import Mapping
Expand Down Expand Up @@ -57,6 +58,7 @@ def import_release(
import_coding_system(Path(tempdir), database_alias)

update_vmp_prev_mapping(database_alias)
update_cached_download_data()


def import_coding_system(release_dir, database_alias):
Expand Down Expand Up @@ -314,3 +316,18 @@ def update_vmp_prev_mapping(database_alias):
desc="Update VMP previous mapping",
):
Mapping.objects.get_or_create(id=vmp.id, vpidprev=vmp.vpidprev)


def update_cached_download_data():
"""
Refresh the cached_csv_data by calling csv_data_shas for each non-draft dmd
CodelistVersion. This will also cache the default download data.
"""
logger.info("Updating cached download data")
for codelist_version in tqdm(
CodelistVersion.objects.filter(codelist__coding_system_id="dmd").exclude(
status="draft"
)
):
if codelist_version.downloadable:
codelist_version.csv_data_shas()
12 changes: 11 additions & 1 deletion coding_systems/dmd/tests/test_import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ def coding_systems_database_tmp_dir(coding_systems_tmp_path):
yield coding_systems_tmp_path


def test_import_data(settings, dmd_data, mock_data_download):
def test_import_data(
settings, dmd_data, dmd_version_asthma_medication, mock_data_download
):
cs_release_count = CodingSystemRelease.objects.count()
assert dmd_version_asthma_medication.cached_csv_data == {}

# import mock XML data
# This consists of the AMP 222311000001102 (Ventolin 100micrograms/dose Evohaler)
Expand Down Expand Up @@ -72,6 +75,13 @@ def test_import_data(settings, dmd_data, mock_data_download):
assert mapping.id == "39113611000001102"
assert mapping.vpidprev == "320139002"

# download data on existing CodelistVersions has been cached
dmd_version_asthma_medication.refresh_from_db()
assert (
dmd_version_asthma_medication.cached_csv_data["release"]
== "dmd_2022-100_20221001"
)


def test_import_data_unexpected_file(mock_data_download_bad_zip):
cs_release_count = CodingSystemRelease.objects.count()
Expand Down