Skip to content

Commit

Permalink
Merge pull request #1727 from opensafely-core/codelist-download-checks
Browse files Browse the repository at this point in the history
Check dm+d codelists against possible previous versions of downloaded CSVs
  • Loading branch information
rebkwok authored Nov 3, 2023
2 parents 3ace42e + 89419c2 commit 5da8530
Show file tree
Hide file tree
Showing 4 changed files with 271 additions and 29 deletions.
4 changes: 2 additions & 2 deletions codelists/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def codelists_check(requests):
return JsonResponse(
{"status": "error", "data": {"error": f"{line} could not be found"}}
)
codelist_download_data[line] = codelist_version.csv_data_sha()
codelist_download_data[line] = codelist_version.csv_data_shas()

# Compare with manifest file
# The manifest file is generated when `opensafely codelists update` is run in a study repo
Expand All @@ -373,7 +373,7 @@ def codelists_check(requests):
file_data["id"]
for file_data in manifest["files"].values()
if file_data["id"] in codelist_download_data
and file_data["sha"] != codelist_download_data[file_data["id"]]
and file_data["sha"] not in codelist_download_data[file_data["id"]]
]
if new_files or removed_files or changed:
return JsonResponse(
Expand Down
73 changes: 64 additions & 9 deletions codelists/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,22 +548,63 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True):
)
if not fixed_headers and not dmd_with_mapped_vmps:
return self.csv_data
table = self.table_with_fixed_headers(include_mapped_vmps)
# if fixed_headers were not explicitly requested (i.e. by OSI), include a column
# with the original code column header. This ensures that any new downloads continue
# to work with existing study/data definitions that import codelists with a named
# code column.
# Currently this is likely to only apply to dm+d codelists which have often been
# converted from BNF codelists, and previously used "dmd_id" as the code column
table = self.table_with_fixed_headers(
include_mapped_vmps=include_mapped_vmps,
include_original_header=not fixed_headers,
)
else:
table = self.table
return rows_to_csv_data(table)

def csv_data_sha(self):
def csv_data_sha(self, csv_data=None):
"""
sha of CSV data for download with default parameters. This matches the method
used to hash the CSVs downloaded in a study repo.
# In order to avoid different OS messing with line endings, opensafely-cli
# splits the lines and rejoins them before hashing.
"""
data_for_download = "\n".join(self.csv_data_for_download().splitlines())
csv_data = csv_data or self.csv_data_for_download()
data_for_download = "\n".join(csv_data.splitlines())
return hashlib.sha1(data_for_download.encode()).hexdigest()

def table_with_fixed_headers(self, include_mapped_vmps=True):
def csv_data_shas(self):
"""
Return a list of shas that should all be considered valid when
checked against the downloaded data in a study repo.
"""
current_csv_data_download = self.csv_data_for_download()
shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
if self.coding_system_id == "dmd":
# To try and minimise impact on users, we check if there are mapped
# VMPs for this set of dm+d codes. If there are not, then the user's
# downloaded CSV data will still be OK, so we can check against the
# sha of the old style dm+d download (no mapped VMPs, and uses the
# original headers)
current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
if len(current_csv_data_in_rows) == len(self.table):
old_dmd_download = rows_to_csv_data(self.table)
shas.append(self.csv_data_sha(old_dmd_download))
# Also allow CSV downloads that only included the "code" and "term"
# columns (i.e. not the original code column)
# We only care about checking this if there is an original code column
# that would be included (it is included by default, but can be missing
# if the original code header was "code" to start with)
if len(current_csv_data_in_rows[0]) == 3:
fixed_header_data = rows_to_csv_data(
[row[:2] for row in current_csv_data_in_rows]
)
shas.append(self.csv_data_sha(fixed_header_data))
return shas

def table_with_fixed_headers(
self, include_mapped_vmps=True, include_original_header=False
):
"""
Find the code and term columns from csv data (which may be labelled with different
headers), and return just those columns with the with the headers "code" and "term".
Expand All @@ -588,6 +629,9 @@ def table_with_fixed_headers(self, include_mapped_vmps=True):
)
# Identify the index for the two columns we want
code_header_index = header_row.index(code_header)
if include_original_header and code_header == "code":
# avoid duplicate columns if the code header is already "code"
include_original_header = False
term_header_index = header_row.index(term_header) if term_header else None

table_rows = self.table[1:]
Expand Down Expand Up @@ -646,13 +690,24 @@ def add_row(vmp, description):
f"VMP subsequent to {', '.join(subsequent_vmps_to_add[subsequent_vmp])}",
)

headers = ["code", "term"]
if include_original_header:
headers += [header_row[code_header_index]]

# re-write the table data with the new headers, and only the code and term columns
# plus a duplicate code column with the original column header if required
def _csv_row(row):
csv_row = [
row[code_header_index],
row[term_header_index] if term_header else "",
]
if include_original_header:
csv_row += [row[code_header_index]]
return csv_row

table_data = [
["code", "term"],
*[
[row[code_header_index], row[term_header_index] if term_header else ""]
for row in table_rows
],
headers,
*[_csv_row(row) for row in table_rows],
]
return table_data

Expand Down
109 changes: 109 additions & 0 deletions codelists/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,115 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication):
}


@pytest.mark.parametrize(
"downloaded_data",
[
( # default download (as expected from a current `opensafely codelists update`)
"code,term,dmd_id\r\n"
"10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,10514511000001106\r\n"
"10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,10525011000001107\r\n"
),
( # old-style download (from before VMPs were mapped into dmd downloads)
"dmd_type,dmd_id,dmd_name,bnf_code\r\n"
"VMP,10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,0301012A0AAABAB\r\n"
"VMP,10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,0301012A0AAACAC\r\n"
),
( # download excluding original code column
"code,term\r\n"
"10514511000001106,Adrenaline (base) 220micrograms/dose inhaler\r\n"
"10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill\r\n"
),
],
)
def test_codelists_check_dmd_alternative_downloads(
client, dmd_version_asthma_medication, downloaded_data
):
# Test that any of these versions of a downloaded CSV are
# still considered OK
codelist_id = (
f"{dmd_version_asthma_medication.organisation.slug}/"
f"{dmd_version_asthma_medication.codelist.slug}/"
f"{dmd_version_asthma_medication.hash}"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"

manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": dmd_version_asthma_medication.csv_data_sha(
csv_data=downloaded_data
),
},
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json() == {"status": "ok"}


@pytest.mark.parametrize(
"downloaded_data,expected_status",
[
(
( # default download (as expected from a current `opensafely codelists update`)
"code,term,dmd_id\r\n"
"10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,10514511000001106\r\n"
"10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,10525011000001107\r\n"
"999,VMP previous to 10514511000001106,999\r\n"
),
"ok",
),
(
( # old-style download (from before VMPs were mapped into dmd downloads)
"dmd_type,dmd_id,dmd_name,bnf_code\r\n"
"VMP,10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,0301012A0AAABAB\r\n"
"VMP,10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,0301012A0AAACAC\r\n"
),
"error",
),
(
( # download excluding original code column
"code,term\r\n"
"10514511000001106,Adrenaline (base) 220micrograms/dose inhaler\r\n"
"10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill\r\n"
"999,VMP previous to 10514511000001106\r\n"
),
"ok",
),
],
)
def test_codelists_check_dmd_alternative_downloads_with_vmp_mappings(
client, dmd_version_asthma_medication, downloaded_data, expected_status
):
# Add a VMP mapping which will be added into the CSV download
VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999")
codelist_id = (
f"{dmd_version_asthma_medication.organisation.slug}/"
f"{dmd_version_asthma_medication.codelist.slug}/"
f"{dmd_version_asthma_medication.hash}"
)
codelist_csv_id = codelist_id.replace("/", "-") + ".csv"

manifest = {
"files": {
codelist_csv_id: {
"id": codelist_id,
"url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
"downloaded_at": "2023-10-04 13:55:17.569997Z",
"sha": dmd_version_asthma_medication.csv_data_sha(
csv_data=downloaded_data
),
},
}
}
data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
resp = client.post("/api/v1/check/", data)
assert resp.json()["status"] == expected_status


def test_codelists_check_sha(version_with_no_searches):
# The CSV data download contains \r\n line endings
assert version_with_no_searches.csv_data_for_download() == (
Expand Down
114 changes: 96 additions & 18 deletions codelists/tests/views/test_version_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,60 @@ def test_get_with_mapped_vmps(client, dmd_version_asthma_medication):

rsp = client.get(dmd_version_asthma_medication.get_download_url())
data = rsp.content.decode("utf8")
# Includes mapped VMPs and uses fixed headers by default
# Includes an additional column with the original code header
assert csv_data_to_rows(data) == [
["code", "term", "dmd_id"],
[
"10514511000001106",
"Adrenaline (base) 220micrograms/dose inhaler",
"10514511000001106",
],
[
"10525011000001107",
"Adrenaline (base) 220micrograms/dose inhaler refill",
"10525011000001107",
],
["999", "VMP previous to 10514511000001106", "999"],
["888", "VMP subsequent to 10514511000001106", "888"],
]


def test_get_with_mapped_vmps_and_original_code_column(
client, dmd_version_asthma_medication
):
dmd_version_asthma_medication.csv_data = (
dmd_version_asthma_medication.csv_data.replace("dmd_id", "code")
)
dmd_version_asthma_medication.save()

# create a previous mapping for one of the dmd codes
Mapping.objects.create(id="10514511000001106", vpidprev="999")

rsp = client.get(dmd_version_asthma_medication.get_download_url())
data = rsp.content.decode("utf8")
# Includes mapped VMPs by default, and uses fixed headers
# No additional column when the original code column was already "code"
assert csv_data_to_rows(data) == [
["code", "term"],
["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
["999", "VMP previous to 10514511000001106"],
]


def test_get_with_mapped_vmps_and_fixed_headers(client, dmd_version_asthma_medication):
# create a previous mapping for one of the dmd codes
Mapping.objects.create(id="10514511000001106", vpidprev="999")
# create a new mapping for one of the dmd codes
Mapping.objects.create(id="888", vpidprev="10514511000001106")

rsp = client.get(
dmd_version_asthma_medication.get_download_url() + "?fixed-headers"
)
data = rsp.content.decode("utf8")
# Includes mapped VMPs by default, and uses fixed headers
# No additional column with the original code header when fixed headers are explictly requested
assert csv_data_to_rows(data) == [
["code", "term"],
["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
Expand Down Expand Up @@ -84,12 +137,20 @@ def test_get_with_duplicated_mapped_vmps(client, dmd_version_asthma_medication):
data = rsp.content.decode("utf8")
# Includes mapped VMPs by default, and uses fixed headers
assert csv_data_to_rows(data) == [
["code", "term"],
["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
["999", "VMP previous to 10514511000001106, 10525011000001107"],
["777", "VMP subsequent to 10514511000001106"],
["888", "VMP subsequent to 10514511000001106"],
["code", "term", "dmd_id"],
[
"10514511000001106",
"Adrenaline (base) 220micrograms/dose inhaler",
"10514511000001106",
],
[
"10525011000001107",
"Adrenaline (base) 220micrograms/dose inhaler refill",
"10525011000001107",
],
["999", "VMP previous to 10514511000001106, 10525011000001107", "999"],
["777", "VMP subsequent to 10514511000001106", "777"],
["888", "VMP subsequent to 10514511000001106", "888"],
]


Expand All @@ -99,10 +160,19 @@ def test_get_with_mapped_vmps_nothing_to_map(client, dmd_version_asthma_medicati
Mapping.objects.create(id="111", vpidprev="999")
rsp = client.get(dmd_version_asthma_medication.get_download_url())
data = rsp.content.decode("utf8")

assert csv_data_to_rows(data) == [
["code", "term"],
["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
["code", "term", "dmd_id"],
[
"10514511000001106",
"Adrenaline (base) 220micrograms/dose inhaler",
"10514511000001106",
],
[
"10525011000001107",
"Adrenaline (base) 220micrograms/dose inhaler refill",
"10525011000001107",
],
]


Expand Down Expand Up @@ -144,13 +214,21 @@ def test_get_with_mapped_vmps_more_than_one_step_distant(
rsp = client.get(dmd_version_asthma_medication.get_download_url())
data = rsp.content.decode("utf8")
assert csv_data_to_rows(data) == [
["code", "term"],
["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
["666", "VMP previous to 10514511000001106"],
["777", "VMP previous to 10514511000001106"],
["999", "VMP previous to 10514511000001106"],
["AAA", "VMP subsequent to 10514511000001106"],
["BBB", "VMP subsequent to 10514511000001106"],
["CCC", "VMP subsequent to 10514511000001106"],
["code", "term", "dmd_id"],
[
"10514511000001106",
"Adrenaline (base) 220micrograms/dose inhaler",
"10514511000001106",
],
[
"10525011000001107",
"Adrenaline (base) 220micrograms/dose inhaler refill",
"10525011000001107",
],
["666", "VMP previous to 10514511000001106", "666"],
["777", "VMP previous to 10514511000001106", "777"],
["999", "VMP previous to 10514511000001106", "999"],
["AAA", "VMP subsequent to 10514511000001106", "AAA"],
["BBB", "VMP subsequent to 10514511000001106", "BBB"],
["CCC", "VMP subsequent to 10514511000001106", "CCC"],
]

0 comments on commit 5da8530

Please sign in to comment.