Merge pull request #1727 from opensafely-core/codelist-download-checks

Check dm+d codelists against possible previous versions of downloaded CSVs
opensafely-core · Nov 3, 2023 · 5da8530 · 5da8530
2 parents 3ace42e + 89419c2
commit 5da8530
Show file tree

Hide file tree

Showing 4 changed files with 271 additions and 29 deletions.
diff --git a/codelists/api.py b/codelists/api.py
@@ -351,7 +351,7 @@ def codelists_check(requests):
                 return JsonResponse(
                     {"status": "error", "data": {"error": f"{line} could not be found"}}
                 )
-        codelist_download_data[line] = codelist_version.csv_data_sha()
+        codelist_download_data[line] = codelist_version.csv_data_shas()
 
     # Compare with manifest file
     # The manifest file is generated when `opensafely codelists update` is run in a study repo
@@ -373,7 +373,7 @@ def codelists_check(requests):
         file_data["id"]
         for file_data in manifest["files"].values()
         if file_data["id"] in codelist_download_data
-        and file_data["sha"] != codelist_download_data[file_data["id"]]
+        and file_data["sha"] not in codelist_download_data[file_data["id"]]
     ]
     if new_files or removed_files or changed:
         return JsonResponse(

diff --git a/codelists/models.py b/codelists/models.py
@@ -548,22 +548,63 @@ def csv_data_for_download(self, fixed_headers=False, include_mapped_vmps=True):
             )
             if not fixed_headers and not dmd_with_mapped_vmps:
                 return self.csv_data
-            table = self.table_with_fixed_headers(include_mapped_vmps)
+            # if fixed_headers were not explicitly requested (i.e. by OSI), include a column
+            # with the original code column header. This ensures that any new downloads continue
+            # to work with existing study/data definitions that import codelists with a named
+            # code column.
+            # Currently this is likely to only apply to dm+d codelists which have often been
+            # converted from BNF codelists, and previously used "dmd_id" as the code column
+            table = self.table_with_fixed_headers(
+                include_mapped_vmps=include_mapped_vmps,
+                include_original_header=not fixed_headers,
+            )
         else:
             table = self.table
         return rows_to_csv_data(table)
 
-    def csv_data_sha(self):
+    def csv_data_sha(self, csv_data=None):
         """
         sha of CSV data for download with default parameters. This matches the method
         used to hash the CSVs downloaded in a study repo.
         # In order to avoid different OS messing with line endings, opensafely-cli
         # splits the lines and rejoins them before hashing.
         """
-        data_for_download = "\n".join(self.csv_data_for_download().splitlines())
+        csv_data = csv_data or self.csv_data_for_download()
+        data_for_download = "\n".join(csv_data.splitlines())
         return hashlib.sha1(data_for_download.encode()).hexdigest()
 
-    def table_with_fixed_headers(self, include_mapped_vmps=True):
+    def csv_data_shas(self):
+        """
+        Return a list of shas that should all be considered valid when
+        checked against the downloaded data in a study repo.
+        """
+        current_csv_data_download = self.csv_data_for_download()
+        shas = [self.csv_data_sha(csv_data=current_csv_data_download)]
+        if self.coding_system_id == "dmd":
+            # To try and minimise impact on users, we check if there are mapped
+            # VMPs for this set of dm+d codes. If there are not, then the user's
+            # downloaded CSV data will still be OK, so we can check against the
+            # sha of the old style dm+d download (no mapped VMPs, and uses the
+            # original headers)
+            current_csv_data_in_rows = csv_data_to_rows(current_csv_data_download)
+            if len(current_csv_data_in_rows) == len(self.table):
+                old_dmd_download = rows_to_csv_data(self.table)
+                shas.append(self.csv_data_sha(old_dmd_download))
+            # Also allow CSV downloads that only included the "code" and "term"
+            # columns (i.e. not the original code column)
+            # We only care about checking this if there is an original code column
+            # that would be included (it is included by default, but can be missing
+            # if the original code header was "code" to start with)
+            if len(current_csv_data_in_rows[0]) == 3:
+                fixed_header_data = rows_to_csv_data(
+                    [row[:2] for row in current_csv_data_in_rows]
+                )
+                shas.append(self.csv_data_sha(fixed_header_data))
+        return shas
+
+    def table_with_fixed_headers(
+        self, include_mapped_vmps=True, include_original_header=False
+    ):
         """
         Find the code and term columns from csv data (which may be labelled with different
         headers), and return just those columns with the with the headers "code" and "term".
@@ -588,6 +629,9 @@ def table_with_fixed_headers(self, include_mapped_vmps=True):
         )
         # Identify the index for the two columns we want
         code_header_index = header_row.index(code_header)
+        if include_original_header and code_header == "code":
+            # avoid duplicate columns if the code header is already "code"
+            include_original_header = False
         term_header_index = header_row.index(term_header) if term_header else None
 
         table_rows = self.table[1:]
@@ -646,13 +690,24 @@ def add_row(vmp, description):
                     f"VMP subsequent to {', '.join(subsequent_vmps_to_add[subsequent_vmp])}",
                 )
 
+        headers = ["code", "term"]
+        if include_original_header:
+            headers += [header_row[code_header_index]]
+
         # re-write the table data with the new headers, and only the code and term columns
+        # plus a duplicate code column with the original column header if required
+        def _csv_row(row):
+            csv_row = [
+                row[code_header_index],
+                row[term_header_index] if term_header else "",
+            ]
+            if include_original_header:
+                csv_row += [row[code_header_index]]
+            return csv_row
+
         table_data = [
-            ["code", "term"],
-            *[
-                [row[code_header_index], row[term_header_index] if term_header else ""]
-                for row in table_rows
-            ],
+            headers,
+            *[_csv_row(row) for row in table_rows],
         ]
         return table_data
 

diff --git a/codelists/tests/test_api.py b/codelists/tests/test_api.py
@@ -681,6 +681,115 @@ def test_codelists_check_changes(client, dmd_version_asthma_medication):
     }
 
 
+@pytest.mark.parametrize(
+    "downloaded_data",
+    [
+        (  # default download (as expected from a current `opensafely codelists update`)
+            "code,term,dmd_id\r\n"
+            "10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,10514511000001106\r\n"
+            "10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,10525011000001107\r\n"
+        ),
+        (  # old-style download (from before VMPs were mapped into dmd downloads)
+            "dmd_type,dmd_id,dmd_name,bnf_code\r\n"
+            "VMP,10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,0301012A0AAABAB\r\n"
+            "VMP,10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,0301012A0AAACAC\r\n"
+        ),
+        (  # download excluding original code column
+            "code,term\r\n"
+            "10514511000001106,Adrenaline (base) 220micrograms/dose inhaler\r\n"
+            "10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill\r\n"
+        ),
+    ],
+)
+def test_codelists_check_dmd_alternative_downloads(
+    client, dmd_version_asthma_medication, downloaded_data
+):
+    # Test that any of these versions of a downloaded CSV are
+    # still considered OK
+    codelist_id = (
+        f"{dmd_version_asthma_medication.organisation.slug}/"
+        f"{dmd_version_asthma_medication.codelist.slug}/"
+        f"{dmd_version_asthma_medication.hash}"
+    )
+    codelist_csv_id = codelist_id.replace("/", "-") + ".csv"
+
+    manifest = {
+        "files": {
+            codelist_csv_id: {
+                "id": codelist_id,
+                "url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
+                "downloaded_at": "2023-10-04 13:55:17.569997Z",
+                "sha": dmd_version_asthma_medication.csv_data_sha(
+                    csv_data=downloaded_data
+                ),
+            },
+        }
+    }
+    data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
+    resp = client.post("/api/v1/check/", data)
+    assert resp.json() == {"status": "ok"}
+
+
+@pytest.mark.parametrize(
+    "downloaded_data,expected_status",
+    [
+        (
+            (  # default download (as expected from a current `opensafely codelists update`)
+                "code,term,dmd_id\r\n"
+                "10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,10514511000001106\r\n"
+                "10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,10525011000001107\r\n"
+                "999,VMP previous to 10514511000001106,999\r\n"
+            ),
+            "ok",
+        ),
+        (
+            (  # old-style download (from before VMPs were mapped into dmd downloads)
+                "dmd_type,dmd_id,dmd_name,bnf_code\r\n"
+                "VMP,10514511000001106,Adrenaline (base) 220micrograms/dose inhaler,0301012A0AAABAB\r\n"
+                "VMP,10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill,0301012A0AAACAC\r\n"
+            ),
+            "error",
+        ),
+        (
+            (  # download excluding original code column
+                "code,term\r\n"
+                "10514511000001106,Adrenaline (base) 220micrograms/dose inhaler\r\n"
+                "10525011000001107,Adrenaline (base) 220micrograms/dose inhaler refill\r\n"
+                "999,VMP previous to 10514511000001106\r\n"
+            ),
+            "ok",
+        ),
+    ],
+)
+def test_codelists_check_dmd_alternative_downloads_with_vmp_mappings(
+    client, dmd_version_asthma_medication, downloaded_data, expected_status
+):
+    # Add a VMP mapping which will be added into the CSV download
+    VmpPrevMapping.objects.create(id="10514511000001106", vpidprev="999")
+    codelist_id = (
+        f"{dmd_version_asthma_medication.organisation.slug}/"
+        f"{dmd_version_asthma_medication.codelist.slug}/"
+        f"{dmd_version_asthma_medication.hash}"
+    )
+    codelist_csv_id = codelist_id.replace("/", "-") + ".csv"
+
+    manifest = {
+        "files": {
+            codelist_csv_id: {
+                "id": codelist_id,
+                "url": f"https://opencodelist.org/codelists/{codelist_csv_id}/",
+                "downloaded_at": "2023-10-04 13:55:17.569997Z",
+                "sha": dmd_version_asthma_medication.csv_data_sha(
+                    csv_data=downloaded_data
+                ),
+            },
+        }
+    }
+    data = {"codelists": codelist_id, "manifest": json.dumps(manifest)}
+    resp = client.post("/api/v1/check/", data)
+    assert resp.json()["status"] == expected_status
+
+
 def test_codelists_check_sha(version_with_no_searches):
     # The CSV data download contains \r\n line endings
     assert version_with_no_searches.csv_data_for_download() == (

diff --git a/codelists/tests/views/test_version_download.py b/codelists/tests/views/test_version_download.py
@@ -55,7 +55,60 @@ def test_get_with_mapped_vmps(client, dmd_version_asthma_medication):
 
     rsp = client.get(dmd_version_asthma_medication.get_download_url())
     data = rsp.content.decode("utf8")
+    # Includes mapped VMPs  and uses fixed headers by default
+    # Includes an additional column with the original code header
+    assert csv_data_to_rows(data) == [
+        ["code", "term", "dmd_id"],
+        [
+            "10514511000001106",
+            "Adrenaline (base) 220micrograms/dose inhaler",
+            "10514511000001106",
+        ],
+        [
+            "10525011000001107",
+            "Adrenaline (base) 220micrograms/dose inhaler refill",
+            "10525011000001107",
+        ],
+        ["999", "VMP previous to 10514511000001106", "999"],
+        ["888", "VMP subsequent to 10514511000001106", "888"],
+    ]
+
+
+def test_get_with_mapped_vmps_and_original_code_column(
+    client, dmd_version_asthma_medication
+):
+    dmd_version_asthma_medication.csv_data = (
+        dmd_version_asthma_medication.csv_data.replace("dmd_id", "code")
+    )
+    dmd_version_asthma_medication.save()
+
+    # create a previous mapping for one of the dmd codes
+    Mapping.objects.create(id="10514511000001106", vpidprev="999")
+
+    rsp = client.get(dmd_version_asthma_medication.get_download_url())
+    data = rsp.content.decode("utf8")
+    # Includes mapped VMPs by default, and uses fixed headers
+    # No additional column when the original code column was already "code"
+    assert csv_data_to_rows(data) == [
+        ["code", "term"],
+        ["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
+        ["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
+        ["999", "VMP previous to 10514511000001106"],
+    ]
+
+
+def test_get_with_mapped_vmps_and_fixed_headers(client, dmd_version_asthma_medication):
+    # create a previous mapping for one of the dmd codes
+    Mapping.objects.create(id="10514511000001106", vpidprev="999")
+    # create a new mapping for one of the dmd codes
+    Mapping.objects.create(id="888", vpidprev="10514511000001106")
+
+    rsp = client.get(
+        dmd_version_asthma_medication.get_download_url() + "?fixed-headers"
+    )
+    data = rsp.content.decode("utf8")
     # Includes mapped VMPs by default, and uses fixed headers
+    # No additional column with the original code header when fixed headers are explictly requested
     assert csv_data_to_rows(data) == [
         ["code", "term"],
         ["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
@@ -84,12 +137,20 @@ def test_get_with_duplicated_mapped_vmps(client, dmd_version_asthma_medication):
     data = rsp.content.decode("utf8")
     # Includes mapped VMPs by default, and uses fixed headers
     assert csv_data_to_rows(data) == [
-        ["code", "term"],
-        ["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
-        ["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
-        ["999", "VMP previous to 10514511000001106, 10525011000001107"],
-        ["777", "VMP subsequent to 10514511000001106"],
-        ["888", "VMP subsequent to 10514511000001106"],
+        ["code", "term", "dmd_id"],
+        [
+            "10514511000001106",
+            "Adrenaline (base) 220micrograms/dose inhaler",
+            "10514511000001106",
+        ],
+        [
+            "10525011000001107",
+            "Adrenaline (base) 220micrograms/dose inhaler refill",
+            "10525011000001107",
+        ],
+        ["999", "VMP previous to 10514511000001106, 10525011000001107", "999"],
+        ["777", "VMP subsequent to 10514511000001106", "777"],
+        ["888", "VMP subsequent to 10514511000001106", "888"],
     ]
 
 
@@ -99,10 +160,19 @@ def test_get_with_mapped_vmps_nothing_to_map(client, dmd_version_asthma_medicati
     Mapping.objects.create(id="111", vpidprev="999")
     rsp = client.get(dmd_version_asthma_medication.get_download_url())
     data = rsp.content.decode("utf8")
+
     assert csv_data_to_rows(data) == [
-        ["code", "term"],
-        ["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
-        ["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
+        ["code", "term", "dmd_id"],
+        [
+            "10514511000001106",
+            "Adrenaline (base) 220micrograms/dose inhaler",
+            "10514511000001106",
+        ],
+        [
+            "10525011000001107",
+            "Adrenaline (base) 220micrograms/dose inhaler refill",
+            "10525011000001107",
+        ],
     ]
 
 
@@ -144,13 +214,21 @@ def test_get_with_mapped_vmps_more_than_one_step_distant(
     rsp = client.get(dmd_version_asthma_medication.get_download_url())
     data = rsp.content.decode("utf8")
     assert csv_data_to_rows(data) == [
-        ["code", "term"],
-        ["10514511000001106", "Adrenaline (base) 220micrograms/dose inhaler"],
-        ["10525011000001107", "Adrenaline (base) 220micrograms/dose inhaler refill"],
-        ["666", "VMP previous to 10514511000001106"],
-        ["777", "VMP previous to 10514511000001106"],
-        ["999", "VMP previous to 10514511000001106"],
-        ["AAA", "VMP subsequent to 10514511000001106"],
-        ["BBB", "VMP subsequent to 10514511000001106"],
-        ["CCC", "VMP subsequent to 10514511000001106"],
+        ["code", "term", "dmd_id"],
+        [
+            "10514511000001106",
+            "Adrenaline (base) 220micrograms/dose inhaler",
+            "10514511000001106",
+        ],
+        [
+            "10525011000001107",
+            "Adrenaline (base) 220micrograms/dose inhaler refill",
+            "10525011000001107",
+        ],
+        ["666", "VMP previous to 10514511000001106", "666"],
+        ["777", "VMP previous to 10514511000001106", "777"],
+        ["999", "VMP previous to 10514511000001106", "999"],
+        ["AAA", "VMP subsequent to 10514511000001106", "AAA"],
+        ["BBB", "VMP subsequent to 10514511000001106", "BBB"],
+        ["CCC", "VMP subsequent to 10514511000001106", "CCC"],
     ]