ROC tutorial notebook

nci · Oct 31, 2023 · 9e50fcb · 9e50fcb
1 parent c4acfe6
commit 9e50fcb
Show file tree

Hide file tree

Showing 7 changed files with 1,458 additions and 32 deletions.
diff --git a/src/scores/categorical/binary_impl.py b/src/scores/categorical/binary_impl.py
@@ -1,24 +1,25 @@
 """
 This module contains methods for binary categories
 """
-from typing import Optional, Sequence
+from typing import Optional
 
 import numpy as np
 import xarray as xr
 
 from scores.functions import apply_weights
 from scores.processing import check_binary
+from scores.typing import FlexibleDimensionTypes, XarrayLike
 from scores.utils import gather_dimensions
 
 
 def probability_of_detection(
-    fcst: xr.DataArray,
-    obs: xr.DataArray,
-    reduce_dims: Optional[Sequence[str]] = None,
-    preserve_dims: Optional[Sequence[str]] = None,
+    fcst: XarrayLike,
+    obs: XarrayLike,
+    reduce_dims: FlexibleDimensionTypes = None,
+    preserve_dims: FlexibleDimensionTypes = None,
     weights: Optional[xr.DataArray] = None,
     check_args: Optional[bool] = True,
-) -> xr.DataArray:
+) -> XarrayLike:
     """
     Calculates the Probability of Detection (POD), also known as the Hit Rate.
     This is the proportion of observed events (obs = 1) that were correctly
@@ -75,19 +76,17 @@ def probability_of_detection(
     hits = hits.sum(dim=dims_to_sum)
 
     pod = hits / (hits + misses)
-
-    pod.name = "ctable_probability_of_detection"
     return pod
 
 
 def probability_of_false_detection(
-    fcst: xr.DataArray,
-    obs: xr.DataArray,
-    reduce_dims: Optional[Sequence[str]] = None,
-    preserve_dims: Optional[Sequence[str]] = None,
+    fcst: XarrayLike,
+    obs: XarrayLike,
+    reduce_dims: FlexibleDimensionTypes = None,
+    preserve_dims: FlexibleDimensionTypes = None,
     weights: Optional[xr.DataArray] = None,
     check_args: Optional[bool] = True,
-) -> xr.DataArray:
+) -> XarrayLike:
     """
     Calculates the Probability of False Detection (POFD), also known as False
     Alarm Rate (not to be confused with the False Alarm Ratio). The POFD is
@@ -144,5 +143,4 @@ def probability_of_false_detection(
     correct_negatives = correct_negatives.sum(dim=dims_to_sum)
 
     pofd = false_alarms / (false_alarms + correct_negatives)
-    pofd.name = "ctable_probability_of_false_detection"
     return pofd
diff --git a/src/scores/probability/roc_impl.py b/src/scores/probability/roc_impl.py
@@ -1,7 +1,8 @@
 """
 Implementation of Reciever Operating Characteristic (ROC) calculations
 """
-from typing import Iterable, Optional, Sequence
+from collections.abc import Iterable, Sequence
+from typing import Optional
 
 import numpy as np
 import xarray as xr

diff --git a/src/scores/processing.py b/src/scores/processing.py
@@ -1,11 +1,13 @@
 """Tools for processing data for verification"""
 import operator
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 import pandas as pd
 import xarray as xr
 
+from scores.typing import FlexibleDimensionTypes, XarrayLike
+
 INEQUALITY_MODES = {
     ">=": (operator.ge, -1),
     ">": (operator.gt, 1),
@@ -17,7 +19,7 @@
 EQUALITY_MODES = {"==": (operator.le), "!=": (operator.gt)}
 
 
-def check_binary(data: Union[xr.DataArray, xr.Dataset], name: str):
+def check_binary(data: XarrayLike, name: str):
     """
     Checks that data does not have any non-NaN values out of the set {0, 1}
 
@@ -27,25 +29,30 @@ def check_binary(data: Union[xr.DataArray, xr.Dataset], name: str):
         ValueError: if there are values in `fcst` and `obs` that are not in the
             set {0, 1, np.nan} and `check_args` is true.
     """
-    unique_values = pd.unique(data.values.flatten())
+    if isinstance(data, xr.DataArray):
+        unique_values = pd.unique(data.values.flatten())
+    else:
+        unique_values = pd.unique(data.to_array().values.flatten())
     unique_values = unique_values[~np.isnan(unique_values)]
     binary_set = {0, 1}
 
     if not set(unique_values).issubset(binary_set):
         raise ValueError(f"`{name}` contains values that are not in the set {{0, 1, np.nan}}")
 
 
-def comparative_discretise(data, comparison, mode, abs_tolerance=None):
+def comparative_discretise(
+    data: XarrayLike, comparison: Union[xr.DataArray, float, int], mode: str, abs_tolerance: Optional[float] = None
+) -> XarrayLike:
     """
     Converts the values of `data` to 0 or 1 based on how they relate to the specified
     values in `comparison` via the `mode` operator.
 
     Args:
-        data (xarray.DataArray or xarray.Dataset): The data to convert to
+        data: The data to convert to
             discrete values.
-        comparison (xarray.DataArray, float or int): The values to which
+        comparison: The values to which
             to compare `data`.
-        mode (str): Specifies the required relation of `data` to `thresholds`
+        mode: Specifies the required relation of `data` to `thresholds`
             for a value to fall in the 'event' category (i.e. assigned to 1).
             Allowed modes are:
             - '>=' values in `data` greater than or equal to the
@@ -60,7 +67,7 @@ def comparative_discretise(data, comparison, mode, abs_tolerance=None):
             are assigned as 1
             - '!=' values in `data` not equal to the corresponding threshold
             are assigned as 1.
-        abs_tolerance (Optional[float]): If supplied, values in data that are
+        abs_tolerance: If supplied, values in data that are
             within abs_tolerance of a threshold are considered to be equal to
             that threshold. This is generally used to correct for floating
             point rounding, e.g. we may want to consider 1.0000000000000002 as
@@ -108,17 +115,23 @@ def comparative_discretise(data, comparison, mode, abs_tolerance=None):
     return discrete_data
 
 
-def binary_discretise(data, thresholds, mode, abs_tolerance=None, autosqueeze=False):
+def binary_discretise(
+    data: XarrayLike,
+    thresholds: FlexibleDimensionTypes,
+    mode: str,
+    abs_tolerance: Optional[float] = None,
+    autosqueeze: Optional[bool] = False,
+):
     """
     Converts the values of `data` to 0 or 1 for each threshold in `thresholds`
     according to the operation defined by `mode`.
 
     Args:
-        data (xarray.DataArray or xarray.Dataset): The data to convert to
+        data: The data to convert to
             discrete values.
-        thresholds (float or iterable): Threshold(s) at which to convert the
+        thresholds: Threshold(s) at which to convert the
             values of `data` to 0 or 1.
-        mode (str): Specifies the required relation of `data` to `thresholds`
+        mode: Specifies the required relation of `data` to `thresholds`
             for a value to fall in the 'event' category (i.e. assigned to 1).
             Allowed modes are:
 
@@ -135,13 +148,13 @@ def binary_discretise(data, thresholds, mode, abs_tolerance=None, autosqueeze=Fa
             - '!=' values in `data` not equal to the corresponding threshold
             are assigned as 1.
 
-        abs_tolerance (Optional[float]): If supplied, values in data that are
+        abs_tolerance: If supplied, values in data that are
             within abs_tolerance of a threshold are considered to be equal to
             that threshold. This is generally used to correct for floating
             point rounding, e.g. we may want to consider 1.0000000000000002 as
             equal to 1
 
-        autosqueeze (Optional[bool]): If True and only one threshold is
+        autosqueeze: If True and only one threshold is
             supplied, then the dimension 'threshold' is squeezed out of the
             output. If `thresholds` is float-like, then this is forced to
             True, otherwise defaults to False.
@@ -184,7 +197,7 @@ def binary_discretise(data, thresholds, mode, abs_tolerance=None, autosqueeze=Fa
     return discrete_data
 
 
-def broadcast_and_match_nan(*args: Union[xr.DataArray, xr.Dataset]):
+def broadcast_and_match_nan(*args: XarrayLike) -> XarrayLike:
     """
     Input xarray data objects are 'matched' - they are broadcast against each
     other (forced to have the same dimensions), and the position of nans are

diff --git a/tests/categorical/test_binary.py b/tests/categorical/test_binary.py
@@ -48,6 +48,14 @@
         (fcst_mix, obs1, "a", True, None, expected_poda),  # Fcst mix, obs ones, only reduce one dim
         (fcst_bad, obs0, None, False, None, expected_pod0),  # Don't check for bad data
         (fcst_mix, obs1, None, True, weight_array, expected_pod_weighted),  # Fcst mixed, obs ones, with weights
+        (
+            xr.Dataset({"array1": fcst0, "array2": fcst1}),
+            xr.Dataset({"array1": obs0, "array2": obs1}),
+            None,
+            True,
+            None,
+            xr.Dataset({"array1": expected_pod0, "array2": expected_pod1}),
+        ),  # Test with DataSet for inputs
     ],
 )
 def test_probability_of_detection(fcst, obs, reduce_dims, check_args, weights, expected):
@@ -90,6 +98,14 @@ def test_probability_of_detection_raises(fcst, obs, error_msg):
         (fcst_mix, obs0, "a", True, None, expected_poda),  # Fcst mix, obs ones, only reduce one dim
         (fcst_bad, obs0, None, False, None, expected_pofd0),  # Don't check for bad data
         (fcst_mix, obs0, None, True, weight_array, expected_pofd_weighted),  # Fcst mixed, obs ones, with weights
+        (
+            xr.Dataset({"array1": fcst0, "array2": fcst1}),
+            xr.Dataset({"array1": obs0, "array2": obs1}),
+            None,
+            True,
+            None,
+            xr.Dataset({"array1": expected_pofd0, "array2": expected_pofd1}),
+        ),  # Test with DataSet for inputs
     ],
 )
 def test_probability_of_false_detection(fcst, obs, reduce_dims, check_args, weights, expected):

diff --git a/tests/probabilty/test_roc.py b/tests/probabilty/test_roc.py
@@ -4,10 +4,10 @@
 import dask
 import numpy as np
 import pytest
-import roc_test_data as rtd
 import xarray as xr
 
 from scores.probability import roc_curve_data
+from tests.probabilty import roc_test_data as rtd
 
 
 @pytest.mark.parametrize(

diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -3,13 +3,14 @@
 import pytest
 import xarray as xr
 
-from tests import test_processing_data as xtd
 from scores.processing import (
     binary_discretise,
     broadcast_and_match_nan,
     check_binary,
     comparative_discretise,
 )
+from tests import test_processing_data as xtd
+
 
 @pytest.mark.parametrize(
     ("args", "expected"),