Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Review of Murphy tutorial #129

Merged
merged 9 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run-pre-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ permissions:
jobs:
build:

runs-on: ubuntu-latest
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
Expand Down
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
.. autofunction:: scores.continuous.murphy_score
.. autofunction:: scores.continuous.murphy.thetas
.. autofunction:: scores.continuous.flip_flop_index
.. autofunction:: scores.continuous.flip_flop_index_proportion_exceeding
```

## scores.probability
Expand Down
11 changes: 4 additions & 7 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,10 @@ Types of contributions include bug reports, merge requests, feature requests, an
These guidelines aim to make it clear how to collaborate effectively.

## Roadmap

1. Further development of examples and demonstrations for new users
2. Set up of readthedocs and travis-ci
3. Add further tests for NaNs and missing data
4. Addition of more standard metrics
5. Addition of more novel metrics
6. Performance review and tests for dask
1. Addition of more scores, metrics and statistical techniques
2. Further optimisation and performance improvements
3. Increased support for machine learning library integration
4. Additional notebooks exploring complex use cases in depth

## Bug Reports and Feature Requests

Expand Down
5 changes: 4 additions & 1 deletion src/scores/continuous/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""
Import the functions from the implementations into the public API
"""
from scores.continuous.flip_flop_impl import flip_flop_index
from scores.continuous.flip_flop_impl import (
flip_flop_index,
flip_flop_index_proportion_exceeding,
)
from scores.continuous.isoreg import isotonic_fit
from scores.continuous.murphy_impl import murphy_score, murphy_thetas
from scores.continuous.quantile_loss_impl import quantile_score
Expand Down
108 changes: 99 additions & 9 deletions src/scores/continuous/flip_flop_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import xarray as xr

from scores.functions import angular_difference
from scores.typing import XarrayLike
from scores.processing import proportion_exceeding
from scores.typing import FlexibleDimensionTypes, XarrayLike
from scores.utils import DimensionError, check_dims, dims_complement


Expand All @@ -34,8 +35,7 @@ def _flip_flop_index(data: xr.DataArray, sampling_dim: str, is_angular: bool = F
"""
# check that `sampling_dim` is in `data`.
check_dims(data, [sampling_dim], mode="superset")

# the maximum possible number of discrete flipflops
# the maximum possible number of discrete flip_flops
sequence_length = len(data[sampling_dim])
max_possible_flip_flop_count = sequence_length - 2

Expand All @@ -50,21 +50,21 @@ def _flip_flop_index(data: xr.DataArray, sampling_dim: str, is_angular: bool = F
# maximum possible angular difference between two forecasts
enc_size = encompassing_sector_size(data=data, dims=dims_to_preserve)
range_val = np.clip(enc_size, a_min=None, a_max=180.0)
flipflop = angular_difference(data.shift({sampling_dim: 1}), data)
flip_flop = angular_difference(data.shift({sampling_dim: 1}), data)
else:
max_val = data.max(dim=sampling_dim, skipna=False)
min_val = data.min(dim=sampling_dim, skipna=False)
range_val = max_val - min_val
# subtract each consecutive 'row' from eachother
flipflop = data.shift({sampling_dim: 1}) - data
flip_flop = data.shift({sampling_dim: 1}) - data

# take the absolute value and sum.
# I don't do skipna=False here because .shift makes a row of nan
flipflop = abs(flipflop).sum(dim=sampling_dim)
flip_flop = abs(flip_flop).sum(dim=sampling_dim)
# adjust based on the range. This is where nan will be introduced.
flipflop = flipflop - range_val
flip_flop = flip_flop - range_val
# normalise
return flipflop / max_possible_flip_flop_count
return flip_flop / max_possible_flip_flop_count


# If there are selections, a DataSet is always returned
Expand Down Expand Up @@ -98,7 +98,7 @@ def flip_flop_index(
direction).
**selections: Additional keyword arguments specify
subsets to draw from the dimension `sampling_dim` of the supplied `data`
before calculation of the flipflop index. e.g. days123=[1, 2, 3]
before calculation of the flip_flop index. e.g. days123=[1, 2, 3]

Returns:
If `selections` are not supplied: An xarray.DataArray, the Flip-flop
Expand Down Expand Up @@ -348,3 +348,93 @@ def _encompassing_sector_size_np(
n_unique_angles = (angular_diffs != 0).sum(axis=0)
result = np.where(n_unique_angles <= 2, np.max(angular_diffs, axis=0), result)
return result


def flip_flop_index_proportion_exceeding(
data: xr.DataArray,
sampling_dim: str,
thresholds: Iterable,
is_angular: bool = False,
preserve_dims: FlexibleDimensionTypes = None,
reduce_dims: FlexibleDimensionTypes = None,
**selections: Iterable[int],
):
"""
Calculates the flip-flop index and returns the proportion exceeding
(or equal to) each of the supplied `thresholds`.

Args:
data: Data from which to draw subsets.
sampling_dim: The name of the dimension along which to calculate
thresholds: The proportion of Flip-Flop index results
equal to or exceeding these thresholds will be calculated.
the flip-flop index.
is_angular: specifies whether `data` is directional data (e.g. wind
direction).
reduce_dims: Dimensions to reduce.
preserve_dims: Dimensions to preserve.
**selections: Additional keyword arguments specify
subsets to draw from the dimension `sampling_dim` of the supplied `data`
before calculation of the flip_flop index. e.g. days123=[1, 2, 3]
Returns:
If `selections` are not supplied - An xarray.DataArray with dimensions
`dims` + 'threshold'. The DataArray is the proportion of the Flip-flop
Index calculated by collapsing dimension `sampling_dim` exceeding or
equal to `thresholds`.

If `selections` are supplied - An xarray.Dataset with dimensions `dims`
+ 'threshold'. There is a data variable for each keyword in
`selections`, and corresponds to the Flip-Flop Index proportion
exceeding for the subset of data specified by the keyword values.

Examples:
>>> data = xr.DataArray(
... [[50, 20, 40, 80], [10, 50, 10, 100], [0, 30, 20, 50]],
... dims=['station_number', 'lead_day'],
... coords=[[10001, 10002, 10003], [1, 2, 3, 4]]
... )

>>> flip_flop_index_proportion_exceeding(data, 'lead_day', [20])
<xarray.DataArray (threshold: 1)>
array([ 0.33333333])
Coordinates:
* threshold (threshold) int64 20
Attributes:
sampling_dim: lead_day

>>> flip_flop_index_proportion_exceeding(
... data, 'lead_day', [20], days123=[1, 2, 3], all_days=[1, 2, 3, 4]
... )
<xarray.Dataset>
Dimensions: (threshold: 1)
Coordinates:
* threshold (threshold) int64 20
Data variables:
days123 (threshold) float64 0.6667
all_days (threshold) float64 0.3333
Attributes:
selections: {{'days123': [1, 2, 3], 'all_days': [1, 2, 3, 4]}}
sampling_dim: lead_day

See also:
`scores.continuous.flip_flop_index`

"""
if preserve_dims is not None and sampling_dim in list(preserve_dims):
raise DimensionError(
f"`sampling_dim`: '{sampling_dim}' must not be in dimensions to preserve "
f"`preserve_dims`: {list(preserve_dims)}"
)
if reduce_dims is not None and sampling_dim in list(reduce_dims):
raise DimensionError(
f"`sampling_dim`: '{sampling_dim}' must not be in dimensions to reduce "
f"`reduce_dims`: {list(reduce_dims)}"
)
# calculate the flip-flop index
flip_flop_data = flip_flop_index(data, sampling_dim, is_angular=is_angular, **selections)
# calculate the proportion exceeding each threshold
flip_flop_exceeding = proportion_exceeding(flip_flop_data, thresholds, reduce_dims, preserve_dims)
# overwrite the attributes
flip_flop_exceeding.attrs = flip_flop_data.attrs

return flip_flop_exceeding
122 changes: 122 additions & 0 deletions src/scores/processing.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Tools for processing data for verification"""
import operator
from collections.abc import Iterable
from typing import Optional, Union

import numpy as np
import pandas as pd
import xarray as xr

from scores.typing import FlexibleDimensionTypes, XarrayLike
from scores.utils import gather_dimensions

INEQUALITY_MODES = {
">=": (operator.ge, -1),
Expand Down Expand Up @@ -260,3 +262,123 @@ def update_mask(mask, data_array):

# return matched data objects
return tuple(arg.where(mask) for arg in args)


def proportion_exceeding(
data: XarrayLike,
thresholds: Iterable,
preserve_dims: FlexibleDimensionTypes = None,
reduce_dims: FlexibleDimensionTypes = None,
):
"""
Calculates the proportion of `data` equal to or exceeding `thresholds`.

Args:
data (xarray.Dataset or xarray.DataArray): The data from which
to calculate the proportion exceeding `thresholds`
thresholds (iterable): The proportion of Flip-Flop index results
equal to or exceeding these thresholds will be calculated.
the flip-flop index.
reduce_dims: Dimensions to reduce.
preserve_dims: Dimensions to preserve.

Returns:
An xarray data object with the type of `data` and dimensions
`dims` + 'threshold'. The values are the proportion of `data`
that are greater than or equal to the corresponding threshold.

"""
return _binary_discretise_proportion(data, thresholds, ">=", preserve_dims, reduce_dims)


def _binary_discretise_proportion(
data: XarrayLike,
thresholds: Iterable,
mode: str,
preserve_dims: FlexibleDimensionTypes = None,
reduce_dims: FlexibleDimensionTypes = None,
abs_tolerance: Optional[bool] = None,
autosqueeze: bool = False,
):
"""
Returns the proportion of `data` in each category. The categories are
defined by the relationship of data to threshold as specified by
the operation `mode`.

Args:
data: The data to convert
into 0 and 1 according the thresholds before calculating the
proportion.
thresholds: The proportion of Flip-Flop index results
equal to or exceeding these thresholds will be calculated.
the flip-flop index.
mode: Specifies the required relation of `data` to `thresholds`
for a value to fall in the 'event' category (i.e. assigned to 1).
Allowed modes are:

- '>=' values in `data` greater than or equal to the
corresponding threshold are assigned as 1.
- '>' values in `data` greater than the corresponding threshold
are assigned as 1.
- '<=' values in `data` less than or equal to the corresponding
threshold are assigned as 1.
- '<' values in `data` less than the corresponding threshold
are assigned as 1.
- '==' values in `data` equal to the corresponding threshold
are assigned as 1
- '!=' values in `data` not equal to the corresponding threshold
are assigned as 1.
reduce_dims: Dimensions to reduce.
preserve_dims: Dimensions to preserve.
abs_tolerance: If supplied, values in data that are
within abs_tolerance of a threshold are considered to be equal to
that threshold. This is generally used to correct for floating
point rounding, e.g. we may want to consider 1.0000000000000002 as
equal to 1.
autosqueeze: If True and only one threshold is
supplied, then the dimension 'threshold' is squeezed out of the
output. If `thresholds` is float-like, then this is forced to
True, otherwise defaults to False.

Returns:
An xarray data object with the type of `data`, dimension `dims` +
'threshold'. The values of the output are the proportion of `data` that
satisfy the relationship to `thresholds` as specified by `mode`.

Examples:

>>> data = xr.DataArray([0, 0.5, 0.5, 1])

>>> _binary_discretise_proportion(data, [0, 0.5, 1], '==')
<xarray.DataArray (threshold: 3)>
array([ 0.25, 0.5 , 0.25])
Coordinates:
* threshold (threshold) float64 0.0 0.5 1.0
Attributes:
discretisation_tolerance: 0
discretisation_mode: ==

>>> _binary_discretise_proportion(data, [0, 0.5, 1], '>=')
<xarray.DataArray (threshold: 3)>
array([ 1. , 0.75, 0.25])
Coordinates:
* threshold (threshold) float64 0.0 0.5 1.0
Attributes:
discretisation_tolerance: 0
discretisation_mode: >=

See also:
`scores.processing.binary_discretise`

"""
# values are 1 when (data {mode} threshold), and 0 when ~(data {mode} threshold).
discrete_data = binary_discretise(data, thresholds, mode, abs_tolerance=abs_tolerance, autosqueeze=autosqueeze)

# The proportion in each category
dims = gather_dimensions(data.dims, data.dims, preserve_dims, reduce_dims)
proportion = discrete_data.mean(dim=dims)

# attach attributes
proportion.attrs = discrete_data.attrs

return proportion
Loading
Loading