diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv index 0ee403abb0..f3589757c7 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_Items_and_Packages.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4106c2e3ae068d40b115857885b673bec3e1114be5183c0a4ae0366560e2a5c9 -size 249391 +oid sha256:596a1bc8d570f341da180fea6db1836c181f6a2a984a9c7f9b4990b78df8e689 +size 215244 diff --git a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv index 25249531b2..43d8a7b653 100644 --- a/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv +++ b/resources/healthsystem/consumables/ResourceFile_Consumables_availability_small.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c358a643e4def0e574b75f89f83d77f9c3366f668422e005150f4d69ebe8d7a7 -size 6169152 +oid sha256:daa5490827d6857323fc837f928b8d983444d35489c8db7512191833a456d483 +size 10086795 diff --git a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv index 7ab675ecba..ce28143182 100644 --- a/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv +++ b/resources/healthsystem/consumables/ResourceFile_consumables_matched.csv @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5b0f417681cbdd2489e2f9c6634b2825c32beb9637dc045b56e308c910a102c -size 90569 +oid sha256:3b1e2cdb4905e48b6ca1340376afb0604a593240cad3d6a2931c28d11fe438b7 +size 58088 diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py deleted file mode 100644 index 3dcd4fe56e..0000000000 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/clean_fac_locations.py +++ /dev/null @@ -1,360 +0,0 @@ -""" -This script generates GIS data on facilities: - -Outputs: -* ResourceFile_Facility_locations.csv -* facility_distances.csv - -The following variables are added to the dataset generated by consumables_avaialbility_estimation.py: -1. facility GIS coordinates -2. Distance and drive time to corresponding District Health office -3. Distance and drive time to corresponding Regional Medical Store (warehouse) - -Inputs: -Dropbox location - ~05 - Resources/Module-healthsystem/consumables raw files/gis_data/LMISFacilityLocations_raw.xlsx - -NB. The comment of this file are commented-out because the script requires dependencies that are not included in the -TLO framework at the time of writing. -""" - - -""" -import datetime -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import requests -import googlemaps as gmaps -import requests -from matplotlib.lines import Line2D - - -# Path to TLO directory -outputfilepath = Path("./outputs") -resourcefilepath = Path("./resources") -path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" - -# Set local Dropbox source -path_to_dropbox = Path( # <-- point to the TLO dropbox locally - 'C:/Users/sm2511/Dropbox/Thanzi la Onse' - # '/Users/tbh03/Dropbox (SPH Imperial College)/Thanzi la Onse Theme 1 SHARE' -) - -path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/" - -# define a timestamp for script outputs -timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") - -# print the start time of the script -print('Script Start', datetime.datetime.now().strftime('%H:%M')) - -# Use googlemaps package to obtain GIS coordinates using facility names -GCODE_URL = 'https://maps.googleapis.com/maps/api/geocode/json?' -GCODE_KEY = '' # PLaceholder to enter googlemaps API -# gmaps = gmaps.Client(key=GCODE_KEY) - -# 1. Clean Master Health Facility Registry (MHFR) data -###################################################################### -# Clean locations for facilities for which GIS data was not available on incorrect in the MHFR -# --- 1.1 Load and set up data --- # -fac_gis = pd.read_excel(open(path_to_files_in_the_tlo_dropbox / 'gis_data/LMISFacilityLocations_raw.xlsx', - 'rb'), sheet_name='final_gis_data') -fac_gis = fac_gis.rename( - columns={'LMIS Facility List': 'fac_name', 'OWNERSHIP': 'fac_owner', 'TYPE': 'fac_type', 'STATUS': 'fac_status', - 'ZONE': 'zone', 'DISTRICT': 'district', 'DATE OPENED': 'open_date', 'LATITUDE': 'lat', - 'LONGITUDE': 'long'}) - -# Create a new column providing source of GIS data -fac_gis['gis_source'] = "" - -# Store unique district names -districts = fac_gis['district'].unique() - -# Preserve rows with missing or incorrect location data in order to derive GIS data using googlemaps API -cond1 = fac_gis['lat'] > -8.5 -cond2 = fac_gis['lat'] < -17.5 -cond3 = fac_gis['long'] > 36.5 -cond4 = fac_gis['long'] < 32.5 -conda = cond1 | cond2 | cond3 | cond4 # outside Malawi's boundaries -fac_gis_noloc = fac_gis[fac_gis.lat.isna() | conda] -fac_gis_noloc = fac_gis_noloc.reset_index() -fac_gis_noloc = fac_gis_noloc.drop(columns='index') - -# Edit data source -cond_originalmhfr = fac_gis.lat.notna() & ~conda -fac_gis.loc[cond_originalmhfr, 'gis_source'] = 'Master Health Facility Registry' -cond_manual = fac_gis['manual_entry'].notna() -fac_gis.loc[cond_manual, 'gis_source'] = 'Manual google search' - -fac_gis_clean = fac_gis[~conda & fac_gis.lat.notna()] # save clean portion of raw data to be appended later - - -# --- 1.2 Geocode facilities with missing data --- # -# Define a function to geocode locations based on names -def reverse_gcode(location): - location = str(location).replace(' ', '+') - nav_req = 'address={}&key={}'.format(location, GCODE_KEY) - request = GCODE_URL + nav_req - result = requests.get(request) - data = result.json() - status = data['status'] - - geo_location = {} - if str(status) == "OK": - sizeofjson = len(data['results'][0]['address_components']) - for i in range(sizeofjson): - sizeoftype = len(data['results'][0]['address_components'][i]['types']) - if sizeoftype == 3: - geo_location[data['results'][0]['address_components'][i]['types'][2]] = \ - data['results'][0]['address_components'][i]['long_name'] - - else: - if data['results'][0]['address_components'][i]['types'][0] == 'administrative_area_level_1': - geo_location['state'] = data['results'][0]['address_components'][i]['long_name'] - - elif data['results'][0]['address_components'][i]['types'][0] == 'administrative_area_level_2': - geo_location['city'] = data['results'][0]['address_components'][i]['long_name'] - geo_location['town'] = geo_location['city'] - - else: - geo_location[data['results'][0]['address_components'][i]['types'][0]] = \ - data['results'][0]['address_components'][i]['long_name'] - - formatted_address = data['results'][0]['formatted_address'] - geo_location['lat'] = data['results'][0]['geometry']['location']['lat'] - geo_location['lang'] = data['results'][0]['geometry']['location']['lng'] - geo_location['formatted_address'] = formatted_address - - return geo_location - - -# Extract latitude, longitude and city based on facility name -for i in range(len(fac_gis_noloc)): - try: - try: - try: - print("Processing facility", fac_gis_noloc['fac_name'][i]) - geo_info = reverse_gcode(fac_gis_noloc['fac_name'][i] + 'Malawi') - fac_gis_noloc['lat'][i] = geo_info['lat'] - fac_gis_noloc['long'][i] = geo_info['lang'] - fac_gis_noloc['gis_source'][i] = 'Google maps geolocation' - fac_gis_noloc['district'][i] = geo_info['city'] - except ValueError: - pass - except TypeError: - pass - except KeyError: - pass - -# Drop incorrect GIS coordinates from the above generated dataset -conda = fac_gis_noloc.district.isin(districts) # districts not from Malawi -cond1 = fac_gis_noloc['lat'] > -8.5 -cond2 = fac_gis_noloc['lat'] < -17.5 -cond3 = fac_gis_noloc['long'] > 36.5 -cond4 = fac_gis_noloc['long'] < 32.5 -condb = cond1 | cond2 | cond3 | cond4 # outside Malawi's boundaries -fac_gis_noloc.loc[~conda | condb, 'lat'] = np.nan -fac_gis_noloc.loc[~conda | condb, 'long'] = np.nan -fac_gis_noloc.loc[~conda | condb, 'district'] = np.nan - -cond = fac_gis_noloc.gis_source.isna() -fac_gis_noloc.loc[cond, 'lat'] = np.nan -fac_gis_noloc.loc[cond, 'long'] = np.nan - -# Append newly generated GIS information to the raw data -fac_gis = fac_gis_noloc.append(fac_gis_clean) - -# Drop incorrect GIS coordinates based on later comparison with district data from LMIS -list_of_incorrect_locations = ['Bilal Clinic', 'Biliwiri Health Centre', 'Chilonga Health care Health Centre', - 'Diamphwi Health Centre', 'Matope Health Centre (CHAM)', 'Nambazo Health Centre', - 'Nkhwayi Health Centre', 'Nsambe Health Centre (CHAM)', 'Padley Pio Health Centre', - 'Phanga Health Centre', 'Somba Clinic', "St. Martin's Molere Health Centre CHAM", - 'Ngapani Clinic', 'Mulungu Alinafe Clinic', 'Mdeza Health Centre', - 'Matandani Health Centre (CHAM)', - 'Sunrise Clinic', 'Sucoma Clinic'] -mapped_to_malawi = fac_gis.lat == -13.254308 -cond = fac_gis.fac_name.isin(list_of_incorrect_locations) | mapped_to_malawi -fac_gis.loc[cond, 'lat'] = np.nan -fac_gis.loc[cond, 'long'] = np.nan -fac_gis.loc[cond, 'gis_source'] = np.nan -fac_gis.loc[cond, 'district'] = np.nan - -# 2. Clean data using information from LMIS # -##################################################################################################### -# --- 2.1 Load and set up LMIS data --- # -stkout_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv", - low_memory=False) - -# Drop rows which can't be used in regression analysis -regsubset_cond1 = stkout_df['data_source'] == 'original_lmis_data' -regsubset_cond2 = stkout_df[ - 'fac_type_tlo'] == 'Facility_level_0' # since only one facility from Mchinji reported in OpenLMIS -stkout_df_reg = stkout_df[regsubset_cond1 & ~regsubset_cond2] - -# Clean some district names to match with master health facility registry -rename_districts = { - 'Nkhota Kota': 'Nkhotakota', - 'Nkhata bay': 'Nkhata Bay' -} -stkout_df['district'] = stkout_df['district'].replace(rename_districts) - -# Keep only relevant columns -lmis_district = stkout_df[['fac_name', 'fac_type_tlo', 'district']] -lmis_district = lmis_district.drop_duplicates() - -# --- 2.2 Clean district column and assign relevant DHO to each facility --- # -# Manual fixes before assigning DHO -# Master Health facility registry did not differentiate between Mzimba North and Mzimba South --> get this data -# and any other district discrepancies from LMIS -fac_gis = fac_gis.rename(columns={'district': 'district_mhfr'}) -fac_gis = pd.merge(fac_gis, lmis_district, how='left', on='fac_name') - -list_mhfr_district_is_correct = ['Chididi Health Centre', 'Chikowa Health Centre', - 'Chileka Health Centre'] -cond_mhfr_district_is_correct = fac_gis.fac_name.isin(list_mhfr_district_is_correct) -cond_lmis_district_missing = fac_gis.district.isna() -fac_gis.loc[cond_mhfr_district_is_correct | cond_lmis_district_missing, 'district'] = fac_gis.district_mhfr -fac_gis = fac_gis.drop(columns=['zone', 'district_mhfr', 'open_date', 'manual_entry']) - -# --- 1.3 Extract final file with GIS locations into .csv --- # -fac_gis = fac_gis[fac_gis['lat'].notna()] # Keep rows with GIS locations -fac_gis.to_csv(path_for_new_resourcefiles / "ResourceFile_Facility_locations.csv") - -# Locate the corresponding DHO for each facility -cond1 = fac_gis['fac_name'].str.contains('DHO') -cond2 = fac_gis['fac_name'].str.contains('istrict') -# Create columns indicating the coordinates of the corresponding DHO for each facility -dho_df = fac_gis[cond1 | cond2].reset_index() -# Rename columns -dho_df = dho_df.rename(columns={'lat': 'lat_dh', 'long': 'long_dh'}) - -# Merge main GIS dataframe with corresponding DHO -fac_gis = pd.merge(fac_gis, dho_df[['district', 'lat_dh', 'long_dh']], how='left', on='district') - -# --- 2.3 Assign relevant CMST Regional Medical Store to each facility --- # -# Create columns indicating the coordinates of the corresponding CMST warehouse (regional medical store) for each -# facility -fac_gis['lat_rms'] = np.nan -fac_gis['long_rms'] = np.nan -fac_gis['rms'] = np.nan - -# RMS Center (-13.980394, 33.783521) -cond_center1 = fac_gis['district'].isin(['Kasungu', 'Ntchisi', 'Dowa', 'Mchinji', 'Lilongwe', 'Ntcheu', - 'Dedza', 'Nkhotakota', 'Salima']) -cond_center2 = fac_gis['fac_name'].str.contains('Kamuzu Central Hospital') -fac_gis.loc[cond_center1 | cond_center2, 'lat_rms'] = -13.980394 -fac_gis.loc[cond_center1 | cond_center2, 'long_rms'] = 33.783521 -fac_gis.loc[cond_center1 | cond_center2, 'rms'] = 'RMS Center' - -# RMS North (-11.425590, 33.997467) -cond_north1 = fac_gis['district'].isin(['Nkhata Bay', 'Rumphi', 'Chitipa', 'Likoma', 'Karonga', - 'Mzimba North', 'Mzimba South']) -cond_north2 = fac_gis['fac_name'].str.contains('Mzuzu Central Hospital') -fac_gis.loc[cond_north1 | cond_north2, 'lat_rms'] = -11.425590 -fac_gis.loc[cond_north1 | cond_north2, 'long_rms'] = 33.997467 -fac_gis.loc[cond_north1 | cond_north2, 'rms'] = 'RMS North' - -# RMS South (-15.804544, 35.021192) -cond_south1 = fac_gis['district'].isin(['Blantyre', 'Balaka', 'Machinga', 'Zomba', 'Mangochi', 'Thyolo', 'Nsanje', - 'Chikwawa', 'Mwanza', 'Neno', 'Mulanje', 'Phalombe', 'Chiradzulu']) -cond_south2 = fac_gis['fac_name'].str.contains('Queen Elizabeth Central') -cond_south3 = fac_gis['fac_name'].str.contains('Zomba Central') -cond_south4 = fac_gis['fac_name'].str.contains('Zomba Mental') -fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'lat_rms'] = -15.804544 -fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'long_rms'] = 35.021192 -fac_gis.loc[cond_south1 | cond_south2 | cond_south3 | cond_south4, 'rms'] = 'RMS South' -fac_gis['district'].unique() - -# 3. Generate data on distance and travel time between facilities and DHO/RMS # -##################################################################################################### -# --- 3.1 Distance and travel time of each facility from the corresponding DHO --- # -fac_gis['dist_todh'] = np.nan -fac_gis['drivetime_todh'] = np.nan -for i in range(len(fac_gis)): - try: - # print("Processing facility", i) - latfac = fac_gis['lat'][i] - longfac = fac_gis['long'][i] - latdho = fac_gis['lat_dh'][i] - longdho = fac_gis['long_dh'][i] - origin = (latdho, longdho) - dest = (latfac, longfac) - - fac_gis['dist_todh'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['distance']['value'] - fac_gis['drivetime_todh'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['duration']['value'] - except: - pass - -# --- 3.2 Distance and travel time of each facility from the corresponding RMS --- # -fac_gis['dist_torms'] = np.nan -fac_gis['drivetime_torms'] = np.nan -for i in range(len(fac_gis)): - try: - # print("Processing facility", i) - latfac = fac_gis['lat'][i] - longfac = fac_gis['long'][i] - latdho = fac_gis['lat_rms'][i] - longdho = fac_gis['long_rms'][i] - origin = (latdho, longdho) - dest = (latfac, longfac) - - fac_gis['dist_torms'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['distance']['value'] - fac_gis['drivetime_torms'][i] = \ - gmaps.distance_matrix(origin, dest, mode='driving')['rows'][0]['elements'][0]['duration']['value'] - except: - pass - -# Update distance values from DH to 0 for levels 2 and above -cond1 = fac_gis['fac_type_tlo'] == 'Facility_level_2' -cond2 = fac_gis['fac_type_tlo'] == 'Facility_level_3' -cond3 = fac_gis['fac_type_tlo'] == 'Facility_level_4' -fac_gis.loc[cond1 | cond2 | cond3, 'dist_todh'] = 0 -fac_gis.loc[cond1 | cond2 | cond3, 'drivetime_todh'] = 0 - -# 4. Save data to be merge into Consumable availabilty dataset for regression analysis # -##################################################################################################### -# Keep only necessary columns and save as .csv -fac_gis = fac_gis[['district', 'rms', 'lat', 'long', 'lat_dh', 'long_dh', 'lat_rms', 'long_rms', - 'dist_torms', 'drivetime_torms', 'dist_todh', 'drivetime_todh', 'fac_name', 'gis_source']] - -# - 1.2.5 Export distances file to dropbox - # -fac_gis.to_csv(path_to_files_in_the_tlo_dropbox / 'gis_data/facility_distances.csv') - -# 5. Descriptive graphs # -##################################################################################################### -groups = fac_gis.groupby('district') - -# Scatterplot of distance and drive time to DHO -fig, ax = plt.subplots() -ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling -for name, group in groups: - ax.plot(group.dist_todh / 1000, group.drivetime_todh, marker='o', linestyle='', ms=5, label=name) -# Shrink current axis by 20% to fit legend -box = ax.get_position() -ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) -# Put a legend to the right of the current axis -ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) -plt.xlabel("Distance (kilometers)", fontsize=12) -plt.ylabel("Drive time (minutes)", fontsize=12) -plt.savefig('C:/Users/sm2511/OneDrive - University of York/Desktop/faclocation_wrtdh_new.png') - -# Scatterplot of distance and drive time to RMS -groups = fac_gis.groupby('rms') -fig, ax = plt.subplots() -ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling -for name, group in groups: - ax.plot(group.dist_torms / 1000, group.drivetime_torms, marker='o', linestyle='', ms=5, label=name) -# Shrink current axis by 20% to fit legend -box = ax.get_position() -ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) -# Put a legend to the right of the current axis -ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) -plt.xlabel("Distance (kilometers)", fontsize=12) -plt.ylabel("Drive time (minutes)", fontsize=12) -plt.savefig('C:/Users/sm2511/OneDrive - University of York/Desktop/faclocation_wrtrms.png') -""" diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py index 3615afd400..d3a58ee15a 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/consumables_availability_estimation.py @@ -30,14 +30,12 @@ from tlo.methods.consumables import check_format_of_consumables_file -# Set local Dropbox source -path_to_dropbox = Path( # <-- point to the TLO dropbox locally - '/Users/sm2511/Dropbox/Thanzi la Onse' - # '/Users/sejjj49/Dropbox/Thanzi la Onse' - # 'C:/Users/tmangal/Dropbox/Thanzi la Onse' +# Set local shared folder source +path_to_share = Path( # <-- point to the shared folder + '/Users/sm2511/Library/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/' ) -path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/" +path_to_files_in_the_tlo_shared_drive = path_to_share / "07 - Data/Consumables data/" # define a timestamp for script outputs timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") @@ -68,7 +66,7 @@ def change_colnames(df, NameChangeList): # Change column names ######################################################################################### # Import 2018 data -lmis_df = pd.read_csv(path_to_files_in_the_tlo_dropbox / 'ResourceFile_LMIS_2018.csv', low_memory=False) +lmis_df = pd.read_csv(path_to_files_in_the_tlo_shared_drive / 'OpenLMIS/2018/ResourceFile_LMIS_2018.csv', low_memory=False) # 1. BASIC CLEANING ## # Rename columns @@ -515,7 +513,7 @@ def custom_agg_stkout(x): unmatched_consumables = unmatched_consumables[unmatched_consumables['item_y'].isna()] # ** Extract stock availability data from HHFA and clean data ** -hhfa_df = pd.read_excel(path_to_files_in_the_tlo_dropbox / 'ResourceFile_hhfa_consumables.xlsx', sheet_name='hhfa_data') +hhfa_df = pd.read_excel(path_to_files_in_the_tlo_shared_drive / 'ResourceFile_hhfa_consumables.xlsx', sheet_name='hhfa_data') # Use the ratio of availability rates between levels 1b on one hand and levels 2 and 3 on the other to extrapolate # availability rates for levels 2 and 3 from the HHFA data @@ -541,7 +539,7 @@ def custom_agg_stkout(x): hhfa_df.loc[cond, var] = 1 # Add further assumptions on consumable availability from other sources -assumptions_df = pd.read_excel(open(path_to_files_in_the_tlo_dropbox / 'ResourceFile_hhfa_consumables.xlsx', 'rb'), +assumptions_df = pd.read_excel(open(path_to_files_in_the_tlo_shared_drive / 'ResourceFile_hhfa_consumables.xlsx', 'rb'), sheet_name='availability_assumptions') assumptions_df = assumptions_df[['item_code', 'available_prop_Facility_level_0', 'available_prop_Facility_level_1a', 'available_prop_Facility_level_1b', @@ -606,35 +604,57 @@ def custom_agg_stkout(x): stkout_df = pd.concat([stkout_df, hhfa_fac0], axis=0, ignore_index=True) # --- 6.4 Generate new category variable for analysis --- # -stkout_df['category'] = stkout_df['module_name'].str.lower() -cond_RH = (stkout_df['category'].str.contains('care_of_women_during_pregnancy')) | \ - (stkout_df['category'].str.contains('labour')) -cond_newborn = (stkout_df['category'].str.contains('newborn')) -cond_childhood = (stkout_df['category'] == 'acute lower respiratory infections') | \ - (stkout_df['category'] == 'measles') | \ - (stkout_df['category'] == 'diarrhoea') -cond_rti = stkout_df['category'] == 'road traffic injuries' -cond_cancer = stkout_df['category'].str.contains('cancer') -cond_ncds = (stkout_df['category'] == 'epilepsy') | \ - (stkout_df['category'] == 'depression') -stkout_df.loc[cond_RH, 'category'] = 'reproductive_health' -stkout_df.loc[cond_cancer, 'category'] = 'cancer' -stkout_df.loc[cond_newborn, 'category'] = 'neonatal_health' -stkout_df.loc[cond_childhood, 'category'] = 'other_childhood_illnesses' -stkout_df.loc[cond_rti, 'category'] = 'road_traffic_injuries' -stkout_df.loc[cond_ncds, 'category'] = 'ncds' - -cond_condom = stkout_df['item_code'] == 2 -stkout_df.loc[cond_condom, 'category'] = 'contraception' - -# Create a general consumables category -general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, - 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247] -diagnostics_cons_list = [41, 50, 128, 216, 2008, 47, 190, 191, 196, 206, 207, 163, 175, 184, - 187] # for now these have not been applied because most diagnostics are program specific - -cond_general = stkout_df['item_code'].isin(general_cons_list) -stkout_df.loc[cond_general, 'category'] = 'general' +def recategorize_modules_into_consumable_categories(_df): + _df['item_category'] = _df['module_name'].str.lower() + cond_RH = (_df['item_category'].str.contains('care_of_women_during_pregnancy')) | \ + (_df['item_category'].str.contains('labour')) + cond_newborn = (_df['item_category'].str.contains('newborn')) + cond_newborn[cond_newborn.isna()] = False + cond_childhood = (_df['item_category'] == 'acute lower respiratory infections') | \ + (_df['item_category'] == 'measles') | \ + (_df['item_category'] == 'diarrhoea') + cond_rti = _df['item_category'] == 'road traffic injuries' + cond_cancer = _df['item_category'].str.contains('cancer') + cond_cancer[cond_cancer.isna()] = False + cond_ncds = (_df['item_category'] == 'epilepsy') | \ + (_df['item_category'] == 'depression') + _df.loc[cond_RH, 'item_category'] = 'reproductive_health' + _df.loc[cond_cancer, 'item_category'] = 'cancer' + _df.loc[cond_newborn, 'item_category'] = 'neonatal_health' + _df.loc[cond_childhood, 'item_category'] = 'other_childhood_illnesses' + _df.loc[cond_rti, 'item_category'] = 'road_traffic_injuries' + _df.loc[cond_ncds, 'item_category'] = 'ncds' + cond_condom = _df['item_code'] == 2 + _df.loc[cond_condom, 'item_category'] = 'contraception' + + # Create a general consumables category + general_cons_list = [300, 33, 57, 58, 141, 5, 6, 10, 21, 23, 127, 24, 80, 93, 144, 149, 154, 40, 67, 73, 76, + 82, 101, 103, 88, 126, 135, 71, 98, 171, 133, 134, 244, 247, 49, 112, 1933, 1960, 9, + 19,20, 40, 47, 49, 50, 75, 112, 127, 128, 135, 141, 154, 1933, 1960] + cond_general = _df['item_code'].isin(general_cons_list) + _df.loc[cond_general, 'item_category'] = 'general' + + + # Fill gaps in categories + dict_for_missing_categories = {292: 'acute lower respiratory infections', 293: 'acute lower respiratory infections', + 307: 'reproductive_health', 2019: 'reproductive_health', + 2678: 'tb', 1171: 'other_childhood_illnesses', 1237: 'cancer', 1239: 'cancer', + 10: "reproductive_health", 39: "reproductive_health", 41: "reproductive_health", + 64: "neonatal_health", 117: "reproductive_health", 150: "epi", 151: "epi", + 153: "epi", 155: "epi", 157: "epi", 158: "epi", 175: "tb", 184: "tb", 190: "hiv", + 197: "hiv", 216: "cardiometabolicdisorders", 234: "cardiometabolicdisorders", + 261: "cancer", 280: "ncds", 285: "other_childhood_illnesses", + 286: "other_childhood_illnesses", 1197: "epi", 1221: "other_childhood_illnesses", + 2064: "ncds", 2670: "reproductive_health"} + # Use map to create a new series from item_code to fill missing values in category + mapped_categories = _df['item_code'].map(dict_for_missing_categories) + # Use fillna on the 'item_category' column to fill missing values using the mapped_categories + _df['item_category'] = _df['item_category'].fillna(mapped_categories) + + return _df + +stkout_df = recategorize_modules_into_consumable_categories(stkout_df) +item_code_category_mapping = stkout_df[['item_category', 'item_code']].drop_duplicates() # --- 6.5 Replace district/fac_name/month entries where missing --- # for var in ['district', 'fac_name', 'month']: @@ -822,12 +842,14 @@ def interpolate_missing_with_mean(_ser): # Check that there are not missing values assert not pd.isnull(full_set_interpolated).any().any() +full_set_interpolated = full_set_interpolated.reset_index().merge(item_code_category_mapping, on = 'item_code', how = 'left', validate = 'm:1') + # --- Check that the exported file has the properties required of it by the model code. --- # -check_format_of_consumables_file(df=full_set_interpolated.reset_index(), fac_ids=fac_ids) +check_format_of_consumables_file(df=full_set_interpolated, fac_ids=fac_ids) # %% # Save -full_set_interpolated.reset_index().to_csv( +full_set_interpolated.to_csv( path_for_new_resourcefiles / "ResourceFile_Consumables_availability_small.csv", index=False ) @@ -849,7 +871,7 @@ def interpolate_missing_with_mean(_ser): hhfa_comparison_df = hhfa_comparison_df.rename({'fac_type_tlo': 'Facility_Level'}, axis=1) # ii. Collapse final model availability data by facility level -final_availability_df = full_set_interpolated.reset_index() +final_availability_df = full_set_interpolated mfl = pd.read_csv(resourcefilepath / "healthsystem" / "organisation" / "ResourceFile_Master_Facilities_List.csv") final_availability_df = pd.merge(final_availability_df, mfl[['District', 'Facility_Level', 'Facility_ID']], how="left", on=['Facility_ID'], @@ -871,7 +893,6 @@ def interpolate_missing_with_mean(_ser): size = 10 comparison_df['consumable_labels'] = comparison_df['consumable_name_tlo'].str[:10] - # Define function to draw calibration plots at different levels of disaggregation def comparison_plot(level_of_disaggregation, group_by_var, colour): comparison_df_agg = comparison_df.groupby([group_by_var], diff --git a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py b/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py deleted file mode 100644 index fc5c775bce..0000000000 --- a/src/scripts/data_file_processing/healthsystem/consumables/consumable_resource_analyses_with_lmis/descriptive_stats.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -This script generates the consumables availability dataset for regression analysis using the outputs of - -consumables_availability_estimation.py and clean_fac_locations.py - -and generates descriptive figures and tables. -""" -import datetime -from pathlib import Path - -import pandas as pd - -# import numpy as np -# import calendar -# import copy -# import matplotlib.pyplot as plt -# from matplotlib.lines import Line2D -# from matplotlib import pyplot # for figures -# import seaborn as sns -# import math - -# Path to TLO directory -outputfilepath = Path("./outputs") -resourcefilepath = Path("./resources") -path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" - -# Set local Dropbox source -path_to_dropbox = Path( # <-- point to the TLO dropbox locally - 'C:/Users/sm2511/Dropbox/Thanzi la Onse' -) - -path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/" - -# define a timestamp for script outputs -timestamp = datetime.datetime.now().strftime("_%Y_%m_%d_%H_%M") - -# print the start time of the script -print('Script Start', datetime.datetime.now().strftime('%H:%M')) - -# 1. DATA IMPORT AND CLEANING # -######################################################################################### -# --- 1.1 Import consumables availability data --- # -stkout_df = pd.read_csv(path_for_new_resourcefiles / "ResourceFile_Consumables_availability_and_usage.csv", - low_memory=False) - -# Drop rows which can't be used in regression analysis -regsubset_cond1 = stkout_df['data_source'] == 'original_lmis_data' -regsubset_cond2 = stkout_df[ - 'fac_type_tlo'] == 'Facility_level_0' # since only one facility from Mchinji reported in OpenLMIS -stkout_df_reg = stkout_df[regsubset_cond1 & ~regsubset_cond2] - -# Clean some district names to match with master health facility registry -rename_districts = { - 'Nkhota Kota': 'Nkhotakota', - 'Nkhata bay': 'Nkhata Bay' -} -stkout_df['district'] = stkout_df['district'].replace(rename_districts) - -# --- 1.2 Import GIS data --- # -fac_gis = pd.read_csv(path_to_files_in_the_tlo_dropbox / "gis_data/facility_distances.csv") - -# --- 1.3 Merge cleaned LMIS data with GIS data --- # -consumables_df = pd.merge(stkout_df.drop(columns=['district', 'Unnamed: 0']), fac_gis.drop(columns=['Unnamed: 0']), - how='left', on='fac_name') -consumables_df.to_csv(path_to_files_in_the_tlo_dropbox / 'consumables_df.csv') diff --git a/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py b/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py index 7ca04f763f..db6af01154 100644 --- a/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py +++ b/src/scripts/data_file_processing/healthsystem/consumables/processing_data_from_one_health/generate_consumables_item_codes_and_packages.py @@ -19,22 +19,20 @@ import numpy as np import pandas as pd -# Set local Dropbox source -path_to_dropbox = Path( # <-- point to the TLO dropbox locally - # '/Users/tbh03/Dropbox (SPH Imperial College)/Thanzi la Onse Theme 1 SHARE' - '/Users/sm2511/Dropbox/Thanzi La Onse') +# Set local shared folder source +path_to_share = Path( # <-- point to the shared folder + '/Users/sm2511/Library/CloudStorage/OneDrive-SharedLibraries-ImperialCollegeLondon/TLOModel - WP - Documents/' +) resourcefilepath = Path("./resources") path_for_new_resourcefiles = resourcefilepath / "healthsystem/consumables" # EHP Consumables list -path_to_files_in_the_tlo_dropbox = path_to_dropbox / "05 - Resources/Module-healthsystem/consumables raw files/" - -workingfile_ehp_consumables = path_to_dropbox / \ +workingfile_ehp_consumables = path_to_share / \ "05 - Resources/Module-healthsystem/From Matthias Arnold/ORIGINAL_Intervention input.xlsx" -workingfile_one_health = path_to_dropbox / \ +workingfile_one_health = path_to_share / \ "07 - Data/OneHealth projection files/OneHealth commodities.xlsx" @@ -157,7 +155,8 @@ wb = wb.merge(intv_codes, on='Intervention_Pkg', how='left', indicator=True) assert (wb['_merge'] == 'both').all() -wb = wb.drop(columns='_merge') +wb = wb.drop(columns=['_merge','Expected_Units_Per_Case', + 'Unit_Cost']) # Assign a unique code for each individual consumable item unique_items = pd.unique(wb['Items']) @@ -172,9 +171,7 @@ 'Intervention_Pkg', 'Intervention_Pkg_Code', 'Items', - 'Item_Code', - 'Expected_Units_Per_Case', - 'Unit_Cost']] + 'Item_Code']] assert not pd.isnull(wb).any().any() @@ -215,7 +212,7 @@ only_in_oh['Intervention_Pkg'] = 'Misc' only_in_oh['Intervention_Pkg_Code'] = -99 only_in_oh['Item_Code'] = np.arange(1000, 1000 + len(only_in_oh)) -only_in_oh['Expected_Units_Per_Case'] = 1.0 +only_in_oh = only_in_oh.drop(columns = 'Unit_Cost') assert set(only_in_oh.columns) == set(wb.columns) @@ -249,8 +246,6 @@ def add_record(df: pd.DataFrame, record: Dict): assert set(df.columns) == set(record.keys()) return pd.concat([df, pd.DataFrame.from_records([record])], ignore_index=True) - - cons = add_record( cons, { @@ -258,10 +253,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "Forceps, obstetric", - 'Item_Code': 2669, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': 1.0 - } + 'Item_Code': 2669} ) cons = add_record( @@ -271,10 +263,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "Vacuum, obstetric", - 'Item_Code': 2670, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': 1.0 - }, + 'Item_Code': 2670}, ) cons = add_record( @@ -284,10 +273,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "First-line ART regimen: adult", - 'Item_Code': 2671, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': 1.0 - }, + 'Item_Code': 2671}, ) cons = add_record( @@ -297,10 +283,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "First line ART regimen: older child", - 'Item_Code': 2672, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': 1.0 - }, + 'Item_Code': 2672}, ) cons = add_record( @@ -310,10 +293,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "First line ART regimen: young child", - 'Item_Code': 2673, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': 1.0 - }, + 'Item_Code': 2673}, ) cons = add_record( @@ -323,10 +303,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "Pre-exposure prophlaxis for HIV", - 'Item_Code': 2674, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': 1.0 - }, + 'Item_Code': 2674}, ) cons = add_record( @@ -336,10 +313,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Isoniazid preventative therapy for HIV+ no TB", 'Intervention_Pkg_Code': 82, 'Items': "Isoniazid/Rifapentine", - 'Item_Code': 2678, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': 1.0 - }, + 'Item_Code': 2678}, ) cons = add_record( @@ -349,9 +323,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "Cystoscope", - 'Item_Code': 285, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': np.nan}, + 'Item_Code': 285}, ) cons = add_record( @@ -360,9 +332,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "Endoscope", - 'Item_Code': 280, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': np.nan}, + 'Item_Code': 280}, ) cons = add_record( @@ -371,9 +341,7 @@ def add_record(df: pd.DataFrame, record: Dict): 'Intervention_Pkg': "Misc", 'Intervention_Pkg_Code': -99, 'Items': "Prostate specific antigen test", - 'Item_Code': 281, - 'Expected_Units_Per_Case': 1.0, - 'Unit_Cost': np.nan}, + 'Item_Code': 281}, ) diff --git a/src/tlo/methods/consumables.py b/src/tlo/methods/consumables.py index 674035ad98..1be37c3267 100644 --- a/src/tlo/methods/consumables.py +++ b/src/tlo/methods/consumables.py @@ -266,7 +266,7 @@ def _lookup_availability_of_consumables(self, def on_simulation_end(self): """Do tasks at the end of the simulation. - + Raise warnings and enter to log about item_codes not recognised. """ if self._not_recognised_item_codes: @@ -339,7 +339,7 @@ def check_format_of_consumables_file(df, fac_ids): months = set(range(1, 13)) item_codes = set(df.item_code.unique()) - assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'available_prop'} + assert set(df.columns) == {'Facility_ID', 'month', 'item_code', 'item_category', 'available_prop'} # Check that all permutations of Facility_ID, month and item_code are present pd.testing.assert_index_equal(