Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert excel ResourceFiles to csv - Add a method to read converted files #1425

Merged
merged 8 commits into from
Oct 8, 2024
78 changes: 76 additions & 2 deletions src/tlo/util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""This file contains helpful utility functions."""
import hashlib
from collections import defaultdict
from typing import Dict, List, Optional, Set, Union
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Union

import numpy as np
import pandas as pd
from pandas import DateOffset
from pandas import DataFrame, DateOffset

from tlo import Population, Property, Types

Expand Down Expand Up @@ -436,3 +437,76 @@ def get_person_id_to_inherit_from(child_id, mother_id, population_dataframe, rng
return abs(mother_id)
elif mother_id >= 0:
return mother_id


def convert_excel_files_to_csv(folder: Path, files: Optional[list[str]] = None, *, delete_excel_files: bool = False) -> None:
""" convert Excel files to csv files.

:param folder: Folder containing Excel files.
:param files: List of Excel file names to convert to csv files. When `None`, all Excel files in the folder and
subsequent folders within this folder will be converted to csv files with Excel file name becoming
folder name and sheet names becoming csv file names.
:param delete_excel_files: When true, the Excel file we are generating csv files from will get deleted.
"""
# get path to Excel files
if files is None:
excel_file_paths = sorted(folder.rglob("*.xlsx"))
else:
excel_file_paths = [folder / file for file in files]
# exit function if no Excel file is given or found within the path
if excel_file_paths is None:
return

for excel_file_path in excel_file_paths:
sheet_dataframes: dict[Any, DataFrame] = pd.read_excel(excel_file_path, sheet_name=None)
excel_file_directory: Path = excel_file_path.with_suffix("")
# Create a container directory for per sheet CSVs
if excel_file_directory.exists():
print(f"Directory {excel_file_directory} already exists")
else:
excel_file_directory.mkdir()
# Write a CSV for each worksheet
for sheet_name, dataframe in sheet_dataframes.items():
dataframe.to_csv(f'{excel_file_directory / sheet_name}.csv', index=False)

if delete_excel_files:
# Remove no longer needed Excel file
Path(folder/excel_file_path).unlink()


def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame | dict[str, DataFrame]:
"""
A function to read CSV files in a similar way pandas reads Excel files (:py:func:`pandas.read_excel`).

NB: Converting Excel files to csv files caused all columns that had no relevant data to simulation (i.e.
parameter descriptions or data references) to be named `Unnamed1, Unnamed2, ....., UnnamedN` in the csv files.
We are therefore using :py:func:`pandas.filter` to track all unnamed columns and silently drop them using
:py:func:`pandas.drop`.

:param folder: Path to folder containing CSV files to read.
:param files: preferred csv file name(s). This is the same as sheet names in Excel file. Note that if None(no files
selected) then all files in the containing folder will be loaded

"""
all_data: dict[str, DataFrame] = {} # dataframes dictionary

def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None:
""" silently drop all columns that have no relevant data to simulation (all columns with a name starting with
Unnamed
:param dataframes_dict: Dictionary of dataframes to clean
"""
for _key, dataframe in dataframes_dict.items():
all_data[_key] = dataframe.drop(dataframe.filter(like='Unnamed'), axis=1) # filter and drop Unnamed columns

if files is None:
for f_name in folder.rglob("*.csv"):
all_data[f_name.stem] = pd.read_csv(f_name)

else:
for f_name in files:
all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"))
# clean and return the dataframe dictionary
clean_dataframe(all_data)
# If only one file loaded return dataframe directly rather than dict
return next(iter(all_data.values())) if len(all_data) == 1 else all_data

Git LFS file not shown
125 changes: 124 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Unit tests for utility functions."""
import os
import pickle
import shutil
import string
import types
from pathlib import Path
Expand All @@ -14,7 +15,7 @@
from tlo import Date, Simulation
from tlo.analysis.utils import parse_log_file
from tlo.methods import demography
from tlo.util import DEFAULT_MOTHER_ID
from tlo.util import DEFAULT_MOTHER_ID, convert_excel_files_to_csv, read_csv_files

path_to_files = Path(os.path.dirname(__file__))

Expand Down Expand Up @@ -317,3 +318,125 @@ def check_hash_is_valid(dfh):
# check hash differs for different dataframes
if not dataframes[i].equals(dataframes[j]):
assert df_hash != tlo.util.hash_dataframe(dataframes[j])

def test_read_csv_files_method(tmpdir):
""" This test read csv files method in util if it can replicate what :py:func:pandas.read_excel was doing without
changing expected outputs

Tests
1) if the method is supplied with one file name, it should return a dataframe
2) if supplied with two or more file names, it should return a dictionary containing dataframes of the
supplied files
3) if supplied with no file names, it should search for all csv files in the
path (resource file containing folder) and return a dictionary csv file names as keys and file data
as dataframes
4) test loading parameter values from Excel file should be equal to loading the same from csv file

"""
# get resource file path
resourcefilepath = path_to_files / 'resources'
tmpdir_resourcefilepath = Path(tmpdir / 'resources')
shutil.copytree(resourcefilepath, tmpdir_resourcefilepath)

# ----- Test case 1. read csv method when one file name is supplied. should return a dataframe
df = read_csv_files(tmpdir_resourcefilepath, files=['df_at_healthcareseeking'])
assert isinstance(df, pd.DataFrame)

# ----- Test case 2. read csv method when multiple file names are supplied.
# should return dictionary.
# dictionary keys should match supplied file names
# all dictionary values should be dataframes
file_names = ['df_at_healthcareseeking', 'df_at_init_of_lifestyle']
df_dict = read_csv_files(tmpdir_resourcefilepath, files=file_names)
assert isinstance(df_dict, dict)
assert set(df_dict.keys()) == set(file_names)
for _key, dataframe in df_dict.items():
assert isinstance(dataframe, pd.DataFrame)

# ---- Test case 3 read csv method when no file names are supplied.
# should return dictionary.
# dictionary keys should match csv file names in resource folder
# all dictionary values should be dataframes
file_names = [csv_file_path.stem for csv_file_path in tmpdir_resourcefilepath.rglob("*.csv")]

df_none_files = read_csv_files(tmpdir_resourcefilepath)
assert isinstance(df_none_files, dict)
assert set(df_none_files.keys()) == set(file_names)
assert all(isinstance(value, pd.DataFrame) for value in df_none_files.values())

# ---- Test case 4 check
excel_file_path = Path(tmpdir_resourcefilepath / 'ResourceFile_test_convert_to_csv/ResourceFile_test_convert_to_csv.xlsx')
# convert the above Excel file into csv equivalent. we will use the newly converted files to determine if
# loading parameters from Excel file will be equal to loading parameters from the converted csv files
convert_excel_files_to_csv(folder=Path(tmpdir_resourcefilepath / 'ResourceFile_test_convert_to_csv'),
files=[excel_file_path.name])

# get excel sheet names
xls = pd.ExcelFile(excel_file_path)
df_excel = pd.read_excel(xls, sheet_name=xls.sheet_names)

# read newly converted csv files using read_csv_files method
df_csv = read_csv_files(Path(str(excel_file_path).split('.')[0]),
files=['dummy_parameter_values', 'parameter_values'])

# dictionary keys from both dataframe dictionaries should match
assert df_excel.keys() == df_csv.keys()


def test_convert_excel_files_method(tmpdir):
""" Test converting Excel files to csv equivalent is done as expected

1) Excel file name should become the name of the folder containing the newly converted csv files
2) Excel file sheet names should become csv file names
3) if files are given, the function should only convert to excel only those given files in a folder
4) if no files are given, all Excel files in the parent folder and subsequent folders within the parent folder
should get converted to csv files

"""

def check_logic_of_converting_excel_files_to_csv_files(folder: Path, files: list) -> None:
""" check converting Excel files to csv files is done as expected
1) check that a new directory to hold the newly created csv files has been created
2) check that this new directory name matches the Excel file name it has been created from
3) check csv files are created and that the csv names should match sheet names of an Excel file they
have been created from
"""
# check that the above function has created a folder named `ResourceFile_load-parameters`(name of the Excel
# file) and a csv file named `parameter_values` (Excel file sheet name).
excel_file_paths = [folder / file for file in files]

for excel_file_path in excel_file_paths:
xl = pd.ExcelFile(excel_file_path)
path_to_new_directory = excel_file_path.with_suffix("")
# new folder should be created
assert path_to_new_directory.exists() and path_to_new_directory.is_dir()
# the new folder name should be the same as the Excel file name
assert excel_file_path.stem == path_to_new_directory.name
for sheet_name in xl.sheet_names:
path_to_new_file = Path(path_to_new_directory / f'{sheet_name}.csv')
# new csv file(s) should be created with name(s) resembling sheet name(s) in excel file
assert path_to_new_file.exists() and path_to_new_file.is_file()
assert sheet_name == path_to_new_file.name.split('.')[0]


# get resource file path
resourcefilepath = path_to_files / 'resources'
tmpdir_resourcefilepath = Path(tmpdir/'resources')
shutil.copytree(resourcefilepath, tmpdir_resourcefilepath)

# ----- Test case 1. supply excel file names to convert Excel files function
excel_file = ['ResourceFile_load-parameters.xlsx']
convert_excel_files_to_csv(tmpdir_resourcefilepath, files=excel_file)
# check new folder containing csv file is created. The folder name and csv file name should resemble the supplied
# Excel file name and sheet name respectively
check_logic_of_converting_excel_files_to_csv_files(tmpdir_resourcefilepath, files=excel_file)

# ------ Test case 2. search and get all Excel files from the tests resources folder
excel_files = [file for file in tmpdir_resourcefilepath.rglob("*.xlsx")]
if excel_files is None:
excel_files = excel_file

convert_excel_files_to_csv(tmpdir_resourcefilepath)
# check behaviours are as expected. New folders containing csv files should be created with names resembling the
# Excel file they were created from
check_logic_of_converting_excel_files_to_csv_files(tmpdir_resourcefilepath, excel_files)