diff --git a/.gitignore b/.gitignore index 9fc74d81..5d6e49b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # setuptools-scm generated file that can be ignored _version.py +examples/data/raw +examples/data/synthetic + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/examples/data/preview.py b/examples/data/preview.py new file mode 100755 index 00000000..e5b5bcf3 --- /dev/null +++ b/examples/data/preview.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python + +from metasyn.filehandler import get_file_handler +import sys + + +if __name__ == "__main__": + dataset_fp = sys.argv[1] + handler_class = get_file_handler(dataset_fp) + df, _ = handler_class.from_file(dataset_fp) + print(df.head(6)) \ No newline at end of file diff --git a/examples/data/synthesize.sh b/examples/data/synthesize.sh new file mode 100755 index 00000000..f445f85f --- /dev/null +++ b/examples/data/synthesize.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +INPUT_FILE=$1 +OUTPUT_FILE="synthetic/$(basename "$INPUT_FILE")" + +mkdir -p synthetic +./preview.py "$INPUT_FILE" +metasyn fileformat "$INPUT_FILE" /tmp/ff.json +metasyn create-meta "$INPUT_FILE" -o /tmp/gmf.json +# metasyn synthesize /tmp/gmf.json --preview +metasyn synthesize /tmp/gmf.json --filehandler /tmp/ff.json -o "$OUTPUT_FILE" +./preview.py "$OUTPUT_FILE" diff --git a/metasyn/__main__.py b/metasyn/__main__.py index 235af013..44066002 100644 --- a/metasyn/__main__.py +++ b/metasyn/__main__.py @@ -19,6 +19,7 @@ from metasyn import MetaFrame from metasyn.config import MetaConfig +from metasyn.filehandler import get_file_handler, load_file_handler from metasyn.validation import create_schema EXAMPLE_CREATE_META="metasyn create-meta your_dataset.csv -o your_gmf_file.json --config your_config.toml" # noqa: E501 @@ -77,7 +78,8 @@ def main() -> None: synthesize() elif subcommand == "schema": schema() - + elif subcommand == "fileformat": + create_file_formatter() elif subcommand == "create-meta": create_metadata() else: @@ -131,12 +133,34 @@ def create_metadata() -> None: raise ValueError("Please supply either an input dataset or a configuration file.") meta_frame = MetaFrame.from_config(meta_config) else: - data_frame = pl.read_csv(args.input, try_parse_dates=True, infer_schema_length=10000, - null_values=["", "na", "NA", "N/A", "Na"], - ignore_errors=True) + handler_class = get_file_handler(args.input) + data_frame, _ = handler_class.from_file(args.input) + # data_frame = pl.read_csv(args.input, try_parse_dates=True, infer_schema_length=10000, + # null_values=["", "na", "NA", "N/A", "Na"], + # ignore_errors=True) meta_frame = MetaFrame.fit_dataframe(data_frame, config=meta_config) meta_frame.save(args.output) +def create_file_formatter() -> None: + parser = argparse.ArgumentParser( + prog="metasyn fileformat", + description="Create file formatter for synthetic output.") + + parser.add_argument( + "dataset", + help="input file; the dataset for which to create a file formatter.", + type=pathlib.Path, + ) + parser.add_argument( + "format_file", + help="TOML file that stores the file format and existing metadata.", + type=pathlib.Path, + ) + args = parser.parse_args() + handler_class = get_file_handler(args.dataset) + _df, handler = handler_class.from_file(args.dataset) + handler.save(args.format_file) + def synthesize() -> None: """Program to generate synthetic data.""" @@ -179,6 +203,11 @@ def synthesize() -> None: help="preview six-row synthesized data frame in console and exit", action="store_true", ) + parser.add_argument( + "--filehandler", + help="Use file handler to set the format of the output file.", + default=None, + ) # parse the args without the subcommand args, _ = parser.parse_known_args() @@ -205,8 +234,11 @@ def synthesize() -> None: # Generate a data frame data_frame = meta_frame.synthesize(args.num_rows, seed=args.seed) - # Store the dataframe to file - if args.output.suffix == ".csv": + if args.filehandler is not None: + handler = load_file_handler(args.filehandler) + handler.write_synthetic(data_frame, args.output) + # Store the dataframe to file + elif args.output.suffix == ".csv": data_frame.write_csv(args.output) elif args.output.suffix == ".feather": data_frame.write_ipc(args.output) diff --git a/metasyn/filehandler.py b/metasyn/filehandler.py new file mode 100644 index 00000000..9ed04305 --- /dev/null +++ b/metasyn/filehandler.py @@ -0,0 +1,156 @@ +import warnings +from pathlib import Path + +import polars as pl +import pyreadstat +import json + +_AVAILABLE_FILE_HANDLERS = {} + + +def filehandler(*args): + """Register a dataset so that it can be found by name.""" + + def _wrap(cls): + _AVAILABLE_FILE_HANDLERS[cls.name] = cls + return cls + + return _wrap(*args) + +class BaseFileHandler(): + name = "base" + extensions = [] + + def __init__(self, metadata): + self.metadata = metadata + + def to_dict(self): + return { + "file_handler_name": self.name, + "kwargs": self.metadata, + } + + def save(self, handler_file): + with open(handler_file, "w", encoding="utf-8") as handle: + json.dump(self.to_dict(), handle, indent=4) + +@filehandler +class SavFileHandler(BaseFileHandler): + name = "spss-sav" + extensions = [".sav", ".zsav"] + + def read_dataset(self, fp): + df, _prs_metadata = self._get_df_metadata(fp) + return df + # df, prs_metadata = pyreadstat.read_sav(fp, apply_value_formats=True) + # return pl.DataFrame(df) + + @classmethod + def _get_df_metadata(cls, fp): + pandas_df, prs_metadata = pyreadstat.read_sav(fp, apply_value_formats=True) + df = pl.DataFrame(pandas_df) + for col in df.columns: + col_format = prs_metadata.original_variable_types[col] + if (col_format.startswith("F") and col_format.endswith(".0") + and df[col].dtype == pl.Float64): + df = df.with_columns(pl.col(col).cast(pl.Int64)) + return df, prs_metadata + + @classmethod + def from_file(cls, fp): + if Path(fp).suffix not in [".sav", ".zsav"]: + warnings.warn(f"Trying to read file '{fp}' with extension different from .sav or .zsav") + if Path(fp).suffix == ".zsav": + compress = True + else: + compress = False + df, prs_metadata = cls._get_df_metadata(fp) + # df, prs_metadata = pyreadstat.read_sav(fp, apply_value_formats=True) + + file_label = "This is a synthetic dataset created by metasyn." + if prs_metadata.file_label is not None: + file_label += f" Original file label: {file_label}" + + metadata = { + "column_labels": prs_metadata.column_labels, + "variable_format": prs_metadata.original_variable_types, + "compress": compress, + "variable_display_width": prs_metadata.variable_display_width, + "file_label": file_label, + "variable_value_labels": prs_metadata.variable_value_labels, + "variable_measure": prs_metadata.variable_measure, + } + return df, cls(metadata) + + def write_synthetic(self, df, out_fp): + for col in df.columns: + col_format = self.metadata["variable_format"][col] + if (col_format.startswith("F") and not col_format.endswith(".0") + and df[col].dtype == pl.Float64): + n_round = int(col_format.split(".")[-1]) + df = df.with_columns(pl.col(col).round(n_round)) + pyreadstat.write_sav(df.to_pandas(), out_fp, **self.metadata) + +@filehandler +class CsvFileHandler(BaseFileHandler): + name = "csv" + extensions = [".csv", ".tsv"] + + def read_dataset(self, fp, **kwargs): + df = pl.read_csv( + fp, try_parse_dates=True, infer_schema_length=10000, + null_values=self.null_values, + ignore_errors=True, + separator=self.separator, + quote_char=self.quote_char, + eol_char=self.eol_char, + **kwargs) + return df + + @classmethod + def from_file(cls, fp, separator=None, eol_char="\n", quote_char='"', null_values=None, **kwargs): + if Path(fp).suffix == ".tsv" and separator is None: + separator = "\t" + if separator is None: + separator = "," + if null_values is None: + null_values = ["", "na", "NA", "N/A", "Na"] + if isinstance(null_values, str): + null_values = [null_values] + + df = pl.read_csv( + fp, try_parse_dates=True, infer_schema_length=10000, + null_values=null_values, + ignore_errors=True, + separator=separator, + quote_char=quote_char, + eol_char=eol_char, + **kwargs) + metadata = { + "separator": separator, + "line_terminator": eol_char, + "quote_char": quote_char, + "null_value": null_values[0], + } + return df, cls(metadata) + + def write_synthetic(self, df, out_fp): + df.write_csv(out_fp, **self.metadata) + + +def get_file_handler(fp): + suffix = Path(fp).suffix + + for handler_name, handler in _AVAILABLE_FILE_HANDLERS.items(): + if suffix in handler.extensions: + return handler + raise ValueError(f"Cannot find handler for files with extension '{suffix}'.") + + +def load_file_handler(fp): + with open(fp, "r", encoding="utf-8") as handle: + metadict = json.load(handle) + for handler_name, handler_class in _AVAILABLE_FILE_HANDLERS.items(): + if handler_class.name == metadict["file_handler_name"]: + return handler_class(metadict["kwargs"]) + raise ValueError(f"Cannot find handler with name '{metadict['filehandler']}'") diff --git a/pyproject.toml b/pyproject.toml index ae12bd3a..f3544234 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,9 @@ builtin = "metasyn.provider:BuiltinDistributionProvider" [project.entry-points."metasyn.privacy"] none = "metasyn.privacy:BasicPrivacy" +[project.entry-points."metasyn.filehandler"] +builtin = "metasyn.filehandler:_AVAILABLE_FILE_HANDLERS" + [tool.setuptools] packages = ["metasyn"] obsoletes = ["metasynth"]