Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rudimentary file handling support. #355

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# setuptools-scm generated file that can be ignored
_version.py

examples/data/raw
examples/data/synthetic

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
11 changes: 11 additions & 0 deletions examples/data/preview.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env python

from metasyn.filehandler import get_file_handler
import sys


if __name__ == "__main__":
dataset_fp = sys.argv[1]
handler_class = get_file_handler(dataset_fp)
df, _ = handler_class.from_file(dataset_fp)
print(df.head(6))
12 changes: 12 additions & 0 deletions examples/data/synthesize.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

INPUT_FILE=$1
OUTPUT_FILE="synthetic/$(basename "$INPUT_FILE")"

mkdir -p synthetic
./preview.py "$INPUT_FILE"
metasyn fileformat "$INPUT_FILE" /tmp/ff.json
metasyn create-meta "$INPUT_FILE" -o /tmp/gmf.json
# metasyn synthesize /tmp/gmf.json --preview
metasyn synthesize /tmp/gmf.json --filehandler /tmp/ff.json -o "$OUTPUT_FILE"
./preview.py "$OUTPUT_FILE"
44 changes: 38 additions & 6 deletions metasyn/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from metasyn import MetaFrame
from metasyn.config import MetaConfig
from metasyn.filehandler import get_file_handler, load_file_handler
from metasyn.validation import create_schema

EXAMPLE_CREATE_META="metasyn create-meta your_dataset.csv -o your_gmf_file.json --config your_config.toml" # noqa: E501
Expand Down Expand Up @@ -77,7 +78,8 @@ def main() -> None:
synthesize()
elif subcommand == "schema":
schema()

elif subcommand == "fileformat":
create_file_formatter()
elif subcommand == "create-meta":
create_metadata()
else:
Expand Down Expand Up @@ -131,12 +133,34 @@ def create_metadata() -> None:
raise ValueError("Please supply either an input dataset or a configuration file.")
meta_frame = MetaFrame.from_config(meta_config)
else:
data_frame = pl.read_csv(args.input, try_parse_dates=True, infer_schema_length=10000,
null_values=["", "na", "NA", "N/A", "Na"],
ignore_errors=True)
handler_class = get_file_handler(args.input)
data_frame, _ = handler_class.from_file(args.input)
# data_frame = pl.read_csv(args.input, try_parse_dates=True, infer_schema_length=10000,
# null_values=["", "na", "NA", "N/A", "Na"],
# ignore_errors=True)
meta_frame = MetaFrame.fit_dataframe(data_frame, config=meta_config)
meta_frame.save(args.output)

def create_file_formatter() -> None:
parser = argparse.ArgumentParser(
prog="metasyn fileformat",
description="Create file formatter for synthetic output.")

parser.add_argument(
"dataset",
help="input file; the dataset for which to create a file formatter.",
type=pathlib.Path,
)
parser.add_argument(
"format_file",
help="TOML file that stores the file format and existing metadata.",
type=pathlib.Path,
)
args = parser.parse_args()
handler_class = get_file_handler(args.dataset)
_df, handler = handler_class.from_file(args.dataset)
handler.save(args.format_file)


def synthesize() -> None:
"""Program to generate synthetic data."""
Expand Down Expand Up @@ -179,6 +203,11 @@ def synthesize() -> None:
help="preview six-row synthesized data frame in console and exit",
action="store_true",
)
parser.add_argument(
"--filehandler",
help="Use file handler to set the format of the output file.",
default=None,
)

# parse the args without the subcommand
args, _ = parser.parse_known_args()
Expand All @@ -205,8 +234,11 @@ def synthesize() -> None:
# Generate a data frame
data_frame = meta_frame.synthesize(args.num_rows, seed=args.seed)

# Store the dataframe to file
if args.output.suffix == ".csv":
if args.filehandler is not None:
handler = load_file_handler(args.filehandler)
handler.write_synthetic(data_frame, args.output)
# Store the dataframe to file
elif args.output.suffix == ".csv":
data_frame.write_csv(args.output)
elif args.output.suffix == ".feather":
data_frame.write_ipc(args.output)
Expand Down
156 changes: 156 additions & 0 deletions metasyn/filehandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import warnings
from pathlib import Path

import polars as pl
import pyreadstat
import json

_AVAILABLE_FILE_HANDLERS = {}


def filehandler(*args):
"""Register a dataset so that it can be found by name."""

def _wrap(cls):
_AVAILABLE_FILE_HANDLERS[cls.name] = cls
return cls

return _wrap(*args)

class BaseFileHandler():
name = "base"
extensions = []

def __init__(self, metadata):
self.metadata = metadata

def to_dict(self):
return {
"file_handler_name": self.name,
"kwargs": self.metadata,
}

def save(self, handler_file):
with open(handler_file, "w", encoding="utf-8") as handle:
json.dump(self.to_dict(), handle, indent=4)

@filehandler
class SavFileHandler(BaseFileHandler):
name = "spss-sav"
extensions = [".sav", ".zsav"]

def read_dataset(self, fp):
df, _prs_metadata = self._get_df_metadata(fp)
return df
# df, prs_metadata = pyreadstat.read_sav(fp, apply_value_formats=True)
# return pl.DataFrame(df)

@classmethod
def _get_df_metadata(cls, fp):
pandas_df, prs_metadata = pyreadstat.read_sav(fp, apply_value_formats=True)
df = pl.DataFrame(pandas_df)
for col in df.columns:
col_format = prs_metadata.original_variable_types[col]
if (col_format.startswith("F") and col_format.endswith(".0")
and df[col].dtype == pl.Float64):
df = df.with_columns(pl.col(col).cast(pl.Int64))
return df, prs_metadata

@classmethod
def from_file(cls, fp):
if Path(fp).suffix not in [".sav", ".zsav"]:
warnings.warn(f"Trying to read file '{fp}' with extension different from .sav or .zsav")
if Path(fp).suffix == ".zsav":
compress = True
else:
compress = False
df, prs_metadata = cls._get_df_metadata(fp)
# df, prs_metadata = pyreadstat.read_sav(fp, apply_value_formats=True)

file_label = "This is a synthetic dataset created by metasyn."
if prs_metadata.file_label is not None:
file_label += f" Original file label: {file_label}"

metadata = {
"column_labels": prs_metadata.column_labels,
"variable_format": prs_metadata.original_variable_types,
"compress": compress,
"variable_display_width": prs_metadata.variable_display_width,
"file_label": file_label,
"variable_value_labels": prs_metadata.variable_value_labels,
"variable_measure": prs_metadata.variable_measure,
}
return df, cls(metadata)

def write_synthetic(self, df, out_fp):
for col in df.columns:
col_format = self.metadata["variable_format"][col]
if (col_format.startswith("F") and not col_format.endswith(".0")
and df[col].dtype == pl.Float64):
n_round = int(col_format.split(".")[-1])
df = df.with_columns(pl.col(col).round(n_round))
pyreadstat.write_sav(df.to_pandas(), out_fp, **self.metadata)

@filehandler
class CsvFileHandler(BaseFileHandler):
name = "csv"
extensions = [".csv", ".tsv"]

def read_dataset(self, fp, **kwargs):
df = pl.read_csv(
fp, try_parse_dates=True, infer_schema_length=10000,
null_values=self.null_values,
ignore_errors=True,
separator=self.separator,
quote_char=self.quote_char,
eol_char=self.eol_char,
**kwargs)
return df

@classmethod
def from_file(cls, fp, separator=None, eol_char="\n", quote_char='"', null_values=None, **kwargs):
if Path(fp).suffix == ".tsv" and separator is None:
separator = "\t"
if separator is None:
separator = ","
if null_values is None:
null_values = ["", "na", "NA", "N/A", "Na"]
if isinstance(null_values, str):
null_values = [null_values]

df = pl.read_csv(
fp, try_parse_dates=True, infer_schema_length=10000,
null_values=null_values,
ignore_errors=True,
separator=separator,
quote_char=quote_char,
eol_char=eol_char,
**kwargs)
metadata = {
"separator": separator,
"line_terminator": eol_char,
"quote_char": quote_char,
"null_value": null_values[0],
}
return df, cls(metadata)

def write_synthetic(self, df, out_fp):
df.write_csv(out_fp, **self.metadata)


def get_file_handler(fp):
suffix = Path(fp).suffix

for handler_name, handler in _AVAILABLE_FILE_HANDLERS.items():
if suffix in handler.extensions:
return handler
raise ValueError(f"Cannot find handler for files with extension '{suffix}'.")


def load_file_handler(fp):
with open(fp, "r", encoding="utf-8") as handle:
metadict = json.load(handle)
for handler_name, handler_class in _AVAILABLE_FILE_HANDLERS.items():
if handler_class.name == metadict["file_handler_name"]:
return handler_class(metadict["kwargs"])
raise ValueError(f"Cannot find handler with name '{metadict['filehandler']}'")
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ builtin = "metasyn.provider:BuiltinDistributionProvider"
[project.entry-points."metasyn.privacy"]
none = "metasyn.privacy:BasicPrivacy"

[project.entry-points."metasyn.filehandler"]
builtin = "metasyn.filehandler:_AVAILABLE_FILE_HANDLERS"

[tool.setuptools]
packages = ["metasyn"]
obsoletes = ["metasynth"]
Expand Down
Loading