Work in progress on tabular adapters (#142)

CategoricalData · Dec 21, 2024 · 4d42df8 · 4d42df8
1 parent f509245
commit 4d42df8
Show file tree

Hide file tree

Showing 2 changed files with 145 additions and 0 deletions.
diff --git a/hydra-ext/src/main/haskell/Hydra/Tools/Csv.hs b/hydra-ext/src/main/haskell/Hydra/Tools/Csv.hs
@@ -0,0 +1,90 @@
+-- | A collection of tools for working with comma-separated values (CSVs).
+--   Note: no error handling yet; e.g.
+
+module Hydra.Tools.Csv where
+
+import Hydra.Kernel
+import Hydra.Ext.Tabular
+import Hydra.Tools.Tabular
+
+import qualified Control.Monad as CM
+import qualified Data.List as L
+import qualified Data.Maybe as Y
+
+
+-- | Encodes a single DataRow as a line of a CSV, and vice versa.
+--   No support for optional cells; only empty strings. No attempt is made to check that rows are of a consistent width.
+dataRowCsvCoder :: Coder s1 s2 (DataRow String) String
+dataRowCsvCoder = Coder encode decode
+  where
+    encode (DataRow cells) = pure $ encodeCsvLine $ Y.catMaybes cells
+    decode line = pure $ DataRow $ fmap Just $ decodeCsvLine line
+
+decodeCsvCell :: String -> String
+decodeCsvCell ('"':xs) = decodeCsvQuotes (init xs)  -- Remove surrounding quotes
+decodeCsvCell cell = cell  -- If not quoted, return as is
+
+decodeCsvLine :: String -> [String]
+decodeCsvLine xs = fmap decodeCsvCell (splitCsvLine xs)
+
+decodeCsvQuotes :: String -> String
+decodeCsvQuotes [] = []
+decodeCsvQuotes ('"':'"':ys) = '"' : decodeCsvQuotes ys  -- Handle escaped quotes
+decodeCsvQuotes (y:ys) = y : decodeCsvQuotes ys
+
+encodeCsvCell :: String -> String
+encodeCsvCell cell
+  | any (`elem` cell) [',', '\n', '"'] = '"' : escapeCsvQuotes cell ++ "\""
+  | otherwise = cell
+
+encodeCsvLine :: [String] -> String
+encodeCsvLine = L.intercalate "," . fmap encodeCsvCell
+
+escapeCsvQuotes :: String -> String
+escapeCsvQuotes [] = []
+escapeCsvQuotes (x:xs)
+  | x == '"'  = '"' : '"' : escapeCsvQuotes xs
+  | otherwise = x : escapeCsvQuotes xs
+
+splitCsvLine :: String -> [String]
+splitCsvLine [] = []
+splitCsvLine ('"':xs) = ('"':quoted ++ "\"") : splitCsvLine next
+  where
+    (quoted, rest) = span (/= '"') xs
+    afterQuote = drop 1 rest  -- Skip closing quote
+    next = dropWhile (== ',') afterQuote  -- Skip comma
+splitCsvLine xs = cell : splitCsvLine (drop 1 rest)  -- Skip comma
+  where
+    (cell, rest) = break (== ',') xs
+
+tableCsvCoder :: Bool -> Coder s1 s2 (Table String) [String]
+tableCsvCoder hasHeader = Coder encode decode
+  where
+    encode (Table mheader rows) = do
+        hrows <- headerRows
+        drows <- CM.mapM (coderEncode dataRowCsvCoder) rows
+        return $ hrows ++ drows
+      where
+        headerRows = if hasHeader
+          then case mheader of
+            Just (HeaderRow names) -> pure [encodeCsvLine names]
+            Nothing -> fail "missing header"
+          else pure []
+    decode rows = do
+        (mheader, rest) <- if hasHeader
+          then if L.null rows
+            then fail "missing header"
+            else pure (Just $ HeaderRow $ decodeCsvLine $ head rows, tail rows)
+          else pure (Nothing, rows)
+        drows <- CM.mapM (coderDecode dataRowCsvCoder) rest
+        return $ Table mheader drows
+
+
+
+-- TODO: temporary
+tryCsvEncoding :: String -> String
+tryCsvEncoding = decodeCsvCell . encodeCsvCell
+
+-- TODO: temporary
+tryCsvLineEncoding :: String -> String
+tryCsvLineEncoding = encodeCsvLine . decodeCsvLine
diff --git a/hydra-ext/src/main/haskell/Hydra/Tools/Tabular.hs b/hydra-ext/src/main/haskell/Hydra/Tools/Tabular.hs
@@ -0,0 +1,55 @@
+-- | A loose collection of tools for working with tabular data, including CSVs.
+
+module Hydra.Tools.Tabular (
+  referenceCoder,
+  tabularAdapter,
+) where
+
+import Hydra.Kernel
+import Hydra.Ext.Tabular
+import Hydra.Lib.Io
+
+import qualified Control.Monad as CM
+import qualified Data.List as L
+import qualified Data.Map as M
+
+
+-- | Consumes a map of field names to value coders, producing a record coder.
+--   The record coder maps records with concrete values on the left, to records with references on the right, and vice versa.
+--   Fields not designated in the map are unaffected; they are simply copied as-is in either direction.
+--   For fields designated in the map, terms on the right *must* be variables.
+referenceCoder :: (M.Map Name (Coder s1 s2 Term Name)) -> Coder s1 s2 Record Record
+referenceCoder fieldCoders = Coder encode decode
+  where
+    encode (Record tname fields) = Record tname <$> CM.sequence (encodeField <$> fields)
+      where
+        encodeField field@(Field fname fterm) = case M.lookup fname fieldCoders of
+          Just coder -> Field fname <$> (TermVariable <$> coderEncode coder fterm)
+          Nothing -> return field
+    decode (Record tname fields) = Record tname <$> CM.sequence (decodeField <$> fields)
+      where
+        decodeField field@(Field fname fterm) = case M.lookup fname fieldCoders of
+          Just coder -> case fterm of
+            TermVariable v -> Field fname <$> coderDecode coder v
+            _ -> unexpected "variable" (showTerm fterm)
+          Nothing -> return field
+
+-- | Consumes a row type and a cell-level coder, producing a record coder.
+--   The record coder maps data rows on the left to records on the right, and vice versa.
+tabularAdapter :: RowType -> (Type -> Coder s1 s2 (Maybe v) Term) -> Coder s1 s2 (DataRow v) Record
+tabularAdapter (RowType typeName fieldTypes) cellAdapter = Coder encode decode
+  where
+    cellCoders = cellAdapter <$> (fieldTypeType <$> fieldTypes)
+    encode (DataRow cells) = do
+      values <- CM.zipWithM coderEncode cellCoders cells
+      let fields = L.zipWith Field (fieldTypeName <$> fieldTypes) values
+      return $ Record typeName fields
+    decode (Record tname fields) = do
+      if (tname /= typeName)
+        then unexpected ("record of type " ++ unName typeName) ("record of type " ++ unName tname)
+        else DataRow <$> CM.sequence (fieldDecoders <*> fields)
+    fieldDecoders = L.zipWith decodeField fieldTypes cellCoders
+      where
+        decodeField fieldType coder field = if (fieldName field /= fieldTypeName fieldType)
+          then unexpected ("field " ++ unName (fieldTypeName fieldType)) ("field " ++ unName (fieldName field))
+          else coderDecode coder (fieldTerm field)