OpenPecha · kaldan007 · Dec 27, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 24, 2024
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -27,3 +27,9 @@ jobs:
         pip install -U pip
         pip install .
         pip install .[dev]
+
+    - name: Test with pytest
+      run: PYTHONPATH=src pytest
+
+    - name: Test Coverage
+      run: PYTHONPATH=src pytest --cov stt_data_with_llm
diff --git a/.gitignore b/.gitignore
@@ -128,6 +128,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 #Data
-data
+.data
 #DS_STORE
 .DS_STORE
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,6 +24,13 @@ repos:
     hooks:
       - id: isort
 
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+        args: ["--config=setup.cfg"]
+        additional_dependencies: [flake8-isort]
+
 
 # sets up .pre-commit-ci.yaml to ensure pre-commit dependencies stay up to date
 ci:

diff --git a/data/split_audio/inference.csv b/data/split_audio/inference.csv
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "Stt_data_with_llm"
+name = "stt_data_with_llm"
 version = "0.0.1"
 authors = [
   { name="OpenPecha", email="[email protected]" },
@@ -16,14 +16,14 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
-
 dependencies = [
-    "git+https://github.com/OpenPecha/fast-antx.git",
+  "pandas<=2.2.3"
 ]
 
-
 [project.optional-dependencies]
 dev = [
+    "pytest",
+    "pytest-cov",
     "pre-commit",
 ]
 

diff --git a/src/stt_data_with_llm/audio_parser.py b/src/stt_data_with_llm/audio_parser.py
@@ -1,5 +1,8 @@
 def get_audio(audio_url):
     pass
 
-def get_split_audio(audio_data, AUDIO_SEG_LOWER_LIMIT, AUDIO_SEG_UPPER_LIMIT, full_audio_id):
-    pass
+
+def get_split_audio(
+    audio_data, AUDIO_SEG_LOWER_LIMIT, AUDIO_SEG_UPPER_LIMIT, full_audio_id
+):
+    pass
diff --git a/src/stt_data_with_llm/catalog_parser.py b/src/stt_data_with_llm/catalog_parser.py
@@ -1,2 +1,73 @@
-def parse_catalog(catalog):
-    pass
+import logging
+
+import pandas as pd
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+
+
+def read_spreadsheet(sheet_id):
+    """
+    Reads a Google Spreadsheet as a Pandas DataFrame without mixing rows and headers.
+
+    Args:
+        sheet_id (str): The ID of the Google Spreadsheet.
+
+    Returns:
+        pd.DataFrame: A cleaned DataFrame with rows and headers properly separated.
+    """
+    url = (
+        f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv"  # noqa
+    )
+    try:
+        # Read the CSV data from the Google Spreadsheet
+        df = pd.read_csv(url, header=0)
+        print(df.head())
+        # Log basic information about the DataFrame
+        logging.info("Spreadsheet successfully read.")
+        logging.info(f"Headers: {df.columns.tolist()}")
+        logging.info(f"First few rows:\n{df.head()}")  # noqa
+
+        return df
+    except Exception as e:
+        logging.error(f"Error reading spreadsheet: {e}")
+        return pd.DataFrame()
+
+
+def catalog_parser(audio_url):
+    """
+    Parses an audio transcription catalog from a Google Spreadsheet.
+
+    Args:
+        audio_url (str): The URL of the Google Spreadsheet containing the audio transcription catalog.
+
+    Returns:
+        dict: A dictionary where keys are unique IDs (e.g., "full_audio_id") and values are dictionaries of audio data.
+    """
+    catalog_df = read_spreadsheet(audio_url)
+
+    # Check if the catalog DataFrame is empty
+    if catalog_df.empty:
+        logging.warning("Catalog DataFrame is empty.")
+        return {}
+
+    audio_transcription_datas = {}
+    for _, row in catalog_df.iterrows():
+        try:
+            full_audio_id = row.get("ID", "")
+            if not full_audio_id:
+                logging.warning(f"Row missing 'ID': {row}")
+                continue
+
+            audio_transcription_datas[full_audio_id] = {
+                "full_audio_id": full_audio_id,
+                "sr_no": row.get("Sr. no", ""),
+                "audio_url": row.get("Audio LInk", ""),
+                "reference_transcript": row.get("Audio text link", ""),
+                "speaker_id": row.get("Speaker ID", ""),
+            }
+        except Exception as e:
+            logging.error(f"Error processing row: {row}. Error: {e}")
+
+    logging.info(f"Parsed {len(audio_transcription_datas)} entries from the catalog.")
+    return audio_transcription_datas
diff --git a/tests/__init__.py b/tests/__init__.py