OpenPecha · kaldan007 · Dec 27, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 24, 2024
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -27,3 +27,9 @@ jobs:
         pip install -U pip
         pip install .
         pip install .[dev]
+
+    - name: Test with pytest
+      run: PYTHONPATH=src pytest
+
+    - name: Test Coverage
+      run: PYTHONPATH=src pytest --cov stt_data_with_llm
diff --git a/.gitignore b/.gitignore
@@ -128,6 +128,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
 #Data
-data
+.data
 #DS_STORE
 .DS_STORE
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,6 +24,13 @@ repos:
     hooks:
       - id: isort
 
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+        args: ["--config=setup.cfg"]
+        additional_dependencies: [flake8-isort]
+
 
 # sets up .pre-commit-ci.yaml to ensure pre-commit dependencies stay up to date
 ci:

diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "Stt_data_with_llm"
+name = "stt_data_with_llm"
 version = "0.0.1"
 authors = [
   { name="OpenPecha", email="[email protected]" },
@@ -16,14 +16,15 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
-
 dependencies = [
-    "git+https://github.com/OpenPecha/fast-antx.git",
+  "fast-antx @ git+https://github.com/OpenPecha/fast-antx.git",
+  "pandas<2.2.3"
 ]
 
-
 [project.optional-dependencies]
 dev = [
+    "pytest",
+    "pytest-cov",
     "pre-commit",
 ]
 

diff --git a/src/stt_data_with_llm/audio_parser.py b/src/stt_data_with_llm/audio_parser.py
@@ -1,5 +1,6 @@
 def get_audio(audio_url):
     pass
 
-def get_split_audio(audio_data, AUDIO_SEG_LOWER_LIMIT, AUDIO_SEG_UPPER_LIMIT, full_audio_id):
-    pass
+
+def get_split_audio(audio_data, lower_limit, upper_limit, full_audio_id):
+    pass
diff --git a/src/stt_data_with_llm/catalog_parser.py b/src/stt_data_with_llm/catalog_parser.py
@@ -1,2 +1,91 @@
-def parse_catalog(catalog):
-    pass
+import logging
+
+import pandas as pd
+
+from stt_data_with_llm.util import setup_logging
+
+# Call the setup_logging function at the beginning of your script
+setup_logging("catalog_parse.log")
+
+
+def read_spreadsheet(sheet_id):
+    """
+    Reads a Google Spreadsheet as a Pandas DataFrame without mixing rows and headers.
+
+    Args:
+        sheet_id (str): The ID of the Google Spreadsheet.
+
+    Returns:
+        pd.DataFrame: A cleaned DataFrame with rows and headers properly separated.
+    """
+    url = (
+        f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv"  # noqa
+    )
+    try:
+        # Read the CSV data from the Google Spreadsheet
+        df = pd.read_csv(url, header=0, encoding="utf-8")
+        # Log basic information about the DataFrame
+        logging.info("Spreadsheet successfully read.")
+        logging.info(f"Headers: {df.columns.tolist()}")
+
+        return df
+    except Exception as e:
+        logging.error(f"Error reading spreadsheet: {e}")
+        return pd.DataFrame()
+
+
+def catalog_parser(google_sheet_id):
+    """
+    Parses an audio transcription catalog from a Google Spreadsheet.
+
+    Args:
+        audio_url (str): The URL of the Google Spreadsheet containing the audio transcription catalog.
+
+    Returns:
+        dict: A dictionary where keys are unique IDs (e.g., "full_audio_id") and values are dictionaries of audio data.
+    """
+    catalog_df = read_spreadsheet(google_sheet_id)
+
+    # Check if the catalog DataFrame is empty
+    if catalog_df.empty:
+        logging.warning("Catalog DataFrame is empty.")
+        return {}
+
+    audio_transcription_datas = {}
+
+    for index, row in catalog_df.iterrows():
+        try:
+            full_audio_id = row.get("ID", "")
+            if not full_audio_id:
+                logging.warning(f"Row missing 'ID': {row.to_dict()}")
+
+            audio_transcription_datas[str(index)] = {
+                "full_audio_id": full_audio_id if not pd.isna(full_audio_id) else "",
+                "sr_no": row.get("Sr.no", "")
+                if not pd.isna(row.get("Sr.no", ""))
+                else "",
+                "audio_url": row.get("Audio URL", "")
+                if not pd.isna(row.get("Audio URL", ""))
+                else "",
+                "reference_transcript": row.get("Audio Text", "")
+                if not pd.isna(row.get("Audio Text", ""))
+                else "",
+                "speaker_name": row.get("Speaker Name", "")
+                if not pd.isna(row.get("Speaker Name", ""))
+                else "",
+                "speaker_gender": row.get("Speaker Gender", "")
+                if not pd.isna(row.get("Speaker Gender", ""))
+                else "",
+                "news_channel": row.get("News Channel", "")
+                if not pd.isna(row.get("News Channel", ""))
+                else "",
+                "publishing_year": row.get("Publishing Year", "")
+                if not pd.isna(row.get("Publishing Year", ""))
+                else "",
+            }
+
+        except Exception as e:
+            logging.error(f"Error processing row: {row.to_dict()}. Error: {e}")
+
+    logging.info(f"Parsed {len(audio_transcription_datas)} entries from the catalog.")
+    return audio_transcription_datas
diff --git a/src/stt_data_with_llm/config.py b/src/stt_data_with_llm/config.py
@@ -1,2 +1,38 @@
+# Catalog Parser
+MAX_BYTES = 1024 * 1024
+BACKUP_COUNT = 5
+
+
+# Audio Segmentation
 AUDIO_SEG_UPPER_LIMIT = 8
-AUDIO_SEG_LOWER_LIMIT = 2
+AUDIO_SEG_LOWER_LIMIT = 2
+
+
+HYPER_PARAMETERS = {
+    # onset/offset activation thresholds
+    "onset": 0.5,
+    "offset": 0.5,
+    # remove speech regions shorter than that many seconds.
+    "min_duration_on": 2.0,
+    # fill non-speech regions shorter than that many seconds.
+    "min_duration_off": 0.0,
+}
+
+# Define the headers (as given)
+HEADERS = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",  # noqa: E501
+    "accept-encoding": "gzip, deflate, br, zstd",
+    "accept-language": "en-US,en;q=0.9,en-IN;q=0.8",
+    "cache-control": "max-age=0",
+    "cookie": "AMCVS_518ABC7455E462B97F000101%40AdobeOrg=1; s_cc=true; utag_main=v_id:019169eade56002296e6ea4a443c0507d001b075008f7$_sn:11$_se:5$_ss:0$_st:1727788387180$vapi_domain:rfa.org$ses_id:1727793787%3Bexp-session$_pn:5%3Bexp-session; AMCV_518ABC7455E462B97F000101%40AdobeOrg=1176715910%7CMCIDTS%7C19997%7CMCMID%7C92058839809215745258174654077801968713%7CMCAID%7CNONE%7CMCOPTOUT-1727793787s%7CNONE%7CvVersion%7C5.4.0; s_sq=%5B%5BB%5D%5D",  # noqa: E501
+    "priority": "u=0, i",
+    "sec-ch-ua": '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": '"Windows"',
+    "sec-fetch-dest": "document",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-site": "none",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0",  # noqa: E501
+}
diff --git a/src/stt_data_with_llm/util.py b/src/stt_data_with_llm/util.py
@@ -0,0 +1,27 @@
+import logging
+from logging.handlers import RotatingFileHandler
+
+from stt_data_with_llm.config import BACKUP_COUNT, MAX_BYTES
+
+
+# Configure logging
+def setup_logging(filename):
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # Create a formatter
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+
+    # Create a file handler for a rotating log file
+    file_handler = RotatingFileHandler(
+        filename,
+        MAX_BYTES,
+        BACKUP_COUNT,
+    )
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    # Create a console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/data/expected_catalog_data.json b/tests/data/expected_catalog_data.json
@@ -0,0 +1,32 @@
+{
+    "0":{
+            "full_audio_id": "STT_NW0802",
+            "sr_no": 1,
+            "audio_url": "https://www.rfa.org/tibetan/sargyur/golok-china-religious-restriction-08202024054225.html/@@stream",
+            "reference_transcript": "A",
+            "speaker_name": "བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
+            "speaker_gender": "",
+            "news_channel": "RFA",
+            "publishing_year": "2024.08.20"
+    },
+    "1":{
+        "full_audio_id": "STT_NW0805",
+        "sr_no": 2,
+        "audio_url": "https://www.rfa.org/tibetan/sargyur/vpn-china-restriction-08152024081404.html/@@stream",
+        "reference_transcript": "B",
+        "speaker_name": "བདེ་སྐྱིད་ཀུན་སྒྲོལ།",
+        "speaker_gender": "",
+        "news_channel": "RFA",
+        "publishing_year": "2024.08.15"
+    },
+    "2":{
+    "full_audio_id": "",
+    "sr_no": 3,
+    "audio_url": "",
+    "reference_transcript": "D",
+    "speaker_name": "",
+    "speaker_gender": "",
+    "news_channel": "RFA",
+    "publishing_year": "2021.01.28"
+}
+}
diff --git a/tests/test_catalog_parser.py b/tests/test_catalog_parser.py
@@ -0,0 +1,26 @@
+import json
+
+from stt_data_with_llm.catalog_parser import catalog_parser
+
+
+def test_catalog_parser():
+    # Parse the catalog
+
+    """
+    Main function to parse the catalog and save the audio transcription data as JSON.
+    """
+    # Replace with your actual spreadsheet ID
+    google_spread_sheet_id = "14pCi8pxD_Ms3i3RAcBWNrCT9MocnRKD49jTLxDHzDe0"
+
+    # Parse the catalog
+
+    audio_transcription_datas = catalog_parser(google_spread_sheet_id)
+    expected_output_json_path = "tests/data/expected_catalog_data.json"
+    with open(expected_output_json_path, encoding="utf-8") as file:
+        expected_output_json = json.load(file)
+
+    assert audio_transcription_datas == expected_output_json
+
+
+if __name__ == "__main__":
+    test_catalog_parser()