Merge pull request #11 from IbLahlou/ild02

Model training
IbLahlou · Sep 7, 2023 · 1e1e9e7 · 1e1e9e7
2 parents f2cddd3 + 077284f
commit 1e1e9e7
Show file tree

Hide file tree

Showing 11 changed files with 334 additions and 40 deletions.
diff --git a/logs/running_logs.log b/logs/running_logs.log
@@ -546,3 +546,173 @@ x==========x]
 [2023-09-07 23:53:06,371: INFO: symspell: length of longest word in corpus: 18]
 [2023-09-07 23:53:06,371: DEBUG: model: Phoneme training started ...]
 [2023-09-07 23:53:06,953: DEBUG: model: Spello training completed successfully ...]
+[2023-09-08 00:19:06,806: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
+[2023-09-08 00:19:06,806: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:19:06,822: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:19:06,822: INFO: common: created directory at: artifacts]
+[2023-09-08 00:19:06,822: INFO: common: created directory at: artifacts/data_ingestion]
+[2023-09-08 00:19:06,822: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
+[2023-09-08 00:19:07,069: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:19:07,069: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
+[2023-09-08 00:19:07,085: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:19:07,088: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:19:07,088: INFO: common: created directory at: artifacts]
+[2023-09-08 00:19:07,093: INFO: common: created directory at: artifacts/data_validation]
+[2023-09-08 00:19:07,093: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:19:07,101: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
+[2023-09-08 00:19:07,109: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:19:07,116: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:19:07,116: INFO: common: created directory at: artifacts]
+[2023-09-08 00:19:07,116: ERROR: main: 'ConfigurationManager' object has no attribute 'get_model_trainer_config']
+Traceback (most recent call last):
+  File ".\main.py", line 31, in <module>
+    model_training.main()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 12, in main
+    model_trainer_config = config.get_model_trainer_config()
+AttributeError: 'ConfigurationManager' object has no attribute 'get_model_trainer_config'
+[2023-09-08 00:21:18,631: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
+[2023-09-08 00:21:18,637: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:21:18,646: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:21:18,648: INFO: common: created directory at: artifacts]
+[2023-09-08 00:21:18,648: INFO: common: created directory at: artifacts/data_ingestion]
+[2023-09-08 00:21:18,648: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
+[2023-09-08 00:21:18,848: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:21:18,848: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
+[2023-09-08 00:21:18,857: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:21:18,857: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:21:18,857: INFO: common: created directory at: artifacts]
+[2023-09-08 00:21:18,857: INFO: common: created directory at: artifacts/data_validation]
+[2023-09-08 00:21:18,857: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:21:18,857: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
+[2023-09-08 00:21:18,873: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:21:18,873: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:21:18,873: INFO: common: created directory at: artifacts]
+[2023-09-08 00:21:18,873: INFO: common: created directory at: artifacts/model_trainer]
+[2023-09-08 00:21:30,592: ERROR: main: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train\\w']
+Traceback (most recent call last):
+  File ".\main.py", line 31, in <module>
+    model_training.main()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 14, in main
+    model_trainer_config.train()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 52, in train
+    raise e
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 46, in train
+    data = read_text(self.config.data_path, self.config.data_file)
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\utils\trainer.py", line 110, in read_text
+    strings = unidecode.unidecode(open(file_path).read())
+FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train\\w'
+[2023-09-08 00:22:27,677: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
+[2023-09-08 00:22:27,707: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:22:27,718: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:22:27,725: INFO: common: created directory at: artifacts]
+[2023-09-08 00:22:27,725: INFO: common: created directory at: artifacts/data_ingestion]
+[2023-09-08 00:22:27,725: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
+[2023-09-08 00:22:27,962: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:22:27,962: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
+[2023-09-08 00:22:27,962: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:22:27,979: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:22:27,979: INFO: common: created directory at: artifacts]
+[2023-09-08 00:22:27,979: INFO: common: created directory at: artifacts/data_validation]
+[2023-09-08 00:22:27,985: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:22:27,985: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
+[2023-09-08 00:22:27,993: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:22:28,001: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:22:28,008: INFO: common: created directory at: artifacts]
+[2023-09-08 00:22:28,008: INFO: common: created directory at: artifacts/model_trainer]
+[2023-09-08 00:22:38,007: ERROR: main: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train/w']
+Traceback (most recent call last):
+  File ".\main.py", line 31, in <module>
+    model_training.main()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 14, in main
+    model_trainer_config.train()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 52, in train
+    raise e
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 46, in train
+    data = read_text(self.config.data_path, self.config.data_file)
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\utils\trainer.py", line 110, in read_text
+    strings = unidecode.unidecode(open(file_path).read())
+FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train/w'
+[2023-09-08 00:33:54,025: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
+[2023-09-08 00:33:54,065: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:33:54,068: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:33:54,082: INFO: common: created directory at: artifacts]
+[2023-09-08 00:33:54,085: INFO: common: created directory at: artifacts/data_ingestion]
+[2023-09-08 00:33:54,085: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
+[2023-09-08 00:33:54,342: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:33:54,358: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
+[2023-09-08 00:33:54,358: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:33:54,374: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:33:54,382: INFO: common: created directory at: artifacts]
+[2023-09-08 00:33:54,382: INFO: common: created directory at: artifacts/data_validation]
+[2023-09-08 00:33:54,389: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:33:54,397: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
+[2023-09-08 00:33:54,406: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:33:54,413: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:33:54,413: INFO: common: created directory at: artifacts]
+[2023-09-08 00:33:54,426: INFO: common: created directory at: artifacts/model_trainer]
+[2023-09-08 00:34:06,693: ERROR: main: 'ModelTrainer' object has no attribute 'data_file']
+Traceback (most recent call last):
+  File ".\main.py", line 31, in <module>
+    model_training.main()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 14, in main
+    model_trainer_config.train()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 61, in train
+    raise e
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 53, in train
+    path = self.get_data_filename()
+  File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 20, in get_data_filename
+    return f"{self.config.data_path}/{self.data_file}"
+AttributeError: 'ModelTrainer' object has no attribute 'data_file'
+[2023-09-08 00:34:50,605: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
+[2023-09-08 00:34:50,618: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:34:50,627: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:34:50,627: INFO: common: created directory at: artifacts]
+[2023-09-08 00:34:50,627: INFO: common: created directory at: artifacts/data_ingestion]
+[2023-09-08 00:34:50,627: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
+[2023-09-08 00:34:50,831: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:34:50,846: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
+[2023-09-08 00:34:50,846: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:34:50,858: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:34:50,864: INFO: common: created directory at: artifacts]
+[2023-09-08 00:34:50,864: INFO: common: created directory at: artifacts/data_validation]
+[2023-09-08 00:34:50,867: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<
+
+x==========x]
+[2023-09-08 00:34:50,867: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
+[2023-09-08 00:34:50,883: INFO: common: yaml file: config\config.yaml loaded successfully]
+[2023-09-08 00:34:50,889: INFO: common: yaml file: params.yaml loaded successfully]
+[2023-09-08 00:34:50,898: INFO: common: created directory at: artifacts]
+[2023-09-08 00:34:50,900: INFO: common: created directory at: artifacts/model_trainer]
+[2023-09-08 00:35:02,257: DEBUG: model: Spello training started..]
+[2023-09-08 00:35:02,276: DEBUG: model: Context model training started ...]
+[2023-09-08 00:35:02,581: DEBUG: model: Symspell training started ...]
+[2023-09-08 00:35:03,055: INFO: symspell: Creating spell check dictionary...]
+[2023-09-08 00:35:17,727: INFO: symspell: 14.66 seconds to run]
+[2023-09-08 00:35:17,727: INFO: symspell: total words processed: 4153]
+[2023-09-08 00:35:17,727: INFO: symspell: total unique words in corpus: 4153]
+[2023-09-08 00:35:17,727: INFO: symspell: total items in dictionary (corpus words & deletions): 383935]
+[2023-09-08 00:35:17,727: INFO: symspell: edit distance for deletions: 3]
+[2023-09-08 00:35:17,727: INFO: symspell: length of longest word in corpus: 19]
+[2023-09-08 00:35:17,727: DEBUG: model: Phoneme training started ...]
+[2023-09-08 00:35:17,834: DEBUG: model: Spello training completed successfully ...]
+[2023-09-08 00:35:18,009: INFO: main: >>>>>> stage Model Training stage completed <<<<<<
+
+x==========x]
diff --git a/main.py b/main.py
@@ -1,7 +1,7 @@
 from spellX import logger
 from spellX.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
 from spellX.pipeline.stage_02_data_validation import DataValidationTrainingPipeline
-
+from spellX.pipeline.stage_03_model_training import ModelTrainingPipeline
 
 STAGE_NAME = "Data Ingestion stage"
 try:
@@ -19,6 +19,17 @@
    data_validation = DataValidationTrainingPipeline()
    data_validation.main()
    logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
+except Exception as e:
+         logger.exception(e)
+         raise e
+
+
+STAGE_NAME = "Model Training stage"
+try:
+   logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") 
+   model_training = ModelTrainingPipeline()
+   model_training.main()
+   logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
 except Exception as e:
          logger.exception(e)
          raise e
diff --git a/research/03_model_training.ipynb b/research/03_model_training.ipynb
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -37,13 +37,12 @@
     "class ModelTrainerConfig:\n",
     "    root_dir : Path\n",
     "    data_path : Path\n",
-    "    data_file : Path\n",
-    "    model_ckpt : Path"
+    "    data_file : Path"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -53,7 +52,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -71,15 +70,14 @@
     "\n",
     "    \n",
     "    def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
-    "        config = self.config.model_traine\n",
+    "        config = self.config.model_trainer\n",
     "\n",
     "        create_directories([config.root_dir])\n",
     "\n",
     "        model_trainer_config = ModelTrainerConfig(\n",
     "            root_dir=config.root_dir,\n",
     "            data_path=config.data_path,\n",
     "            data_file = config.data_file,\n",
-    "            model_ckpt = config.model_ckpt,\n",
     "            \n",
     "        )\n",
     "\n",
@@ -88,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -104,7 +102,7 @@
     "\n",
     "    def get_model_filename(self):\n",
     "        # Generate the filename for the current model\n",
-    "        return f\"{self.config.model_ckpt}/model{self.model_count}.pkl\"\n",
+    "        return f\"{self.config.root_dir}/model{self.model_count}.pkl\"\n",
     "    \n",
     "    def train(self):\n",
     "\n",
@@ -147,9 +145,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-09-07 23:51:27,590: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
+      "[2023-09-07 23:51:27,599: INFO: common: yaml file: params.yaml loaded successfully]\n",
+      "[2023-09-07 23:51:27,599: INFO: common: created directory at: artifacts]\n",
+      "[2023-09-07 23:51:27,607: INFO: common: created directory at: artifacts/model_trainer]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Spello training started..\n",
+      "[2023-09-07 23:51:30,191: DEBUG: model: Spello training started..]\n",
+      "Context model training started ...\n",
+      "[2023-09-07 23:51:30,947: DEBUG: model: Context model training started ...]\n",
+      "Symspell training started ...\n",
+      "[2023-09-07 23:52:04,197: DEBUG: model: Symspell training started ...]\n",
+      "[2023-09-07 23:52:04,205: INFO: symspell: Creating spell check dictionary...]\n",
+      "[2023-09-07 23:53:06,359: INFO: symspell: 62.16 seconds to run]\n",
+      "[2023-09-07 23:53:06,367: INFO: symspell: total words processed: 29624]\n",
+      "[2023-09-07 23:53:06,368: INFO: symspell: total unique words in corpus: 29624]\n",
+      "[2023-09-07 23:53:06,371: INFO: symspell: total items in dictionary (corpus words & deletions): 2112195]\n",
+      "[2023-09-07 23:53:06,371: INFO: symspell: edit distance for deletions: 3]\n",
+      "[2023-09-07 23:53:06,371: INFO: symspell: length of longest word in corpus: 18]\n",
+      "Phoneme training started ...\n",
+      "[2023-09-07 23:53:06,371: DEBUG: model: Phoneme training started ...]\n",
+      "Spello training completed successfully ...\n",
+      "[2023-09-07 23:53:06,953: DEBUG: model: Spello training completed successfully ...]\n"
+     ]
+    }
+   ],
    "source": [
     "try:\n",
     "    config = ConfigurationManager()\n",
@@ -159,6 +191,13 @@
     "except Exception as e:\n",
     "    raise e"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/src/spellX/components/model_trainer.py b/src/spellX/components/model_trainer.py
@@ -0,0 +1,66 @@
+from spello.model import SpellCorrectionModel
+import re
+import pickle
+from spellX.utils.trainer  import read_text
+from spellX import logger
+from spellX.entity.config_entity import ModelTrainerConfig
+import os
+
+class ModelTrainer :
+    def __init__(self, config: ModelTrainerConfig):
+        self.config = config
+        self.model_count = 0 
+
+    def get_model_filename(self):
+        # Generate the filename for the current model
+        return f"{self.config.root_dir}/model{self.model_count}.pkl"
+
+    def get_data_filename(self):
+        # Generate the filename for the current model
+        return f"{self.config.data_path}/{self.config.data_file}"
+
+    def train(self):
+        try :
+            model_path = self.get_model_filename()
+
+            if not os.path.exists(model_path):
+                with open("./artifacts/data_ingestion/data/train/big.txt", "r") as f:
+                    big = f.readlines()
+                big  = [i.strip() for i in big]
+                #Remove \t - tab
+                big_t = [re.sub('\\t', ' ', text) for text in big]
+                #Remove \\
+                big_ = [re.sub("\\'", "", text) for text in big_t]
+                #Remove
+                big_r = [text for text in big_ if text != '']
+                #Remove Special characters
+                big_star = [re.sub(r'[^a-zA-Z]+', ' ', text) for text in big_r]
+                #Remove leading and trailing spaces
+                big_stripped = [text.strip() for text in big_star]
+                sp = SpellCorrectionModel(language='en')
+                sp.train(big_stripped)
+                self.sp = sp
+
+                with open(model_path, 'wb') as file:
+                    pickle.dump(self.sp, file)
+
+            else:
+                # Model loading
+                with open(model_path, 'rb') as file:
+                    sp = pickle.load(file)
+
+                # New Data gathering
+                path = self.get_data_filename()
+                with open(path, "r") as f:
+                    data = f.readlines()
+                sp.train(data)
+
+                # Increment the model count for the next model
+                self.model_count += 1
+        except Exception as e:
+            raise e
+
+
+
+
+
diff --git a/src/spellX/config/__pycache__/configuration.cpython-36.pyc b/src/spellX/config/__pycache__/configuration.cpython-36.pyc