Skip to content

Commit

Permalink
Model training
Browse files Browse the repository at this point in the history
  • Loading branch information
IbLahlou committed Sep 7, 2023
1 parent 470e113 commit 077284f
Show file tree
Hide file tree
Showing 11 changed files with 334 additions and 40 deletions.
170 changes: 170 additions & 0 deletions logs/running_logs.log
Original file line number Diff line number Diff line change
Expand Up @@ -546,3 +546,173 @@ x==========x]
[2023-09-07 23:53:06,371: INFO: symspell: length of longest word in corpus: 18]
[2023-09-07 23:53:06,371: DEBUG: model: Phoneme training started ...]
[2023-09-07 23:53:06,953: DEBUG: model: Spello training completed successfully ...]
[2023-09-08 00:19:06,806: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
[2023-09-08 00:19:06,806: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:19:06,822: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:19:06,822: INFO: common: created directory at: artifacts]
[2023-09-08 00:19:06,822: INFO: common: created directory at: artifacts/data_ingestion]
[2023-09-08 00:19:06,822: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
[2023-09-08 00:19:07,069: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<

x==========x]
[2023-09-08 00:19:07,069: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
[2023-09-08 00:19:07,085: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:19:07,088: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:19:07,088: INFO: common: created directory at: artifacts]
[2023-09-08 00:19:07,093: INFO: common: created directory at: artifacts/data_validation]
[2023-09-08 00:19:07,093: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<

x==========x]
[2023-09-08 00:19:07,101: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
[2023-09-08 00:19:07,109: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:19:07,116: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:19:07,116: INFO: common: created directory at: artifacts]
[2023-09-08 00:19:07,116: ERROR: main: 'ConfigurationManager' object has no attribute 'get_model_trainer_config']
Traceback (most recent call last):
File ".\main.py", line 31, in <module>
model_training.main()
File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 12, in main
model_trainer_config = config.get_model_trainer_config()
AttributeError: 'ConfigurationManager' object has no attribute 'get_model_trainer_config'
[2023-09-08 00:21:18,631: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
[2023-09-08 00:21:18,637: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:21:18,646: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:21:18,648: INFO: common: created directory at: artifacts]
[2023-09-08 00:21:18,648: INFO: common: created directory at: artifacts/data_ingestion]
[2023-09-08 00:21:18,648: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
[2023-09-08 00:21:18,848: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<

x==========x]
[2023-09-08 00:21:18,848: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
[2023-09-08 00:21:18,857: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:21:18,857: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:21:18,857: INFO: common: created directory at: artifacts]
[2023-09-08 00:21:18,857: INFO: common: created directory at: artifacts/data_validation]
[2023-09-08 00:21:18,857: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<

x==========x]
[2023-09-08 00:21:18,857: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
[2023-09-08 00:21:18,873: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:21:18,873: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:21:18,873: INFO: common: created directory at: artifacts]
[2023-09-08 00:21:18,873: INFO: common: created directory at: artifacts/model_trainer]
[2023-09-08 00:21:30,592: ERROR: main: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train\\w']
Traceback (most recent call last):
File ".\main.py", line 31, in <module>
model_training.main()
File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 14, in main
model_trainer_config.train()
File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 52, in train
raise e
File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 46, in train
data = read_text(self.config.data_path, self.config.data_file)
File "c:\users\p52s\documents\python project\spellx\src\spellX\utils\trainer.py", line 110, in read_text
strings = unidecode.unidecode(open(file_path).read())
FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train\\w'
[2023-09-08 00:22:27,677: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
[2023-09-08 00:22:27,707: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:22:27,718: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:22:27,725: INFO: common: created directory at: artifacts]
[2023-09-08 00:22:27,725: INFO: common: created directory at: artifacts/data_ingestion]
[2023-09-08 00:22:27,725: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
[2023-09-08 00:22:27,962: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<

x==========x]
[2023-09-08 00:22:27,962: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
[2023-09-08 00:22:27,962: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:22:27,979: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:22:27,979: INFO: common: created directory at: artifacts]
[2023-09-08 00:22:27,979: INFO: common: created directory at: artifacts/data_validation]
[2023-09-08 00:22:27,985: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<

x==========x]
[2023-09-08 00:22:27,985: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
[2023-09-08 00:22:27,993: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:22:28,001: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:22:28,008: INFO: common: created directory at: artifacts]
[2023-09-08 00:22:28,008: INFO: common: created directory at: artifacts/model_trainer]
[2023-09-08 00:22:38,007: ERROR: main: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train/w']
Traceback (most recent call last):
File ".\main.py", line 31, in <module>
model_training.main()
File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 14, in main
model_trainer_config.train()
File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 52, in train
raise e
File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 46, in train
data = read_text(self.config.data_path, self.config.data_file)
File "c:\users\p52s\documents\python project\spellx\src\spellX\utils\trainer.py", line 110, in read_text
strings = unidecode.unidecode(open(file_path).read())
FileNotFoundError: [Errno 2] No such file or directory: 'artifacts/data_ingestion/data/train/w'
[2023-09-08 00:33:54,025: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
[2023-09-08 00:33:54,065: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:33:54,068: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:33:54,082: INFO: common: created directory at: artifacts]
[2023-09-08 00:33:54,085: INFO: common: created directory at: artifacts/data_ingestion]
[2023-09-08 00:33:54,085: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
[2023-09-08 00:33:54,342: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<

x==========x]
[2023-09-08 00:33:54,358: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
[2023-09-08 00:33:54,358: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:33:54,374: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:33:54,382: INFO: common: created directory at: artifacts]
[2023-09-08 00:33:54,382: INFO: common: created directory at: artifacts/data_validation]
[2023-09-08 00:33:54,389: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<

x==========x]
[2023-09-08 00:33:54,397: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
[2023-09-08 00:33:54,406: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:33:54,413: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:33:54,413: INFO: common: created directory at: artifacts]
[2023-09-08 00:33:54,426: INFO: common: created directory at: artifacts/model_trainer]
[2023-09-08 00:34:06,693: ERROR: main: 'ModelTrainer' object has no attribute 'data_file']
Traceback (most recent call last):
File ".\main.py", line 31, in <module>
model_training.main()
File "c:\users\p52s\documents\python project\spellx\src\spellX\pipeline\stage_03_model_training.py", line 14, in main
model_trainer_config.train()
File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 61, in train
raise e
File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 53, in train
path = self.get_data_filename()
File "c:\users\p52s\documents\python project\spellx\src\spellX\components\model_trainer.py", line 20, in get_data_filename
return f"{self.config.data_path}/{self.data_file}"
AttributeError: 'ModelTrainer' object has no attribute 'data_file'
[2023-09-08 00:34:50,605: INFO: main: >>>>>> stage Data Ingestion stage started <<<<<<]
[2023-09-08 00:34:50,618: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:34:50,627: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:34:50,627: INFO: common: created directory at: artifacts]
[2023-09-08 00:34:50,627: INFO: common: created directory at: artifacts/data_ingestion]
[2023-09-08 00:34:50,627: INFO: data_ingestion: File already exists of size: ~ 2473 KB]
[2023-09-08 00:34:50,831: INFO: main: >>>>>> stage Data Ingestion stage completed <<<<<<

x==========x]
[2023-09-08 00:34:50,846: INFO: main: >>>>>> stage Data Validation stage started <<<<<<]
[2023-09-08 00:34:50,846: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:34:50,858: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:34:50,864: INFO: common: created directory at: artifacts]
[2023-09-08 00:34:50,864: INFO: common: created directory at: artifacts/data_validation]
[2023-09-08 00:34:50,867: INFO: main: >>>>>> stage Data Validation stage completed <<<<<<

x==========x]
[2023-09-08 00:34:50,867: INFO: main: >>>>>> stage Model Training stage started <<<<<<]
[2023-09-08 00:34:50,883: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-08 00:34:50,889: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-08 00:34:50,898: INFO: common: created directory at: artifacts]
[2023-09-08 00:34:50,900: INFO: common: created directory at: artifacts/model_trainer]
[2023-09-08 00:35:02,257: DEBUG: model: Spello training started..]
[2023-09-08 00:35:02,276: DEBUG: model: Context model training started ...]
[2023-09-08 00:35:02,581: DEBUG: model: Symspell training started ...]
[2023-09-08 00:35:03,055: INFO: symspell: Creating spell check dictionary...]
[2023-09-08 00:35:17,727: INFO: symspell: 14.66 seconds to run]
[2023-09-08 00:35:17,727: INFO: symspell: total words processed: 4153]
[2023-09-08 00:35:17,727: INFO: symspell: total unique words in corpus: 4153]
[2023-09-08 00:35:17,727: INFO: symspell: total items in dictionary (corpus words & deletions): 383935]
[2023-09-08 00:35:17,727: INFO: symspell: edit distance for deletions: 3]
[2023-09-08 00:35:17,727: INFO: symspell: length of longest word in corpus: 19]
[2023-09-08 00:35:17,727: DEBUG: model: Phoneme training started ...]
[2023-09-08 00:35:17,834: DEBUG: model: Spello training completed successfully ...]
[2023-09-08 00:35:18,009: INFO: main: >>>>>> stage Model Training stage completed <<<<<<

x==========x]
13 changes: 12 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from spellX import logger
from spellX.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
from spellX.pipeline.stage_02_data_validation import DataValidationTrainingPipeline

from spellX.pipeline.stage_03_model_training import ModelTrainingPipeline

STAGE_NAME = "Data Ingestion stage"
try:
Expand All @@ -19,6 +19,17 @@
data_validation = DataValidationTrainingPipeline()
data_validation.main()
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
except Exception as e:
logger.exception(e)
raise e


STAGE_NAME = "Model Training stage"
try:
logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<")
model_training = ModelTrainingPipeline()
model_training.main()
logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x")
except Exception as e:
logger.exception(e)
raise e
61 changes: 50 additions & 11 deletions research/03_model_training.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -37,13 +37,12 @@
"class ModelTrainerConfig:\n",
" root_dir : Path\n",
" data_path : Path\n",
" data_file : Path\n",
" model_ckpt : Path"
" data_file : Path"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -53,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -71,15 +70,14 @@
"\n",
" \n",
" def get_model_trainer_config(self) -> ModelTrainerConfig:\n",
" config = self.config.model_traine\n",
" config = self.config.model_trainer\n",
"\n",
" create_directories([config.root_dir])\n",
"\n",
" model_trainer_config = ModelTrainerConfig(\n",
" root_dir=config.root_dir,\n",
" data_path=config.data_path,\n",
" data_file = config.data_file,\n",
" model_ckpt = config.model_ckpt,\n",
" \n",
" )\n",
"\n",
Expand All @@ -88,7 +86,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -104,7 +102,7 @@
"\n",
" def get_model_filename(self):\n",
" # Generate the filename for the current model\n",
" return f\"{self.config.model_ckpt}/model{self.model_count}.pkl\"\n",
" return f\"{self.config.root_dir}/model{self.model_count}.pkl\"\n",
" \n",
" def train(self):\n",
"\n",
Expand Down Expand Up @@ -147,9 +145,43 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2023-09-07 23:51:27,590: INFO: common: yaml file: config\\config.yaml loaded successfully]\n",
"[2023-09-07 23:51:27,599: INFO: common: yaml file: params.yaml loaded successfully]\n",
"[2023-09-07 23:51:27,599: INFO: common: created directory at: artifacts]\n",
"[2023-09-07 23:51:27,607: INFO: common: created directory at: artifacts/model_trainer]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Spello training started..\n",
"[2023-09-07 23:51:30,191: DEBUG: model: Spello training started..]\n",
"Context model training started ...\n",
"[2023-09-07 23:51:30,947: DEBUG: model: Context model training started ...]\n",
"Symspell training started ...\n",
"[2023-09-07 23:52:04,197: DEBUG: model: Symspell training started ...]\n",
"[2023-09-07 23:52:04,205: INFO: symspell: Creating spell check dictionary...]\n",
"[2023-09-07 23:53:06,359: INFO: symspell: 62.16 seconds to run]\n",
"[2023-09-07 23:53:06,367: INFO: symspell: total words processed: 29624]\n",
"[2023-09-07 23:53:06,368: INFO: symspell: total unique words in corpus: 29624]\n",
"[2023-09-07 23:53:06,371: INFO: symspell: total items in dictionary (corpus words & deletions): 2112195]\n",
"[2023-09-07 23:53:06,371: INFO: symspell: edit distance for deletions: 3]\n",
"[2023-09-07 23:53:06,371: INFO: symspell: length of longest word in corpus: 18]\n",
"Phoneme training started ...\n",
"[2023-09-07 23:53:06,371: DEBUG: model: Phoneme training started ...]\n",
"Spello training completed successfully ...\n",
"[2023-09-07 23:53:06,953: DEBUG: model: Spello training completed successfully ...]\n"
]
}
],
"source": [
"try:\n",
" config = ConfigurationManager()\n",
Expand All @@ -159,6 +191,13 @@
"except Exception as e:\n",
" raise e"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
66 changes: 66 additions & 0 deletions src/spellX/components/model_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from spello.model import SpellCorrectionModel
import re
import pickle
from spellX.utils.trainer import read_text
from spellX import logger
from spellX.entity.config_entity import ModelTrainerConfig
import os

class ModelTrainer :
def __init__(self, config: ModelTrainerConfig):
self.config = config
self.model_count = 0

def get_model_filename(self):
# Generate the filename for the current model
return f"{self.config.root_dir}/model{self.model_count}.pkl"

def get_data_filename(self):
# Generate the filename for the current model
return f"{self.config.data_path}/{self.config.data_file}"

def train(self):
try :
model_path = self.get_model_filename()

if not os.path.exists(model_path):
with open("./artifacts/data_ingestion/data/train/big.txt", "r") as f:
big = f.readlines()
big = [i.strip() for i in big]
#Remove \t - tab
big_t = [re.sub('\\t', ' ', text) for text in big]
#Remove \\
big_ = [re.sub("\\'", "", text) for text in big_t]
#Remove
big_r = [text for text in big_ if text != '']
#Remove Special characters
big_star = [re.sub(r'[^a-zA-Z]+', ' ', text) for text in big_r]
#Remove leading and trailing spaces
big_stripped = [text.strip() for text in big_star]
sp = SpellCorrectionModel(language='en')
sp.train(big_stripped)
self.sp = sp

with open(model_path, 'wb') as file:
pickle.dump(self.sp, file)

else:
# Model loading
with open(model_path, 'rb') as file:
sp = pickle.load(file)

# New Data gathering
path = self.get_data_filename()
with open(path, "r") as f:
data = f.readlines()
sp.train(data)

# Increment the model count for the next model
self.model_count += 1
except Exception as e:
raise e





Binary file modified src/spellX/config/__pycache__/configuration.cpython-36.pyc
Binary file not shown.
Loading

0 comments on commit 077284f

Please sign in to comment.