Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ros_speech_recognition] Add vosk engine #462

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions ros_speech_recognition/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,14 @@ generate_dynamic_reconfigure_options(
cfg/SpeechRecognition.cfg
)

add_custom_target(${PROJECT_NAME}_install_trained_data ALL COMMAND python$ENV{ROS_PYTHON_VERSION} ${PROJECT_SOURCE_DIR}/scripts/install_trained_data.py)

catkin_package()

if($ENV{ROS_DISTRO} STRGREATER "melodic")
catkin_generate_virtualenv(
PYTHON_INTERPRETER python3
)
else()
catkin_generate_virtualenv(
PYTHON_INTERPRETER python2
catkin_generate_virtualenv(
PYTHON_INTERPRETER python3
CHECK_VENV FALSE
)
endif()

file(GLOB PYTHON_SCRIPT_FILES scripts/*.py test/*.py)
catkin_install_python(
Expand Down
8 changes: 8 additions & 0 deletions ros_speech_recognition/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,14 @@ roslaunch ros_speech_recognition parrotry.launch language:=ja-JP

Auth key for Bing API.
This is valid only if `~engine` is `bing`.

* `~vosk_model_path` (`String`, default: `None`)

Path to trainded model for Vosk API.
This is valid only if `~engine` is `Vosk`.

If `en-US` or `ja` is selected as `~language`, you do not need to specify the path.
To load other models, please download them from [Model list](https://alphacephei.com/vosk/models).

## Author

Expand Down
3 changes: 2 additions & 1 deletion ros_speech_recognition/cfg/SpeechRecognition.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ engine_enum = gen.enum([gen.const("Google", str_t, "Google", "Google Speech Reco
gen.const("Wit", str_t, "Wit", "Wit.ai API"),
gen.const("Bing", str_t, "Bing", "Microsoft Bing Speech API"),
gen.const("Houndify", str_t, "Houndify", "Houndify API"),
gen.const("IBM", str_t, "IBM", "IBM Speech to Text API")],
gen.const("IBM", str_t, "IBM", "IBM Speech to Text API"),
gen.const("Vosk", str_t, "Vosk", "Vosk API")],
"engine")

# name type level description default min max
Expand Down
2 changes: 2 additions & 0 deletions ros_speech_recognition/package.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@

<build_depend>catkin_virtualenv</build_depend>
<build_depend>dynamic_reconfigure</build_depend>
<build_depend>jsk_data</build_depend>
<build_depend>speech_recognition_msgs</build_depend>
<build_depend>g++-static</build_depend>

<run_depend>audio_capture</run_depend>
<run_depend>audio_common_msgs</run_depend>
<run_depend>dynamic_reconfigure</run_depend>
<run_depend>flac</run_depend>
<run_depend>jsk_data</run_depend>
<run_depend>sound_play</run_depend>
<run_depend>speech_recognition_msgs</run_depend>
<run_depend>ubuntu-sounds</run_depend>
Expand Down
3 changes: 2 additions & 1 deletion ros_speech_recognition/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
SpeechRecognition==3.8.1
SpeechRecognition==3.9.0
vosk==0.3.45
Comment on lines +1 to +2
Copy link
Member

@tkmtnt7000 tkmtnt7000 May 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am very sorry for too late comment.
I just found that both SpeechRecognition==3.9.0 and vosk are not compatible with python 3.4, so indigo test will surely fail.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Drop indigo support like 2c9e010
  2. Support indigo like e8742b0

I think it may be ok to drop indigo support dependent on #471 discussion.

44 changes: 44 additions & 0 deletions ros_speech_recognition/scripts/install_trained_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python

import argparse
import multiprocessing

import jsk_data


def download_data(*args, **kwargs):
p = multiprocessing.Process(
target=jsk_data.download_data,
args=args,
kwargs=kwargs)
p.start()


def main():
parser = argparse.ArgumentParser()
parser.add_argument('-v', '--verbose', dest='quiet', action='store_false')
args = parser.parse_args()
quiet = args.quiet

PKG = 'ros_speech_recognition'

download_data(
pkg_name=PKG,
path='trained_data/vosk-model-small-ja-0.22.zip',
url='https://alphacephei.com/vosk/models/vosk-model-small-ja-0.22.zip', # NOQA
md5='0e3163dd62dfb0d823353718ac3cbf79',
extract=True,
quiet=quiet,
)

download_data(
pkg_name=PKG,
path='trained_data/vosk-model-small-en-us-0.15.zip',
url='https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip', # NOQA
md5='09ab50ccd62b674cbaa231b825f9c1cb',
extract=True,
quiet=quiet,
)

if __name__ == '__main__':
main()
5 changes: 5 additions & 0 deletions ros_speech_recognition/scripts/speech_recognition_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import rospy
import speech_recognition as SR
from ros_speech_recognition.recognize_google_cloud import RecognizerEx
import ros_speech_recognition.recognize_vosk
import json
import array
import sys
Expand Down Expand Up @@ -275,6 +276,10 @@ def recognize(self, audio):
recog_func = self.recognizer.recognize_houndify
elif self.engine == Config.SpeechRecognition_IBM:
recog_func = self.recognizer.recognize_ibm
elif self.engine == Config.SpeechRecognition_Vosk:
if not self.args:
self.args = {'model_path': rospy.get_param('~vosk_model_path', None)}
recog_func = self.recognizer.recognize_vosk

return recog_func(audio_data=audio, language=self.language, **self.args)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# file to override recognize_vosk
# we need this to use vosk model anywhere

from speech_recognition import AudioData
from ros_speech_recognition.recognize_google_cloud import RecognizerEx
from vosk import Model, KaldiRecognizer
import json
import os.path as osp
import rospkg
import rospy

def recognize_vosk(self, audio_data, model_path=None, language='en-US'):

assert isinstance(audio_data, AudioData), "Data must be audio data"

if not hasattr(self, 'vosk_model'):
if model_path is None:
PKG = 'ros_speech_recognition'
rp = rospkg.RosPack()
data_path = osp.join(rp.get_path(PKG), 'trained_data')
if language == 'en-US':
model_path = osp.join(data_path, 'vosk-model-small-en-us-0.15')
elif language == 'ja':
model_path = osp.join(data_path, 'vosk-model-small-ja-0.22')
else:
rospy.logerr("Unsupported language: {0}.\n Please download the model from https://alphacephei.com/vosk/models and specify its path as 'vosk_model_path'.".format(language))
exit (1)
nakane11 marked this conversation as resolved.
Show resolved Hide resolved
rospy.loginfo("Loading model from {}".format(model_path))
self.vosk_model = Model(model_path)
rec = KaldiRecognizer(self.vosk_model, 16000);

rec.AcceptWaveform(audio_data.get_raw_data(convert_rate=16000, convert_width=2));
finalRecognition = rec.FinalResult()
text = json.loads(finalRecognition)['text']
return text

RecognizerEx.recognize_vosk = recognize_vosk
2 changes: 2 additions & 0 deletions ros_speech_recognition/trained_data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore