JohnSnowLabs · C-K-Loan · Nov 8, 2023 · Nov 6, 2023
diff --git a/examples/colab/component_examples/sentence_embeddings/NLU_MPNET_sentence_embeddings.ipynb b/examples/colab/component_examples/sentence_embeddings/NLU_MPNET_sentence_embeddings.ipynb
diff --git a/nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py b/nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py
@@ -0,0 +1,18 @@
+from sparknlp.annotator import MPNetEmbeddings
+
+
+class MPNetSentence:
+    @staticmethod
+    def get_default_model():
+        return MPNetEmbeddings.pretrained() \
+            .setInputCols(["documents"]) \
+            .setOutputCol("mpnet_embeddings")
+
+    @staticmethod
+    def get_pretrained_model(name, language, bucket=None):
+        return MPNetEmbeddings.pretrained(name,language,bucket) \
+            .setInputCols(["documents"]) \
+            .setOutputCol("mpnet_embeddings")
+
+
+
diff --git a/nlu/components/embeddings/sentence_mpnet/__init__.py b/nlu/components/embeddings/sentence_mpnet/__init__.py
diff --git a/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py b/nlu/pipe/col_substitution/name_deduction/name_deductable_annotators_OS.py
@@ -22,6 +22,7 @@
     BertSentenceEmbeddings,
     UniversalSentenceEncoder,
     SentenceEmbeddings,
+    MPNetEmbeddings,
     ContextSpellCheckerModel ,
     SymmetricDeleteModel ,
     NorvigSweetingModel ,
@@ -86,6 +87,7 @@
     BertSentenceEmbeddings,
     UniversalSentenceEncoder,
     SentenceEmbeddings,
+    MPNetEmbeddings,
     MultiClassifierDLModel,
     ClassifierDLModel ,
     ChunkEmbeddings ,

diff --git a/nlu/pipe/col_substitution/substitution_map_OS.py b/nlu/pipe/col_substitution/substitution_map_OS.py
@@ -64,6 +64,9 @@
     SentenceEmbeddings: {
         'default': substitute_sent_embed_cols,
     },
+    MPNetEmbeddings: {
+        'default': substitute_sent_embed_cols,
+    },
     Tokenizer: {
         'default': substitute_tokenizer_cols,
     },

diff --git a/nlu/spellbook.py b/nlu/spellbook.py
diff --git a/nlu/universe/annotator_class_universe.py b/nlu/universe/annotator_class_universe.py
@@ -103,6 +103,7 @@ class AnnoClassRef:
         A_N.ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION: 'RoBertaForZeroShotClassification',
         A_N.LONGFORMER_FOR_TOKEN_CLASSIFICATION: 'LongformerForTokenClassification',
         A_N.MARIAN_TRANSFORMER: 'MarianTransformer',
+        A_N.MPNET_SENTENCE_EMBEDDINGS: 'MPNetEmbeddings',
         A_N.ROBERTA_EMBEDDINGS: 'RoBertaEmbeddings',
         A_N.ROBERTA_FOR_TOKEN_CLASSIFICATION: 'RoBertaForTokenClassification',
         A_N.ROBERTA_SENTENCE_EMBEDDINGS: 'RoBertaSentenceEmbeddings',

diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py
@@ -74,6 +74,7 @@
 from nlu.components.embeddings.longformer.longformer import Longformer
 from nlu.components.embeddings.roberta.roberta import Roberta
 from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence
+from nlu.components.embeddings.sentence_mpnet.MPNetSentenceEmbedding import MPNetSentence
 from nlu.components.embeddings.sentence_xlm.sentence_xlm import Sentence_XLM
 from nlu.components.embeddings.use.spark_nlp_use import SparkNLPUse
 from nlu.components.embeddings.word2vec.word2vec import Word2Vec
@@ -2551,6 +2552,26 @@ class ComponentUniverse:
                                       jsl_anno_class_id=A.MARIAN_TRANSFORMER,
                                       jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MARIAN_TRANSFORMER],
                                       ),
+        A.MPNET_SENTENCE_EMBEDDINGS: partial(NluComponent,
+                                            name=A.MPNET_SENTENCE_EMBEDDINGS,
+                                            type=T.DOCUMENT_EMBEDDING,
+                                            get_default_model=MPNetSentence.get_default_model,
+                                            get_pretrained_model=MPNetSentence.get_pretrained_model,
+                                            pdf_extractor_methods={'default': default_sentence_embedding_config,
+                                                                   'default_full': default_full_config, },
+                                            pdf_col_name_substitutor=substitute_sent_embed_cols,
+                                            output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING,
+                                            node=NLP_FEATURE_NODES.nodes[A.MPNET_SENTENCE_EMBEDDINGS],
+                                            description='Sentence-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture.',
+                                            provider=ComponentBackends.open_source,
+                                            license=Licenses.open_source,
+                                            computation_context=ComputeContexts.spark,
+                                            output_context=ComputeContexts.spark,
+                                            jsl_anno_class_id=A.MPNET_SENTENCE_EMBEDDINGS,
+                                            jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MPNET_SENTENCE_EMBEDDINGS],
+                                            has_storage_ref=True,
+                                            is_storage_ref_producer=True,
+                                            ),
         A.ROBERTA_EMBEDDINGS: partial(NluComponent,
                                       name=A.ROBERTA_EMBEDDINGS,
                                       type=T.TOKEN_EMBEDDING,

diff --git a/nlu/universe/feature_node_ids.py b/nlu/universe/feature_node_ids.py
@@ -105,7 +105,7 @@ class NLP_NODE_IDS:
     CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification")
     SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
     BART_TRANSFORMER = JslAnnoId("bart_transformer")
-
+    MPNET_SENTENCE_EMBEDDINGS = JslAnnoId('mpnet_sentence_embeddings')
 
 
 

diff --git a/nlu/universe/feature_node_universes.py b/nlu/universe/feature_node_universes.py
@@ -73,6 +73,7 @@ class NLP_FEATURE_NODES:  # or Mode Node?
     A = NLP_NODE_IDS
     F = NLP_FEATURES
     nodes = {
+        A.MPNET_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.MPNET_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),
 
         A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),
 

diff --git a/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_mpnet_tests.py b/tests/nlu_core_tests/component_tests/embed_tests/sentence_embeddings/sentence_mpnet_tests.py
@@ -0,0 +1,22 @@
+import unittest
+
+from nlu import *
+
+
+class TestMPNetSentenceEmbeddings(unittest.TestCase):
+    def test_mpnet_embeds(self):
+        res = nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2').predict('This is an example sentence',
+                                                                            output_level='document')
+        for c in res:
+            print(res[c])
+
+        res = nlu.load('en.embed_sentence.mpnet.all_mpnet_base_questions_clustering_english').predict(
+            "Each sentence is converted",
+            output_level='document')
+
+        for c in res:
+            print(res[c])
+
+
+if __name__ == "__main__":
+    unittest.main()