Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPNetEmbeddings Integration #218

Merged
merged 1 commit into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions nlu/components/embeddings/sentence_mpnet/MPNetSentenceEmbedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from sparknlp.annotator import MPNetEmbeddings


class MPNetSentence:
@staticmethod
def get_default_model():
return MPNetEmbeddings.pretrained() \
.setInputCols(["documents"]) \
.setOutputCol("mpnet_embeddings")

@staticmethod
def get_pretrained_model(name, language, bucket=None):
return MPNetEmbeddings.pretrained(name,language,bucket) \
.setInputCols(["documents"]) \
.setOutputCol("mpnet_embeddings")



Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
BertSentenceEmbeddings,
UniversalSentenceEncoder,
SentenceEmbeddings,
MPNetEmbeddings,
ContextSpellCheckerModel ,
SymmetricDeleteModel ,
NorvigSweetingModel ,
Expand Down Expand Up @@ -86,6 +87,7 @@
BertSentenceEmbeddings,
UniversalSentenceEncoder,
SentenceEmbeddings,
MPNetEmbeddings,
MultiClassifierDLModel,
ClassifierDLModel ,
ChunkEmbeddings ,
Expand Down
3 changes: 3 additions & 0 deletions nlu/pipe/col_substitution/substitution_map_OS.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
SentenceEmbeddings: {
'default': substitute_sent_embed_cols,
},
MPNetEmbeddings: {
'default': substitute_sent_embed_cols,
},
Tokenizer: {
'default': substitute_tokenizer_cols,
},
Expand Down
712 changes: 711 additions & 1 deletion nlu/spellbook.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions nlu/universe/annotator_class_universe.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ class AnnoClassRef:
A_N.ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION: 'RoBertaForZeroShotClassification',
A_N.LONGFORMER_FOR_TOKEN_CLASSIFICATION: 'LongformerForTokenClassification',
A_N.MARIAN_TRANSFORMER: 'MarianTransformer',
A_N.MPNET_SENTENCE_EMBEDDINGS: 'MPNetEmbeddings',
A_N.ROBERTA_EMBEDDINGS: 'RoBertaEmbeddings',
A_N.ROBERTA_FOR_TOKEN_CLASSIFICATION: 'RoBertaForTokenClassification',
A_N.ROBERTA_SENTENCE_EMBEDDINGS: 'RoBertaSentenceEmbeddings',
Expand Down
21 changes: 21 additions & 0 deletions nlu/universe/component_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
from nlu.components.embeddings.longformer.longformer import Longformer
from nlu.components.embeddings.roberta.roberta import Roberta
from nlu.components.embeddings.sentence_bert.BertSentenceEmbedding import BertSentence
from nlu.components.embeddings.sentence_mpnet.MPNetSentenceEmbedding import MPNetSentence
from nlu.components.embeddings.sentence_xlm.sentence_xlm import Sentence_XLM
from nlu.components.embeddings.use.spark_nlp_use import SparkNLPUse
from nlu.components.embeddings.word2vec.word2vec import Word2Vec
Expand Down Expand Up @@ -2551,6 +2552,26 @@ class ComponentUniverse:
jsl_anno_class_id=A.MARIAN_TRANSFORMER,
jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MARIAN_TRANSFORMER],
),
A.MPNET_SENTENCE_EMBEDDINGS: partial(NluComponent,
name=A.MPNET_SENTENCE_EMBEDDINGS,
type=T.DOCUMENT_EMBEDDING,
get_default_model=MPNetSentence.get_default_model,
get_pretrained_model=MPNetSentence.get_pretrained_model,
pdf_extractor_methods={'default': default_sentence_embedding_config,
'default_full': default_full_config, },
pdf_col_name_substitutor=substitute_sent_embed_cols,
output_level=L.INPUT_DEPENDENT_DOCUMENT_EMBEDDING,
node=NLP_FEATURE_NODES.nodes[A.MPNET_SENTENCE_EMBEDDINGS],
description='Sentence-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture.',
provider=ComponentBackends.open_source,
license=Licenses.open_source,
computation_context=ComputeContexts.spark,
output_context=ComputeContexts.spark,
jsl_anno_class_id=A.MPNET_SENTENCE_EMBEDDINGS,
jsl_anno_py_class=ACR.JSL_anno2_py_class[A.MPNET_SENTENCE_EMBEDDINGS],
has_storage_ref=True,
is_storage_ref_producer=True,
),
A.ROBERTA_EMBEDDINGS: partial(NluComponent,
name=A.ROBERTA_EMBEDDINGS,
type=T.TOKEN_EMBEDDING,
Expand Down
2 changes: 1 addition & 1 deletion nlu/universe/feature_node_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ class NLP_NODE_IDS:
CONVNEXT_IMAGE_CLASSIFICATION = JslAnnoId("convnext_image_classification")
SWIN_IMAGE_CLASSIFICATION = JslAnnoId("swin_image_classification")
BART_TRANSFORMER = JslAnnoId("bart_transformer")

MPNET_SENTENCE_EMBEDDINGS = JslAnnoId('mpnet_sentence_embeddings')



Expand Down
1 change: 1 addition & 0 deletions nlu/universe/feature_node_universes.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class NLP_FEATURE_NODES: # or Mode Node?
A = NLP_NODE_IDS
F = NLP_FEATURES
nodes = {
A.MPNET_SENTENCE_EMBEDDINGS: NlpFeatureNode(A.MPNET_SENTENCE_EMBEDDINGS, [F.DOCUMENT], [F.SENTENCE_EMBEDDINGS]),

A.PARTIALLY_IMPLEMENTED: NlpFeatureNode(A.PARTIALLY_IMPLEMENTED, [F.UNKOWN], [F.UNKOWN]),

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest

from nlu import *


class TestMPNetSentenceEmbeddings(unittest.TestCase):
def test_mpnet_embeds(self):
res = nlu.load('en.embed_sentence.mpnet.all_mpnet_base_v2').predict('This is an example sentence',
output_level='document')
for c in res:
print(res[c])

res = nlu.load('en.embed_sentence.mpnet.all_mpnet_base_questions_clustering_english').predict(
"Each sentence is converted",
output_level='document')

for c in res:
print(res[c])


if __name__ == "__main__":
unittest.main()
Loading