From 3eabb1a5e87a24a9976c3826ca82b8090af26fc2 Mon Sep 17 00:00:00 2001 From: liheng Date: Tue, 27 Aug 2019 15:01:43 +0800 Subject: [PATCH] added entities detection for a list --- flashtext/keyword.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/flashtext/keyword.py b/flashtext/keyword.py index f358c77..c78a3b4 100644 --- a/flashtext/keyword.py +++ b/flashtext/keyword.py @@ -121,7 +121,7 @@ def __getitem__(self, word): if self._keyword in current_dict and len_covered == len(word): return current_dict[self._keyword] - def __setitem__(self, keyword, clean_name=None): + def __setitem__(self, keyword, clean_name=None, multi_match=False, ): """To add keyword to the dictionary pass the keyword and the clean name it maps to. @@ -149,7 +149,14 @@ def __setitem__(self, keyword, clean_name=None): if self._keyword not in current_dict: status = True self._terms_in_trie += 1 - current_dict[self._keyword] = clean_name + + if current_dict.get(self._keyword) and multi_match: + # if multi_match and is str, convert to list and append, else append directly + if isinstance(current_dict[self._keyword], str): + current_dict[self._keyword] = [current_dict[self._keyword]] + current_dict[self._keyword].append(clean_name) + else: + current_dict[self._keyword] = clean_name return status def __delitem__(self, keyword): @@ -222,7 +229,7 @@ def add_non_word_boundary(self, character): """ self.non_word_boundaries.add(character) - def add_keyword(self, keyword, clean_name=None): + def add_keyword(self, keyword, multi_match = False, clean_name=None): """To add one or more keywords to the dictionary pass the keyword and the clean name it maps to. @@ -245,7 +252,7 @@ def add_keyword(self, keyword, clean_name=None): >>> keyword_processor.add_keyword('Big Apple') >>> # This case 'Big Apple' will return 'Big Apple' """ - return self.__setitem__(keyword, clean_name) + return self.__setitem__(keyword, clean_name=clean_name, multi_match=multi_match) def remove_keyword(self, keyword): """To remove one or more keywords from the dictionary @@ -321,12 +328,12 @@ def add_keyword_from_file(self, keyword_file, encoding="utf-8"): for line in f: if '=>' in line: keyword, clean_name = line.split('=>') - self.add_keyword(keyword, clean_name.strip()) + self.add_keyword(keyword, multi_match=False, clean_name=clean_name.strip()) else: keyword = line.strip() self.add_keyword(keyword) - def add_keywords_from_dict(self, keyword_dict): + def add_keywords_from_dict(self, keyword_dict, multi_match=False): """To add keywords from a dictionary Args: @@ -348,7 +355,7 @@ def add_keywords_from_dict(self, keyword_dict): raise AttributeError("Value of key {} should be a list".format(clean_name)) for keyword in keywords: - self.add_keyword(keyword, clean_name) + self.add_keyword(keyword, multi_match=multi_match, clean_name=clean_name) def remove_keywords_from_dict(self, keyword_dict): """To remove keywords from a dictionary @@ -580,7 +587,7 @@ def replace_keywords(self, sentence): if not sentence: # if sentence is empty or none just return the same. return sentence - new_sentence = [] + new_sentence = '' orig_sentence = sentence if not self.case_sensitive: sentence = sentence.lower() @@ -639,17 +646,17 @@ def replace_keywords(self, sentence): current_word = current_word_continued current_dict = self.keyword_trie_dict if longest_sequence_found: - new_sentence.append(longest_sequence_found + current_white_space) + new_sentence += longest_sequence_found + current_white_space current_word = '' current_white_space = '' else: - new_sentence.append(current_word) + new_sentence += current_word current_word = '' current_white_space = '' else: # we reset current_dict current_dict = self.keyword_trie_dict - new_sentence.append(current_word) + new_sentence += current_word current_word = '' current_white_space = '' elif char in current_dict: @@ -667,15 +674,15 @@ def replace_keywords(self, sentence): break idy += 1 idx = idy - new_sentence.append(current_word) + new_sentence += current_word current_word = '' current_white_space = '' # if we are end of sentence and have a sequence discovered if idx + 1 >= sentence_len: if self._keyword in current_dict: sequence_found = current_dict[self._keyword] - new_sentence.append(sequence_found) + new_sentence += sequence_found else: - new_sentence.append(current_word) + new_sentence += current_word idx += 1 - return "".join(new_sentence) + return new_sentence