Skip to content

Commit

Permalink
forked from flashtext to allow returning multiple matching keywords
Browse files Browse the repository at this point in the history
  • Loading branch information
brlala committed Aug 27, 2019
1 parent 3eabb1a commit fe876af
Showing 1 changed file with 76 additions and 75 deletions.
151 changes: 76 additions & 75 deletions flashtext/keyword.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,18 @@ class KeywordProcessor(object):
Defaults to False
Examples:
>>> # import module
>>> from flashtext import KeywordProcessor
>>> # Create an object of KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> # add keywords
>>> keyword_names = ['NY', 'new-york', 'SF']
>>> clean_names = ['new york', 'new york', 'san francisco']
>>> for keyword_name, clean_name in zip(keyword_names, clean_names):
>>> keyword_processor.add_keyword(keyword_name, clean_name)
>>> keywords_found = keyword_processor.extract_keywords('I love SF and NY. new-york is the best.')
>>> keywords_found
>>> ['san francisco', 'new york', 'new york']
# >>> # import module
# >>> from flashtext import KeywordProcessor
# >>> # Create an object of KeywordProcessor
# >>> keyword_processor = KeywordProcessor()
# >>> # add keywords
# >>> keyword_names = ['NY', 'new-york', 'SF']
# >>> clean_names = ['new york', 'new york', 'san francisco']
# >>> for keyword_name, clean_name in zip(keyword_names, clean_names):
# >>> keyword_processor.add_keyword(keyword_name, clean_name)
# >>> keywords_found = keyword_processor.extract_keywords('I love SF and NY. new-york is the best.')
# >>> keywords_found
# >>> ['san francisco', 'new york', 'new york']
Note:
* loosely based on `Aho-Corasick algorithm <https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm>`_.
Expand All @@ -42,7 +42,7 @@ def __init__(self, case_sensitive=False):
Defaults to False
"""
self._keyword = '_keyword_'
self._white_space_chars = set(['.', '\t', '\n', '\a', ' ', ','])
self._white_space_chars = {'.', '\t', '\n', '\a', ' ', ','}
try:
# python 2.x
self.non_word_boundaries = set(string.digits + string.letters + '_')
Expand Down Expand Up @@ -75,9 +75,9 @@ def __contains__(self, word):
If word is present as it is in keyword_trie_dict then we return True, else False
Examples:
>>> keyword_processor.add_keyword('Big Apple')
>>> 'Big Apple' in keyword_processor
>>> # True
# >>> keyword_processor.add_keyword('Big Apple')
# >>> 'Big Apple' in keyword_processor
# >>> # True
"""
if not self.case_sensitive:
Expand All @@ -104,9 +104,9 @@ def __getitem__(self, word):
If word is present as it is in keyword_trie_dict then we return keyword mapped to it.
Examples:
>>> keyword_processor.add_keyword('Big Apple', 'New York')
>>> keyword_processor['Big Apple']
>>> # New York
# >>> keyword_processor.add_keyword('Big Apple', 'New York')
# >>> keyword_processor['Big Apple']
# >>> # New York
"""
if not self.case_sensitive:
word = word.lower()
Expand Down Expand Up @@ -134,7 +134,7 @@ def __setitem__(self, keyword, clean_name=None, multi_match=False, ):
if not provided, keyword will be used as the clean name also.
Examples:
>>> keyword_processor['Big Apple'] = 'New York'
# >>> keyword_processor['Big Apple'] = 'New York'
"""
status = False
if not clean_name and keyword:
Expand Down Expand Up @@ -168,8 +168,8 @@ def __delitem__(self, keyword):
keyword that you want to remove if it's present
Examples:
>>> keyword_processor.add_keyword('Big Apple')
>>> del keyword_processor['Big Apple']
# >>> keyword_processor.add_keyword('Big Apple')
# >>> del keyword_processor['Big Apple']
"""
status = False
if keyword:
Expand Down Expand Up @@ -229,7 +229,7 @@ def add_non_word_boundary(self, character):
"""
self.non_word_boundaries.add(character)

def add_keyword(self, keyword, multi_match = False, clean_name=None):
def add_keyword(self, keyword, multi_match=False, clean_name=None):
"""To add one or more keywords to the dictionary
pass the keyword and the clean name it maps to.
Expand All @@ -246,11 +246,11 @@ def add_keyword(self, keyword, multi_match = False, clean_name=None):
The return value. True for success, False otherwise.
Examples:
>>> keyword_processor.add_keyword('Big Apple', 'New York')
>>> # This case 'Big Apple' will return 'New York'
>>> # OR
>>> keyword_processor.add_keyword('Big Apple')
>>> # This case 'Big Apple' will return 'Big Apple'
# >>> keyword_processor.add_keyword('Big Apple', 'New York')
# >>> # This case 'Big Apple' will return 'New York'
# >>> # OR
# >>> keyword_processor.add_keyword('Big Apple')
# >>> # This case 'Big Apple' will return 'Big Apple'
"""
return self.__setitem__(keyword, clean_name=clean_name, multi_match=multi_match)

Expand All @@ -267,12 +267,12 @@ def remove_keyword(self, keyword):
The return value. True for success, False otherwise.
Examples:
>>> keyword_processor.add_keyword('Big Apple')
>>> keyword_processor.remove_keyword('Big Apple')
>>> # Returns True
>>> # This case 'Big Apple' will no longer be a recognized keyword
>>> keyword_processor.remove_keyword('Big Apple')
>>> # Returns False
# >>> keyword_processor.add_keyword('Big Apple')
# >>> keyword_processor.remove_keyword('Big Apple')
# >>> # Returns True
# >>> # This case 'Big Apple' will no longer be a recognized keyword
# >>> keyword_processor.remove_keyword('Big Apple')
# >>> # Returns False
"""
return self.__delitem__(keyword)
Expand All @@ -289,9 +289,9 @@ def get_keyword(self, word):
If word is present as it is in keyword_trie_dict then we return keyword mapped to it.
Examples:
>>> keyword_processor.add_keyword('Big Apple', 'New York')
>>> keyword_processor.get('Big Apple')
>>> # New York
# >>> keyword_processor.add_keyword('Big Apple', 'New York')
# >>> keyword_processor.get('Big Apple')
# >>> # New York
"""
return self.__getitem__(word)

Expand All @@ -305,18 +305,18 @@ def add_keyword_from_file(self, keyword_file, encoding="utf-8"):
Examples:
keywords file format can be like:
>>> # Option 1: keywords.txt content
>>> # java_2e=>java
>>> # java programing=>java
>>> # product management=>product management
>>> # product management techniques=>product management
>>> # Option 2: keywords.txt content
>>> # java
>>> # python
>>> # c++
>>> keyword_processor.add_keyword_from_file('keywords.txt')
# >>> # Option 1: keywords.txt content
# >>> # java_2e=>java
# >>> # java programing=>java
# >>> # product management=>product management
# >>> # product management techniques=>product management
#
# >>> # Option 2: keywords.txt content
# >>> # java
# >>> # python
# >>> # c++
#
# >>> keyword_processor.add_keyword_from_file('keywords.txt')
Raises:
IOError: If `keyword_file` path is not valid
Expand All @@ -340,14 +340,15 @@ def add_keywords_from_dict(self, keyword_dict, multi_match=False):
keyword_dict (dict): A dictionary with `str` key and (list `str`) as value
Examples:
>>> keyword_dict = {
# >>> keyword_dict = {
"java": ["java_2e", "java programing"],
"product management": ["PM", "product manager"]
}
>>> keyword_processor.add_keywords_from_dict(keyword_dict)
# >>> keyword_processor.add_keywords_from_dict(keyword_dict)
Raises:
AttributeError: If value for a key in `keyword_dict` is not a list.
:param multi_match: Will return a list of matching keywords instead of replacing it
"""
for clean_name, keywords in keyword_dict.items():
Expand All @@ -364,11 +365,11 @@ def remove_keywords_from_dict(self, keyword_dict):
keyword_dict (dict): A dictionary with `str` key and (list `str`) as value
Examples:
>>> keyword_dict = {
# >>> keyword_dict = {
"java": ["java_2e", "java programing"],
"product management": ["PM", "product manager"]
}
>>> keyword_processor.remove_keywords_from_dict(keyword_dict)
# >>> keyword_processor.remove_keywords_from_dict(keyword_dict)
Raises:
AttributeError: If value for a key in `keyword_dict` is not a list.
Expand All @@ -388,7 +389,7 @@ def add_keywords_from_list(self, keyword_list):
keyword_list (list(str)): List of keywords to add
Examples:
>>> keyword_processor.add_keywords_from_list(["java", "python"]})
# >>> keyword_processor.add_keywords_from_list(["java", "python"]})
Raises:
AttributeError: If `keyword_list` is not a list.
Expand All @@ -406,7 +407,7 @@ def remove_keywords_from_list(self, keyword_list):
keyword_list (list(str)): List of keywords to remove
Examples:
>>> keyword_processor.remove_keywords_from_list(["java", "python"]})
# >>> keyword_processor.remove_keywords_from_list(["java", "python"]})
Raises:
AttributeError: If `keyword_list` is not a list.
Expand All @@ -433,12 +434,12 @@ def get_all_keywords(self, term_so_far='', current_dict=None):
And value mapped to it is the clean name mapped to it.
Examples:
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('j2ee', 'Java')
>>> keyword_processor.add_keyword('Python', 'Python')
>>> keyword_processor.get_all_keywords()
>>> {'j2ee': 'Java', 'python': 'Python'}
>>> # NOTE: for case_insensitive all keys will be lowercased.
# >>> keyword_processor = KeywordProcessor()
# >>> keyword_processor.add_keyword('j2ee', 'Java')
# >>> keyword_processor.add_keyword('Python', 'Python')
# >>> keyword_processor.get_all_keywords()
# >>> {'j2ee': 'Java', 'python': 'Python'}
# >>> # NOTE: for case_insensitive all keys will be lowercased.
"""
terms_present = {}
if not term_so_far:
Expand All @@ -465,13 +466,13 @@ def extract_keywords(self, sentence, span_info=False):
keywords_extracted (list(str)): List of terms/keywords found in sentence that match our corpus
Examples:
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('Big Apple', 'New York')
>>> keyword_processor.add_keyword('Bay Area')
>>> keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
>>> keywords_found
>>> ['New York', 'Bay Area']
# >>> from flashtext import KeywordProcessor
# >>> keyword_processor = KeywordProcessor()
# >>> keyword_processor.add_keyword('Big Apple', 'New York')
# >>> keyword_processor.add_keyword('Bay Area')
# >>> keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
# >>> keywords_found
# >>> ['New York', 'Bay Area']
"""
keywords_extracted = []
Expand Down Expand Up @@ -575,13 +576,13 @@ def replace_keywords(self, sentence):
new_sentence (str): Line of text with replaced keywords
Examples:
>>> from flashtext import KeywordProcessor
>>> keyword_processor = KeywordProcessor()
>>> keyword_processor.add_keyword('Big Apple', 'New York')
>>> keyword_processor.add_keyword('Bay Area')
>>> new_sentence = keyword_processor.replace_keywords('I love Big Apple and bay area.')
>>> new_sentence
>>> 'I love New York and Bay Area.'
# >>> from flashtext import KeywordProcessor
# >>> keyword_processor = KeywordProcessor()
# >>> keyword_processor.add_keyword('Big Apple', 'New York')
# >>> keyword_processor.add_keyword('Bay Area')
# >>> new_sentence = keyword_processor.replace_keywords('I love Big Apple and bay area.')
# >>> new_sentence
# >>> 'I love New York and Bay Area.'
"""
if not sentence:
Expand Down

0 comments on commit fe876af

Please sign in to comment.