diff --git a/GetOldTweets3/manager/TweetCriteria.py b/GetOldTweets3/manager/TweetCriteria.py index f5955f2e..0ce75417 100644 --- a/GetOldTweets3/manager/TweetCriteria.py +++ b/GetOldTweets3/manager/TweetCriteria.py @@ -26,6 +26,17 @@ def setUsername(self, username): self.username = username return self + def setExcludeWords(self, excludeWords): + """Set word(s) to exclude from tweets + Parameters + ---------- + excludeWords : list or iterable + + Example: ["red", "blue", "yellow", "green"] + """ + self.excludeWords = excludeWords + return self + def setSince(self, since): """Set a lower bound date in UTC Parameters @@ -46,6 +57,36 @@ def setUntil(self, until): self.until = until return self + def setMinReplies(self, minReplies): + """Set the minimum number of replies of tweets to search + Parameters + ---------- + minReplies : str, + for example: 42 + """ + self.minReplies = minReplies + return self + + def setMinFaves(self, minFaves): + """Set the minimum number of favorites of tweets to search + Parameters + ---------- + minFaves : str, + for example: 42 + """ + self.minFaves = minFaves + return self + + def setMinRetweets(self, minRetweets): + """Set the minimum number of retweets of tweets to search + Parameters + ---------- + minRetweets : str, + for example: 42 + """ + self.minRetweets = minRetweets + return self + def setNear(self, near): """Set location to search nearby Parameters diff --git a/GetOldTweets3/manager/TweetManager.py b/GetOldTweets3/manager/TweetManager.py index ac0d7013..056c6c2a 100644 --- a/GetOldTweets3/manager/TweetManager.py +++ b/GetOldTweets3/manager/TweetManager.py @@ -289,6 +289,9 @@ def getJsonResponse(tweetCriteria, refreshCursor, cookieJar, proxy, useragent=No if hasattr(tweetCriteria, 'querySearch'): urlGetData += tweetCriteria.querySearch + if hasattr(tweetCriteria, 'excludeWords'): + urlGetData += ' -'.join([''] + tweetCriteria.excludeWords) + if hasattr(tweetCriteria, 'username'): if not hasattr(tweetCriteria.username, '__iter__'): tweetCriteria.username = [tweetCriteria.username] @@ -312,6 +315,15 @@ def getJsonResponse(tweetCriteria, refreshCursor, cookieJar, proxy, useragent=No if hasattr(tweetCriteria, 'until'): urlGetData += ' until:' + tweetCriteria.until + if hasattr(tweetCriteria, 'minReplies'): + urlGetData += ' min_replies:' + tweetCriteria.minReplies + + if hasattr(tweetCriteria, 'minFaves'): + urlGetData += ' min_faves:' + tweetCriteria.minFaves + + if hasattr(tweetCriteria, 'minRetweets'): + urlGetData += ' min_retweets:' + tweetCriteria.minRetweets + if hasattr(tweetCriteria, 'lang'): urlLang = 'l=' + tweetCriteria.lang + '&' else: diff --git a/README.md b/README.md index d19308e3..7f897ea9 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,28 @@ GetOldTweets3 --querysearch "bitcoin" --near "Berlin, Germany" --within 25km --m GetOldTweets3 --querysearch "museum" --near "55.75, 37.61" --within 40km --maxtweets 10 ``` +**Example 8 - Get tweets by minimum number of replies:** +```bash +GetOldTweets3 --querysearch "bitcoin" --minreplies 10 --maxtweets 10 +``` + +**Example 9 - Get tweets by minimum number of favorites:** +```bash +GetOldTweets3 --querysearch "bitcoin" --minfaves 10 --maxtweets 10 +``` + +**Example 10 - Get tweets by minimum number of retweets:** +```bash +GetOldTweets3 --querysearch "bitcoin" --minretweets 10 --maxtweets 10 +``` + +**Example 11 - Get tweets by excluding tweets with any word of a list:** +```bash +GetOldTweets3 --querysearch "bitcoin" --exclude-words-from-file excludewords.txt --maxtweets 10 +``` + +where words to exclude are separated by a whitespace character in the `excludewords.txt` file. + ## Python classes - **Tweet:** Model class that describes a specific tweet. - id (str) diff --git a/bin/GetOldTweets3 b/bin/GetOldTweets3 index 69a8bf22..45e29800 100755 --- a/bin/GetOldTweets3 +++ b/bin/GetOldTweets3 @@ -1,22 +1,26 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """To use this script you can pass the following attributes: - --querysearch: a query text to be matched - --username: a username or a list of usernames (comma or space separated) - of a specific twitter account(s) (with or without @) ---username-from-file: a file with a list of usernames, - --since: a lower bound date in UTC (yyyy-mm-dd) - --until: an upper bound date in UTC (yyyy-mm-dd) (not included) - --near: a reference location area from where tweets were generated - --within: a distance radius from "near" location (e.g. 15mi) - --toptweets: only the tweets provided as top tweets by Twitter (no parameters required) - --maxtweets: the maximum number of tweets to retrieve - --lang: the language of tweets - --emoji: "ignore" (the default, discards emojis), "unicode" - or "named" (replaces with "Emoji[Name of emoji]") - --output: a filename to export the results (default is "output_got.csv"). - Pass a '.gz' suffix to have it gzipped. - --debug: outputs debug information to standard error stream + --querysearch: a query text to be matched +--exclude-words-from-file: a file with a list of words to exclude + --username: a username or a list of usernames (comma or space separated) + of a specific twitter account(s) (with or without @) + --username-from-file: a file with a list of usernames + --since: a lower bound date in UTC (yyyy-mm-dd) + --until: an upper bound date in UTC (yyyy-mm-dd) (not included) + --minreplies: a minimum number of replies for a tweet + --minfaves: a minimum number of favorites for a tweet + --minretweets: a minimum number of retweets for a tweet + --near: a reference location area from where tweets were generated + --within: a distance radius from "near" location (e.g. 15mi) + --toptweets: only the tweets provided as top tweets by Twitter (no parameters required) + --maxtweets: the maximum number of tweets to retrieve + --lang: the language of tweets + --emoji: "ignore" (the default, discards emojis), "unicode" + or "named" (replaces with "Emoji[Name of emoji]") + --output: a filename to export the results (default is "output_got.csv") + Pass a '.gz' suffix to have it gzipped. + --debug: outputs debug information to standard error stream Examples: @@ -39,7 +43,19 @@ GetOldTweets3 --querysearch "bitcoin" --lang cn --maxtweets 10 GetOldTweets3 --querysearch "bitcoin" --near "Berlin, Germany" --within 25km --maxtweets 10 # Example 7 - Get tweets by geo coordinates: -GetOldTweets3 --querysearch "museum" --near "55.75, 37.61" --within 40km --maxtweets 10 +GetOldTweets3 --querysearch "museum" --near "55.75, 37.61" --within 40km --maxtweets 10 + +# Example 8 - Get tweets by minimum number of replies: +GetOldTweets3 --querysearch "bitcoin" --minreplies 10 --maxtweets 10 + +# Example 9 - Get tweets by minimum number of favorites: +GetOldTweets3 --querysearch "bitcoin" --minfaves 10 --maxtweets 10 + +# Example 10 - Get tweets by minimum number of retweets: +GetOldTweets3 --querysearch "bitcoin" --minretweets 10 --maxtweets 10 + +# Example 11 - Get tweets by excluding tweets with any word of a list: +GetOldTweets3 --querysearch "bitcoin" --exclude-words-from-file excludewords.txt --maxtweets 10 """ from functools import partial @@ -64,10 +80,14 @@ def main(argv): try: opts, args = getopt.getopt(argv, "", ("querysearch=", + "exclude-words-from-file=", "username=", "usernames-from-file=", "since=", "until=", + "minreplies=", + "minfaves=", + "minretweets=", "near=", "within=", "toptweets", @@ -83,6 +103,7 @@ def main(argv): debug = False usernames = set() username_files = set() + exclude_words_files = set() for opt, arg in opts: if opt == '--querysearch': tweetCriteria.querySearch = arg @@ -95,12 +116,24 @@ def main(argv): elif opt == '--usernames-from-file': username_files.add(arg) + elif opt == '--exclude-words-from-file': + exclude_words_files.add(arg) + elif opt == '--since': tweetCriteria.since = arg elif opt == '--until': tweetCriteria.until = arg + elif opt == '--minreplies': + tweetCriteria.minReplies = arg + + elif opt == '--minfaves': + tweetCriteria.minFaves = arg + + elif opt == '--minretweets': + tweetCriteria.minRetweets = arg + elif opt == '--near': geocode = arg.split(',') try: @@ -169,6 +202,10 @@ def main(argv): usernames |= set(usernames_) print("Found %i usernames in %s" % (len(usernames_), uf)) + if exclude_words_files: + exclude_words = sum([open(ewf).read().split() for ewf in exclude_words_files], []) + tweetCriteria.excludeWords = exclude_words + if usernames: if len(usernames) > 1: tweetCriteria.username = usernames