diff --git a/Dockerfile b/Dockerfile index e7195a9..5df41af 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,38 @@ FROM python:3.7-alpine + +RUN apt-get update \ + && apt-get install -y --no-install-recommends wget libgtk-3-dev libdbus-glib-1-2 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + + +ARG FIREFOX_VERSION=latest +RUN FIREFOX_DOWNLOAD_URL=$(if [ $FIREFOX_VERSION = "latest" ] || [ $FIREFOX_VERSION = "nightly-latest" ] || [ $FIREFOX_VERSION = "devedition-latest" ] || [ $FIREFOX_VERSION = "esr-latest" ]; then echo "https://download.mozilla.org/?product=firefox-$FIREFOX_VERSION-ssl&os=linux64&lang=en-US"; else echo "https://download-installer.cdn.mozilla.net/pub/firefox/releases/$FIREFOX_VERSION/linux-x86_64/en-US/firefox-$FIREFOX_VERSION.tar.bz2"; fi) \ + && apt-get update -qqy \ + && apt-get -qqy --no-install-recommends install libavcodec-extra \ + && rm -rf /var/lib/apt/lists/* /var/cache/apt/* \ + && wget --no-verbose -O /tmp/firefox.tar.bz2 $FIREFOX_DOWNLOAD_URL \ + && tar -C /opt -xjf /tmp/firefox.tar.bz2 \ + && rm /tmp/firefox.tar.bz2 \ + && mv /opt/firefox /opt/firefox-$FIREFOX_VERSION \ + && ln -fs /opt/firefox-$FIREFOX_VERSION/firefox /usr/bin/firefox + +#============ +# GeckoDriver +#============ +ARG GECKODRIVER_VERSION=latest +RUN GK_VERSION=$(if [ ${GECKODRIVER_VERSION:-latest} = "latest" ]; then echo "0.27.0"; else echo $GECKODRIVER_VERSION; fi) \ + && echo "Using GeckoDriver version: "$GK_VERSION \ + && wget --no-verbose -O /tmp/geckodriver.tar.gz https://github.com/mozilla/geckodriver/releases/download/v$GK_VERSION/geckodriver-v$GK_VERSION-linux64.tar.gz \ + && rm -rf /opt/geckodriver \ + && tar -C /opt -zxf /tmp/geckodriver.tar.gz \ + && rm /tmp/geckodriver.tar.gz \ + && mv /opt/geckodriver /opt/geckodriver-$GK_VERSION \ + && cp /opt/geckodriver-$GK_VERSION /bin \ + && chmod 755 /opt/geckodriver-$GK_VERSION \ + && ln -fs /opt/geckodriver-$GK_VERSION /usr/bin/geckodriver \ + && ln -fs /opt/geckodriver-$GK_VERSION /usr/bin/wires +# twitterscraper RUN apk add --update --no-cache g++ gcc libxslt-dev COPY . /app WORKDIR /app diff --git a/requirements.txt b/requirements.txt index ec44956..31a1ff7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ bs4 lxml requests billiard +selenium-wire diff --git a/twitterscraper/__init__.py b/twitterscraper/__init__.py index f0dfc0a..450e520 100644 --- a/twitterscraper/__init__.py +++ b/twitterscraper/__init__.py @@ -10,8 +10,6 @@ __license__ = 'MIT' -from twitterscraper.query import query_tweets -from twitterscraper.query import query_tweets_from_user -from twitterscraper.query import query_user_info -from twitterscraper.tweet import Tweet +from twitterscraper.query import query_tweets, query_tweets_from_user, query_user_info +from twitterscraper.query_js import get_user_data, get_query_data from twitterscraper.user import User diff --git a/twitterscraper/browser.py b/twitterscraper/browser.py new file mode 100644 index 0000000..064debf --- /dev/null +++ b/twitterscraper/browser.py @@ -0,0 +1,112 @@ +import requests +from functools import lru_cache +from itertools import cycle +from bs4 import BeautifulSoup +from threading import Thread +from random import shuffle + +from seleniumwire import webdriver +from selenium.webdriver.firefox.options import Options + +import logging +logger = logging.getLogger('twitterscraper') + + +PROXY_URL = 'https://free-proxy-list.net/' +NYT_LOGO_URL = 'https://pbs.twimg.com/profile_images/1098244578472280064/gjkVMelR_normal.png' + + +def get_proxy_delay(proxy, result, max_time=10): + try: + response = requests.post( + NYT_LOGO_URL, + proxies={'https': f'https://{proxy}/'}, + timeout=max_time + ) + + except Exception: + result[proxy] = None + else: + result[proxy] = response.elapsed.total_seconds() + + +@lru_cache(1) +def get_best_proxies(proxies): + logger.info('Pinging twitter to find best proxies') + threads = [] + result = {} + # In this case 'urls' is a list of urls to be crawled. + for proxy in proxies: + process = Thread(target=get_proxy_delay, args=[proxy, result]) + process.start() + threads.append(process) + for process in threads: + process.join() + + assert len(set(result.values())) > 1 # ensure at least one proxy took less than max_time + + result = {k: v for k, v in result.items() if v} + best_proxies = [x[0] for x in sorted(result.items(), key=lambda x: x[1])] + return best_proxies[:int(len(best_proxies)**0.5)] # best sqrt(N) of N working proxies + + +@lru_cache(1) +def get_proxies(): + response = requests.get(PROXY_URL) + soup = BeautifulSoup(response.text, 'lxml') + table = soup.find('table', id='proxylisttable') + list_tr = table.find_all('tr') + list_td = [elem.find_all('td') for elem in list_tr] + list_td = list(filter(None, list_td)) + list_ip = [elem[0].text for elem in list_td] + list_ports = [elem[1].text for elem in list_td] + list_proxies = [':'.join(elem) for elem in list(zip(list_ip, list_ports))] + return list_proxies + + +def get_proxy_pool(): + # TODO: cache this on disk so reruns aren't required + best_proxies = get_best_proxies( + tuple(get_proxies()) + ) + shuffle(best_proxies) + return cycle(best_proxies) + + +@lru_cache(1) +def get_ublock(): + pass + #download ublock here + + +def get_driver(proxy=None, timeout=30): + profile = webdriver.FirefoxProfile() + profile.set_preference("http.response.timeout", 5) + + seleniumwire_options = {'verify_ssl': False} + if proxy: + seleniumwire_options['suppress_connection_errors'] = False + seleniumwire_options['proxy'] = { + 'https': f'https://{proxy}', + 'http': f'http://{proxy}', + } + + opt = Options() + opt.headless = True + + driver = webdriver.Firefox( + firefox_profile=profile, + options=opt, + seleniumwire_options=seleniumwire_options + ) + + """ + TODO: install ublock here + get_ublock() + extensions.ublock0.adminSettings = best settings for twitter here + browser.install_addon(extension_dir + extension, temporary=True) + """ + + driver.set_page_load_timeout(timeout) + + return driver diff --git a/twitterscraper/main.py b/twitterscraper/main.py index 24f93ef..db7ffbe 100644 --- a/twitterscraper/main.py +++ b/twitterscraper/main.py @@ -10,6 +10,9 @@ from os.path import isfile from pprint import pprint +from twitterscraper import query_js, query + + from twitterscraper.query import (query_tweets, query_tweets_from_user, query_user_info) @@ -65,12 +68,14 @@ def main(): "This may take a while. You can increase the number of parallel" "processes depending on the computational power you have.") parser.add_argument("-c", "--csv", action='store_true', - help="Set this flag if you want to save the results to a CSV format.") + help="Set this flag if you want to save the results to a CSV format.") + parser.add_argument("-j", "--javascript", action='store_true', + help="Set this flag if you want to request using javascript via Selenium.") parser.add_argument("-u", "--user", action='store_true', help="Set this flag to if you want to scrape tweets from a specific user" "The query should then consist of the profilename you want to scrape without @") parser.add_argument("--profiles", action='store_true', - help="Set this flag to if you want to scrape profile info of all the users where you" + help="Set this flag to if you want to scrape profile info of all the users where you" "have previously scraped from. After all of the tweets have been scraped it will start" "a new process of scraping profile pages.") parser.add_argument("--lang", type=str, default=None, @@ -113,14 +118,33 @@ def main(): exit(-1) if args.all: - args.begindate = dt.date(2006,3,1) + args.begindate = dt.date(2006, 3, 1) if args.user: - tweets = query_tweets_from_user(user = args.query, limit = args.limit, use_proxy = not args.disableproxy) + if args.javascript: + tweets = query_js.get_user_data( + from_user=args.query, limit=args.limit, + begindate=args.begindate, enddate=args.enddate, + poolsize=args.poolsize, lang=args.lang, use_proxy=not args.disableproxy + )['tweets'] + else: + tweets = query.query_tweets_from_user(user=args.query, limit=args.limit, use_proxy=not args.disableproxy) + else: - tweets = query_tweets(query = args.query, limit = args.limit, - begindate = args.begindate, enddate = args.enddate, - poolsize = args.poolsize, lang = args.lang, use_proxy = not args.disableproxy) + if args.javascript: + tweets = query_js.get_query_data( + query=args.query, limit=args.limit, + begindate=args.begindate, enddate=args.enddate, + poolsize=args.poolsize, lang=args.lang, + use_proxy=not args.disableproxy + ) + else: + tweets = query.query_tweets( + query=args.query, limit=args.limit, + begindate=args.begindate, enddate=args.enddate, + poolsize=args.poolsize, lang=args.lang, + use_proxy=not args.disableproxy + ) if args.dump: pprint([tweet.__dict__ for tweet in tweets]) @@ -151,8 +175,11 @@ def main(): json.dump(tweets, output, cls=JSONEncoder) if args.profiles and tweets: list_users = list(set([tweet.username for tweet in tweets])) - list_users_info = [query_user_info(elem, not args.disableproxy) for elem in list_users] - filename = 'userprofiles_' + args.output + + # Note: this has no query_js equivalent! + list_users_info = [query.query_user_info(elem, not args.disableproxy) for elem in list_users] + + filename = 'userprofiles_' + args.outputp with open(filename, "w", encoding="utf-8") as output: json.dump(list_users_info, output, cls=JSONEncoder) except KeyboardInterrupt: diff --git a/twitterscraper/query.py b/twitterscraper/query.py index 1002b9c..9d1309d 100644 --- a/twitterscraper/query.py +++ b/twitterscraper/query.py @@ -52,8 +52,8 @@ def get_proxies(): list_ip = [elem[0].text for elem in list_td] list_ports = [elem[1].text for elem in list_td] list_proxies = [':'.join(elem) for elem in list(zip(list_ip, list_ports))] - return list_proxies - + return list_proxies + def get_query_url(query, lang, pos, from_user = False): if from_user: if pos is None: @@ -116,7 +116,7 @@ def query_single_page(query, lang, pos, retry=50, from_user=False, timeout=60, u pos = json_resp['min_position'] has_more_items = json_resp['has_more_items'] if not has_more_items: - logger.info("Twitter returned : 'has_more_items' ") + logger.info("Twitter response: 'has_more_items' == False ") return [], None else: pos = None @@ -217,10 +217,10 @@ def query_tweets_once(*args, **kwargs): def query_tweets(query, limit=None, begindate=dt.date(2006, 3, 21), enddate=dt.date.today(), poolsize=20, lang='', use_proxy=True): no_days = (enddate - begindate).days - + if(no_days < 0): sys.exit('Begin date must occur before end date.') - + if poolsize > no_days: # Since we are assigning each pool a range of dates to query, # the number of pools should not exceed the number of dates. @@ -329,8 +329,6 @@ def query_user_info(user, use_proxy=True): :param user: the twitter user to web scrape its twitter page info """ - - try: user_info = query_user_page(INIT_URL_USER.format(u=user), use_proxy=use_proxy) if user_info: @@ -343,4 +341,4 @@ def query_user_info(user, use_proxy=True): logger.exception("An unknown error occurred! Returning user information gathered so far...") logger.info("Got user information from username {}".format(user)) - return user_info + return None diff --git a/twitterscraper/query_js.py b/twitterscraper/query_js.py new file mode 100644 index 0000000..c37d699 --- /dev/null +++ b/twitterscraper/query_js.py @@ -0,0 +1,245 @@ +from collections import defaultdict +import datetime as dt +import sys +import json +from itertools import cycle +from functools import partial +from billiard.pool import Pool +import time + +from selenium.webdriver.common.action_chains import ActionChains +from selenium.webdriver.common.keys import Keys + +from twitterscraper.tweet import Tweet +from twitterscraper.browser import get_driver, get_proxy_pool + +import logging +logger = logging.getLogger('twitterscraper') + + +INIT_URL = 'https://twitter.com/search?f=live&vertical=default&q={q}&l={lang}' +INIT_URL_USER = 'https://twitter.com/{u}' + + +def linspace(start, stop, n): + if n == 1: + yield stop + return + h = (stop - start) / (n - 1) + for i in range(n): + yield start + h * i + + +def decode_body(body): + try: + return json.loads(body) + except (UnicodeDecodeError, json.decoder.JSONDecodeError): + return None + + +def scroll_down(driver, num_press=1, pause=1): + actions = ActionChains(driver) + for _ in range(num_press): + actions.send_keys(Keys.PAGE_DOWN) + actions.pause(pause) + actions.perform() + + +def get_proxied_driver(use_proxy): + # get proxied driver if use_proxy, else get unproxied driver + proxy_pool = get_proxy_pool() if use_proxy else cycle([None]) + proxy = next(proxy_pool) + logger.info('Using proxy {}'.format(proxy)) + return get_driver(proxy) + + +def retrieve_twitter_response_data(url, limit, use_proxy, retry, wait_seconds=40): + logger.info('Scraping tweets from {}'.format(url)) + + driver = get_proxied_driver(use_proxy) + + try: + driver.get(url) + relevant_responses = {} + + # page down, recording the results, until there isn't anything new or limit has been breached + start_time = dt.datetime.now() + tweet_count = 0 + while dt.datetime.now() < start_time + dt.timedelta(seconds=wait_seconds): + logger.info(f'found {tweet_count} tweets') + scroll_down(driver) + + # relevant requests have completely responses, json in their path (but not guide.json), and a globalObjects key + new_relevant_responses = { + i: decode_body(r.response.body) for i, r in enumerate(driver.requests) + if 'json' in r.path and 'guide.json' not in r.path and + r.response is not None and isinstance(decode_body(r.response.body), dict) and + 'globalObjects' in decode_body(r.response.body) and i not in relevant_responses + } + + # ensure we don't surpass tweet limit + new_tweet_count = 0 + for response in new_relevant_responses.values(): + new_tweet_count += len(response['globalObjects']['tweets']) + + # if no relevant requests, or latest relevant request isn't done loading, wait then check again + if not new_relevant_responses or not new_tweet_count: + time.sleep(1) + continue + + tweet_count += new_tweet_count + if tweet_count >= limit: + break + + # merge into relevant responses + relevant_responses.update(new_relevant_responses) + + # finished retrieval cycle successfully, reset start_time + start_time = dt.datetime.now() + + except Exception as e: + driver.quit() + logger.exception('Exception {} while requesting "{}"'.format( + e, url)) + if retry > 0: + logger.debug('Retrying with fresh browser... (Attempts left: {})'.format(retry)) + return retrieve_twitter_response_data(url, limit, use_proxy, retry) + else: + return None + + driver.quit() + return relevant_responses + + +def query_single_page(url, retry=50, from_user=False, timeout=60, use_proxy=True, limit=None, npasses=3): + """ + Returns tweets from the given URL. + :param query: The query url + :param retry: Number of retries if something goes wrong. + :param use_proxy: Determines whether to fetch tweets with proxy + :param limit: Max number of tweets to get + :return: Twitter dict containing tweets users, locations, and other metadata + """ + limit = limit or float('inf') + + data = defaultdict(dict) + for _ in range(npasses): + for response_body in retrieve_twitter_response_data(url, limit, use_proxy, retry).values(): + for key, value in response_body['globalObjects'].items(): + data[key].update(value) + + return data + + +def get_query_data(query, limit=None, begindate=None, enddate=None, poolsize=None, lang='', use_proxy=True): + begindate = begindate or dt.date.today() - dt.timedelta(days=1) + enddate = enddate or dt.date.today() + poolsize = poolsize or 5 + + num_days = (enddate - begindate).days + + if(num_days < 0): + sys.exit('Begin date must occur before end date.') + + if poolsize > num_days: + # Since we are assigning each pool a range of dates to query, + # the number of pools should not exceed the number of dates. + poolsize = num_days + # query one day at a time so driver doesn't use too much memory + dateranges = list(reversed([begindate + dt.timedelta(days=elem) for elem in linspace(0, num_days, num_days + 1)])) + + urls = [] + for until, since in zip(dateranges[:-1], dateranges[1:]): + query_str = '{} since:{} until:{}'.format(query, since, until) + urls.append(INIT_URL.format(q=query_str, lang=lang)) + logger.info('query: {}'.format(query_str)) + + data = retrieve_data_from_urls(urls, limit=limit, poolsize=poolsize, use_proxy=use_proxy) + tweets = get_tweets_in_daterange(data['tweets'], begindate, enddate) + return get_tweet_objects(tweets, data['users']) + + +def get_tweet_objects(tweets_dict, users): + tweets = [] + for tid, tweet_item in sorted(tweets_dict.items(), reverse=True): + user = users[str(tweet_item['user_id'])] + tweets.append(Tweet( + screen_name=user['screen_name'], + username=user['name'], + user_id=tweet_item['user_id'], + tweet_id=tid, + tweet_url=f'https://twitter.com/{user["screen_name"]}/status/{tid}', # hack? + timestamp=timestamp_of_tweet(tweet_item), # hack? + timestamp_epochs=timestamp_of_tweet(tweet_item), # hack? + text=tweet_item['full_text'], + text_html=None, # hack? + links=tweet_item['entities']['urls'], + hashtags=tweet_item['entities']['hashtags'], + has_media=None, # hack? + img_urls=None, # hack? + parent_tweet_id=tweet_item['in_reply_to_status_id'], + reply_to_users=tweet_item['in_reply_to_user_id'], # hack? + video_url=None, #hack? + likes=None, #hack?, + retweets=None, #hack? + replies=None, #hack? + is_replied=None, #hack? + is_reply_to=None, #hack? + )) + return tweets + + +def date_of_tweet(tweet): + return dt.datetime.strptime( + tweet['created_at'], '%a %b %d %H:%M:%S %z %Y' + ).replace(tzinfo=None).date() + + +def timestamp_of_tweet(tweet): + return dt.datetime.strptime( + tweet['created_at'], '%a %b %d %H:%M:%S %z %Y' + ).timestamp() + + +def get_tweets_in_daterange(tweets, begindate=None, enddate=None): + begindate = begindate or dt.date(1990, 1, 1) + enddate = enddate or dt.date(2100, 1, 1) + return { + tid: tweet for tid, tweet in tweets.items() + if begindate <= date_of_tweet(tweet) <= enddate + } + + +def get_user_data(from_user, *args, **kwargs): + # include retweets + retweet_query = f'filter:nativeretweets from:{from_user}' + no_retweet_query = f'from:{from_user}' + return ( + get_query_data(retweet_query, *args, **kwargs) + + get_query_data(no_retweet_query, *args, **kwargs) + ) + + +def retrieve_data_from_urls(urls, limit, poolsize, use_proxy=True): + # send query urls to multiprocessing pool, and aggregate + if limit and poolsize: + limit_per_pool = (limit // poolsize) + 1 + else: + limit_per_pool = None + + all_data = defaultdict(dict) + try: + pool = Pool(poolsize) + try: + for new_data in pool.imap_unordered(partial(query_single_page, limit=limit_per_pool, use_proxy=use_proxy), urls): + for key, value in new_data.items(): + all_data[key].update(value) + logger.info('Got {} data ({} new).'.format( + len(all_data['tweets']), len(new_data['tweets']))) + except KeyboardInterrupt: + logger.debug('Program interrupted by user. Returning all tweets gathered so far.') + finally: + pool.close() + pool.join() + + return all_data diff --git a/twitterscraper/tests/test_integration.py b/twitterscraper/tests/test_integration.py new file mode 100644 index 0000000..0ba48b4 --- /dev/null +++ b/twitterscraper/tests/test_integration.py @@ -0,0 +1,71 @@ +import datetime as dt +import pytest +from twitterscraper import query, query_js +import logging + + +@pytest.mark.parametrize( + 'query_fn, is_data_request', + [ + (query_js.get_query_data, True), + (query.query_tweets, False), + ] +) +def test_get_multiple_correct_count(query_fn, is_data_request): + # expect + expected_counts_by_date = { + dt.date(2017, 11, 10): 0, + dt.date(2017, 11, 11): 29, + dt.date(2017, 11, 12): 32, + dt.date(2017, 11, 13): 78, + dt.date(2017, 11, 14): 55, + dt.date(2017, 11, 15): 51, + dt.date(2017, 11, 16): 45, + dt.date(2017, 11, 17): 60, + dt.date(2017, 11, 18): 40, + dt.date(2017, 11, 19): 32, + dt.date(2017, 11, 20): 44, + dt.date(2017, 11, 21): 74, + dt.date(2017, 11, 22): 0, + } + + # retrieve + call_dict = dict(begindate=dt.date(2017, 11, 11), enddate=dt.date(2017, 11, 22), + poolsize=3, lang='en') + call_dict['query'] = 'alphabet soup' + res = query.query_tweets(**call_dict) + + res = query_fn(**call_dict) + + # validate + actual_counts_by_date = { + d: len([r for r in res if r.timestamp.date() == d]) + for d in expected_counts_by_date.keys() + } + + for k in expected_counts_by_date: + print(k, expected_counts_by_date.get(k), actual_counts_by_date.get(k)) + assert expected_counts_by_date == actual_counts_by_date + + +@pytest.mark.parametrize( + 'query_fn, is_data_request', + [ + (query_js.get_query_data, True), + (query.query_tweets, False), + ] +) +def test_same_count_multiple_tries(query_fn, is_data_request): + lengths = [] + for _ in range(10): + call_dict = dict(begindate=dt.date(2019, 2, 2), enddate=dt.date(2019, 2, 3), + poolsize=1, lang='en') + call_dict['query'] = 'lapp' + + res = query_fn(**call_dict) + if is_data_request: + lengths.append(len(res['tweets'])) + else: + lengths.append(len(res)) + + assert len(set(lengths)) == 1 # all the same length diff --git a/twitterscraper/tests/test_simple.py b/twitterscraper/tests/test_simple.py new file mode 100644 index 0000000..97b7c78 --- /dev/null +++ b/twitterscraper/tests/test_simple.py @@ -0,0 +1,14 @@ +import datetime as dt +from twitterscraper import query_js + +# TODO: fix logging +import logging +logger = logging.getLogger('twitterscraper') +logger.setLevel(logging.DEBUG) + + +def test_simple_js(): + call_dict = dict(begindate=dt.date(2018, 5, 5), enddate=dt.date(2018, 5, 7), + poolsize=2, lang='en', query='donald john trump', use_proxy=True) + res = query_js.get_query_data(**call_dict) + assert len(res) == 78 diff --git a/twitterscraper/user.py b/twitterscraper/user.py index e7bf4ff..05bcfbd 100644 --- a/twitterscraper/user.py +++ b/twitterscraper/user.py @@ -2,7 +2,7 @@ class User: - def __init__(self, user="", full_name="", location="", blog="", date_joined="", id="", tweets=0, + def __init__(self, user="", full_name="", location="", blog="", date_joined="", id="", tweets=0, following=0, followers=0, likes=0, lists=0, is_verified=0): self.user = user self.full_name = full_name @@ -16,7 +16,7 @@ def __init__(self, user="", full_name="", location="", blog="", date_joined="", self.likes = likes self.lists = lists self.is_verified = is_verified - + @classmethod def from_soup(self, tag_prof_header, tag_prof_nav): """ @@ -27,31 +27,31 @@ def from_soup(self, tag_prof_header, tag_prof_nav): :return: Returns a User object with captured data via beautifulsoup """ - self.user= tag_prof_header.find('a', {'class':'ProfileHeaderCard-nameLink u-textInheritColor js-nav'})['href'].strip("/") + self.user= tag_prof_header.find('a', {'class':'ProfileHeaderCard-nameLink u-textInheritColor js-nav'})['href'].strip("/") self.full_name = tag_prof_header.find('a', {'class':'ProfileHeaderCard-nameLink u-textInheritColor js-nav'}).text - - location = tag_prof_header.find('span', {'class':'ProfileHeaderCard-locationText u-dir'}) + + location = tag_prof_header.find('span', {'class':'ProfileHeaderCard-locationText u-dir'}) if location is None: self.location = "None" - else: + else: self.location = location.text.strip() blog = tag_prof_header.find('span', {'class':"ProfileHeaderCard-urlText u-dir"}) if blog is None: blog = "None" else: - self.blog = blog.text.strip() + self.blog = blog.text.strip() date_joined = tag_prof_header.find('div', {'class':"ProfileHeaderCard-joinDate"}).find('span', {'class':'ProfileHeaderCard-joinDateText js-tooltip u-dir'})['title'] if date_joined is None: self.data_joined = "Unknown" - else: + else: self.date_joined = date_joined.strip() tag_verified = tag_prof_header.find('span', {'class': "ProfileHeaderCard-badges"}) if tag_verified is not None: self.is_verified = 1 - + self.id = tag_prof_nav.find('div',{'class':'ProfileNav'})['data-user-id'] tweets = tag_prof_nav.find('span', {'class':"ProfileNav-value"})['data-count'] if tweets is None: @@ -59,46 +59,47 @@ def from_soup(self, tag_prof_header, tag_prof_nav): else: self.tweets = int(tweets) - following = tag_prof_nav.find('li', {'class':"ProfileNav-item ProfileNav-item--following"}).\ - find('span', {'class':"ProfileNav-value"})['data-count'] + following = tag_prof_nav.find('li', {'class':"ProfileNav-item ProfileNav-item--following"}) if following is None: - following = 0 + self.following = 0 else: + following = following.find('span', {'class':"ProfileNav-value"})['data-count'] self.following = int(following) - followers = tag_prof_nav.find('li', {'class':"ProfileNav-item ProfileNav-item--followers"}).\ - find('span', {'class':"ProfileNav-value"})['data-count'] + followers = tag_prof_nav.find('li', {'class':"ProfileNav-item ProfileNav-item--followers"}) if followers is None: self.followers = 0 else: - self.followers = int(followers) - - likes = tag_prof_nav.find('li', {'class':"ProfileNav-item ProfileNav-item--favorites"}).\ - find('span', {'class':"ProfileNav-value"})['data-count'] + followers = followers.find('span', {'class':"ProfileNav-value"})['data-count'] + self.followers = int(followers) + + likes = tag_prof_nav.find('li', {'class':"ProfileNav-item ProfileNav-item--favorites"}) if likes is None: self.likes = 0 else: - self.likes = int(likes) - + likes = likes.find('span', {'class':"ProfileNav-value"})['data-count'] + self.likes = int(likes) + lists = tag_prof_nav.find('li', {'class':"ProfileNav-item ProfileNav-item--lists"}) if lists is None: self.lists = 0 - elif lists.find('span', {'class':"ProfileNav-value"}) is None: + elif lists.find('span', {'class':"ProfileNav-value"}) is None: self.lists = 0 - else: - lists = lists.find('span', {'class':"ProfileNav-value"}).text + else: + lists = lists.find('span', {'class':"ProfileNav-value"}).text self.lists = int(lists) return(self) @classmethod def from_html(self, html): soup = BeautifulSoup(html, "lxml") - user_profile_header = soup.find("div", {"class":'ProfileHeaderCard'}) - user_profile_canopy = soup.find("div", {"class":'ProfileCanopy-nav'}) + user_profile_header = soup.find("div", {"class": 'ProfileHeaderCard'}) + user_profile_canopy = soup.find("div", {"class": 'ProfileCanopy-nav'}) + if user_profile_header and user_profile_canopy: try: return self.from_soup(user_profile_header, user_profile_canopy) - except AttributeError: + except AttributeError as e: pass # Incomplete info? Discard! - except TypeError: + except TypeError as e: pass # Incomplete info? Discard!