From 2d6f678e2889d94d60bc14d3360c30da7303f9c6 Mon Sep 17 00:00:00 2001 From: J Hancock <123655328+J-A-Ha@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:55:51 +0100 Subject: [PATCH] adding docstrings --- art/classes/activitylog.py | 40 +++++++-- art/classes/citation_crawler.py | 147 ++++++++++++++++++++++---------- 2 files changed, 135 insertions(+), 52 deletions(-) diff --git a/art/classes/activitylog.py b/art/classes/activitylog.py index 450787b..47bd5cc 100644 --- a/art/classes/activitylog.py +++ b/art/classes/activitylog.py @@ -8,21 +8,22 @@ class ActivityLog(pd.DataFrame): """ This is an ActivityLog object. It is a modified Pandas Dataframe object designed to store metadata about an academic review. - Parameters - ---------- - - - Attributes - ---------- + Columns + ------- + * **timestamp**: date-time the activity occurred. + * **type**: type of activity. + * **activity**: details of activity. + * **location**: location in Review that activity occurred. + * **database**: name of database/repository accessed (if relevant). + * **url**: web address accessed (if relevant). + * **query**: search query used (if relevant). + * **changes**: number of changes made to the Review results. """ def __init__(self): """ Initialises ActivityLog instance. - - Parameters - ---------- """ @@ -44,6 +45,27 @@ def __init__(self): def add_activity(self, type: str, activity: str, location: list, database = None, query = None, url = None, changes_dict = None): + """ + Adds a new activity to the ActivityLog DataFrame. + + Parameters + ---------- + type : str + type of activity. + activity : str + details of activity. + location : str + name of location in Review that activity occurred. + database : str + name of database/repository accessed (if relevant). Defaults to None. + query : str + search query used (if relevant). Defaults to None. + url : str + web address accessed (if relevant). Defaults to None. + changes_dict : dict + dictionary of changes made to Review. Defaults to None. + """ + new_index = len(self) self.loc[new_index, 'timestamp'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S") self.loc[new_index, 'type'] = type diff --git a/art/classes/citation_crawler.py b/art/classes/citation_crawler.py index a984e08..4788b05 100644 --- a/art/classes/citation_crawler.py +++ b/art/classes/citation_crawler.py @@ -13,9 +13,12 @@ import pandas as pd import numpy as np - def crawler_scrape_url(url) -> pd.DataFrame: + """ + Core functionality for the citation crawler's web scraper. Takes a URL and returns a Pandas DataFrame. + """ + scrape_res = scrape_url(url=url) global results_cols @@ -111,6 +114,10 @@ def crawler_scrape_url(url) -> pd.DataFrame: def citation_crawler_site_test(url: str): + """ + Checks whether the citation crawler can crawl a given URL. Returns True if yes; False if no. + """ + global can_scrape for i in can_scrape: @@ -121,6 +128,22 @@ def citation_crawler_site_test(url: str): def academic_scraper(url, be_polite = False): + """ + Bespoke web scraper for academic repository websites. + + Parameters + ---------- + url : str + a URL to scrape. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + + Returns + ------- + res_df : pandas.DataFrame + a Pandas DataFrame containing scraped web data. + """ + # Checking if URL is bad. If True, tries to correct it. url = correct_seed_url_errors(url) domain = get_domain(url) @@ -160,6 +183,22 @@ def academic_scraper(url, be_polite = False): def citation_crawler_scraper(entry: pd.Series, be_polite = True): + """ + Bespoke web scraper for use by citation crawler. + + Parameters + ---------- + entry : pandas.Series + citation crawler entry. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + + Returns + ------- + entry : pandas.Series + citation crawler entry. + """ + url = entry['link'] res_df = academic_scraper(url=url, be_polite=be_polite) @@ -173,6 +212,24 @@ def citation_crawler_scraper(entry: pd.Series, be_polite = True): def citation_crawler_doi_retriver(entry: pd.Series, be_polite = True, timeout = 60): + """ + Takes citation crawler entry. If it contains a DOI, looks up the record using the CrossRef API. If not, scrapes the URL. + + Parameters + ---------- + entry : pandas.Series + citation crawler entry. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + entry : pandas.Series + citation crawler entry. + """ + doi = entry['doi'] link = entry['link'] @@ -221,6 +278,24 @@ def citation_crawler_doi_retriver(entry: pd.Series, be_polite = True, timeout = def update_citation_crawler_data(entry: pd.Series, be_polite = True, timeout = 60): + """ + Takes citation crawler entry and updates the data using the CrossRef API if a record is available. + + Parameters + ---------- + entry : pandas.Series + citation crawler entry. + be_polite : bool + whether to follow respect scraping permissions contained in websites' robots.txt files. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. + + Returns + ------- + entry : pandas.Series + citation crawler entry. + """ + doi = entry['doi'] link = entry['link'] @@ -266,32 +341,27 @@ def citation_crawler_engine( Parameters ---------- - urls : queue - ordered queue of URLs to be crawled. - required_keywords : list - list of keywords which sites must contain to be crawled. - excluded_keywords : list - list of keywords which sites must *not* contain to be crawled. - excluded_url_terms : list - list of strings; link will be ignored if it contains any string in list. - case_sensitive : bool - whether or not to ignore string characters' case. + to_crawl : queue + records to crawl. + data : pandas.DataFrame + a dataframe of data gathered by the crawler. + use_api : bool + whether to lookup entries and update their data using APIs. Required for the crawler to find new and add new data. Defaults to True. crawl_limit : int how many URLs the crawler should visit before it stops. - ignore_urls : list - list of URLs to ignore. - ignore_domains : list - list of domains to ignore. + depth_limit : int + maximum number of crawler iterations to perform. be_polite : bool whether to respect websites' permissions for crawlers. - full : bool - whether to run a full scrape on each site. This takes longer. - + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. Returns ------- - output_dict : dict - a dictionary containing results from each crawled site. + data : pandas.DataFrame + a Pandas DataFrame containing results from the crawl. """ # Intiailising variables to store the pages already visited @@ -411,34 +481,25 @@ def citation_crawler( Parameters ---------- - seeds : str or list - one or more URLs from which to crawl. + data : pandas.DataFrame + a dataframe of data gathered by the crawler. + use_api : bool + whether to lookup entries and update their data using APIs. Required for the crawler to find new and add new data. Defaults to True. crawl_limit : int - how many URLs the crawler should visit before it stops. - excluded_url_terms : list - list of strings; link will be ignored if it contains any string in list. - required_keywords : list - list of keywords which sites must contain to be crawled. - excluded_keywords : list - list of keywords which sites must *not* contain to be crawled. - case_sensitive : bool - whether or not to ignore string characters' case. - ignore_urls : list - list of URLs to ignore. - ignore_domains : list - list of domains to ignore. + how many records the crawler should visit before it stops. + depth_limit : int + maximum number of crawler iterations to perform. be_polite : bool - whether respect websites' permissions for crawlers. - full : bool - whether to run a full scrape on each site. This takes longer. - output_as : str - the format to output results in. Defaults to a pandas.DataFrame. - + whether to respect websites' permissions for crawlers. + rate_limit : float + time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds. + timeout : int + maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds. Returns ------- - result : pd.DataFrame - an object containing the results of a crawl. + output : pd.DataFrame + an object containing the results from the crawl. """ # See https://www.zenrows.com/blog/web-crawler-python#transitioning-to-a-real-world-web-crawler