Skip to content

Commit

Permalink
adding docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
J-A-Ha committed Sep 19, 2024
1 parent 68ac9be commit 2d6f678
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 52 deletions.
40 changes: 31 additions & 9 deletions art/classes/activitylog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,22 @@ class ActivityLog(pd.DataFrame):
"""
This is an ActivityLog object. It is a modified Pandas Dataframe object designed to store metadata about an academic review.
Parameters
----------
Attributes
----------
Columns
-------
* **timestamp**: date-time the activity occurred.
* **type**: type of activity.
* **activity**: details of activity.
* **location**: location in Review that activity occurred.
* **database**: name of database/repository accessed (if relevant).
* **url**: web address accessed (if relevant).
* **query**: search query used (if relevant).
* **changes**: number of changes made to the Review results.
"""

def __init__(self):

"""
Initialises ActivityLog instance.
Parameters
----------
"""


Expand All @@ -44,6 +45,27 @@ def __init__(self):

def add_activity(self, type: str, activity: str, location: list, database = None, query = None, url = None, changes_dict = None):

"""
Adds a new activity to the ActivityLog DataFrame.
Parameters
----------
type : str
type of activity.
activity : str
details of activity.
location : str
name of location in Review that activity occurred.
database : str
name of database/repository accessed (if relevant). Defaults to None.
query : str
search query used (if relevant). Defaults to None.
url : str
web address accessed (if relevant). Defaults to None.
changes_dict : dict
dictionary of changes made to Review. Defaults to None.
"""

new_index = len(self)
self.loc[new_index, 'timestamp'] = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
self.loc[new_index, 'type'] = type
Expand Down
147 changes: 104 additions & 43 deletions art/classes/citation_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
import pandas as pd
import numpy as np


def crawler_scrape_url(url) -> pd.DataFrame:

"""
Core functionality for the citation crawler's web scraper. Takes a URL and returns a Pandas DataFrame.
"""

scrape_res = scrape_url(url=url)

global results_cols
Expand Down Expand Up @@ -111,6 +114,10 @@ def crawler_scrape_url(url) -> pd.DataFrame:

def citation_crawler_site_test(url: str):

"""
Checks whether the citation crawler can crawl a given URL. Returns True if yes; False if no.
"""

global can_scrape

for i in can_scrape:
Expand All @@ -121,6 +128,22 @@ def citation_crawler_site_test(url: str):

def academic_scraper(url, be_polite = False):

"""
Bespoke web scraper for academic repository websites.
Parameters
----------
url : str
a URL to scrape.
be_polite : bool
whether to follow respect scraping permissions contained in websites' robots.txt files.
Returns
-------
res_df : pandas.DataFrame
a Pandas DataFrame containing scraped web data.
"""

# Checking if URL is bad. If True, tries to correct it.
url = correct_seed_url_errors(url)
domain = get_domain(url)
Expand Down Expand Up @@ -160,6 +183,22 @@ def academic_scraper(url, be_polite = False):

def citation_crawler_scraper(entry: pd.Series, be_polite = True):

"""
Bespoke web scraper for use by citation crawler.
Parameters
----------
entry : pandas.Series
citation crawler entry.
be_polite : bool
whether to follow respect scraping permissions contained in websites' robots.txt files.
Returns
-------
entry : pandas.Series
citation crawler entry.
"""

url = entry['link']

res_df = academic_scraper(url=url, be_polite=be_polite)
Expand All @@ -173,6 +212,24 @@ def citation_crawler_scraper(entry: pd.Series, be_polite = True):

def citation_crawler_doi_retriver(entry: pd.Series, be_polite = True, timeout = 60):

"""
Takes citation crawler entry. If it contains a DOI, looks up the record using the CrossRef API. If not, scrapes the URL.
Parameters
----------
entry : pandas.Series
citation crawler entry.
be_polite : bool
whether to follow respect scraping permissions contained in websites' robots.txt files.
timeout : int
maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds.
Returns
-------
entry : pandas.Series
citation crawler entry.
"""

doi = entry['doi']
link = entry['link']

Expand Down Expand Up @@ -221,6 +278,24 @@ def citation_crawler_doi_retriver(entry: pd.Series, be_polite = True, timeout =

def update_citation_crawler_data(entry: pd.Series, be_polite = True, timeout = 60):

"""
Takes citation crawler entry and updates the data using the CrossRef API if a record is available.
Parameters
----------
entry : pandas.Series
citation crawler entry.
be_polite : bool
whether to follow respect scraping permissions contained in websites' robots.txt files.
timeout : int
maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds.
Returns
-------
entry : pandas.Series
citation crawler entry.
"""

doi = entry['doi']
link = entry['link']

Expand Down Expand Up @@ -266,32 +341,27 @@ def citation_crawler_engine(
Parameters
----------
urls : queue
ordered queue of URLs to be crawled.
required_keywords : list
list of keywords which sites must contain to be crawled.
excluded_keywords : list
list of keywords which sites must *not* contain to be crawled.
excluded_url_terms : list
list of strings; link will be ignored if it contains any string in list.
case_sensitive : bool
whether or not to ignore string characters' case.
to_crawl : queue
records to crawl.
data : pandas.DataFrame
a dataframe of data gathered by the crawler.
use_api : bool
whether to lookup entries and update their data using APIs. Required for the crawler to find new and add new data. Defaults to True.
crawl_limit : int
how many URLs the crawler should visit before it stops.
ignore_urls : list
list of URLs to ignore.
ignore_domains : list
list of domains to ignore.
depth_limit : int
maximum number of crawler iterations to perform.
be_polite : bool
whether to respect websites' permissions for crawlers.
full : bool
whether to run a full scrape on each site. This takes longer.
rate_limit : float
time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds.
timeout : int
maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds.
Returns
-------
output_dict : dict
a dictionary containing results from each crawled site.
data : pandas.DataFrame
a Pandas DataFrame containing results from the crawl.
"""

# Intiailising variables to store the pages already visited
Expand Down Expand Up @@ -411,34 +481,25 @@ def citation_crawler(
Parameters
----------
seeds : str or list
one or more URLs from which to crawl.
data : pandas.DataFrame
a dataframe of data gathered by the crawler.
use_api : bool
whether to lookup entries and update their data using APIs. Required for the crawler to find new and add new data. Defaults to True.
crawl_limit : int
how many URLs the crawler should visit before it stops.
excluded_url_terms : list
list of strings; link will be ignored if it contains any string in list.
required_keywords : list
list of keywords which sites must contain to be crawled.
excluded_keywords : list
list of keywords which sites must *not* contain to be crawled.
case_sensitive : bool
whether or not to ignore string characters' case.
ignore_urls : list
list of URLs to ignore.
ignore_domains : list
list of domains to ignore.
how many records the crawler should visit before it stops.
depth_limit : int
maximum number of crawler iterations to perform.
be_polite : bool
whether respect websites' permissions for crawlers.
full : bool
whether to run a full scrape on each site. This takes longer.
output_as : str
the format to output results in. Defaults to a pandas.DataFrame.
whether to respect websites' permissions for crawlers.
rate_limit : float
time delay in seconds per result. Used to limit impact on CrossRef servers. Defaults to 0.05 seconds.
timeout : int
maximum time in seconds to wait for a response before aborting the CrossRef API call. Defaults to 60 seconds.
Returns
-------
result : pd.DataFrame
an object containing the results of a crawl.
output : pd.DataFrame
an object containing the results from the crawl.
"""

# See https://www.zenrows.com/blog/web-crawler-python#transitioning-to-a-real-world-web-crawler
Expand Down

0 comments on commit 2d6f678

Please sign in to comment.