diff --git a/utils/sort_yaml.py b/utils/sort_yaml.py index db06ac1c7d5..85f513072e2 100644 --- a/utils/sort_yaml.py +++ b/utils/sort_yaml.py @@ -16,6 +16,7 @@ from tidy_conf.date import clean_dates from tidy_conf.latlon import add_latlon from tidy_conf.links import check_link_availability +from tidy_conf.links import get_cache from tidy_conf.schema import Conference from tidy_conf.schema import get_schema from tidy_conf.titles import tidy_titles @@ -140,13 +141,13 @@ def split_data(data): def check_links(data): """Check the links in the data iteratively.""" + cache, cache_archived = get_cache() for i, q in tqdm(enumerate(sorted(data, key=operator.itemgetter("year"), reverse=True)), total=len(data)): for key in ("link", "cfp_link", "sponsor", "finaid"): if key in q: - new_link = check_link_availability(q[key], q["start"]) - if "https://web.archive.org" not in new_link: + new_link = check_link_availability(q[key], q["start"], cache=cache, cache_archived=cache_archived) + if q[key] != new_link and "archive.org" in new_link: time.sleep(0.5) - q[key] = new_link data[i] = q return data diff --git a/utils/tidy_conf/links.py b/utils/tidy_conf/links.py index fae9aa7e143..08a1ff18657 100644 --- a/utils/tidy_conf/links.py +++ b/utils/tidy_conf/links.py @@ -8,7 +8,28 @@ from tqdm import tqdm -def check_link_availability(url, start): +def get_cache_location(): + # Check if the URL is cached + cache_file = Path("utils", "tidy_conf", "data", ".tmp", "no_archive.txt") + cache_file_archived = Path("utils", "tidy_conf", "data", ".tmp", "archived_links.txt") + return cache_file, cache_file_archived + + +def get_cache(): + cache_file, cache_file_archived = get_cache_location() + + # Create the cache file if it doesn't exist + cache_file.touch() + cache_file_archived.touch() + + # Read the cache file + cache = set(cache_file.read_text(encoding="utf-8").split("\n")[:-1]) + cache_archived = set(cache_file_archived.read_text(encoding="utf-8").split("\n")[:-1]) + + return cache, cache_archived + + +def check_link_availability(url, start, cache=None, cache_archived=None): """Checks if a URL is available. If not, tries to retrieve an archived version from the Wayback Machine. @@ -24,19 +45,11 @@ def check_link_availability(url, start): if url.startswith(("https://web.archive.org", "http://web.archive.org")): return url - # Check if the URL is cached - cache_file = Path("utils", "tidy_conf", "data", ".tmp", "no_archive.txt") - cache_file_archived = Path("utils", "tidy_conf", "data", ".tmp", "archived_links.txt") - - # Create the cache file if it doesn't exist - cache_file.touch() - cache_file_archived.touch() + # Get the cache + if cache is None or cache_archived is None: + cache, cache_archived = get_cache() - # Read the cache file - with cache_file.open(encoding="utf-8") as f: - cache = f.read().split("\n")[:-1] - with cache_file_archived.open(encoding="utf-8") as f: - cache_archived = f.read().split("\n")[:-1] + cache_file, _ = get_cache_location() # Check if the URL is in the cache if url in cache and url not in cache_archived: @@ -65,7 +78,7 @@ def check_link_availability(url, start): ) else: if start > datetime.now(tz=timezone.utc).date(): - attempt_archive_url(url, cache_file_archived) + attempt_archive_url(url, cache_archived) return url except requests.RequestException as e: tqdm.write(f"An error occurred: {e}. Trying to find an archived version...") @@ -89,7 +102,7 @@ def check_link_availability(url, start): tqdm.write(f"Found archived version: {archived_url}") return archived_url tqdm.write("No archived version found.") - attempt_archive_url(url, cache_file_archived) + attempt_archive_url(url, cache_archived) with cache_file.open("a") as f: f.write(url + "\n") return url @@ -101,25 +114,25 @@ def check_link_availability(url, start): return url -def attempt_archive_url(url, cache_file): +def attempt_archive_url(url, cache=None): """Attempts to archive a URL using the Wayback Machine.""" # Read the cache file - cache_file = Path(cache_file) - - with cache_file.open(encoding="utf-8") as f: - cache = f.read().split("\n")[:-1] + if cache is None: + _, cache = get_cache() # Check if the URL is in the cache if url in cache: tqdm.write(f"URL {url} was already archived.") return - with cache_file.open("a") as f: - f.write(url + "\n") try: tqdm.write(f"Attempting archive of {url}.") - archive_response = requests.get("https://web.archive.org/save/" + url, timeout=7) + headers = {"User-Agent": "Pythondeadlin.es Archival Attempt/0.1 (https://pythondeadlin.es)"} + archive_response = requests.get("https://web.archive.org/save/" + url, timeout=30, headers=headers) if archive_response.status_code == 200: + _, cache_file = get_cache_location() + with cache_file.open("a") as f: + f.write(url + "\n") tqdm.write(f"Successfully archived {url}.") except requests.RequestException as e: tqdm.write(f"An error occurred while attempting to archive: {e}")