Skip to content

Commit

Permalink
Merge pull request #361 from flairNLP/module_logging
Browse files Browse the repository at this point in the history
Implement module scoped logging
  • Loading branch information
MaxDall authored Apr 24, 2024
2 parents eca7420 + cd406bd commit c931e2e
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 52 deletions.
10 changes: 6 additions & 4 deletions scripts/generate_parser_test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@
from tqdm import tqdm

from fundus import Crawler, PublisherCollection
from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article
from fundus.scraping.filter import RequiresAll
from fundus.scraping.html import WebSource
from fundus.scraping.scraper import BaseScraper, WebScraper
from fundus.scraping.scraper import BaseScraper
from tests.test_parser import attributes_required_to_cover
from tests.utility import HTMLTestFile, get_test_case_json, load_html_test_file_mapping

logger = create_logger(__name__)


def get_test_article(enum: PublisherEnum, url: Optional[str] = None) -> Optional[Article]:
if url is not None:
Expand Down Expand Up @@ -84,7 +86,7 @@ def main() -> None:
# sort args.attributes for consistency
arguments.attributes = sorted(set(arguments.attributes) or attributes_required_to_cover)

basic_logger.setLevel(WARN)
logger.setLevel(WARN)

publishers: List[PublisherEnum] = (
list(PublisherCollection)
Expand All @@ -107,7 +109,7 @@ def main() -> None:

if arguments.overwrite or not html_mapping.get(publisher.parser.latest_version):
if not (article := get_test_article(publisher, url)):
basic_logger.error(f"Couldn't get article for {publisher.name}. Skipping")
logger.error(f"Couldn't get article for {publisher.name}. Skipping")
continue
html = HTMLTestFile(
url=article.html.responded_url,
Expand Down
22 changes: 22 additions & 0 deletions src/fundus/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import logging

__all__ = ["set_log_level", "create_logger"]

_loggers = []

_stream_handler = logging.StreamHandler()
_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
_stream_handler.setFormatter(_formatter)


def create_logger(name: str) -> logging.Logger:
logger = logging.getLogger(name)
logger.setLevel(logging.ERROR)
logger.addHandler(_stream_handler)
_loggers.append(logger)
return logger


def set_log_level(level: int):
for logger in _loggers:
logger.setLevel(level)
3 changes: 0 additions & 3 deletions src/fundus/logging/__init__.py

This file was deleted.

9 changes: 0 additions & 9 deletions src/fundus/logging/logger.py

This file was deleted.

6 changes: 4 additions & 2 deletions src/fundus/scraping/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
import more_itertools
from colorama import Fore, Style

from fundus.logging.logger import basic_logger
from fundus.logging import create_logger
from fundus.parser import ArticleBody
from fundus.scraping.html import HTML

logger = create_logger(__name__)


@dataclass(frozen=True)
class Article:
Expand Down Expand Up @@ -54,7 +56,7 @@ def lang(self) -> Optional[str]:
try:
language = langdetect.detect(self.plaintext)
except langdetect.LangDetectException:
basic_logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")
logger.debug(f"Unable to detect language for article '{self.html.responded_url}'")

# use @lang attribute of <html> tag as fallback
if not language or language == langdetect.detector_factory.Detector.UNKNOWN_LANG:
Expand Down
8 changes: 5 additions & 3 deletions src/fundus/scraping/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from tqdm import tqdm
from typing_extensions import ParamSpec, TypeAlias

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.publishers.base_objects import PublisherCollectionMeta, PublisherEnum
from fundus.scraping.article import Article
from fundus.scraping.delay import Delay
Expand All @@ -45,6 +45,8 @@
from fundus.scraping.session import session_handler
from fundus.scraping.url import URLSource

logger = create_logger(__name__)

_T = TypeVar("_T")
_P = ParamSpec("_P")

Expand Down Expand Up @@ -190,15 +192,15 @@ def build_extraction_filter() -> Optional[ExtractionFilter]:
)
)
if missing_attributes := extraction_filter.required_attributes - supported_attributes:
basic_logger.warning(
logger.warning(
f"The required attribute(s) `{', '.join(missing_attributes)}` "
f"is(are) not supported by {publisher.publisher_name}. Skipping publisher"
)
else:
fitting_publishers.append(publisher)

if not fitting_publishers:
basic_logger.error(
logger.error(
f"Could not find any fitting publishers for required attributes "
f"`{', '.join(extraction_filter.required_attributes)}`"
)
Expand Down
28 changes: 15 additions & 13 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from fastwarc import ArchiveIterator, WarcRecord, WarcRecordType
from requests import ConnectionError, HTTPError

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.delay import Delay
from fundus.scraping.filter import URLFilter
Expand All @@ -29,6 +29,8 @@
from fundus.scraping.session import session_handler
from fundus.scraping.url import URLSource

logger = create_logger(__name__)


@dataclass(frozen=True)
class HTML:
Expand Down Expand Up @@ -97,11 +99,11 @@ def filter_url(u: str) -> bool:
timestamp = time.time()

if not validators.url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
logger.debug(f"Skipped requested URL '{url}' because the URL is malformed")
continue

if filter_url(url):
basic_logger.debug(f"Skipped requested URL '{url}' because of URL filter")
logger.debug(f"Skipped requested URL '{url}' because of URL filter")
continue

session = session_handler.get_session()
Expand All @@ -114,23 +116,23 @@ def filter_url(u: str) -> bool:
response = session.get(url, headers=self.request_header)

except (HTTPError, ConnectionError) as error:
basic_logger.info(f"Skipped requested URL '{url}' because of '{error}'")
logger.info(f"Skipped requested URL '{url}' because of '{error}'")
if isinstance(error, HTTPError) and error.response.status_code >= 500:
basic_logger.info(f"Skipped {self.publisher} due to server errors: '{error}'")
logger.info(f"Skipped {self.publisher} due to server errors: '{error}'")
continue

except Exception as error:
basic_logger.warning(f"Warning! Skipped requested URL '{url}' because of an unexpected error {error}")
logger.warning(f"Warning! Skipped requested URL '{url}' because of an unexpected error {error}")
continue

else:
if filter_url(str(response.url)):
basic_logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
logger.debug(f"Skipped responded URL '{str(response.url)}' because of URL filter")
continue
html = response.text

if response.history:
basic_logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")
logger.info(f"Got redirected {len(response.history)} time(s) from {url} -> {response.url}")

source_info = (
WebSourceInfo(self.publisher, type(self.url_source).__name__, self.url_source.url)
Expand Down Expand Up @@ -167,20 +169,20 @@ def extract_content(record: WarcRecord) -> Optional[str]:
encoding: Optional[str] = chardet.detect(warc_body)["encoding"]

if encoding is not None:
basic_logger.debug(
logger.debug(
f"Trying to decode record {record.record_id!r} from {target_url!r} "
f"using detected encoding {encoding}."
)

try:
return str(warc_body, encoding=encoding)
except UnicodeDecodeError:
basic_logger.warning(
logger.warning(
f"Couldn't decode record {record.record_id!r} from {target_url!r} with "
f"original charset {record.http_charset!r} using detected charset {encoding!r}."
)
else:
basic_logger.warning(
logger.warning(
f"Couldn't detect charset for record {record.record_id!r} from {target_url!r} "
f"with invalid original charset {record.http_charset!r}."
)
Expand All @@ -194,7 +196,7 @@ def extract_content(record: WarcRecord) -> Optional[str]:
target_url = str(warc_record.headers["WARC-Target-URI"])

if url_filter is not None and url_filter(target_url):
basic_logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
logger.debug(f"Skipped WARC record with target URI {target_url!r} because of URL filter")
continue

publisher_domain: str = urlparse(target_url).netloc
Expand All @@ -205,7 +207,7 @@ def extract_content(record: WarcRecord) -> Optional[str]:
publisher = self._publisher_mapping[publisher_domain]

if publisher.url_filter is not None and publisher.url_filter(target_url):
basic_logger.debug(
logger.debug(
f"Skipped WARC record with target URI {target_url!r} because of "
f"publisher specific URL filter"
)
Expand Down
14 changes: 7 additions & 7 deletions src/fundus/scraping/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import more_itertools

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.parser import ParserProxy
from fundus.publishers.base_objects import PublisherEnum
from fundus.scraping.article import Article
Expand All @@ -15,6 +15,8 @@
from fundus.scraping.html import CCNewsSource, HTMLSource, WebSource
from fundus.scraping.url import URLSource

logger = create_logger(__name__)


class BaseScraper:
def __init__(self, *sources: HTMLSource, parser_mapping: Dict[str, ParserProxy]):
Expand All @@ -37,27 +39,25 @@ def scrape(
except Exception as err:
if error_handling == "raise":
error_message = f"Run into an error processing article '{html.requested_url}'"
basic_logger.error(error_message)
logger.error(error_message)
err.args = (str(err) + "\n\n" + error_message,)
raise err
elif error_handling == "catch":
yield Article(html=html, exception=err)
elif error_handling == "suppress":
basic_logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
logger.info(f"Skipped article at '{html.requested_url}' because of: {err!r}")
else:
raise ValueError(f"Unknown value '{error_handling}' for parameter <error_handling>'")

else:
if extraction_filter and (filter_result := extraction_filter(extraction)):
if isinstance(filter_result, FilterResultWithMissingAttributes):
basic_logger.debug(
logger.debug(
f"Skipped article at '{html.requested_url}' because attribute(s) "
f"{', '.join(filter_result.missing_attributes)!r} is(are) missing"
)
else:
basic_logger.debug(
f"Skipped article at '{html.requested_url}' because of extraction filter"
)
logger.debug(f"Skipped article at '{html.requested_url}' because of extraction filter")
else:
article = Article.from_extracted(html=html, extracted=extraction)
yield article
Expand Down
10 changes: 6 additions & 4 deletions src/fundus/scraping/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import requests.adapters
from typing_extensions import Self

from fundus.logging import basic_logger
from fundus.logging import create_logger

logger = create_logger(__name__)

_default_header = {"user-agent": "FundusBot"}

Expand Down Expand Up @@ -40,14 +42,14 @@ def _session_factory(self) -> requests.Session:
A new requests.Session
"""

basic_logger.debug("Creating new session")
logger.debug("Creating new session")
session = requests.Session()

def _response_log(response: requests.Response, *args, **kwargs) -> None:
history = response.history
previous_status_codes = [f"({response.status_code})" for response in history] if history else []
status_code_chain = " -> ".join(previous_status_codes + [f"({response.status_code})"])
basic_logger.debug(
logger.debug(
f"{status_code_chain} <{response.request.method} {response.url!r}> "
f"took {response.elapsed.total_seconds()} second(s)"
)
Expand Down Expand Up @@ -92,7 +94,7 @@ def close_current_session(self) -> None:
"""
if self.session is not None:
session = self.get_session()
basic_logger.debug(f"Close session {session}")
logger.debug(f"Close session {session}")
session.close()
self.session = None

Expand Down
16 changes: 9 additions & 7 deletions src/fundus/scraping/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
from lxml.etree import XPath
from requests import ConnectionError, HTTPError

from fundus.logging import basic_logger
from fundus.logging import create_logger
from fundus.scraping.filter import URLFilter, inverse
from fundus.scraping.session import _default_header, session_handler

logger = create_logger(__name__)


class _ArchiveDecompressor:
def __init__(self):
Expand Down Expand Up @@ -45,7 +47,7 @@ def __post_init__(self):
if not self._request_header:
self._request_header = _default_header
if not validators.url(self.url, strict_query=False):
basic_logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}")
logger.error(f"{type(self).__name__} initialized with invalid URL {self.url}")

def set_header(self, request_header: Dict[str, str]) -> None:
self._request_header = request_header
Expand Down Expand Up @@ -77,12 +79,12 @@ def __iter__(self) -> Iterator[str]:
try:
response = session.get(self.url, headers=self._request_header)
except HTTPError as err:
basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {err}")
logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {err}")
return
html = response.text
rss_feed = feedparser.parse(html)
if exception := rss_feed.get("bozo_exception"):
basic_logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
logger.warning(f"Warning! Couldn't parse rss feed '{self.url}' because of {exception}")
return
else:
for url in (entry["link"] for entry in rss_feed["entries"]):
Expand All @@ -103,17 +105,17 @@ def __iter__(self) -> Iterator[str]:
def yield_recursive(sitemap_url: str) -> Iterator[str]:
session = session_handler.get_session()
if not validators.url(sitemap_url):
basic_logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
logger.info(f"Skipped sitemap '{sitemap_url}' because the URL is malformed")
try:
response = session.get(url=sitemap_url, headers=self._request_header)
except (HTTPError, ConnectionError) as error:
basic_logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
logger.warning(f"Warning! Couldn't reach sitemap '{sitemap_url}' because of {error}")
return
content = response.content
if (content_type := response.headers.get("content-type")) in self._decompressor.supported_file_formats:
content = self._decompressor.decompress(content, content_type)
if not content:
basic_logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
logger.warning(f"Warning! Empty sitemap at '{sitemap_url}'")
return
tree = lxml.html.fromstring(content)
urls = [node.text_content() for node in self._url_selector(tree)]
Expand Down

0 comments on commit c931e2e

Please sign in to comment.