diff --git a/setup.py b/setup.py index d33bc82..5730379 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # flake8: noqa import io -from contextlib import suppress +from contextlib import suppress, contextmanager from os import fspath from pathlib import Path from typing import Optional, List, Dict @@ -52,21 +52,6 @@ def run(self) -> None: f"Unable to find regexes.yaml, should be at {yaml_src!r}" ) - def write_matcher(f, typ: str, fields: List[Optional[object]]): - f.write(f" {typ}(".encode()) - while len(fields) > 1 and fields[-1] is None: - fields = fields[:-1] - f.write(", ".join(map(repr, fields)).encode()) - f.write(b"),\n") - - def write_params(fields): - # strip trailing None values - while len(fields) > 1 and fields[-1] is None: - fields.pop() - - for field in fields: - fp.write((f" {field!r},\n").encode()) - with yaml_src.open("rb") as f: regexes = yaml.safe_load(f) @@ -79,96 +64,150 @@ def write_params(fields): outdir.mkdir(parents=True, exist_ok=True) dest = outdir / "_matchers.py" + dest_lazy = outdir / "_lazy.py" dest_legacy = outdir / "_regexes.py" - with dest.open("wb") as f, dest_legacy.open("wb") as fp: - # fmt: off - f.write(b"""\ + with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open( + "wb" + ) as legacy: + eager = EagerWriter(eager) + lazy = LazyWriter(lazy) + legacy = LegacyWriter(legacy) + + for section in ["user_agent_parsers", "os_parsers", "device_parsers"]: + with eager.section(section), lazy.section(section), legacy.section( + section + ): + extract = EXTRACTORS[section] + for p in regexes[section]: + el = trim(extract(p)) + eager.item(el) + lazy.item(el) + legacy.item(el) + eager.end() + lazy.end() + legacy.end() + + +def trim(l): + while len(l) > 1 and l[-1] is None: + l.pop() + return l + + +EXTRACTORS = { + "user_agent_parsers": lambda p: [ + p["regex"], + p.get("family_replacement"), + p.get("v1_replacement"), + p.get("v2_replacement"), + ], + "os_parsers": lambda p: [ + p["regex"], + p.get("os_replacement"), + p.get("os_v1_replacement"), + p.get("os_v2_replacement"), + p.get("os_v3_replacement"), + p.get("os_v4_replacement"), + ], + "device_parsers": lambda p: [ + p["regex"], + p.get("regex_flag"), + p.get("device_replacement"), + p.get("brand_replacement"), + p.get("model_replacement"), + ], +} + + +class Writer: + section_end = b"" + + def __init__(self, fp): + self.fp = fp + self.fp.write( + b"""\ ######################################################## # NOTICE: this file is autogenerated from regexes.yaml # ######################################################## +""" + ) + self.fp.write(self.prefix) + self._section = None + + @contextmanager + def section(self, id): + self._section = id + self.fp.write(self.sections[id]) + yield + self.fp.write(self.section_end) + + def item(self, elements): + # DeviceMatcher(re, flag, repl1), + self.fp.write(self.items[self._section]) + self.fp.write(", ".join(map(repr, elements)).encode()) + self.fp.write(b"),\n") + + def end(self): + self.fp.write(self.suffix) + + +class LegacyWriter(Writer): + prefix = b"""\ +__all__ = [ + "USER_AGENT_PARSERS", + "DEVICE_PARSERS", + "OS_PARSERS", +] + +from .user_agent_parser import UserAgentParser, DeviceParser, OSParser + +""" + sections = { + "user_agent_parsers": b"USER_AGENT_PARSERS = [\n", + "os_parsers": b"\n\nOS_PARSERS = [\n", + "device_parsers": b"\n\nDEVICE_PARSERS = [\n", + } + section_end = b"]" + items = { + "user_agent_parsers": b" UserAgentParser(", + "os_parsers": b" OSParser(", + "device_parsers": b" DeviceParser(", + } + suffix = b"\n" + + +class EagerWriter(Writer): + prefix = b"""\ +__all__ = ["MATCHERS"] + +from typing import Tuple, List +from .core import UserAgentMatcher, OSMatcher, DeviceMatcher + +MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ +""" + sections = { + "user_agent_parsers": b"", + "os_parsers": b"], [\n", + "device_parsers": b"], [\n", + } + items = { + "user_agent_parsers": b" UserAgentMatcher(", + "os_parsers": b" OSMatcher(", + "device_parsers": b" DeviceMatcher(", + } + suffix = b"])\n" + + +class LazyWriter(EagerWriter): + prefix = b"""\ +__all__ = ["MATCHERS"] + +from typing import Tuple, List +from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher -from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher - -MATCHERS: Matchers = ([ -""") - fp.write(b"# -*- coding: utf-8 -*-\n") - fp.write(b"########################################################\n") - fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n") - fp.write(b"########################################################\n") - fp.write(b"\n") - fp.write(b"from .user_agent_parser import (\n") - fp.write(b" UserAgentParser, DeviceParser, OSParser,\n") - fp.write(b")\n") - fp.write(b"\n") - fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n") - fp.write(b"\n") - fp.write(b"USER_AGENT_PARSERS = [\n") - for device_parser in regexes["user_agent_parsers"]: - write_matcher(f, "UserAgentMatcher", [ - device_parser["regex"], - device_parser.get("family_replacement"), - device_parser.get("v1_replacement"), - device_parser.get("v2_replacement"), - ]) - - fp.write(b" UserAgentParser(\n") - write_params([ - device_parser["regex"], - device_parser.get("family_replacement"), - device_parser.get("v1_replacement"), - device_parser.get("v2_replacement"), - ]) - fp.write(b" ),\n") - f.write(b" ], [\n") - fp.write(b"]\n\n") - - fp.write(b"OS_PARSERS = [\n") - for device_parser in regexes["os_parsers"]: - write_matcher(f, "OSMatcher", [ - device_parser["regex"], - device_parser.get("os_replacement"), - device_parser.get("os_v1_replacement"), - device_parser.get("os_v2_replacement"), - device_parser.get("os_v3_replacement"), - device_parser.get("os_v4_replacement"), - ]) - - fp.write(b" OSParser(\n") - write_params([ - device_parser["regex"], - device_parser.get("os_replacement"), - device_parser.get("os_v1_replacement"), - device_parser.get("os_v2_replacement"), - device_parser.get("os_v3_replacement"), - device_parser.get("os_v4_replacement"), - ]) - fp.write(b" ),\n") - f.write(b" ], [\n") - fp.write(b"]\n\n") - - fp.write(b"DEVICE_PARSERS = [\n") - for device_parser in regexes["device_parsers"]: - write_matcher(f, "DeviceMatcher", [ - device_parser["regex"], - device_parser.get("regex_flag"), - device_parser.get("device_replacement"), - device_parser.get("brand_replacement"), - device_parser.get("model_replacement"), - ]) - - fp.write(b" DeviceParser(\n") - write_params([ - device_parser["regex"], - device_parser.get("regex_flag"), - device_parser.get("device_replacement"), - device_parser.get("brand_replacement"), - device_parser.get("model_replacement"), - ]) - fp.write(b" ),\n") - f.write(b"])\n") - fp.write(b"]\n") - # fmt: on +MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([ +""" setup( diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py index 01b73ef..46cae5b 100644 --- a/src/ua_parser/__init__.py +++ b/src/ua_parser/__init__.py @@ -36,6 +36,7 @@ "UserAgent", "UserAgentMatcher", "load_builtins", + "load_lazy_builtins", "load_data", "load_yaml", "parse", @@ -65,7 +66,7 @@ ) from .basic import Parser as BasicParser from .caching import CachingParser, Clearing, LRU, Locking -from .loaders import load_builtins, load_data, load_yaml +from .loaders import load_builtins, load_lazy_builtins, load_data, load_yaml Re2Parser: Optional[Callable[[Matchers], Parser]] = None with contextlib.suppress(ImportError): @@ -79,7 +80,7 @@ def __getattr__(name: str) -> Parser: global parser if name == "parser": if Re2Parser is not None: - parser = Re2Parser(load_builtins()) + parser = Re2Parser(load_lazy_builtins()) else: parser = CachingParser( BasicParser(load_builtins()), diff --git a/src/ua_parser/_lazy.pyi b/src/ua_parser/_lazy.pyi new file mode 100644 index 0000000..aa67478 --- /dev/null +++ b/src/ua_parser/_lazy.pyi @@ -0,0 +1,10 @@ +__all__ = ["MATCHERS"] + +from typing import Tuple, List +from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher + +MATCHERS: Tuple[ + List[UserAgentMatcher], + List[OSMatcher], + List[DeviceMatcher], +] diff --git a/src/ua_parser/_matchers.pyi b/src/ua_parser/_matchers.pyi index a27227f..da0b023 100644 --- a/src/ua_parser/_matchers.pyi +++ b/src/ua_parser/_matchers.pyi @@ -1,3 +1,10 @@ -from .core import Matchers +__all__ = ["MATCHERS"] -MATCHERS: Matchers +from typing import Tuple, List +from .core import UserAgentMatcher, OSMatcher, DeviceMatcher + +MATCHERS: Tuple[ + List[UserAgentMatcher], + List[OSMatcher], + List[DeviceMatcher], +] diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py index bd16746..828a6cb 100644 --- a/src/ua_parser/basic.py +++ b/src/ua_parser/basic.py @@ -7,6 +7,7 @@ Device, DeviceMatcher, Domain, + Matcher, Matchers, OS, OSMatcher, @@ -23,9 +24,9 @@ class Parser(AbstractParser): when one matches. """ - user_agent_matchers: List[UserAgentMatcher] - os_matchers: List[OSMatcher] - device_matchers: List[DeviceMatcher] + user_agent_matchers: List[Matcher[UserAgent]] + os_matchers: List[Matcher[OS]] + device_matchers: List[Matcher[Device]] def __init__( self, diff --git a/src/ua_parser/core.py b/src/ua_parser/core.py index cb9947e..15d86e9 100644 --- a/src/ua_parser/core.py +++ b/src/ua_parser/core.py @@ -1,9 +1,8 @@ import abc import re -from collections.abc import Callable, Sequence -from dataclasses import dataclass, fields +from dataclasses import dataclass from enum import Flag, auto -from typing import Literal, Optional, Tuple, List, TypeVar, Match, Pattern +from typing import Generic, Literal, Optional, Tuple, List, TypeVar, Match, Pattern __all__ = [ "DefaultedParseResult", @@ -221,7 +220,23 @@ def _replacer(repl: str, m: Match[str]) -> Optional[str]: return re.sub(r"\$(\d)", lambda n: _get(m, int(n[1])) or "", repl).strip() or None -class UserAgentMatcher: +T = TypeVar("T") + + +class Matcher(abc.ABC, Generic[T]): + @abc.abstractmethod + def __call__(self, ua: str) -> Optional[T]: ... + + @property + @abc.abstractmethod + def pattern(self) -> str: ... + + @property + def flags(self) -> int: + return 0 + + +class UserAgentMatcher(Matcher[UserAgent]): regex: Pattern[str] family: str major: Optional[str] @@ -260,6 +275,10 @@ def __call__(self, ua: str) -> Optional[UserAgent]: ) return None + @property + def pattern(self) -> str: + return self.regex.pattern + def __repr__(self) -> str: fields = [ ("family", self.family if self.family != "$1" else None), @@ -270,10 +289,10 @@ def __repr__(self) -> str: ] args = "".join(f", {k}={v!r}" for k, v in fields if v is not None) - return f"UserAgentMatcher({self.regex.pattern!r}{args})" + return f"UserAgentMatcher({self.pattern!r}{args})" -class OSMatcher: +class OSMatcher(Matcher[OS]): regex: Pattern[str] family: str major: str @@ -311,6 +330,10 @@ def __call__(self, ua: str) -> Optional[OS]: ) return None + @property + def pattern(self) -> str: + return self.regex.pattern + def __repr__(self) -> str: fields = [ ("family", self.family if self.family != "$1" else None), @@ -321,10 +344,10 @@ def __repr__(self) -> str: ] args = "".join(f", {k}={v!r}" for k, v in fields if v is not None) - return f"OSMatcher({self.regex.pattern!r}{args})" + return f"OSMatcher({self.pattern!r}{args})" -class DeviceMatcher: +class DeviceMatcher(Matcher[Device]): regex: Pattern[str] family: str brand: str @@ -355,20 +378,28 @@ def __call__(self, ua: str) -> Optional[Device]: ) return None + @property + def pattern(self) -> str: + return self.regex.pattern + + @property + def flags(self) -> int: + return self.regex.flags + def __repr__(self) -> str: fields = [ ("family", self.family if self.family != "$1" else None), ("brand", self.brand or None), ("model", self.model if self.model != "$1" else None), ] - iflag = ', "i"' if self.regex.flags & re.IGNORECASE else "" + iflag = ', "i"' if self.flags & re.IGNORECASE else "" args = iflag + "".join(f", {k}={v!r}" for k, v in fields if v is not None) - return f"DeviceMatcher({self.regex.pattern!r}{args})" + return f"DeviceMatcher({self.pattern!r}{args})" Matchers = Tuple[ - List[UserAgentMatcher], - List[OSMatcher], - List[DeviceMatcher], + List[Matcher[UserAgent]], + List[Matcher[OS]], + List[Matcher[Device]], ] diff --git a/src/ua_parser/lazy.py b/src/ua_parser/lazy.py new file mode 100644 index 0000000..fb24735 --- /dev/null +++ b/src/ua_parser/lazy.py @@ -0,0 +1,167 @@ +__all__ = ["UserAgentMatcher", "OSMatcher", "DeviceMatcher"] + +import re +from functools import cached_property +from typing import Literal, Optional, Pattern + +from .core import Matcher, UserAgent, OS, Device, _replacer, _get + + +class UserAgentMatcher(Matcher[UserAgent]): + pattern: str = "" + family: str + major: Optional[str] + minor: Optional[str] + patch: Optional[str] + patch_minor: Optional[str] + + def __init__( + self, + regex: str, + family: Optional[str] = None, + major: Optional[str] = None, + minor: Optional[str] = None, + patch: Optional[str] = None, + patch_minor: Optional[str] = None, + ) -> None: + self.pattern = regex + self.family = family or "$1" + self.major = major + self.minor = minor + self.patch = patch + self.patch_minor = patch_minor + + def __call__(self, ua: str) -> Optional[UserAgent]: + if m := self.regex.search(ua): + return UserAgent( + family=( + self.family.replace("$1", m[1]) + if "$1" in self.family + else self.family + ), + major=self.major or _get(m, 2), + minor=self.minor or _get(m, 3), + patch=self.patch or _get(m, 4), + patch_minor=self.patch_minor or _get(m, 5), + ) + return None + + @cached_property + def regex(self) -> Pattern[str]: + return re.compile(self.pattern) + + def __repr__(self) -> str: + fields = [ + ("family", self.family if self.family != "$1" else None), + ("major", self.major), + ("minor", self.minor), + ("patch", self.patch), + ("patch_minor", self.patch_minor), + ] + args = "".join(f", {k}={v!r}" for k, v in fields if v is not None) + + return f"UserAgentMatcher({self.pattern!r}{args})" + + +class OSMatcher(Matcher[OS]): + pattern: str = "" + family: str + major: str + minor: str + patch: str + patch_minor: str + + def __init__( + self, + regex: str, + family: Optional[str] = None, + major: Optional[str] = None, + minor: Optional[str] = None, + patch: Optional[str] = None, + patch_minor: Optional[str] = None, + ) -> None: + self.pattern = regex + self.family = family or "$1" + self.major = major or "$2" + self.minor = minor or "$3" + self.patch = patch or "$4" + self.patch_minor = patch_minor or "$5" + + def __call__(self, ua: str) -> Optional[OS]: + if m := self.regex.search(ua): + family = _replacer(self.family, m) + if family is None: + raise ValueError(f"Unable to find OS family in {ua}") + return OS( + family=family, + major=_replacer(self.major, m), + minor=_replacer(self.minor, m), + patch=_replacer(self.patch, m), + patch_minor=_replacer(self.patch_minor, m), + ) + return None + + @cached_property + def regex(self) -> Pattern[str]: + return re.compile(self.pattern) + + def __repr__(self) -> str: + fields = [ + ("family", self.family if self.family != "$1" else None), + ("major", self.major if self.major != "$2" else None), + ("minor", self.minor if self.minor != "$3" else None), + ("patch", self.patch if self.patch != "$4" else None), + ("patch_minor", self.patch_minor if self.patch_minor != "$5" else None), + ] + args = "".join(f", {k}={v!r}" for k, v in fields if v is not None) + + return f"OSMatcher({self.pattern!r}{args})" + + +class DeviceMatcher(Matcher[Device]): + pattern: str = "" + flags: int = 0 + family: str + brand: str + model: str + + def __init__( + self, + regex: str, + regex_flag: Optional[Literal["i"]] = None, + family: Optional[str] = None, + brand: Optional[str] = None, + model: Optional[str] = None, + ) -> None: + self.pattern = regex + self.flags = re.IGNORECASE if regex_flag == "i" else 0 + self.family = family or "$1" + self.brand = brand or "" + self.model = model or "$1" + + def __call__(self, ua: str) -> Optional[Device]: + if m := self.regex.search(ua): + family = _replacer(self.family, m) + if family is None: + raise ValueError(f"Unable to find device family in {ua}") + return Device( + family=family, + brand=_replacer(self.brand, m), + model=_replacer(self.model, m), + ) + return None + + @cached_property + def regex(self) -> Pattern[str]: + return re.compile(self.pattern, flags=self.flags) + + def __repr__(self) -> str: + fields = [ + ("family", self.family if self.family != "$1" else None), + ("brand", self.brand or None), + ("model", self.model if self.model != "$1" else None), + ] + iflag = ', "i"' if self.flags & re.IGNORECASE else "" + args = iflag + "".join(f", {k}={v!r}" for k, v in fields if v is not None) + + return f"DeviceMatcher({self.pattern!r}{args})" diff --git a/src/ua_parser/loaders.py b/src/ua_parser/loaders.py index 2e9718b..7743117 100644 --- a/src/ua_parser/loaders.py +++ b/src/ua_parser/loaders.py @@ -2,6 +2,7 @@ __all__ = [ "load_builtins", + "load_lazy_builtins", "load_data", "load_yaml", "MatchersData", @@ -14,19 +15,20 @@ import json import os from typing import ( - Any, Callable, - Dict, List, + Literal, Optional, + Protocol, Tuple, Type, - Union, TypedDict, - Literal, + Union, TYPE_CHECKING, + cast, ) +from . import lazy from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher if TYPE_CHECKING: @@ -44,7 +46,15 @@ def load_builtins() -> Matchers: from ._matchers import MATCHERS - return MATCHERS + # typing and mypy don't have safe upcast (#5756) and mypy is + # unhappy about returning concrete matchers for a mixed type + return cast(Matchers, MATCHERS) + + +def load_lazy_builtins() -> Matchers: + from ._lazy import MATCHERS + + return cast(Matchers, MATCHERS) # superclass needed to mix required & optional typed dict entries @@ -77,6 +87,7 @@ class DeviceDict(_RegexDict, total=False): MatchersData = Tuple[List[UserAgentDict], List[OSDict], List[DeviceDict]] +DataLoader = Callable[[MatchersData], Matchers] def load_data(d: MatchersData) -> Matchers: @@ -116,14 +127,57 @@ def load_data(d: MatchersData) -> Matchers: ) -def load_json(f: PathOrFile) -> Matchers: +def load_lazy(d: MatchersData) -> Matchers: + return ( + [ + lazy.UserAgentMatcher( + p["regex"], + p.get("family_replacement"), + p.get("v1_replacement"), + p.get("v2_replacement"), + p.get("v3_replacement"), + p.get("v4_replacement"), + ) + for p in d[0] + ], + [ + lazy.OSMatcher( + p["regex"], + p.get("os_replacement"), + p.get("os_v1_replacement"), + p.get("os_v2_replacement"), + p.get("os_v3_replacement"), + p.get("os_v4_replacement"), + ) + for p in d[1] + ], + [ + lazy.DeviceMatcher( + p["regex"], + p.get("regex_flag"), + p.get("device_replacement"), + p.get("brand_replacement"), + p.get("model_replacement"), + ) + for p in d[2] + ], + ) + + +class FileLoader(Protocol): + def __call__( + self, path: PathOrFile, loader: DataLoader = load_data + ) -> Matchers: ... + + +def load_json(f: PathOrFile, loader: DataLoader = load_data) -> Matchers: if isinstance(f, (str, os.PathLike)): with open(f) as fp: regexes = json.load(fp) else: regexes = json.load(f) - return load_data( + return loader( ( regexes["user_agent_parsers"], regexes["os_parsers"], @@ -132,12 +186,12 @@ def load_json(f: PathOrFile) -> Matchers: ) -load_yaml: Optional[Callable[[PathOrFile], Matchers]] +load_yaml: Optional[FileLoader] if load is None: load_yaml = None else: - def load_yaml(path: PathOrFile) -> Matchers: + def load_yaml(path: PathOrFile, loader: DataLoader = load_data) -> Matchers: if isinstance(path, (str, os.PathLike)): with open(path) as fp: regexes = load(fp, Loader=SafeLoader) # type: ignore diff --git a/src/ua_parser/re2.py b/src/ua_parser/re2.py index f9a92c4..867edfa 100644 --- a/src/ua_parser/re2.py +++ b/src/ua_parser/re2.py @@ -12,6 +12,8 @@ PartialParseResult, Device, Domain, + Matcher, + Matchers, OS, UserAgent, UserAgentMatcher, @@ -22,26 +24,26 @@ class Parser(AbstractParser): ua: re2.Filter - user_agent_matchers: List[UserAgentMatcher] + user_agent_matchers: List[Matcher[UserAgent]] os: re2.Filter - os_matchers: List[OSMatcher] + os_matchers: List[Matcher[OS]] devices: re2.Filter - device_matchers: List[DeviceMatcher] + device_matchers: List[Matcher[Device]] def __init__( self, - matchers: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]], + matchers: Matchers, ) -> None: self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers self.ua = re2.Filter() for u in self.user_agent_matchers: - self.ua.Add(u.regex.pattern) + self.ua.Add(u.pattern) self.ua.Compile() self.os = re2.Filter() for o in self.os_matchers: - self.os.Add(o.regex.pattern) + self.os.Add(o.pattern) self.os.Compile() self.devices = re2.Filter() @@ -49,10 +51,10 @@ def __init__( # Prepend the i global flag if IGNORECASE is set. Assumes # no pattern uses global flags, but since they're not # supported in JS that seems safe. - if d.regex.flags & re.IGNORECASE: - self.devices.Add("(?i)" + d.regex.pattern) + if d.flags & re.IGNORECASE: + self.devices.Add("(?i)" + d.pattern) else: - self.devices.Add(d.regex.pattern) + self.devices.Add(d.pattern) self.devices.Compile() def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult: diff --git a/tests/test_core.py b/tests/test_core.py index 6abc06a..f92c1a2 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -30,6 +30,7 @@ ParseResult, UserAgentMatcher, load_builtins, + load_lazy_builtins, caching, ) @@ -38,6 +39,7 @@ PARSERS = [ pytest.param(BasicParser(load_builtins()), id="basic"), + pytest.param(BasicParser(load_lazy_builtins()), id="lazy"), pytest.param( caching.CachingParser( BasicParser(load_builtins()),