Move the new API off of the legacy parsers, extract inits to helper f…

…unctions The bridging of the legacy parsers and the new results turns out to be pretty mid. Create similar but better typed matcher classes, with a slightly different API: they return `None` on a match failure instead of a triplet, which make them compose better in iterations (e.g. can just `filter` them out). And add a `Matchers` alias to carry them around (a tuple of lists of matchers) for convenience. And the UA matcher now supports patch_minor, though that requires excluding that bit from the tests as there are apparently broken test cases around that item. Also clarify the replacer rules, and hopefully implement the thing more clearly. Rename the old "types" to "core" to better clarify what it's about and put the matchers there as even for non-basic parsers they would likely be needed both to generate the regex groups and to do the final extraction. Finally move the loaders off of the basic parser and into their own module, hopefully cleaner and more reusable than having the things be embedded a bit ad-hoc as classmethods. Instead have the basic parser just take a `Matchers` directly. This does require updating the tests as the parser now takes a triple (instead of 3 lists), but I think the regularity is a gain.
ua-parser · Nov 2, 2023 · dbcee8c · dbcee8c
1 parent 10a9ed3
commit dbcee8c
Show file tree

Hide file tree

Showing 9 changed files with 339 additions and 123 deletions.
diff --git a/README.rst b/README.rst
@@ -35,7 +35,8 @@ Retrieve all data on a user-agent string
     ParseResult(user_agent=UserAgent(family='Chrome',
                                      major='41',
                                      minor='0',
-                                     patch='2272'),
+                                     patch='2272',
+                                     patch_minor='104'),
                 os=OS(family='Mac OS X',
                       major='10',
                       minor='9',
@@ -59,7 +60,7 @@ Extract only browser data from user-agent string
     >>> from ua_parser import parse_user_agent
     >>> ua_string = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36'
     >>> parse_user_agent(ua_string)
-    UserAgent(family='Chrome', major='41', minor='0', patch='2272')
+    UserAgent(family='Chrome', major='41', minor='0', patch='2272', patch_minor='104')
 
 For specific domains, a match failure just returns ``None``::
 

diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py
@@ -19,9 +19,10 @@
 VERSION = (1, 0, 0)
 
 from typing import Optional
-from .types import *
+from .core import *
 from .basic import Parser as BasicParser
 from .caching import CachingParser, Clearing, LRU
+from .loaders import load_builtins, load_data, load_yaml
 
 
 parser: Parser
@@ -31,7 +32,7 @@ def __getattr__(name):
     global parser
     if name == "parser":
         parser = CachingParser(
-            BasicParser.from_regexes(),
+            BasicParser(load_builtins()),
             LRU(200),
         )
         return parser

diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py
@@ -3,133 +3,60 @@
 import io
 import os
 from itertools import starmap
-from typing import *
 from operator import methodcaller
+from typing import List
 
-from dataclasses import dataclass
-from .types import (
+from .core import (
     Parser as BaseParser,
     PartialParseResult,
     Domain,
     UserAgent,
     OS,
     Device,
+    Matchers,
+    UserAgentMatcher,
+    OSMatcher,
+    DeviceMatcher,
 )
-from ._legacy import UserAgentParser, OSParser, DeviceParser
 
-load: Optional[Callable]
-SafeLoader: Optional[Type]
-try:
-    from yaml import load, CSafeLoader as SafeLoader
-except ImportError:
-    try:
-        from yaml import load, SafeLoader
-    except ImportError:
-        load = SafeLoader = None
 
-
-@dataclass
 class Parser(BaseParser):
     """A simple pure-python parser based around trying a numer of regular
     expressions in sequence for each domain, and returning a result
     when one matches.
-
-    Can be initialised by passing in custom parsing rules directly,
-    though should usually be instantiated using :meth:`~.from_yaml`
-    instead.
     """
 
-    user_agent_parsers: List[UserAgentParser]
-    os_parsers: List[OSParser]
-    device_parsers: List[DeviceParser]
-
-    @classmethod
-    def from_regexes(cls) -> Parser:
-        """Instantiates a parser from the pre-compiled regex set. Currently
-        not a singleton, but essentially free anyway after the initial
-        call (which loads the pre-compiled code).
-        """
-        from ._regexes import USER_AGENT_PARSERS, DEVICE_PARSERS, OS_PARSERS
-
-        return cls(
-            user_agent_parsers=USER_AGENT_PARSERS,
-            os_parsers=OS_PARSERS,
-            device_parsers=DEVICE_PARSERS,
-        )
-
-    if load:
+    user_agent_parsers: List[UserAgentMatcher]
+    os_parsers: List[OSMatcher]
+    device_parsers: List[DeviceMatcher]
 
-        @classmethod
-        def from_yaml(cls, path: Union[str, os.PathLike | io.IOBase]) -> Parser:
-            """Instantiates a parser from a YAML file-like object or path.
-
-            The data should follow the `regexes.yaml
-            <https://github.com/ua-parser/uap-core/blob/master/docs/specification.md#regexesyaml>`_
-            format. Note that because yaml is a superset of json, the
-            rules can just be in json.
-
-            Requires ``pyyaml``, and every call will read and reload
-            the data anew.
-
-            """
-            if isinstance(path, (str, os.PathLike)):
-                with open(path) as fp:
-                    regexes = load(fp, Loader=SafeLoader)  # type: ignore
-            else:
-                regexes = load(path, Loader=SafeLoader)  # type: ignore
-
-            return cls(
-                user_agent_parsers=[
-                    UserAgentParser(
-                        p["regex"],
-                        p.get("family_replacement"),
-                        p.get("v1_replacement"),
-                        p.get("v2_replacement"),
-                    )
-                    for p in regexes["user_agent_parsers"]
-                ],
-                os_parsers=[
-                    OSParser(
-                        p["regex"],
-                        p.get("os_replacement"),
-                        p.get("os_v1_replacement"),
-                        p.get("os_v2_replacement"),
-                        p.get("os_v3_replacement"),
-                        p.get("os_v4_replacement"),
-                    )
-                    for p in regexes["os_parsers"]
-                ],
-                device_parsers=[
-                    DeviceParser(
-                        p["regex"],
-                        p.get("regex_flag"),
-                        p.get("device_replacement"),
-                        p.get("brand_replacement"),
-                        p.get("model_replacement"),
-                    )
-                    for p in regexes["device_parsers"]
-                ],
-            )
+    def __init__(
+        self,
+        matchers: Matchers,
+    ) -> None:
+        self.user_agent_parsers = matchers[0]
+        self.os_parsers = matchers[1]
+        self.device_parsers = matchers[2]
 
     def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
-        parse = methodcaller("Parse", ua)
+        parse = methodcaller("__call__", ua)
         return PartialParseResult(
             domains=domains,
             string=ua,
             user_agent=next(
-                (UserAgent(*m) for m in map(parse, self.user_agent_parsers) if m[0]),
+                filter(None, map(parse, self.user_agent_parsers)),
                 None,
             )
             if Domain.USER_AGENT in domains
             else None,
             os=next(
-                (OS(*m) for m in map(parse, self.os_parsers) if m[0]),
+                filter(None, map(parse, self.os_parsers)),
                 None,
             )
             if Domain.OS in domains
             else None,
             device=next(
-                (Device(*m) for m in map(parse, self.device_parsers) if m[0]),
+                filter(None, map(parse, self.device_parsers)),
                 None,
             )
             if Domain.DEVICE in domains

diff --git a/src/ua_parser/caching.py b/src/ua_parser/caching.py
@@ -1,7 +1,8 @@
 import abc
 from collections import OrderedDict
 from typing import Dict, Optional, MutableMapping
-from .types import Parser, Domain, PartialParseResult
+
+from .core import Parser, Domain, PartialParseResult
 
 
 __all__ = [

diff --git a/src/ua_parser/types.py → src/ua_parser/core.py b/src/ua_parser/types.py → src/ua_parser/core.py
@@ -1,8 +1,8 @@
 import abc
-
-from dataclasses import dataclass
+import re
+from dataclasses import dataclass, fields
 from enum import Flag, auto
-from typing import *
+from typing import Literal, Optional, Tuple, List
 
 __all__ = [
     "UserAgent",
@@ -13,6 +13,10 @@
     "Parser",
     "Domain",
     "PartialParseResult",
+    "Matchers",
+    "UserAgentMatcher",
+    "OSMatcher",
+    "DeviceMatcher",
 ]
 
 
@@ -26,6 +30,7 @@ class UserAgent:
     major: Optional[str] = None
     minor: Optional[str] = None
     patch: Optional[str] = None
+    patch_minor: Optional[str] = None
 
 
 @dataclass(frozen=True)
@@ -177,3 +182,147 @@ def parse_device(self, ua: str) -> Optional[Device]:
         back to the default value in case of failure.
         """
         return self(ua, Domain.DEVICE).device
+
+
+def _get(m: re.Match, idx: int) -> Optional[str]:
+    return (m[idx] or None) if 0 < idx <= m.re.groups else None
+
+
+def _replacer(repl: str, m: re.Match) -> Optional[str]:
+    """The replacement rules are frustratingly subtle and innimical to
+    standard fallbacks:
+
+    - if there is a non-null replacement pattern, then it must be used with
+      match groups as template parameters (at indices 1+)
+      - the result is stripped
+      - if it is an empty string, then it's replaced by a null
+    - otherwise fallback to a (possibly optional) match group
+    - or null (device brand has no fallback)
+
+    Replacement rules only apply to OS and Device matchers, the UA
+    matcher has bespoke replacement semantics for the family (just
+    $1), and no replacement for the other fields, either there is a
+    static replacement or it falls back to the corresponding
+    (optional) match group.
+
+    """
+    if not repl:
+        return None
+
+    return re.sub(r"\$(\d)", lambda n: _get(m, int(n[1])) or "", repl).strip() or None
+
+
+class UserAgentMatcher:
+    regex: re.Pattern
+    family: str
+    major: Optional[str]
+    minor: Optional[str]
+    patch: Optional[str]
+    patch_minor: Optional[str]
+
+    def __init__(
+        self,
+        regex: str,
+        family: Optional[str] = None,
+        major: Optional[str] = None,
+        minor: Optional[str] = None,
+        patch: Optional[str] = None,
+        patch_minor: Optional[str] = None,
+    ) -> None:
+        self.regex = re.compile(regex)
+        self.family = family or "$1"
+        self.major = major
+        self.minor = minor
+        self.patch = patch
+        self.patch_minor = patch_minor
+
+    def __call__(self, ua: str) -> Optional[UserAgent]:
+        if m := self.regex.search(ua):
+            return UserAgent(
+                family=self.family.replace("$1", m[1])
+                if "$1" in self.family
+                else self.family,
+                major=self.major or _get(m, 2),
+                minor=self.minor or _get(m, 3),
+                patch=self.patch or _get(m, 4),
+                patch_minor=self.patch_minor or _get(m, 5),
+            )
+        return None
+
+
+class OSMatcher:
+    regex: re.Pattern
+    family: str
+    major: str
+    minor: str
+    patch: str
+    patch_minor: str
+
+    def __init__(
+        self,
+        regex: str,
+        family: Optional[str] = None,
+        major: Optional[str] = None,
+        minor: Optional[str] = None,
+        patch: Optional[str] = None,
+        patch_minor: Optional[str] = None,
+    ) -> None:
+        self.regex = re.compile(regex)
+        self.family = family or "$1"
+        self.major = major or "$2"
+        self.minor = minor or "$3"
+        self.patch = patch or "$4"
+        self.patch_minor = patch_minor or "$5"
+
+    def __call__(self, ua: str) -> Optional[OS]:
+        if m := self.regex.search(ua):
+            family = _replacer(self.family, m)
+            if family is None:
+                raise ValueError(f"Unable to find OS family in {ua}")
+            return OS(
+                family=family,
+                major=_replacer(self.major, m),
+                minor=_replacer(self.minor, m),
+                patch=_replacer(self.patch, m),
+                patch_minor=_replacer(self.patch_minor, m),
+            )
+        return None
+
+
+class DeviceMatcher:
+    regex: re.Pattern
+    family: str
+    brand: str
+    model: str
+
+    def __init__(
+        self,
+        regex: str,
+        regex_flag: Optional[Literal["i"]] = None,
+        family: Optional[str] = None,
+        brand: Optional[str] = None,
+        model: Optional[str] = None,
+    ) -> None:
+        self.regex = re.compile(regex, flags=re.IGNORECASE if regex_flag == "i" else 0)
+        self.family = family or "$1"
+        self.brand = brand or ""
+        self.model = model or "$1"
+
+    def __call__(self, ua: str) -> Optional[Device]:
+        if m := self.regex.search(ua):
+            family = _replacer(self.family, m)
+            if family is None:
+                raise ValueError(f"Unable to find device family in {ua}")
+            return Device(
+                family=family,
+                brand=_replacer(self.brand, m),
+                model=_replacer(self.model, m),
+            )
+        return None
+
+
+Matchers = Tuple[
+    List[UserAgentMatcher],
+    List[OSMatcher],
+    List[DeviceMatcher],
+]