Add support for lazy matchers

Add lazy builtin matchers (with a separately compiled file), as well as loading json or yaml files using lazy matchers. Lazy matchers are very much a tradeoff: they improve import speed (and memory consumption until triggered), but slow down run speed, possibly dramatically: - importing the package itself takes ~36ms - importing the lazy matchers takes ~36ms (including the package, so ~0) and ~70kB RSS - importing the eager matchers takes ~97ms and ~780kB RSS - triggering the instantiation of the lazy matchers adds ~800kB RSS - running bench on the sample file using the lazy matcher has 700~800ms overhead compared to the eager matchers While the lazy matchers are less costly across the board until they're used, benching the sample file causes the loading of *every* regex -- likely due to matching failures -- has a 700~800ms overhead over eager matchers, and increases the RSS by ~800kB (on top of the original 70). Thus lazy matchers are not a great default for the basic parser. Though they might be a good opt-in if the user only ever uses one of the domains (especially if it's not the devices one as that's by far the largest). With the re2 parser however, only 156 of the 1162 regexes get evaluated, leading to a minor CPU overhead of 20~30ms (1% of bench time) and a more reasonable memory overhead. Thus use the lazy matcher fot the re2 parser. On the more net-negative but relatively minor side of things, the pregenerated lazy matchers file adds 120k to the on-disk requirements of the library, and ~25k to the wheel archive. This is also what the _regexes and _matchers precompiled files do. pyc files seem to be even bigger (~130k) so the tradeoff is dubious even if they are slightly faster. Fixes #171, fixes #173
ua-parser · Feb 18, 2024 · 16c1324 · 16c1324
1 parent 04d0b7d
commit 16c1324
Show file tree

Hide file tree

Showing 10 changed files with 452 additions and 138 deletions.
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # flake8: noqa
 import io
-from contextlib import suppress
+from contextlib import suppress, contextmanager
 from os import fspath
 from pathlib import Path
 from typing import Optional, List, Dict
@@ -52,21 +52,6 @@ def run(self) -> None:
                 f"Unable to find regexes.yaml, should be at {yaml_src!r}"
             )
 
-        def write_matcher(f, typ: str, fields: List[Optional[object]]):
-            f.write(f"        {typ}(".encode())
-            while len(fields) > 1 and fields[-1] is None:
-                fields = fields[:-1]
-            f.write(", ".join(map(repr, fields)).encode())
-            f.write(b"),\n")
-
-        def write_params(fields):
-            # strip trailing None values
-            while len(fields) > 1 and fields[-1] is None:
-                fields.pop()
-
-            for field in fields:
-                fp.write((f"        {field!r},\n").encode())
-
         with yaml_src.open("rb") as f:
             regexes = yaml.safe_load(f)
 
@@ -79,96 +64,150 @@ def write_params(fields):
         outdir.mkdir(parents=True, exist_ok=True)
 
         dest = outdir / "_matchers.py"
+        dest_lazy = outdir / "_lazy.py"
         dest_legacy = outdir / "_regexes.py"
 
-        with dest.open("wb") as f, dest_legacy.open("wb") as fp:
-            # fmt: off
-            f.write(b"""\
+        with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
+            "wb"
+        ) as legacy:
+            eager = EagerWriter(eager)
+            lazy = LazyWriter(lazy)
+            legacy = LegacyWriter(legacy)
+
+            for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
+                with eager.section(section), lazy.section(section), legacy.section(
+                    section
+                ):
+                    extract = EXTRACTORS[section]
+                    for p in regexes[section]:
+                        el = trim(extract(p))
+                        eager.item(el)
+                        lazy.item(el)
+                        legacy.item(el)
+            eager.end()
+            lazy.end()
+            legacy.end()
+
+
+def trim(l):
+    while len(l) > 1 and l[-1] is None:
+        l.pop()
+    return l
+
+
+EXTRACTORS = {
+    "user_agent_parsers": lambda p: [
+        p["regex"],
+        p.get("family_replacement"),
+        p.get("v1_replacement"),
+        p.get("v2_replacement"),
+    ],
+    "os_parsers": lambda p: [
+        p["regex"],
+        p.get("os_replacement"),
+        p.get("os_v1_replacement"),
+        p.get("os_v2_replacement"),
+        p.get("os_v3_replacement"),
+        p.get("os_v4_replacement"),
+    ],
+    "device_parsers": lambda p: [
+        p["regex"],
+        p.get("regex_flag"),
+        p.get("device_replacement"),
+        p.get("brand_replacement"),
+        p.get("model_replacement"),
+    ],
+}
+
+
+class Writer:
+    section_end = b""
+
+    def __init__(self, fp):
+        self.fp = fp
+        self.fp.write(
+            b"""\
 ########################################################
 # NOTICE: this file is autogenerated from regexes.yaml #
 ########################################################
+"""
+        )
+        self.fp.write(self.prefix)
+        self._section = None
+
+    @contextmanager
+    def section(self, id):
+        self._section = id
+        self.fp.write(self.sections[id])
+        yield
+        self.fp.write(self.section_end)
+
+    def item(self, elements):
+        #        DeviceMatcher(re, flag, repl1),
+        self.fp.write(self.items[self._section])
+        self.fp.write(", ".join(map(repr, elements)).encode())
+        self.fp.write(b"),\n")
+
+    def end(self):
+        self.fp.write(self.suffix)
+
+
+class LegacyWriter(Writer):
+    prefix = b"""\
+__all__ = [
+    "USER_AGENT_PARSERS",
+    "DEVICE_PARSERS",
+    "OS_PARSERS",
+]
+
+from .user_agent_parser import UserAgentParser, DeviceParser, OSParser
+
+"""
+    sections = {
+        "user_agent_parsers": b"USER_AGENT_PARSERS = [\n",
+        "os_parsers": b"\n\nOS_PARSERS = [\n",
+        "device_parsers": b"\n\nDEVICE_PARSERS = [\n",
+    }
+    section_end = b"]"
+    items = {
+        "user_agent_parsers": b"    UserAgentParser(",
+        "os_parsers": b"    OSParser(",
+        "device_parsers": b"    DeviceParser(",
+    }
+    suffix = b"\n"
+
+
+class EagerWriter(Writer):
+    prefix = b"""\
+__all__ = ["MATCHERS"]
+
+from typing import Tuple, List
+from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
+
+MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
+"""
+    sections = {
+        "user_agent_parsers": b"",
+        "os_parsers": b"], [\n",
+        "device_parsers": b"], [\n",
+    }
+    items = {
+        "user_agent_parsers": b"    UserAgentMatcher(",
+        "os_parsers": b"    OSMatcher(",
+        "device_parsers": b"    DeviceMatcher(",
+    }
+    suffix = b"])\n"
+
+
+class LazyWriter(EagerWriter):
+    prefix = b"""\
+__all__ = ["MATCHERS"]
+
+from typing import Tuple, List
+from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
 
-from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher
-
-MATCHERS: Matchers = ([
-""")
-            fp.write(b"# -*- coding: utf-8 -*-\n")
-            fp.write(b"########################################################\n")
-            fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n")
-            fp.write(b"########################################################\n")
-            fp.write(b"\n")
-            fp.write(b"from .user_agent_parser import (\n")
-            fp.write(b"    UserAgentParser, DeviceParser, OSParser,\n")
-            fp.write(b")\n")
-            fp.write(b"\n")
-            fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n")
-            fp.write(b"\n")
-            fp.write(b"USER_AGENT_PARSERS = [\n")
-            for device_parser in regexes["user_agent_parsers"]:
-                write_matcher(f, "UserAgentMatcher", [
-                    device_parser["regex"],
-                    device_parser.get("family_replacement"),
-                    device_parser.get("v1_replacement"),
-                    device_parser.get("v2_replacement"),
-                ])
-
-                fp.write(b"    UserAgentParser(\n")
-                write_params([
-                    device_parser["regex"],
-                    device_parser.get("family_replacement"),
-                    device_parser.get("v1_replacement"),
-                    device_parser.get("v2_replacement"),
-                ])
-                fp.write(b"    ),\n")
-            f.write(b"    ], [\n")
-            fp.write(b"]\n\n")
-
-            fp.write(b"OS_PARSERS = [\n")
-            for device_parser in regexes["os_parsers"]:
-                write_matcher(f, "OSMatcher", [
-                    device_parser["regex"],
-                    device_parser.get("os_replacement"),
-                    device_parser.get("os_v1_replacement"),
-                    device_parser.get("os_v2_replacement"),
-                    device_parser.get("os_v3_replacement"),
-                    device_parser.get("os_v4_replacement"),
-                ])
-
-                fp.write(b"    OSParser(\n")
-                write_params([
-                    device_parser["regex"],
-                    device_parser.get("os_replacement"),
-                    device_parser.get("os_v1_replacement"),
-                    device_parser.get("os_v2_replacement"),
-                    device_parser.get("os_v3_replacement"),
-                    device_parser.get("os_v4_replacement"),
-                ])
-                fp.write(b"    ),\n")
-            f.write(b"    ], [\n")
-            fp.write(b"]\n\n")
-
-            fp.write(b"DEVICE_PARSERS = [\n")
-            for device_parser in regexes["device_parsers"]:
-                write_matcher(f, "DeviceMatcher", [
-                    device_parser["regex"],
-                    device_parser.get("regex_flag"),
-                    device_parser.get("device_replacement"),
-                    device_parser.get("brand_replacement"),
-                    device_parser.get("model_replacement"),
-                ])
-
-                fp.write(b"    DeviceParser(\n")
-                write_params([
-                    device_parser["regex"],
-                    device_parser.get("regex_flag"),
-                    device_parser.get("device_replacement"),
-                    device_parser.get("brand_replacement"),
-                    device_parser.get("model_replacement"),
-                ])
-                fp.write(b"    ),\n")
-            f.write(b"])\n")
-            fp.write(b"]\n")
-            # fmt: on
+MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
+"""
 
 
 setup(

diff --git a/src/ua_parser/__init__.py b/src/ua_parser/__init__.py
@@ -36,6 +36,7 @@
     "UserAgent",
     "UserAgentMatcher",
     "load_builtins",
+    "load_lazy_builtins",
     "load_data",
     "load_yaml",
     "parse",
@@ -65,7 +66,7 @@
 )
 from .basic import Parser as BasicParser
 from .caching import CachingParser, Clearing, LRU, Locking
-from .loaders import load_builtins, load_data, load_yaml
+from .loaders import load_builtins, load_lazy_builtins, load_data, load_yaml
 
 Re2Parser: Optional[Callable[[Matchers], Parser]] = None
 with contextlib.suppress(ImportError):
@@ -79,7 +80,7 @@ def __getattr__(name: str) -> Parser:
     global parser
     if name == "parser":
         if Re2Parser is not None:
-            parser = Re2Parser(load_builtins())
+            parser = Re2Parser(load_lazy_builtins())
         else:
             parser = CachingParser(
                 BasicParser(load_builtins()),

diff --git a/src/ua_parser/_lazy.pyi b/src/ua_parser/_lazy.pyi
@@ -0,0 +1,10 @@
+__all__ = ["MATCHERS"]
+
+from typing import Tuple, List
+from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
+
+MATCHERS: Tuple[
+    List[UserAgentMatcher],
+    List[OSMatcher],
+    List[DeviceMatcher],
+]
diff --git a/src/ua_parser/_matchers.pyi b/src/ua_parser/_matchers.pyi
@@ -1,3 +1,10 @@
-from .core import Matchers
+__all__ = ["MATCHERS"]
 
-MATCHERS: Matchers
+from typing import Tuple, List
+from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
+
+MATCHERS: Tuple[
+    List[UserAgentMatcher],
+    List[OSMatcher],
+    List[DeviceMatcher],
+]
diff --git a/src/ua_parser/basic.py b/src/ua_parser/basic.py
@@ -7,6 +7,7 @@
     Device,
     DeviceMatcher,
     Domain,
+    Matcher,
     Matchers,
     OS,
     OSMatcher,
@@ -23,9 +24,9 @@ class Parser(AbstractParser):
     when one matches.
     """
 
-    user_agent_matchers: List[UserAgentMatcher]
-    os_matchers: List[OSMatcher]
-    device_matchers: List[DeviceMatcher]
+    user_agent_matchers: List[Matcher[UserAgent]]
+    os_matchers: List[Matcher[OS]]
+    device_matchers: List[Matcher[Device]]
 
     def __init__(
         self,