Skip to content

Commit

Permalink
Add support for lazy matchers
Browse files Browse the repository at this point in the history
Add lazy builtin matchers (with a separately compiled file), as well
as loading json or yaml files using lazy matchers.

Lazy matchers are very much a tradeoff: they improve import speed (and
memory consumption until triggered), but slow down run speed, possibly
dramatically:

- importing the package itself takes ~36ms
- importing the lazy matchers takes ~36ms (including the package, so
  ~0) and ~70kB RSS
- importing the eager matchers takes ~97ms and ~780kB RSS
- triggering the instantiation of the lazy matchers adds ~800kB RSS
- running bench on the sample file using the lazy matcher has
  700~800ms overhead compared to the eager matchers

While the lazy matchers are less costly across the board until they're
used, benching the sample file causes the loading of *every* regex --
likely due to matching failures -- has a 700~800ms overhead over eager
matchers, and increases the RSS by ~800kB (on top of the original 70).

Thus lazy matchers are not a great default for the basic parser.
Though they might be a good opt-in if the user only ever uses one of
the domains (especially if it's not the devices one as that's by far
the largest).

With the re2 parser however, only 156 of the 1162 regexes get
evaluated, leading to a minor CPU overhead of 20~30ms (1% of bench
time) and a more reasonable memory overhead. Thus use the lazy matcher
fot the re2 parser.

On the more net-negative but relatively minor side of things, the
pregenerated lazy matchers file adds 120k to the on-disk requirements
of the library, and ~25k to the wheel archive. This is also what the
_regexes and _matchers precompiled files do. pyc files seem to be even
bigger (~130k) so the tradeoff is dubious even if they are slightly
faster.

Fixes #171, fixes #173
  • Loading branch information
masklinn committed Feb 18, 2024
1 parent 04d0b7d commit 16c1324
Show file tree
Hide file tree
Showing 10 changed files with 452 additions and 138 deletions.
239 changes: 139 additions & 100 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# flake8: noqa
import io
from contextlib import suppress
from contextlib import suppress, contextmanager
from os import fspath
from pathlib import Path
from typing import Optional, List, Dict
Expand Down Expand Up @@ -52,21 +52,6 @@ def run(self) -> None:
f"Unable to find regexes.yaml, should be at {yaml_src!r}"
)

def write_matcher(f, typ: str, fields: List[Optional[object]]):
f.write(f" {typ}(".encode())
while len(fields) > 1 and fields[-1] is None:
fields = fields[:-1]
f.write(", ".join(map(repr, fields)).encode())
f.write(b"),\n")

def write_params(fields):
# strip trailing None values
while len(fields) > 1 and fields[-1] is None:
fields.pop()

for field in fields:
fp.write((f" {field!r},\n").encode())

with yaml_src.open("rb") as f:
regexes = yaml.safe_load(f)

Expand All @@ -79,96 +64,150 @@ def write_params(fields):
outdir.mkdir(parents=True, exist_ok=True)

dest = outdir / "_matchers.py"
dest_lazy = outdir / "_lazy.py"
dest_legacy = outdir / "_regexes.py"

with dest.open("wb") as f, dest_legacy.open("wb") as fp:
# fmt: off
f.write(b"""\
with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
"wb"
) as legacy:
eager = EagerWriter(eager)
lazy = LazyWriter(lazy)
legacy = LegacyWriter(legacy)

for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
with eager.section(section), lazy.section(section), legacy.section(
section
):
extract = EXTRACTORS[section]
for p in regexes[section]:
el = trim(extract(p))
eager.item(el)
lazy.item(el)
legacy.item(el)
eager.end()
lazy.end()
legacy.end()


def trim(l):
while len(l) > 1 and l[-1] is None:
l.pop()
return l


EXTRACTORS = {
"user_agent_parsers": lambda p: [
p["regex"],
p.get("family_replacement"),
p.get("v1_replacement"),
p.get("v2_replacement"),
],
"os_parsers": lambda p: [
p["regex"],
p.get("os_replacement"),
p.get("os_v1_replacement"),
p.get("os_v2_replacement"),
p.get("os_v3_replacement"),
p.get("os_v4_replacement"),
],
"device_parsers": lambda p: [
p["regex"],
p.get("regex_flag"),
p.get("device_replacement"),
p.get("brand_replacement"),
p.get("model_replacement"),
],
}


class Writer:
section_end = b""

def __init__(self, fp):
self.fp = fp
self.fp.write(
b"""\
########################################################
# NOTICE: this file is autogenerated from regexes.yaml #
########################################################
"""
)
self.fp.write(self.prefix)
self._section = None

@contextmanager
def section(self, id):
self._section = id
self.fp.write(self.sections[id])
yield
self.fp.write(self.section_end)

def item(self, elements):
# DeviceMatcher(re, flag, repl1),
self.fp.write(self.items[self._section])
self.fp.write(", ".join(map(repr, elements)).encode())
self.fp.write(b"),\n")

def end(self):
self.fp.write(self.suffix)


class LegacyWriter(Writer):
prefix = b"""\
__all__ = [
"USER_AGENT_PARSERS",
"DEVICE_PARSERS",
"OS_PARSERS",
]
from .user_agent_parser import UserAgentParser, DeviceParser, OSParser
"""
sections = {
"user_agent_parsers": b"USER_AGENT_PARSERS = [\n",
"os_parsers": b"\n\nOS_PARSERS = [\n",
"device_parsers": b"\n\nDEVICE_PARSERS = [\n",
}
section_end = b"]"
items = {
"user_agent_parsers": b" UserAgentParser(",
"os_parsers": b" OSParser(",
"device_parsers": b" DeviceParser(",
}
suffix = b"\n"


class EagerWriter(Writer):
prefix = b"""\
__all__ = ["MATCHERS"]
from typing import Tuple, List
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
"""
sections = {
"user_agent_parsers": b"",
"os_parsers": b"], [\n",
"device_parsers": b"], [\n",
}
items = {
"user_agent_parsers": b" UserAgentMatcher(",
"os_parsers": b" OSMatcher(",
"device_parsers": b" DeviceMatcher(",
}
suffix = b"])\n"


class LazyWriter(EagerWriter):
prefix = b"""\
__all__ = ["MATCHERS"]
from typing import Tuple, List
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher
MATCHERS: Matchers = ([
""")
fp.write(b"# -*- coding: utf-8 -*-\n")
fp.write(b"########################################################\n")
fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n")
fp.write(b"########################################################\n")
fp.write(b"\n")
fp.write(b"from .user_agent_parser import (\n")
fp.write(b" UserAgentParser, DeviceParser, OSParser,\n")
fp.write(b")\n")
fp.write(b"\n")
fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n")
fp.write(b"\n")
fp.write(b"USER_AGENT_PARSERS = [\n")
for device_parser in regexes["user_agent_parsers"]:
write_matcher(f, "UserAgentMatcher", [
device_parser["regex"],
device_parser.get("family_replacement"),
device_parser.get("v1_replacement"),
device_parser.get("v2_replacement"),
])

fp.write(b" UserAgentParser(\n")
write_params([
device_parser["regex"],
device_parser.get("family_replacement"),
device_parser.get("v1_replacement"),
device_parser.get("v2_replacement"),
])
fp.write(b" ),\n")
f.write(b" ], [\n")
fp.write(b"]\n\n")

fp.write(b"OS_PARSERS = [\n")
for device_parser in regexes["os_parsers"]:
write_matcher(f, "OSMatcher", [
device_parser["regex"],
device_parser.get("os_replacement"),
device_parser.get("os_v1_replacement"),
device_parser.get("os_v2_replacement"),
device_parser.get("os_v3_replacement"),
device_parser.get("os_v4_replacement"),
])

fp.write(b" OSParser(\n")
write_params([
device_parser["regex"],
device_parser.get("os_replacement"),
device_parser.get("os_v1_replacement"),
device_parser.get("os_v2_replacement"),
device_parser.get("os_v3_replacement"),
device_parser.get("os_v4_replacement"),
])
fp.write(b" ),\n")
f.write(b" ], [\n")
fp.write(b"]\n\n")

fp.write(b"DEVICE_PARSERS = [\n")
for device_parser in regexes["device_parsers"]:
write_matcher(f, "DeviceMatcher", [
device_parser["regex"],
device_parser.get("regex_flag"),
device_parser.get("device_replacement"),
device_parser.get("brand_replacement"),
device_parser.get("model_replacement"),
])

fp.write(b" DeviceParser(\n")
write_params([
device_parser["regex"],
device_parser.get("regex_flag"),
device_parser.get("device_replacement"),
device_parser.get("brand_replacement"),
device_parser.get("model_replacement"),
])
fp.write(b" ),\n")
f.write(b"])\n")
fp.write(b"]\n")
# fmt: on
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
"""


setup(
Expand Down
5 changes: 3 additions & 2 deletions src/ua_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"UserAgent",
"UserAgentMatcher",
"load_builtins",
"load_lazy_builtins",
"load_data",
"load_yaml",
"parse",
Expand Down Expand Up @@ -65,7 +66,7 @@
)
from .basic import Parser as BasicParser
from .caching import CachingParser, Clearing, LRU, Locking
from .loaders import load_builtins, load_data, load_yaml
from .loaders import load_builtins, load_lazy_builtins, load_data, load_yaml

Re2Parser: Optional[Callable[[Matchers], Parser]] = None
with contextlib.suppress(ImportError):
Expand All @@ -79,7 +80,7 @@ def __getattr__(name: str) -> Parser:
global parser
if name == "parser":
if Re2Parser is not None:
parser = Re2Parser(load_builtins())
parser = Re2Parser(load_lazy_builtins())
else:
parser = CachingParser(
BasicParser(load_builtins()),
Expand Down
10 changes: 10 additions & 0 deletions src/ua_parser/_lazy.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
__all__ = ["MATCHERS"]

from typing import Tuple, List
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher

MATCHERS: Tuple[
List[UserAgentMatcher],
List[OSMatcher],
List[DeviceMatcher],
]
11 changes: 9 additions & 2 deletions src/ua_parser/_matchers.pyi
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
from .core import Matchers
__all__ = ["MATCHERS"]

MATCHERS: Matchers
from typing import Tuple, List
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher

MATCHERS: Tuple[
List[UserAgentMatcher],
List[OSMatcher],
List[DeviceMatcher],
]
7 changes: 4 additions & 3 deletions src/ua_parser/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Device,
DeviceMatcher,
Domain,
Matcher,
Matchers,
OS,
OSMatcher,
Expand All @@ -23,9 +24,9 @@ class Parser(AbstractParser):
when one matches.
"""

user_agent_matchers: List[UserAgentMatcher]
os_matchers: List[OSMatcher]
device_matchers: List[DeviceMatcher]
user_agent_matchers: List[Matcher[UserAgent]]
os_matchers: List[Matcher[OS]]
device_matchers: List[Matcher[Device]]

def __init__(
self,
Expand Down
Loading

0 comments on commit 16c1324

Please sign in to comment.