-
Notifications
You must be signed in to change notification settings - Fork 154
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Uses `re2.Filter`, which unlike the C++ `FilteredRE2` bundles prefiltering, using an `re2.Set` so likely less efficient than providing one's own e.g. aho-corasick, but avoids having to do that. At first glance according to pytest's `--durations 0` this is quite successful (unlike using `re2.Set` which was more of a mixed bag): ``` 2.54s call tests/test_core.py::test_devices[test_device.yaml-basic] 2.51s call tests/test_core.py::test_ua[pgts_browser_list.yaml-basic] 2.48s call tests/test_legacy.py::TestParse::testPGTSStrings 2.43s call tests/test_legacy.py::TestParse::testStringsDevice 0.95s call tests/test_core.py::test_devices[test_device.yaml-re2] 0.55s call tests/test_core.py::test_ua[pgts_browser_list.yaml-re2] 0.18s call tests/test_core.py::test_ua[test_ua.yaml-basic] 0.16s call tests/test_legacy.py::TestParse::testBrowserscopeStrings 0.10s call tests/test_core.py::test_ua[test_ua.yaml-re2] ``` While the "basic" parser for the new API is slightly slower than the legacy API (browserscope does use test_ua.yaml so that matches) the re2 parser is significantly faster than both: - 60% faster on test_device.yaml (~2.5s -> 1s) - 80% faster on pgts (2.5s -> 0.5s) - 40% faster on browserscope (0.16 -> 0.1) This is very encouraging, altough the memory consumption has not been checked (yet). Fixes #149, kind-of
- Loading branch information
Showing
4 changed files
with
86 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from __future__ import annotations | ||
|
||
import io | ||
import os | ||
import re | ||
from typing import List, Tuple, Union | ||
|
||
import re2 # type: ignore | ||
|
||
from .core import ( | ||
Parser as AbstractParser, | ||
PartialParseResult, | ||
Device, | ||
Domain, | ||
OS, | ||
UserAgent, | ||
Matchers, | ||
UserAgentMatcher, | ||
OSMatcher, | ||
DeviceMatcher, | ||
) | ||
|
||
|
||
class Parser(AbstractParser): | ||
ua: re2.Filter | ||
user_agent_parsers: List[UserAgentMatcher] | ||
os: re2.Filter | ||
os_parsers: List[OSMatcher] | ||
devices: re2.Filter | ||
device_parsers: List[DeviceMatcher] | ||
|
||
def __init__( | ||
self, | ||
matchers: Matchers, | ||
) -> None: | ||
self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers | ||
|
||
self.ua = re2.Filter() | ||
for u in self.user_agent_parsers: | ||
self.ua.Add(u.regex.pattern) | ||
self.ua.Compile() | ||
|
||
self.os = re2.Filter() | ||
for o in self.os_parsers: | ||
self.os.Add(o.regex.pattern) | ||
self.os.Compile() | ||
|
||
self.devices = re2.Filter() | ||
for d in self.device_parsers: | ||
# Prepend the i global flag if IGNORECASE is set. Assumes | ||
# no pattern uses global flags, but since they're not | ||
# supported in JS that seems safe. | ||
if d.regex.flags & re.IGNORECASE: | ||
self.devices.Add("(?i)" + d.regex.pattern) | ||
else: | ||
self.devices.Add(d.regex.pattern) | ||
self.devices.Compile() | ||
|
||
def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult: | ||
user_agent = os = device = None | ||
if Domain.USER_AGENT in domains: | ||
if matches := self.ua.Match(ua): | ||
# Set/Filter does not return the match in index order | ||
# (position order?) so to fit UAP semantics we need to | ||
# extract the first matching regex (lowest index). | ||
user_agent = self.user_agent_parsers[min(matches)](ua) | ||
if Domain.OS in domains: | ||
if matches := self.os.Match(ua): | ||
os = self.os_parsers[min(matches)](ua) | ||
if Domain.DEVICE in domains: | ||
if matches := self.devices.Match(ua): | ||
device = self.device_parsers[min(matches)](ua) | ||
return PartialParseResult( | ||
domains=domains, string=ua, user_agent=user_agent, os=os, device=device | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters