-
Notifications
You must be signed in to change notification settings - Fork 154
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Only uses re2.Set which turns out to be not great, at least according to `pytest --durations` on 3.11: - re2 is sometimes faster for UA tests - `pgts_browser_list.yaml` goes from 2.5s to 1.5 - `firefox_user_agent_strings.yaml` goes from 0.05 to 0.04 (not really significant) - though `test_ua.yaml` goes from 0.18 to 0.65 - re2 is *way* slower for devices tests - `test_device.yaml` goes from 2.5 to 8s Obviously tests might not be representative at all, implementing a proper benchmark on a real-life test-set (#163) would likely provide better information. It's possible that `FilteredRE2` would would offer better performances, *but* it requires additional memory and more importantly it requires a fast literal string matcher e.g. a fast implementation of Aho-Corasick, or possibly Hyperscan's Teddy (via [python-hyperscan][5]?). [According to burntsushi commentz-walter is not great in practice][1], at least as you increase the number of patterns, so that one looks like a dead end. Either way this would likely be an *additional* dependency to make it usable, although there seems to be [a well-maintained Python version with impressive performances (for pure python)][2], [a native module][3], and [a wrapper for burntsushi's rust implementation][4] which claims even better performances than the native module. Linked to (but probably can't be argued to fix) #149. [1]: https://news.ycombinator.com/item?id=26913349 [2]: https://github.com/abusix/ahocorapy [3]: https://github.com/WojciechMula/pyahocorasick/ [4]: https://github.com/G-Research/ahocorasick_rs/ [5]: https://python-hyperscan.readthedocs.io
- Loading branch information
Showing
4 changed files
with
91 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
from __future__ import annotations | ||
|
||
import io | ||
import os | ||
import re | ||
from typing import List, Tuple, Union | ||
|
||
import re2 # type: ignore | ||
|
||
from .core import ( | ||
Parser as BaseParser, | ||
PartialParseResult, | ||
Device, | ||
Domain, | ||
OS, | ||
UserAgent, | ||
Matchers, | ||
UserAgentMatcher, | ||
OSMatcher, | ||
DeviceMatcher, | ||
) | ||
|
||
|
||
RE_OPTS = re2.Options() | ||
# as of uap-core 0.18, the devices set needs at least 28MB (up from | ||
# the default 8), set to 32 | ||
RE_OPTS.max_mem = 8 << 22 | ||
# might write directly to stdout? not great, suppress | ||
RE_OPTS.log_errors = False | ||
|
||
|
||
class Parser(BaseParser): | ||
ua: re2.Set | ||
user_agent_parsers: List[UserAgentMatcher] | ||
os: re2.Set | ||
os_parsers: List[OSMatcher] | ||
devices: re2.Set | ||
device_parsers: List[DeviceMatcher] | ||
|
||
def __init__( | ||
self, | ||
matchers: Matchers, | ||
) -> None: | ||
self.user_agent_parsers, self.os_parsers, self.device_parsers = matchers | ||
|
||
self.ua = re2.Set.SearchSet(RE_OPTS) | ||
for u in self.user_agent_parsers: | ||
self.ua.Add(u.regex.pattern) | ||
self.ua.Compile() | ||
|
||
self.os = re2.Set.SearchSet(RE_OPTS) | ||
for o in self.os_parsers: | ||
self.os.Add(o.regex.pattern) | ||
self.os.Compile() | ||
|
||
self.devices = re2.Set.SearchSet(RE_OPTS) | ||
for d in self.device_parsers: | ||
# Prepend the i global flag if IGNORECASE is set. Assumes | ||
# no pattern uses global flags, but since they're not | ||
# supported in JS that seems safe. | ||
if d.regex.flags & re.IGNORECASE: | ||
self.devices.Add("(?i)" + d.regex.pattern) | ||
else: | ||
self.devices.Add(d.regex.pattern) | ||
self.devices.Compile() | ||
|
||
def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult: | ||
user_agent = os = device = None | ||
if Domain.USER_AGENT in domains: | ||
if matches := self.ua.Match(ua): | ||
user_agent = self.user_agent_parsers[min(matches)](ua) | ||
if Domain.OS in domains: | ||
if matches := self.os.Match(ua): | ||
os = self.os_parsers[min(matches)](ua) | ||
if Domain.DEVICE in domains: | ||
if matches := self.devices.Match(ua): | ||
device = self.device_parsers[min(matches)](ua) | ||
return PartialParseResult( | ||
domains=domains, string=ua, user_agent=user_agent, os=os, device=device | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters