Skip to content

Commit

Permalink
Move the new API off of the legacy parsers, extract inits to helper f…
Browse files Browse the repository at this point in the history
…unctions

The bridging of the legacy parsers and the new results turns out to be
pretty mid. Create similar but better typed matcher classes, with a
slightly different API: they return `None` on a match failure instead
of a triplet, which make them compose better in iterations (e.g. can
just `filter` them out). And add a `Matchers` alias to carry them
around (a tuple of lists of matchers) for convenience. And the UA
matcher now supports patch_minor, though that requires excluding that
bit from the tests as there are apparently broken test cases around
that item.

Also clarify the replacer rules, and hopefully implement the thing
more clearly.

Rename the old "types" to "core" to better clarify what it's about and
put the matchers there as even for non-basic parsers they would likely
be needed both to generate the regex groups and to do the final
extraction.

Finally move the loaders off of the basic parser and into their own
module, hopefully cleaner and more reusable than having the things be
embedded a bit ad-hoc as classmethods. Instead have the basic parser
just take a `Matchers` directly. This does require updating the tests
as the parser now takes a triple (instead of 3 lists), but I think the
regularity is a gain.
  • Loading branch information
masklinn committed Nov 2, 2023
1 parent 10a9ed3 commit dbcee8c
Show file tree
Hide file tree
Showing 9 changed files with 339 additions and 123 deletions.
5 changes: 3 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ Retrieve all data on a user-agent string
ParseResult(user_agent=UserAgent(family='Chrome',
major='41',
minor='0',
patch='2272'),
patch='2272',
patch_minor='104'),
os=OS(family='Mac OS X',
major='10',
minor='9',
Expand All @@ -59,7 +60,7 @@ Extract only browser data from user-agent string
>>> from ua_parser import parse_user_agent
>>> ua_string = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36'
>>> parse_user_agent(ua_string)
UserAgent(family='Chrome', major='41', minor='0', patch='2272')
UserAgent(family='Chrome', major='41', minor='0', patch='2272', patch_minor='104')
For specific domains, a match failure just returns ``None``::
Expand Down
5 changes: 3 additions & 2 deletions src/ua_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
VERSION = (1, 0, 0)

from typing import Optional
from .types import *
from .core import *
from .basic import Parser as BasicParser
from .caching import CachingParser, Clearing, LRU
from .loaders import load_builtins, load_data, load_yaml


parser: Parser
Expand All @@ -31,7 +32,7 @@ def __getattr__(name):
global parser
if name == "parser":
parser = CachingParser(
BasicParser.from_regexes(),
BasicParser(load_builtins()),
LRU(200),
)
return parser
Expand Down
113 changes: 20 additions & 93 deletions src/ua_parser/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,133 +3,60 @@
import io
import os
from itertools import starmap
from typing import *
from operator import methodcaller
from typing import List

from dataclasses import dataclass
from .types import (
from .core import (
Parser as BaseParser,
PartialParseResult,
Domain,
UserAgent,
OS,
Device,
Matchers,
UserAgentMatcher,
OSMatcher,
DeviceMatcher,
)
from ._legacy import UserAgentParser, OSParser, DeviceParser

load: Optional[Callable]
SafeLoader: Optional[Type]
try:
from yaml import load, CSafeLoader as SafeLoader
except ImportError:
try:
from yaml import load, SafeLoader
except ImportError:
load = SafeLoader = None


@dataclass
class Parser(BaseParser):
"""A simple pure-python parser based around trying a numer of regular
expressions in sequence for each domain, and returning a result
when one matches.
Can be initialised by passing in custom parsing rules directly,
though should usually be instantiated using :meth:`~.from_yaml`
instead.
"""

user_agent_parsers: List[UserAgentParser]
os_parsers: List[OSParser]
device_parsers: List[DeviceParser]

@classmethod
def from_regexes(cls) -> Parser:
"""Instantiates a parser from the pre-compiled regex set. Currently
not a singleton, but essentially free anyway after the initial
call (which loads the pre-compiled code).
"""
from ._regexes import USER_AGENT_PARSERS, DEVICE_PARSERS, OS_PARSERS

return cls(
user_agent_parsers=USER_AGENT_PARSERS,
os_parsers=OS_PARSERS,
device_parsers=DEVICE_PARSERS,
)

if load:
user_agent_parsers: List[UserAgentMatcher]
os_parsers: List[OSMatcher]
device_parsers: List[DeviceMatcher]

@classmethod
def from_yaml(cls, path: Union[str, os.PathLike | io.IOBase]) -> Parser:
"""Instantiates a parser from a YAML file-like object or path.
The data should follow the `regexes.yaml
<https://github.com/ua-parser/uap-core/blob/master/docs/specification.md#regexesyaml>`_
format. Note that because yaml is a superset of json, the
rules can just be in json.
Requires ``pyyaml``, and every call will read and reload
the data anew.
"""
if isinstance(path, (str, os.PathLike)):
with open(path) as fp:
regexes = load(fp, Loader=SafeLoader) # type: ignore
else:
regexes = load(path, Loader=SafeLoader) # type: ignore

return cls(
user_agent_parsers=[
UserAgentParser(
p["regex"],
p.get("family_replacement"),
p.get("v1_replacement"),
p.get("v2_replacement"),
)
for p in regexes["user_agent_parsers"]
],
os_parsers=[
OSParser(
p["regex"],
p.get("os_replacement"),
p.get("os_v1_replacement"),
p.get("os_v2_replacement"),
p.get("os_v3_replacement"),
p.get("os_v4_replacement"),
)
for p in regexes["os_parsers"]
],
device_parsers=[
DeviceParser(
p["regex"],
p.get("regex_flag"),
p.get("device_replacement"),
p.get("brand_replacement"),
p.get("model_replacement"),
)
for p in regexes["device_parsers"]
],
)
def __init__(
self,
matchers: Matchers,
) -> None:
self.user_agent_parsers = matchers[0]
self.os_parsers = matchers[1]
self.device_parsers = matchers[2]

def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
parse = methodcaller("Parse", ua)
parse = methodcaller("__call__", ua)
return PartialParseResult(
domains=domains,
string=ua,
user_agent=next(
(UserAgent(*m) for m in map(parse, self.user_agent_parsers) if m[0]),
filter(None, map(parse, self.user_agent_parsers)),
None,
)
if Domain.USER_AGENT in domains
else None,
os=next(
(OS(*m) for m in map(parse, self.os_parsers) if m[0]),
filter(None, map(parse, self.os_parsers)),
None,
)
if Domain.OS in domains
else None,
device=next(
(Device(*m) for m in map(parse, self.device_parsers) if m[0]),
filter(None, map(parse, self.device_parsers)),
None,
)
if Domain.DEVICE in domains
Expand Down
3 changes: 2 additions & 1 deletion src/ua_parser/caching.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import abc
from collections import OrderedDict
from typing import Dict, Optional, MutableMapping
from .types import Parser, Domain, PartialParseResult

from .core import Parser, Domain, PartialParseResult


__all__ = [
Expand Down
155 changes: 152 additions & 3 deletions src/ua_parser/types.py → src/ua_parser/core.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import abc

from dataclasses import dataclass
import re
from dataclasses import dataclass, fields
from enum import Flag, auto
from typing import *
from typing import Literal, Optional, Tuple, List

__all__ = [
"UserAgent",
Expand All @@ -13,6 +13,10 @@
"Parser",
"Domain",
"PartialParseResult",
"Matchers",
"UserAgentMatcher",
"OSMatcher",
"DeviceMatcher",
]


Expand All @@ -26,6 +30,7 @@ class UserAgent:
major: Optional[str] = None
minor: Optional[str] = None
patch: Optional[str] = None
patch_minor: Optional[str] = None


@dataclass(frozen=True)
Expand Down Expand Up @@ -177,3 +182,147 @@ def parse_device(self, ua: str) -> Optional[Device]:
back to the default value in case of failure.
"""
return self(ua, Domain.DEVICE).device


def _get(m: re.Match, idx: int) -> Optional[str]:
return (m[idx] or None) if 0 < idx <= m.re.groups else None


def _replacer(repl: str, m: re.Match) -> Optional[str]:
"""The replacement rules are frustratingly subtle and innimical to
standard fallbacks:
- if there is a non-null replacement pattern, then it must be used with
match groups as template parameters (at indices 1+)
- the result is stripped
- if it is an empty string, then it's replaced by a null
- otherwise fallback to a (possibly optional) match group
- or null (device brand has no fallback)
Replacement rules only apply to OS and Device matchers, the UA
matcher has bespoke replacement semantics for the family (just
$1), and no replacement for the other fields, either there is a
static replacement or it falls back to the corresponding
(optional) match group.
"""
if not repl:
return None

return re.sub(r"\$(\d)", lambda n: _get(m, int(n[1])) or "", repl).strip() or None


class UserAgentMatcher:
regex: re.Pattern
family: str
major: Optional[str]
minor: Optional[str]
patch: Optional[str]
patch_minor: Optional[str]

def __init__(
self,
regex: str,
family: Optional[str] = None,
major: Optional[str] = None,
minor: Optional[str] = None,
patch: Optional[str] = None,
patch_minor: Optional[str] = None,
) -> None:
self.regex = re.compile(regex)
self.family = family or "$1"
self.major = major
self.minor = minor
self.patch = patch
self.patch_minor = patch_minor

def __call__(self, ua: str) -> Optional[UserAgent]:
if m := self.regex.search(ua):
return UserAgent(
family=self.family.replace("$1", m[1])
if "$1" in self.family
else self.family,
major=self.major or _get(m, 2),
minor=self.minor or _get(m, 3),
patch=self.patch or _get(m, 4),
patch_minor=self.patch_minor or _get(m, 5),
)
return None


class OSMatcher:
regex: re.Pattern
family: str
major: str
minor: str
patch: str
patch_minor: str

def __init__(
self,
regex: str,
family: Optional[str] = None,
major: Optional[str] = None,
minor: Optional[str] = None,
patch: Optional[str] = None,
patch_minor: Optional[str] = None,
) -> None:
self.regex = re.compile(regex)
self.family = family or "$1"
self.major = major or "$2"
self.minor = minor or "$3"
self.patch = patch or "$4"
self.patch_minor = patch_minor or "$5"

def __call__(self, ua: str) -> Optional[OS]:
if m := self.regex.search(ua):
family = _replacer(self.family, m)
if family is None:
raise ValueError(f"Unable to find OS family in {ua}")
return OS(
family=family,
major=_replacer(self.major, m),
minor=_replacer(self.minor, m),
patch=_replacer(self.patch, m),
patch_minor=_replacer(self.patch_minor, m),
)
return None


class DeviceMatcher:
regex: re.Pattern
family: str
brand: str
model: str

def __init__(
self,
regex: str,
regex_flag: Optional[Literal["i"]] = None,
family: Optional[str] = None,
brand: Optional[str] = None,
model: Optional[str] = None,
) -> None:
self.regex = re.compile(regex, flags=re.IGNORECASE if regex_flag == "i" else 0)
self.family = family or "$1"
self.brand = brand or ""
self.model = model or "$1"

def __call__(self, ua: str) -> Optional[Device]:
if m := self.regex.search(ua):
family = _replacer(self.family, m)
if family is None:
raise ValueError(f"Unable to find device family in {ua}")
return Device(
family=family,
brand=_replacer(self.brand, m),
model=_replacer(self.model, m),
)
return None


Matchers = Tuple[
List[UserAgentMatcher],
List[OSMatcher],
List[DeviceMatcher],
]
Loading

0 comments on commit dbcee8c

Please sign in to comment.