Skip to content

Commit

Permalink
internal/parser: add PublicSuffix and RegisteredDomain methods to List (
Browse files Browse the repository at this point in the history
#2228)

These calculate the public suffix or registered/registerable domain of some
input domain, according to the strict PSL algorithm. Aside from being useful
as a reference implementation of the algorithm, it comes in handy in validations
that need to do stuff like figure out what registry to query for domain info.
  • Loading branch information
danderson authored Oct 22, 2024
1 parent 7fd9e2f commit 47e25e6
Show file tree
Hide file tree
Showing 3 changed files with 271 additions and 4 deletions.
30 changes: 26 additions & 4 deletions tools/internal/domain/domain.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,33 @@ func (d Name) CutSuffix(suffix Name) (rest []Label, found bool) {
return ret, true
}

// AddPrefix returns d prefixed with label.
// AddPrefix returns d prefixed with labels.
//
// For example, AddPrefix of "bar" to "foo.com" is "bar.foo.com".
func (d Name) AddPrefix(label Label) (Name, error) {
// For example, AddPrefix("qux", "bar") to "foo.com" is "qux.bar.foo.com".
func (d Name) AddPrefix(labels ...Label) (Name, error) {
// Due to total name length restrictions, we have to fully
// re-check the shape of the extended domain name. The simplest
// way to do that is to round-trip through a string and leverage
// Parse again.
retStr := label.String() + "." + d.String()
parts := make([]string, 0, len(labels)+1)
for _, l := range labels {
parts = append(parts, l.String())
}
parts = append(parts, d.String())
retStr := strings.Join(parts, ".")
return Parse(retStr)
}

// MustAddPrefix is like AddPrefix, but panics if the formed prefix is
// invalid instead of returning an error.
func (d Name) MustAddPrefix(labels ...Label) Name {
ret, err := d.AddPrefix(labels...)
if err != nil {
panic(fmt.Sprintf("failed to add prefix %v to domain %q: %v", labels, d, err))
}
return ret
}

// Label is a domain name label.
type Label struct {
label string
Expand Down Expand Up @@ -203,6 +218,13 @@ func (l Label) ASCIIString() string {
return ret
}

// AsTLD returns the label as a top-level domain Name.
func (l Label) AsTLD() Name {
return Name{
labels: []Label{l},
}
}

// Compare compares domain labels. It returns -1 if l < m, +1 if l > m,
// and 0 if l == m.
//
Expand Down
150 changes: 150 additions & 0 deletions tools/internal/parser/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package parser

import (
"cmp"
"fmt"
"net/mail"
"net/url"
"slices"
Expand Down Expand Up @@ -78,6 +79,98 @@ type List struct {

func (l *List) Children() []Block { return l.Blocks }

// PublicSuffix returns the public suffix of n.
//
// This follows the PSL algorithm to the letter. Notably: a rule
// "*.foo.com" does not implicitly create a "foo.com" rule, and there
// is a hardcoded implicit "*" rule so that unknown TLDs are all
// public suffixes.
func (l *List) PublicSuffix(d domain.Name) domain.Name {
if d.NumLabels() == 0 {
// Edge case: zero domain.Name value
return d
}

// Look at wildcards first, because the PSL algorithm says that
// exceptions to wildcards take priority over all other rules. So,
// if we find a wildcard exception, we can halt early.
var (
ret domain.Name
matchLen int
gotException bool
)
for _, w := range BlocksOfType[*Wildcard](l) {
suf, isException, ok := w.PublicSuffix(d)
switch {
case !ok:
continue
case isException && !gotException:
// First matching exception encountered.
gotException = true
matchLen = suf.NumLabels()
ret = suf
case isException:
// Second or later exception match. According to the
// format, only 0 or 1 exceptions can match,
// multi-exception matches are undefined and unused. But
// just to be safe, handle the N exception case by
// accepting the longest matching exception.
if nl := suf.NumLabels(); nl > matchLen {
matchLen = nl
ret = suf
}
case !gotException:
// Non-exception match.
if nl := suf.NumLabels(); nl > matchLen {
matchLen = nl
ret = suf
}
}
}
if gotException {
return ret
}

// Otherwise, keep scanning through the regular suffixes.
for _, s := range BlocksOfType[*Suffix](l) {
if suf, ok := s.PublicSuffix(d); ok && suf.NumLabels() > matchLen {
matchLen = suf.NumLabels()
ret = suf
}
}

if matchLen == 0 {
// The PSL algorithm includes an implicit "*" to match every
// TLD, in the absence of any matching explicit rule.
labels := d.Labels()
tld := labels[len(labels)-1].AsTLD()
return tld
}

return ret
}

// RegisteredDomain returns the registered/registerable domain of
// n. Returns (domain, true) when the input is a child of a public
// suffix, and (zero, false) when the input is itself a public suffix.
//
// RegisteredDomain follows the PSL algorithm to the letter. Notably:
// a rule "*.foo.com" does not implicitly create a "foo.com" rule, and
// there is a hardcoded implicit "*" rule so that unknown TLDs are all
// public suffixes.
func (l *List) RegisteredDomain(d domain.Name) (domain.Name, bool) {
suf := l.PublicSuffix(d)
if suf.Equal(d) {
return domain.Name{}, false
}

next, ok := d.CutSuffix(suf)
if !ok {
panic(fmt.Sprintf("public suffix %q is not a suffix of domain %q", suf, d))
}
return suf.MustAddPrefix(next[len(next)-1]), true
}

// Comment is a comment block, consisting of one or more contiguous
// lines of commented text.
type Comment struct {
Expand Down Expand Up @@ -223,6 +316,31 @@ type Suffix struct {

func (s *Suffix) Children() []Block { return nil }

// PublicSuffix returns the public suffix of n according to this
// Suffix rule taken in isolation. If n is not a child domain of s
// PublicSuffix returns (zeroValue, false).
func (s *Suffix) PublicSuffix(n domain.Name) (suffix domain.Name, ok bool) {
if n.Equal(s.Domain) {
return s.Domain, true
}
if _, ok := n.CutSuffix(s.Domain); ok {
return s.Domain, true
}
return domain.Name{}, false
}

// RegisteredDomain returns the registered/registerable domain of n
// according to this Suffix rule taken in isolation. The registered
// domain is defined as n's public suffix plus one more child
// label. If n is not a child domain of s, RegisteredDomain returns
// (zeroValue, false).
func (s *Suffix) RegisteredDomain(n domain.Name) (regDomain domain.Name, ok bool) {
if prefix, ok := n.CutSuffix(s.Domain); ok {
return s.Domain.MustAddPrefix(prefix[len(prefix)-1]), true
}
return domain.Name{}, false
}

// Wildcard is a wildcard public suffix, along with any exceptions to
// that wildcard.
type Wildcard struct {
Expand All @@ -240,3 +358,35 @@ type Wildcard struct {
}

func (w *Wildcard) Children() []Block { return nil }

// PublicSuffix returns the public suffix of n according to this
// Wildcard rule taken in isolation. If n is not a child domain of w
// PublicSuffix returns (zeroValue, false).
func (w *Wildcard) PublicSuffix(n domain.Name) (suffix domain.Name, isException, ok bool) {
if prefix, ok := n.CutSuffix(w.Domain); ok {
next := prefix[len(prefix)-1]
if slices.Contains(w.Exceptions, next) {
return w.Domain, true, true
}

return w.Domain.MustAddPrefix(next), false, true
}
return domain.Name{}, false, false
}

// RegisteredDomain returns the registered/registerable domain of n
// according to this Suffix rule taken in isolation. The registered
// domain is defined as n's public suffix plus one more child
// label. If n is not a child domain of s, RegisteredDomain returns
// (zeroValue, false).
func (w *Wildcard) RegisteredDomain(n domain.Name) (regDomain domain.Name, isException, ok bool) {
if prefix, ok := n.CutSuffix(w.Domain); ok && len(prefix) >= 2 {
next := prefix[len(prefix)-1]
if slices.Contains(w.Exceptions, next) {
return w.Domain.MustAddPrefix(next), true, true
}

return w.Domain.MustAddPrefix(prefix[len(prefix)-2:]...), false, true
}
return domain.Name{}, false, false
}
95 changes: 95 additions & 0 deletions tools/internal/parser/file_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package parser

import (
"testing"

"github.com/publicsuffix/list/tools/internal/domain"
)

func TestPublicSuffix(t *testing.T) {
lst := list(
section(1, 1, "PRIVATE DOMAINS",
suffixes(1, 1, noInfo,
suffix(1, "example.com"),
wildcard(2, 3, "baz.net", "except", "other"),
suffix(4, "com"),

// Wildcards and exceptions nested inside each
// other. This doesn't appear in the PSL in practice,
// and is implicitly forbidden by the format spec, but
// the parser/validator does not currently reject such
// files, so we want PublicSuffix/RegisteredDomain to
// be well-defined for such inputs.
wildcard(5, 6, "nested.org", "except"),
wildcard(7, 8, "in.except.nested.org", "other-except"),
),
),
)

tests := []struct {
in string
pubSuffix string
regDomain string
}{
{"www.example.com", "example.com", "www.example.com"},
{"www.public.example.com", "example.com", "public.example.com"},
{"example.com", "example.com", ""},

{"www.other.com", "com", "other.com"},
{"other.com", "com", "other.com"},
{"com", "com", ""},

{"qux.bar.baz.net", "bar.baz.net", "qux.bar.baz.net"},
{"bar.baz.net", "bar.baz.net", ""},
{"baz.net", "net", "baz.net"}, // Implicit * rule
{"qux.except.baz.net", "baz.net", "except.baz.net"},
{"except.baz.net", "baz.net", "except.baz.net"},
{"other.other.baz.net", "baz.net", "other.baz.net"},

// Tests for nested wildcards+exceptions. Does not appear in
// the real PSL, and implicitly disallowed by the format spec,
// but necessary to make PublicSuffix and RegisteredDomain's
// outputs well defined for all inputs.
{"qux.bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"},
{"bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"},
{"foo.nested.org", "foo.nested.org", ""},
{"nested.org", "org", "nested.org"},
{"bar.except.nested.org", "nested.org", "except.nested.org"},
{"except.nested.org", "nested.org", "except.nested.org"},
{"in.except.nested.org", "nested.org", "except.nested.org"},
// Matches both nested wildcard and also outer exception,
// outer exception wins.
{"other.in.except.nested.org", "nested.org", "except.nested.org"},
// Matches both outer and inner exceptions, inner exception
// wins.
{"qux.other-except.in.except.nested.org", "in.except.nested.org", "other-except.in.except.nested.org"},
}

for _, tc := range tests {
in := mustParseDomain(tc.in)
wantSuffix := mustParseDomain(tc.pubSuffix)

gotSuffix := lst.PublicSuffix(in)
if !gotSuffix.Equal(wantSuffix) {
t.Errorf("PublicSuffix(%q) = %q, want %q", in, gotSuffix, wantSuffix)
}

gotReg, ok := lst.RegisteredDomain(in)
if ok && tc.regDomain == "" {
t.Errorf("RegisteredDomain(%q) = %q, want none", in, gotReg)
} else if ok {
wantReg := mustParseDomain(tc.regDomain)
if !gotReg.Equal(wantReg) {
t.Errorf("RegisteredDomain(%q) = %q, want %q", in, gotReg, wantReg)
}
}
}
}

func mustParseDomain(s string) domain.Name {
d, err := domain.Parse(s)
if err != nil {
panic(err)
}
return d
}

0 comments on commit 47e25e6

Please sign in to comment.