diff --git a/tools/internal/domain/domain.go b/tools/internal/domain/domain.go index 466356057..802c16592 100644 --- a/tools/internal/domain/domain.go +++ b/tools/internal/domain/domain.go @@ -156,18 +156,33 @@ func (d Name) CutSuffix(suffix Name) (rest []Label, found bool) { return ret, true } -// AddPrefix returns d prefixed with label. +// AddPrefix returns d prefixed with labels. // -// For example, AddPrefix of "bar" to "foo.com" is "bar.foo.com". -func (d Name) AddPrefix(label Label) (Name, error) { +// For example, AddPrefix("qux", "bar") to "foo.com" is "qux.bar.foo.com". +func (d Name) AddPrefix(labels ...Label) (Name, error) { // Due to total name length restrictions, we have to fully // re-check the shape of the extended domain name. The simplest // way to do that is to round-trip through a string and leverage // Parse again. - retStr := label.String() + "." + d.String() + parts := make([]string, 0, len(labels)+1) + for _, l := range labels { + parts = append(parts, l.String()) + } + parts = append(parts, d.String()) + retStr := strings.Join(parts, ".") return Parse(retStr) } +// MustAddPrefix is like AddPrefix, but panics if the formed prefix is +// invalid instead of returning an error. +func (d Name) MustAddPrefix(labels ...Label) Name { + ret, err := d.AddPrefix(labels...) + if err != nil { + panic(fmt.Sprintf("failed to add prefix %v to domain %q: %v", labels, d, err)) + } + return ret +} + // Label is a domain name label. type Label struct { label string @@ -203,6 +218,13 @@ func (l Label) ASCIIString() string { return ret } +// AsTLD returns the label as a top-level domain Name. +func (l Label) AsTLD() Name { + return Name{ + labels: []Label{l}, + } +} + // Compare compares domain labels. It returns -1 if l < m, +1 if l > m, // and 0 if l == m. // diff --git a/tools/internal/parser/file.go b/tools/internal/parser/file.go index 4578a38a0..9b11aa086 100644 --- a/tools/internal/parser/file.go +++ b/tools/internal/parser/file.go @@ -2,6 +2,7 @@ package parser import ( "cmp" + "fmt" "net/mail" "net/url" "slices" @@ -78,6 +79,98 @@ type List struct { func (l *List) Children() []Block { return l.Blocks } +// PublicSuffix returns the public suffix of n. +// +// This follows the PSL algorithm to the letter. Notably: a rule +// "*.foo.com" does not implicitly create a "foo.com" rule, and there +// is a hardcoded implicit "*" rule so that unknown TLDs are all +// public suffixes. +func (l *List) PublicSuffix(d domain.Name) domain.Name { + if d.NumLabels() == 0 { + // Edge case: zero domain.Name value + return d + } + + // Look at wildcards first, because the PSL algorithm says that + // exceptions to wildcards take priority over all other rules. So, + // if we find a wildcard exception, we can halt early. + var ( + ret domain.Name + matchLen int + gotException bool + ) + for _, w := range BlocksOfType[*Wildcard](l) { + suf, isException, ok := w.PublicSuffix(d) + switch { + case !ok: + continue + case isException && !gotException: + // First matching exception encountered. + gotException = true + matchLen = suf.NumLabels() + ret = suf + case isException: + // Second or later exception match. According to the + // format, only 0 or 1 exceptions can match, + // multi-exception matches are undefined and unused. But + // just to be safe, handle the N exception case by + // accepting the longest matching exception. + if nl := suf.NumLabels(); nl > matchLen { + matchLen = nl + ret = suf + } + case !gotException: + // Non-exception match. + if nl := suf.NumLabels(); nl > matchLen { + matchLen = nl + ret = suf + } + } + } + if gotException { + return ret + } + + // Otherwise, keep scanning through the regular suffixes. + for _, s := range BlocksOfType[*Suffix](l) { + if suf, ok := s.PublicSuffix(d); ok && suf.NumLabels() > matchLen { + matchLen = suf.NumLabels() + ret = suf + } + } + + if matchLen == 0 { + // The PSL algorithm includes an implicit "*" to match every + // TLD, in the absence of any matching explicit rule. + labels := d.Labels() + tld := labels[len(labels)-1].AsTLD() + return tld + } + + return ret +} + +// RegisteredDomain returns the registered/registerable domain of +// n. Returns (domain, true) when the input is a child of a public +// suffix, and (zero, false) when the input is itself a public suffix. +// +// RegisteredDomain follows the PSL algorithm to the letter. Notably: +// a rule "*.foo.com" does not implicitly create a "foo.com" rule, and +// there is a hardcoded implicit "*" rule so that unknown TLDs are all +// public suffixes. +func (l *List) RegisteredDomain(d domain.Name) (domain.Name, bool) { + suf := l.PublicSuffix(d) + if suf.Equal(d) { + return domain.Name{}, false + } + + next, ok := d.CutSuffix(suf) + if !ok { + panic(fmt.Sprintf("public suffix %q is not a suffix of domain %q", suf, d)) + } + return suf.MustAddPrefix(next[len(next)-1]), true +} + // Comment is a comment block, consisting of one or more contiguous // lines of commented text. type Comment struct { @@ -223,6 +316,31 @@ type Suffix struct { func (s *Suffix) Children() []Block { return nil } +// PublicSuffix returns the public suffix of n according to this +// Suffix rule taken in isolation. If n is not a child domain of s +// PublicSuffix returns (zeroValue, false). +func (s *Suffix) PublicSuffix(n domain.Name) (suffix domain.Name, ok bool) { + if n.Equal(s.Domain) { + return s.Domain, true + } + if _, ok := n.CutSuffix(s.Domain); ok { + return s.Domain, true + } + return domain.Name{}, false +} + +// RegisteredDomain returns the registered/registerable domain of n +// according to this Suffix rule taken in isolation. The registered +// domain is defined as n's public suffix plus one more child +// label. If n is not a child domain of s, RegisteredDomain returns +// (zeroValue, false). +func (s *Suffix) RegisteredDomain(n domain.Name) (regDomain domain.Name, ok bool) { + if prefix, ok := n.CutSuffix(s.Domain); ok { + return s.Domain.MustAddPrefix(prefix[len(prefix)-1]), true + } + return domain.Name{}, false +} + // Wildcard is a wildcard public suffix, along with any exceptions to // that wildcard. type Wildcard struct { @@ -240,3 +358,35 @@ type Wildcard struct { } func (w *Wildcard) Children() []Block { return nil } + +// PublicSuffix returns the public suffix of n according to this +// Wildcard rule taken in isolation. If n is not a child domain of w +// PublicSuffix returns (zeroValue, false). +func (w *Wildcard) PublicSuffix(n domain.Name) (suffix domain.Name, isException, ok bool) { + if prefix, ok := n.CutSuffix(w.Domain); ok { + next := prefix[len(prefix)-1] + if slices.Contains(w.Exceptions, next) { + return w.Domain, true, true + } + + return w.Domain.MustAddPrefix(next), false, true + } + return domain.Name{}, false, false +} + +// RegisteredDomain returns the registered/registerable domain of n +// according to this Suffix rule taken in isolation. The registered +// domain is defined as n's public suffix plus one more child +// label. If n is not a child domain of s, RegisteredDomain returns +// (zeroValue, false). +func (w *Wildcard) RegisteredDomain(n domain.Name) (regDomain domain.Name, isException, ok bool) { + if prefix, ok := n.CutSuffix(w.Domain); ok && len(prefix) >= 2 { + next := prefix[len(prefix)-1] + if slices.Contains(w.Exceptions, next) { + return w.Domain.MustAddPrefix(next), true, true + } + + return w.Domain.MustAddPrefix(prefix[len(prefix)-2:]...), false, true + } + return domain.Name{}, false, false +} diff --git a/tools/internal/parser/file_test.go b/tools/internal/parser/file_test.go new file mode 100644 index 000000000..0f0f9514e --- /dev/null +++ b/tools/internal/parser/file_test.go @@ -0,0 +1,95 @@ +package parser + +import ( + "testing" + + "github.com/publicsuffix/list/tools/internal/domain" +) + +func TestPublicSuffix(t *testing.T) { + lst := list( + section(1, 1, "PRIVATE DOMAINS", + suffixes(1, 1, noInfo, + suffix(1, "example.com"), + wildcard(2, 3, "baz.net", "except", "other"), + suffix(4, "com"), + + // Wildcards and exceptions nested inside each + // other. This doesn't appear in the PSL in practice, + // and is implicitly forbidden by the format spec, but + // the parser/validator does not currently reject such + // files, so we want PublicSuffix/RegisteredDomain to + // be well-defined for such inputs. + wildcard(5, 6, "nested.org", "except"), + wildcard(7, 8, "in.except.nested.org", "other-except"), + ), + ), + ) + + tests := []struct { + in string + pubSuffix string + regDomain string + }{ + {"www.example.com", "example.com", "www.example.com"}, + {"www.public.example.com", "example.com", "public.example.com"}, + {"example.com", "example.com", ""}, + + {"www.other.com", "com", "other.com"}, + {"other.com", "com", "other.com"}, + {"com", "com", ""}, + + {"qux.bar.baz.net", "bar.baz.net", "qux.bar.baz.net"}, + {"bar.baz.net", "bar.baz.net", ""}, + {"baz.net", "net", "baz.net"}, // Implicit * rule + {"qux.except.baz.net", "baz.net", "except.baz.net"}, + {"except.baz.net", "baz.net", "except.baz.net"}, + {"other.other.baz.net", "baz.net", "other.baz.net"}, + + // Tests for nested wildcards+exceptions. Does not appear in + // the real PSL, and implicitly disallowed by the format spec, + // but necessary to make PublicSuffix and RegisteredDomain's + // outputs well defined for all inputs. + {"qux.bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"}, + {"bar.foo.nested.org", "foo.nested.org", "bar.foo.nested.org"}, + {"foo.nested.org", "foo.nested.org", ""}, + {"nested.org", "org", "nested.org"}, + {"bar.except.nested.org", "nested.org", "except.nested.org"}, + {"except.nested.org", "nested.org", "except.nested.org"}, + {"in.except.nested.org", "nested.org", "except.nested.org"}, + // Matches both nested wildcard and also outer exception, + // outer exception wins. + {"other.in.except.nested.org", "nested.org", "except.nested.org"}, + // Matches both outer and inner exceptions, inner exception + // wins. + {"qux.other-except.in.except.nested.org", "in.except.nested.org", "other-except.in.except.nested.org"}, + } + + for _, tc := range tests { + in := mustParseDomain(tc.in) + wantSuffix := mustParseDomain(tc.pubSuffix) + + gotSuffix := lst.PublicSuffix(in) + if !gotSuffix.Equal(wantSuffix) { + t.Errorf("PublicSuffix(%q) = %q, want %q", in, gotSuffix, wantSuffix) + } + + gotReg, ok := lst.RegisteredDomain(in) + if ok && tc.regDomain == "" { + t.Errorf("RegisteredDomain(%q) = %q, want none", in, gotReg) + } else if ok { + wantReg := mustParseDomain(tc.regDomain) + if !gotReg.Equal(wantReg) { + t.Errorf("RegisteredDomain(%q) = %q, want %q", in, gotReg, wantReg) + } + } + } +} + +func mustParseDomain(s string) domain.Name { + d, err := domain.Parse(s) + if err != nil { + panic(err) + } + return d +}