Skip to content

Commit

Permalink
core(entity-classification): integrate public-suffix-list into LH
Browse files Browse the repository at this point in the history
Experiments integrating npm:tldts (MIT licensed) to bring in
public suffix list based root domain classification.
Splits Util.getRootDomain into UrlUtils.getRootDomain that depends
on PSL, and replaces report-side with entity recognition based.
Preserves existing rootDomains with an explicit `Legacy` prefix
to be used for rendering pre-10.0 LHRs.
  • Loading branch information
alexnj committed Nov 29, 2023
1 parent 4949ffb commit f43fd32
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 29 deletions.
2 changes: 1 addition & 1 deletion core/computed/entity-classification.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class EntityClassification {
// Make up an entity only for valid http/https URLs.
if (!parsedUrl.protocol.startsWith('http')) return;

const rootDomain = Util.getRootDomain(url);
const rootDomain = UrlUtils.getRootDomain(url);
if (!rootDomain) return;
if (entityCache.has(rootDomain)) return entityCache.get(rootDomain);

Expand Down
4 changes: 2 additions & 2 deletions core/computed/resource-summary.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import {makeComputedArtifact} from './computed-artifact.js';
import {NetworkRecords} from './network-records.js';
import {NetworkRequest} from '../lib/network-request.js';
import {Budget} from '../config/budget.js';
import {Util} from '../../shared/util.js';
import UrlUtils from '../lib/url-utils.js';

/** @typedef {{count: number, resourceSize: number, transferSize: number}} ResourceEntry */

Expand Down Expand Up @@ -59,7 +59,7 @@ class ResourceSummary {
firstPartyHosts = budget.options.firstPartyHostnames;
} else {
firstPartyHosts = classifiedEntities.firstParty?.domains.map(domain => `*.${domain}`) ||
[`*.${Util.getRootDomain(URLArtifact.finalDisplayedUrl)}`];
[`*.${UrlUtils.getRootDomain(URLArtifact.finalDisplayedUrl)}`];
}

networkRecords.filter(record => {
Expand Down
16 changes: 14 additions & 2 deletions core/lib/url-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
* SPDX-License-Identifier: Apache-2.0
*/

import {getDomain} from 'tldts';

import {Util} from '../../shared/util.js';
import {LighthouseError} from './lh-error.js';

Expand Down Expand Up @@ -99,6 +101,16 @@ class UrlUtils {
}
}

/**
* Returns a primary domain for provided hostname (e.g. www.example.com -> example.com).
* @param {string|URL} url hostname or URL object
* @return {string}
*/
static getRootDomain(url) {
const parsedUrl = Util.createOrReturnURL(url);
return getDomain(parsedUrl.href) || parsedUrl.hostname;
}

/**
* Check if rootDomains matches
*
Expand All @@ -120,8 +132,8 @@ class UrlUtils {
}

// get the string before the tld
const urlARootDomain = Util.getRootDomain(urlAInfo);
const urlBRootDomain = Util.getRootDomain(urlBInfo);
const urlARootDomain = UrlUtils.getRootDomain(urlAInfo);
const urlBRootDomain = UrlUtils.getRootDomain(urlBInfo);

return urlARootDomain === urlBRootDomain;
}
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@
"semver": "^5.3.0",
"speedline-core": "^1.4.3",
"third-party-web": "^0.24.0",
"tldts": "^6.0.22",
"ws": "^7.0.0",
"yargs": "^17.3.1",
"yargs-parser": "^21.0.0"
Expand Down
5 changes: 3 additions & 2 deletions report/renderer/report-ui-features.js
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ export class ReportUIFeatures {
* @return {Array<HTMLElement>}
*/
_getThirdPartyRows(rowEls, finalDisplayedUrl) {
const finalDisplayedUrlRootDomain = Util.getRootDomain(finalDisplayedUrl);
const finalDisplayedUrlEntity = Util.getEntityFromUrl(finalDisplayedUrl, this.json.entities);
const firstPartyEntityName = this.json.entities?.find(e => e.isFirstParty === true)?.name;

/** @type {Array<HTMLElement>} */
Expand All @@ -337,7 +337,8 @@ export class ReportUIFeatures {
if (!urlItem) continue;
const datasetUrl = urlItem.dataset.url;
if (!datasetUrl) continue;
const isThirdParty = Util.getRootDomain(datasetUrl) !== finalDisplayedUrlRootDomain;
const isThirdParty =
Util.getEntityFromUrl(datasetUrl, this.json.entities) !== finalDisplayedUrlEntity;
if (!isThirdParty) continue;
}

Expand Down
38 changes: 19 additions & 19 deletions shared/test/util-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,29 +18,29 @@ describe('util helpers', () => {
});
});

describe('getRootDomain', () => {
describe('getLegacyRootDomain', () => {
it('returns the correct rootDomain from a string', () => {
assert.equal(Util.getRootDomain('https://www.example.com/index.html'), 'example.com');
assert.equal(Util.getRootDomain('https://example.com'), 'example.com');
assert.equal(Util.getRootDomain('https://www.example.co.uk'), 'example.co.uk');
assert.equal(Util.getRootDomain('https://example.com.br/app/'), 'example.com.br');
assert.equal(Util.getRootDomain('https://example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getRootDomain('https://sub.example.com'), 'example.com');
assert.equal(Util.getRootDomain('https://sub.example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getRootDomain('http://localhost'), 'localhost');
assert.equal(Util.getRootDomain('http://localhost:8080'), 'localhost');
assert.equal(Util.getLegacyRootDomain('https://www.example.com/index.html'), 'example.com');
assert.equal(Util.getLegacyRootDomain('https://example.com'), 'example.com');
assert.equal(Util.getLegacyRootDomain('https://www.example.co.uk'), 'example.co.uk');
assert.equal(Util.getLegacyRootDomain('https://example.com.br/app/'), 'example.com.br');
assert.equal(Util.getLegacyRootDomain('https://example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getLegacyRootDomain('https://sub.example.com'), 'example.com');
assert.equal(Util.getLegacyRootDomain('https://sub.example.tokyo.jp'), 'tokyo.jp');
assert.equal(Util.getLegacyRootDomain('http://localhost'), 'localhost');
assert.equal(Util.getLegacyRootDomain('http://localhost:8080'), 'localhost');
});

it('returns the correct rootDomain from an URL object', () => {
assert.equal(Util.getRootDomain(new URL('https://www.example.com/index.html')), 'example.com');
assert.equal(Util.getRootDomain(new URL('https://example.com')), 'example.com');
assert.equal(Util.getRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk');
assert.equal(Util.getRootDomain(new URL('https://example.com.br/app/')), 'example.com.br');
assert.equal(Util.getRootDomain(new URL('https://example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getRootDomain(new URL('https://sub.example.com')), 'example.com');
assert.equal(Util.getRootDomain(new URL('https://sub.example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getRootDomain(new URL('http://localhost')), 'localhost');
assert.equal(Util.getRootDomain(new URL('http://localhost:8080')), 'localhost');
assert.equal(Util.getLegacyRootDomain(new URL('https://www.example.com/index.html')), 'example.com');
assert.equal(Util.getLegacyRootDomain(new URL('https://example.com')), 'example.com');
assert.equal(Util.getLegacyRootDomain(new URL('https://www.example.co.uk')), 'example.co.uk');
assert.equal(Util.getLegacyRootDomain(new URL('https://example.com.br/app/')), 'example.com.br');
assert.equal(Util.getLegacyRootDomain(new URL('https://example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getLegacyRootDomain(new URL('https://sub.example.com')), 'example.com');
assert.equal(Util.getLegacyRootDomain(new URL('https://sub.example.tokyo.jp')), 'tokyo.jp');
assert.equal(Util.getLegacyRootDomain(new URL('http://localhost')), 'localhost');
assert.equal(Util.getLegacyRootDomain(new URL('http://localhost:8080')), 'localhost');
});
});

Expand Down
23 changes: 20 additions & 3 deletions shared/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,21 @@ class Util {
return details;
}

/**
* Given the entity classification dataset and a URL, identify the entity.
* @param {string} url
* @param {LH.Result.Entities=} entities
* @return {LH.Result.LhrEntity|string}
*/
static getEntityFromUrl(url, entities) {
if (!entities) {
return Util.getLegacyRootDomain(url);
}

const entity = entities.find(e => e.origins.find(origin => url.startsWith(origin)));
return entity || Util.getLegacyRootDomain(url);
}

/**
* Split a string by markdown code spans (enclosed in `backticks`), splitting
* into segments that were enclosed in backticks (marked as `isCode === true`)
Expand Down Expand Up @@ -292,11 +307,12 @@ class Util {

/**
* Gets the tld of a domain
* This function is only while rendering pre-10.0 LHRs.
*
* @param {string} hostname
* @return {string} tld
*/
static getTld(hostname) {
static getLegacyTld(hostname) {
const tlds = hostname.split('.').slice(-2);

if (!listOfTlds.includes(tlds[0])) {
Expand All @@ -308,12 +324,13 @@ class Util {

/**
* Returns a primary domain for provided hostname (e.g. www.example.com -> example.com).
* This function is only while rendering pre-10.0 LHRs.
* @param {string|URL} url hostname or URL object
* @return {string}
*/
static getRootDomain(url) {
static getLegacyRootDomain(url) {
const hostname = Util.createOrReturnURL(url).hostname;
const tld = Util.getTld(hostname);
const tld = Util.getLegacyTld(hostname);

// tld is .com or .co.uk which means we means that length is 1 to big
// .com => 2 & .co.uk => 3
Expand Down
12 changes: 12 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6938,6 +6938,18 @@ through@2, "through@>=2.2.7 <3", through@^2.3.8:
resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5"
integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=

tldts-core@^6.0.22:
version "6.0.22"
resolved "https://registry.yarnpkg.com/tldts-core/-/tldts-core-6.0.22.tgz#1f4d43eb75f1f2e89e488776128abd7b3bd3f1b6"
integrity sha512-5m5+f69JzLj+QP+5DVgBv0fKjAE0zJaU8kBWx6dN+Tm9cm+OHNDIVNf2dmy3WL+ujECROIPJZHNAr+74hm8ujA==

tldts@^6.0.22:
version "6.0.22"
resolved "https://registry.yarnpkg.com/tldts/-/tldts-6.0.22.tgz#9a2833b196ebb6704085b0cd07fdfc205eb4d3bd"
integrity sha512-dBxlzF/sbr8DBCI6To3gMUzTgoz7P8qrnZsfF+nYGkjEfcPaOUkwtJMjLzde4dN7xyjDLMIS5+uxChhYaFzRKw==
dependencies:
tldts-core "^6.0.22"

tmp@^0.2.1:
version "0.2.1"
resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.2.1.tgz#8457fc3037dcf4719c251367a1af6500ee1ccf14"
Expand Down

0 comments on commit f43fd32

Please sign in to comment.