diff --git a/CHANGELOG.md b/CHANGELOG.md index c74611f..4195d7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,12 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -Implemented this release automation: https://superface.ai/blog/npm-publish-gh-actions-changelog +## [0.4.0] - 2023-08-23 ### Added -- GitHub action to publish new versions to npm. -- Automatically move unreleased changed to a new release section in the changelog. +- Middleware to recognise organic search and social traffic. ## [0.3.2] - 2023-08-02 diff --git a/README.md b/README.md index e07c14b..6dd5cfb 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,43 @@ A few middlewares have been provided: Please see the source of these middlewares for further details on their behavior. +### Mapping referrals from search engines and social networks + +Two middlewares have been provided to automatically turn referrals from well-known search engines or social networks +into organic search or organic social traffic, respectively. + +For example: +```javascript +import { InteractionLogger, ReferralMapper } from '@jeroen.bakker/just-attribute'; + +const logger = new InteractionLogger(); + +// By not passing in a list of search engines or social networks we use the default lists +logger.registerInteractionMiddleware(ReferralMapper.newSearchEngineMiddleware()); +logger.registerInteractionMiddleware(ReferralMapper.newSocialNetworkMiddleware()); + +// Usually you would just call logger.pageview() +// but calling processInteraction() directly makes it easier to demonstrate the transformation +logger.processInteraction({source: 'www.google.com', medium: 'referral'}); +logger.lastInteraction(); // Returns {source: 'Google', medium: 'organic'} + +logger.processInteraction({source: 'www.reddit.com', medium: 'referral'}); +logger.lastInteraction(); // Returns {source: 'reddit', medium: 'social'} +``` + +This is done by comparing referring domains to a small list of large [search engines](data/search-engines-basic.json) +and [social networks](data/social-networks-basic.json). +These lists have been manually compiled by using the [searchengine-and-social-list](https://github.com/matomo-org/searchengine-and-social-list) +from Matomo as a starting point. + +The compiled lists have been kept intentionally small so that they don't add too much overhead +if you decide to bundle them with the logger. +But if you want to save even more on data or need to include a larger list you could also run the middlewares +on the interaction log, server-side. + +If you notice any obvious services missing from these lists feel free to open an issue or pull request. +Additionally, you can pass in your own list if you need more niche or regional services to be recognised. + ### Asynchronous attribution This package is split into two main components: @@ -169,8 +206,6 @@ This is purely intended to be used on the web, mobile apps have not been taken i There is currently no planned support for tracking redeemed discount codes or other promotions which could be used to attribute orders. Planned: -- Add out of the box implementation for recognising organic search based on a list of domains -- Add out of the box implementation for recognising organic social media based on a list of domains - Add out of the box implementation for running attribution models in BigQuery using javascript UDFs - Describe how to contribute - Add a code style linter/fixer to make contributing easier @@ -178,6 +213,6 @@ Planned: - Figure out browser support (should be pretty good if you don't need to support IE or Opera mini) Undecided: -- Whether to log the page URL as part of the interaction, this would allow users to get information about landing pages and how they perform. +- Whether to log the page URL as part of the interaction, this would allow you to get information about landing pages and how they perform. This is not intended as a full analytics tool, but this would take almost no effort to add and could provide a lot of value. This can always be implemented as a middleware, but if it adds enough value it might make more sense to just enable it out of the box. diff --git a/data/search-engines-basic.json b/data/search-engines-basic.json new file mode 100644 index 0000000..9088fff --- /dev/null +++ b/data/search-engines-basic.json @@ -0,0 +1,9 @@ +{ + "AOL": ["aol.com"], + "Baidu": ["baidu.com"], + "Bing": ["bing.com"], + "DuckDuckGo": ["duckduckgo.com"], + "Google": ["google.*", "google.co.*", "google.com.*"], + "Yahoo": ["*.yahoo.*"], + "Yandex": ["yandex.com"] +} diff --git a/data/social-networks-basic.json b/data/social-networks-basic.json new file mode 100644 index 0000000..e7bb1d7 --- /dev/null +++ b/data/social-networks-basic.json @@ -0,0 +1,15 @@ +{ + "Facebook": ["facebook.com", "fb.me", "m.facebook.com", "l.facebook.com"], + "Hacker News": ["news.ycombinator.com"], + "Instagram": ["instagram.com", "l.instagram.com"], + "LinkedIn": ["linkedin.com", "lnkd.in"], + "Pinterest": ["pinterest.*", "pinterest.co.*", "pinterest.com.*"], + "reddit": ["reddit.com", "*.reddit.com"], + "Snapchat": ["snapchat.com"], + "TikTok": ["tiktok.com"], + "tumblr": ["tumblr.com", "t.umblr.com"], + "Twitter": ["twitter.com", "t.co", "x.com"], + "YouTube": ["youtube.com", "youtu.be"], + "Vimeo": ["vimeo.com"], + "Weibo": ["weibo.com"] +} \ No newline at end of file diff --git a/package.json b/package.json index ffa197d..3d745dd 100644 --- a/package.json +++ b/package.json @@ -5,10 +5,11 @@ "author": "Jeroen Bakker", "license": "MIT", "scripts": { - "build": "esbuild src/*.ts src/*/*.ts --outdir=dist --bundle --format=esm --minify", + "build": "npm run esbuild && npm run tsc", + "esbuild": "esbuild src/*.ts src/*/*.ts --outdir=dist --bundle --format=esm --minify", "tsc": "tsc", "test": "jest", - "prepublishOnly": "npm run build && npm run tsc" + "prepublishOnly": "npm run build" }, "type": "module", "main": "dist/index.js", diff --git a/src/InteractionMiddlewares/ReferralMapper.ts b/src/InteractionMiddlewares/ReferralMapper.ts new file mode 100644 index 0000000..4b84ad5 --- /dev/null +++ b/src/InteractionMiddlewares/ReferralMapper.ts @@ -0,0 +1,49 @@ +import { InteractionMiddleware, Interaction } from '../types'; +import {domainToService} from "../domainToService"; +import {ServiceDatabase} from "../domainToService"; + +/** + * Note: this requires the automatic referral detection to be enabled. + */ +export default class ReferralMapper { + /** + * Turns referrals from known domains into organic search traffic. + */ + public static newSearchEngineMiddleware(searchEngines?: ServiceDatabase): InteractionMiddleware { + searchEngines ??= require('../../data/search-engines-basic.json') as ServiceDatabase; + + return ReferralMapper.newReferralMapper('organic', searchEngines); + } + /** + * Turns referrals from known domains into organic social traffic. + */ + public static newSocialNetworkMiddleware(socialNetworks?: ServiceDatabase): InteractionMiddleware { + socialNetworks ??= require('../../data/social-networks-basic.json') as ServiceDatabase; + + return ReferralMapper.newReferralMapper('social', socialNetworks); + } + + private static newReferralMapper(mappedMedium: string, services: ServiceDatabase): InteractionMiddleware { + return (interaction: Interaction): Interaction => { + // If it is not a referral nothing needs to be done + if (interaction.medium !== 'referral' || ! interaction.source) { + return interaction; + } + + // The automatic referral detection puts the referrer domain as the source, + // so we compare the source to the domains of the network + let sourceDomain = interaction.source.toLowerCase(); + let serviceName = domainToService(sourceDomain, services); + + if (serviceName) { + return { + ...interaction, + source: serviceName, + medium: mappedMedium, + } + } + + return interaction; + } + } +} diff --git a/src/domainToService.ts b/src/domainToService.ts new file mode 100644 index 0000000..34d4f12 --- /dev/null +++ b/src/domainToService.ts @@ -0,0 +1,32 @@ +export type ServiceDatabase = Record>; + +/** + * Attempts to match a domain to a service in the ServiceDatabase. + * Supports wildcard lookup and will ignore www subdomains. + */ +export function domainToService(domain: string, services: ServiceDatabase): string | null { + // strip the www subdomain if it's at the start, so we can ignore it + domain = domain.replace(/^www\./i, ''); + + for (const [serviceName, serviceDomains] of Object.entries(services)) { + for (let serviceDomain of serviceDomains) { + const domainPattern = serviceDomain + // Strip any leading www subdomain from the service domain because we also removed it from the lookup domain + .replace(/^www\./, '') + // Escape . to not accidentally match any character when using the domain as a regex + .replace(/\./g, '\\.') + // The * wildcard maps to a pattern allowing any characters valid in a domain (not .) + // which is a naive subdomain regex but works well enough for most cases + // Replace *. by an optional subdomain, meaning it also matches a missing subdomain + .replace(/\*\\\./g, '([\\w\\d-]+\.)?') + // Replace * by any characters valid in a domain (not .) + .replace(/\*/g, '[\\w\\d-]+'); + + if (new RegExp(`^${domainPattern}$`, 'i').test(domain)) { + return serviceName; + } + } + } + + return null; +} diff --git a/src/index.ts b/src/index.ts index bb699af..0e93594 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,6 +3,7 @@ export { default as distributeValue } from './distributeValue'; export { default as googleAds } from './InteractionMiddlewares/GoogleAds'; export { default as facebookAds } from './InteractionMiddlewares/FacebookAds'; export { default as ref } from './InteractionMiddlewares/Ref'; +export { default as ReferralMapper } from './InteractionMiddlewares/ReferralMapper'; export { default as lastInteraction } from './AttributionModels/LastInteraction'; export { default as firstInteraction } from './AttributionModels/FirstInteraction'; export { default as lastNonDirectInteraction } from './AttributionModels/LastNonDirectInteraction'; diff --git a/tests/InteractionMiddlewares/ReferralMapper.test.ts b/tests/InteractionMiddlewares/ReferralMapper.test.ts new file mode 100644 index 0000000..bcb6613 --- /dev/null +++ b/tests/InteractionMiddlewares/ReferralMapper.test.ts @@ -0,0 +1,99 @@ +import {expect, test} from '@jest/globals'; +import ReferralMapper from '../../src/InteractionMiddlewares/ReferralMapper'; + +test('it ignores non-referral interactions', async () => { + const searchEngines = {google: ['www.google.com']}; + const searchEngineMiddleware = ReferralMapper.newSearchEngineMiddleware(searchEngines); + + const initialInteraction = {medium: 'foo', source: 'www.google.com'}; + + expect(searchEngineMiddleware(initialInteraction)).toEqual(initialInteraction); +}); + +test('it returns initial interaction on no match', async () => { + const searchEngines = {google: ['www.google.com'], bing: ['www.bing.com']}; + const searchEngineMiddleware = ReferralMapper.newSearchEngineMiddleware(searchEngines); + + const initialInteraction = {medium: 'referral', source: 'www.reddit.com'}; + + expect(searchEngineMiddleware(initialInteraction)).toEqual(initialInteraction); +}); + +test('it attributes search engine referrals to organic search', async () => { + const searchEngines = {google: ['www.google.com']}; + const searchEngineMiddleware = ReferralMapper.newSearchEngineMiddleware(searchEngines); + + const initialInteraction = {medium: 'referral', source: 'www.google.com'}; + const expectedInteraction = {medium: 'organic', source: 'google'}; + + expect(searchEngineMiddleware(initialInteraction)).toEqual(expectedInteraction); +}); + +test('it attributes social network referrals to social', async () => { + const socialNetworks = {facebook: ['www.facebook.com']}; + const socialNetworkMiddleware = ReferralMapper.newSocialNetworkMiddleware(socialNetworks); + + const initialInteraction = {medium: 'referral', source: 'www.facebook.com'}; + const expectedInteraction = {medium: 'social', source: 'facebook'}; + + expect(socialNetworkMiddleware(initialInteraction)).toEqual(expectedInteraction); +}); + + +test.each([ + ['www.aol.com', 'AOL'], + ['www.baidu.com', 'Baidu'], + ['www.bing.com', 'Bing'], + ['duckduckgo.com', 'DuckDuckGo'], + ['google.com', 'Google'], + ['www.google.com', 'Google'], + ['www.google.nl', 'Google'], + ['www.google.co.uk', 'Google'], + ['www.google.com.au', 'Google'], + ['www.yahoo.com', 'Yahoo'], + ['yandex.com', 'Yandex'], +])('it uses the basic list of search engines by default', async (domain, expectedService) => { + const searchEngineMiddleware = ReferralMapper.newSearchEngineMiddleware(); + + const initialInteraction = {medium: 'referral', source: domain}; + const expectedInteraction = {medium: 'organic', source: expectedService}; + + expect(searchEngineMiddleware(initialInteraction)).toEqual(expectedInteraction); +}); + +test.each([ + ['www.facebook.com', 'Facebook'], + ['fb.me', 'Facebook'], + ['m.facebook.com', 'Facebook'], + ['l.facebook.com', 'Facebook'], + ['news.ycombinator.com', 'Hacker News'], + ['www.instagram.com', 'Instagram'], + ['l.instagram.com', 'Instagram'], + ['www.linkedin.com', 'LinkedIn'], + ['lnkd.in', 'LinkedIn'], + ['www.pinterest.com', 'Pinterest'], + ['www.pinterest.nl', 'Pinterest'], + ['www.pinterest.co.uk', 'Pinterest'], + ['www.pinterest.com.au', 'Pinterest'], + ['www.reddit.com', 'reddit'], + ['old.reddit.com', 'reddit'], + ['np.reddit.com', 'reddit'], + ['www.snapchat.com', 'Snapchat'], + ['www.tiktok.com', 'TikTok'], + ['www.tumblr.com', 'tumblr'], + ['t.umblr.com', 'tumblr'], + ['twitter.com', 'Twitter'], + ['t.co', 'Twitter'], + ['x.com', 'Twitter'], + ['www.youtube.com', 'YouTube'], + ['youtu.be', 'YouTube'], + ['vimeo.com', 'Vimeo'], + ['weibo.com', 'Weibo'], +])('it uses the basic list of social networks by default', async (domain, expectedService) => { + const socialNetworkMiddleware = ReferralMapper.newSocialNetworkMiddleware(); + + const initialInteraction = {medium: 'referral', source: domain}; + const expectedInteraction = {medium: 'social', source: expectedService}; + + expect(socialNetworkMiddleware(initialInteraction)).toEqual(expectedInteraction); +}); \ No newline at end of file diff --git a/tests/domainToService.test.ts b/tests/domainToService.test.ts new file mode 100644 index 0000000..11d2226 --- /dev/null +++ b/tests/domainToService.test.ts @@ -0,0 +1,86 @@ +import { expect, test } from '@jest/globals'; +import {domainToService} from "../src/domainToService"; + +test('it matches service', async () => { + const services = {google: ['www.google.com']}; + + expect(domainToService('www.google.com', services)).toEqual('google'); +}); + +test('it ignores www subdomain', async () => { + const services = {google: ['google.com']}; + + expect(domainToService('www.google.com', services)).toEqual('google'); +}); + +test('it allows www subdomain', async () => { + // const services = {google: ['www.google.*']}; + const services = {"google": ["google.*", "google.co.*", "google.com.*"],}; + + expect(domainToService('www.google.com', services)).toEqual('google'); +}); + +test('it ignores capitals', async () => { + const services = {google: ['www.GOOGLE.com']}; + + expect(domainToService('www.Google.com', services)).toEqual('google'); +}); + +test('it skips incorrect domain lists', async () => { + const services = {google: 'www.google.com', test: ['www.google.com']}; + + // @ts-ignore + expect(domainToService('www.google.com', services)).toEqual('test'); +}); + +test.each([ + [ + {google: ['www.google.*']}, + 'www.google.nl', + 'google', + ], + [ + {google: ['*.google.com']}, + 'www.google.com', + 'google', + ], + [ + {google: ['*.google.com']}, + 'www.google.com', + 'google', + ], + [ + {google: ['*.google.*']}, + 'www.google.de', + 'google', + ], + [ + // We need special cases to handle the more common two part eTLDs + {google: ['www.google.*', '*.google.com', '*.google.*', '*.google.co.*', '*.google.com.*']}, + 'www.google.co.uk', + 'google', + ], + [ + // We need special cases to handle the more common two part eTLDs + {google: ['www.google.*', '*.google.com', '*.google.*', '*.google.co.*', '*.google.com.*']}, + 'www.google.com.au', + 'google', + ], +])('it supports wildcards in the domains', async (services, domain, expectedServiceName) => { + expect(domainToService(domain, services)).toEqual(expectedServiceName); +}); + +test('it returns null on no match', async () => { + const services = {google: ['www.google.com']}; + + expect(domainToService('www.example.com', services)).toBeNull(); +}); + +/** + * . is a wildcard in regex, and we use regex to compare domains + */ +test('it does not use the . in domain names as wildcards', async () => { + const services = {google: ['www.google.com']}; + + expect(domainToService('wwwxgoogle.com', services)).toBeNull(); +});