From 0f09b4a352e7928f0361d345f284509dbdcd3117 Mon Sep 17 00:00:00 2001 From: Francis McKenzie Date: Mon, 19 Aug 2024 08:13:41 +0800 Subject: [PATCH 1/7] Add Public Suffix API proposal --- proposals/public-suffix.md | 227 +++++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 proposals/public-suffix.md diff --git a/proposals/public-suffix.md b/proposals/public-suffix.md new file mode 100644 index 00000000..0cf19300 --- /dev/null +++ b/proposals/public-suffix.md @@ -0,0 +1,227 @@ +# Proposal: Public Suffix API + +**Summary** + +API to obtain the topmost *registrable domain / eTLD+1 (effective Top Level Domain+1)* +from a domain name or URL. + +**Document Metadata** + +**Author:** [Francis McKenzie](https://github.com/mckenfra) + +**Sponsoring Browser:** Mozilla Firefox + +**Contributors:** N/A + +**Created:** 2024-08-19 + +**Related Issues:** [#231](https://github.com/w3c/webextensions/issues/231) + +## Motivation + +### Objective + +This API enables developers to obtain the topmost *registrable domain / eTLD+1* +from a domain name or URL. This functionality is already implemented for internal +use by all the major browsers. Therefore the effect of this API is to expose +existing built-in browser functionality to extensions developers. + +The primary objective of this API is to eliminate the possibility of inconsistencies +between the host browser and hosted extensions when deriving topmost +*registrable domain / eTLD+1*s from domain names / URLs. + +Secondary objectives of this API are to: + +1. Improve extension developer experience by reducing complexity and maintenance overhead, +since developers will no longer need to roll their own solutions for obtaining and parsing +the [Public Suffix List (PSL)](https://publicsuffix.org/list/). +2. Reduce extensions' resource usage (CPU, memory, disk space), since extensions +will no longer be duplicating work already done by the host browser. + +#### Use Case #1: Wildcard Domains + +This API is relevant to any extension that gives the user control over which +domain names / URLs the extension's functionality should apply to. For example, +Mozilla's [multi-account-containers](https://github.com/mozilla/multi-account-containers) +extension allows users to create containers for web pages as they visit them. + +Users may want the ability to apply the extension's functionality to a +*wildcard domain*, such that all subdomains of some base domain are +automatically included. + +This API allows the extension to automatically propose the topmost *registrable domain / eTLD+1* +as a possible *wildcard domain* for the user to choose. It also allows the +extension to prevent the user from accidentally choosing a *public suffix / eTLD* +as a wildcard domain, e.g. `*.com`. + +#### Use Case #2: Grouping Domains in UI + +Where extensions present lists of domain names / URLs to users, it can be beneficial +from a UX perspective to group them by their topmost *registrable domain / eTLD+1*s. + +##### Ungrouped domains + +| | +| --------------------- | +| example.co.uk | +| example2.com | +| foo.bar.example.co.uk | +| foo.bar.example2.com | +| www.example.co.uk | +| www.example2.com | + +##### Grouped domains + +| example.co.uk | | +| -------------- | --------------------- | +| | example.co.uk | +| | foo.bar.example.co.uk | +| | www.example.co.uk | + +| example2.com | | +| -------------- | --------------------- | +| | example2.com | +| | foo.bar.example2.com | +| | www.example2.com | + +### Known Consumers + +Mozilla intends to make use of this API in its [multi-account-containers](https://github.com/mozilla/multi-account-containers) extension to allow users to create containers +that include wildcard domain names: [PR #2352](https://github.com/mozilla/multi-account-containers/pull/2352). Currently, users must manually curate their containers to capture all +anticipated subdomains of a base domain, and this manual approach is cumbersome and +error-prone. + +Apart from this, any other extension that already rolls its own solution in order to parse +the PSL could potentially benefit from this API, for example [uBlock Origin](https://github.com/gorhill/uBlock). + +## Specification + +### Schema + +A new API `publicSuffix` is added as follows: + +```ts +// +// Example: +// +// let domain = await browser.publicSuffix.getRegistrableDomain("www.example.co.uk"); +// ==> 'example.co.uk' +// +Promise browser.publicSuffix.getRegistrableDomain( + // The domain name or URL whose registrable domain we want to find + domainNameOrUrl: string, +) +``` + +### Behavior #1: Returned Promise + +The promise returned by `getRegistrableDomain()` method will either: + +1. Resolve with the topmost *registrable domain / eTLD+1* if it can be determined from +the `domainNameOrUrl`. +2. Reject with an `Error` / populate `browser.runtime.lastError`. + +### Behavior #2: Private Domains + +By default, the lookup performed by `getRegistrableDomain()` should **exclude** private domains +contained in the PSL dataset. + +See [Future work](#1-extend-the-api). + +### New Permissions + +| Permission Added | Suggested Warning | +| ---------------- | ----------------- | +| publicSuffix | Read the public suffix list | + +### Manifest File Changes + +There are no changes to the manifest. + +## Security and Privacy + +### Exposed Sensitive Data + +The only data exposed by this API is the [public suffix list](https://publicsuffix.org/list/). + +### Abuse Mitigations + +This does not expose any new non-public data so there are no new abuse vectors. + +### Additional Security Considerations + +N/A + +## Alternatives + +### Existing Workarounds + +Developers can download the PSL dataset, bundle it with their extensions, and implement +logic that parses and interprets the dataset in order to determine the topmost +*registrable domain / eTLD+1* for a domain name. There are several drawbacks to +this approach: + +1. Potential for inconsistencies in the determination of topmost *registrable domain / eTLD+1* +by the extension and the host browser, due to differences in the version +of the PSL dataset used, and differences in the implementations that the host +browser and the extension use in order to interpret this dataset. +2. Increased complexity and maintenance costs of extensions. +3. Increased performance overhead of extensions due to bundling of bulky +PSL dataset, leading to increase in memory, disk usage and increase in CPU usage +due to possibly suboptimal extension implementations and repeating work already +done by the host browser. + +### Open Web API + +The purpose of this API is to eliminate the potential for inconsistency between +the host browser and its hosted extensions. The simplest way of achieving this +is for extensions to access this functionality via the host browser itself rather +than via some external source, such as an Open Web API. + +It is then a determination for the host browser itself as to whether +the functionality (used by both the host browser and its extensions) +should ultimately be obtained by means of an Open Web API. + +## Implementation Notes + +Since the major browsers all already implement internal methods for determining +topmost *registrable domain / eTLD+1*s, it is hoped that the implementation will +involve little more than providing the relevant mechanism for exposing these same methods +to extensions: + +| Browser | Registrable Domain Method | +| ------- | ------------------------- | +| Chrome | [GetDomainAndRegistry](https://source.chromium.org/chromium/chromium/src/+/main:net/base/registry_controlled_domains/registry_controlled_domain.h;l=182;drc=4f516be19f2c1d35dc3240d050d84d10f0d6f726) | +| Firefox | [getBaseDomain](https://searchfox.org/mozilla-central/rev/e9f9bf31d1c0057a1cd339b5a853a75d1b16db39/netwerk/dns/nsIEffectiveTLDService.idl#94) | +| Safari | [topPrivatelyControlledDomain](https://github.com/WebKit/WebKit/blob/01eba7c416725cfd4eec57ab16daffa25b8124b4/Source/WebCore/platform/PublicSuffixStore.h#L43) | + +## Future Work + +### 1. Extend the API + +The major browsers provide additional methods/parameters internally for getting +information related to the *registable domain / eTLD+1*. The API could be extended +to expose more of these internal methods/parameters, for example: + +1. Provide an option to include private domains when getting the *registrable domain / eTLD+1*. +2. Provide method `getPublicSuffix()` to get the *public suffix / eTLD*. +3. Provide methods `isRegistrableDomain()` and/or `isPublicSuffix()` for possibly improved +efficiency in certain use cases. + +### 2. Batching + +Extensions may want to obtain *registrable domain / eTLD+1*s for large numbers of +domain names / URLs at once. A possible enhancement to the API would be to +provide a method that performed a lookup on multiple domain names / URLs with a +single method call. + +### 3. Change Notifications + +The PSL dataset, used by the browsers to determine *registrable domain / eTLD+1*s, +is a dynamic dataset that can change at any time. This API proposal currently provides no +mechanism for notifying extensions when the host browser's PSL dataset changes. +It is understood that such changes are only made currently when a new browser version +is released, however this may not always be the case. + +It may be useful to implement a notification mechanism so that extensions can take +appropriate action when the host browser's PSL dataset changes. From 3301526fcb150706aa59e9f20839d646e4920649 Mon Sep 17 00:00:00 2001 From: Francis McKenzie Date: Fri, 6 Sep 2024 11:06:18 +0800 Subject: [PATCH 2/7] Update Public Suffix API proposal --- proposals/public-suffix.md | 506 ++++++++++++++++++++++++++++++------- 1 file changed, 417 insertions(+), 89 deletions(-) diff --git a/proposals/public-suffix.md b/proposals/public-suffix.md index 0cf19300..cdd39c19 100644 --- a/proposals/public-suffix.md +++ b/proposals/public-suffix.md @@ -2,8 +2,8 @@ **Summary** -API to obtain the topmost *registrable domain / eTLD+1 (effective Top Level Domain+1)* -from a domain name or URL. +API to obtain the *registrable domain / eTLD+1 (effective Top Level Domain+1)* +from a domain name. **Document Metadata** @@ -21,14 +21,14 @@ from a domain name or URL. ### Objective -This API enables developers to obtain the topmost *registrable domain / eTLD+1* -from a domain name or URL. This functionality is already implemented for internal +This API enables developers to obtain the *registrable domain / eTLD+1* +from a domain name. This functionality is already implemented for internal use by all the major browsers. Therefore the effect of this API is to expose existing built-in browser functionality to extensions developers. The primary objective of this API is to eliminate the possibility of inconsistencies -between the host browser and hosted extensions when deriving topmost -*registrable domain / eTLD+1*s from domain names / URLs. +between the host browser and hosted extensions when deriving +*registrable domain / eTLD+1*s from domain names. Secondary objectives of this API are to: @@ -38,61 +38,195 @@ the [Public Suffix List (PSL)](https://publicsuffix.org/list/). 2. Reduce extensions' resource usage (CPU, memory, disk space), since extensions will no longer be duplicating work already done by the host browser. -#### Use Case #1: Wildcard Domains +### Features of the PSL -This API is relevant to any extension that gives the user control over which -domain names / URLs the extension's functionality should apply to. For example, -Mozilla's [multi-account-containers](https://github.com/mozilla/multi-account-containers) -extension allows users to create containers for web pages as they visit them. +In order to set out the use cases fully, it is first necessary to describe certain +relevant features of the PSL. -Users may want the ability to apply the extension's functionality to a -*wildcard domain*, such that all subdomains of some base domain are -automatically included. +#### 1. ICANN vs Private -This API allows the extension to automatically propose the topmost *registrable domain / eTLD+1* -as a possible *wildcard domain* for the user to choose. It also allows the -extension to prevent the user from accidentally choosing a *public suffix / eTLD* -as a wildcard domain, e.g. `*.com`. +The PSL it is divided into two sections: ICANN suffixes and Private (i.e. non-ICANN) +suffixes. -#### Use Case #2: Grouping Domains in UI +##### Examples -Where extensions present lists of domain names / URLs to users, it can be beneficial -from a UX perspective to group them by their topmost *registrable domain / eTLD+1*s. +| ICANN Suffixes | Private Suffixes (Non-ICANN) | +|-------------------------------|------------------------------ | +| com | blogspot.com | +| | members.linode.com | +| | simplesite.com | +| | | +| co.uk | blogspot.co.uk | +| | wellbeingzone.co.uk | +| | vm.bytemark.co.uk | -##### Ungrouped domains +The suffixes in the PSL represent boundaries between registrar-type organizations +and their clients. The "owner" of the suffix is one organization, and the "owner" +of a domain label immediately preceeding the suffix is a different organization. -| | -| --------------------- | -| example.co.uk | -| example2.com | -| foo.bar.example.co.uk | -| foo.bar.example2.com | -| www.example.co.uk | -| www.example2.com | +In the case of suffixes in the ICANN section, the "owner" of a suffix is ICANN / +an affiliated registrar. In the case of suffixes in the Private section, the +"owner" of a suffix is a private organization offering a service for clients +to take ownership of subdomains underneath its own ICANN-suffixed domain. -##### Grouped domains +#### 2. Multiple Suffixes per Domain -| example.co.uk | | -| -------------- | --------------------- | -| | example.co.uk | -| | foo.bar.example.co.uk | -| | www.example.co.uk | +If a domain name ends with a suffix listed in the Private section of the PSL, +then it must also end with a shorter suffix in the ICANN section. The question +of which one should be taken to be the domain's public suffix depends on the +specific use case. -| example2.com | | -| -------------- | --------------------- | -| | example2.com | -| | foo.bar.example2.com | -| | www.example2.com | +##### Example -### Known Consumers +| Domain name | Private suffix | ICANN suffix | +|:-------------------:|:---------------:|:------------:| +| foo.bar.wixsite.com | wixsite.com | com | + +#### 3. Known vs Unknown + +If a domain name does not match any suffix in the PSL, it is considered to have +an unknown suffix. Depending on the use case, the public suffix could be taken +by default to be the last domain label of the domain name, or alternatively +the domain name could be considered invalid. + +**Note:** it may be more performant to allow unknown suffixes and assume a single-label +suffix by default, because it allows the following optimisation of the lookup algorithm: +all single-label suffixes in the PSL can be excluded from the lookup, since they do not +need to be matched specifically. + +#### 4. IDN + +The PSL uses Unicode, and contains International Domain Name (IDN) suffixes. Depending +on the use case, when returning the *registrable domain / eTLD+1* for a domain name, +either Unicode or Punycode may be preferred. + +**Note:** the [PSL algorithm](https://github.com/publicsuffix/list/wiki/Format#formal-algorithm) +requires Punycode for the matching logic. Therefore a requirement to convert Punycode +back to Unicode involves extra work. It may be preferable to avoid this in +performance-sensitive use cases. + +#### Use Cases + +#### 1. Filter Requests by Organization + +This API is relevant to any extension that does both of the following: + +* acts on web requests automatically while the user browses +* allows the user to choose which domain names the extension should act on, +and which should be ignored by the extension + +In effect, such extensions allow users to create request filtering rules that +restrict the extension to acting on only web requests having specific domain names. + +When choosing domain names for the request filter, users may want to be able to choose +all domain names for an organization without having to enter them each in individually. +The user may want to specify request filtering rules as follows: + +| # | Filtering rule type | User value | Examples of domains affected | +|:-:|---------------------|:----------:|-----------------------------:| +| 1 | organizational base domain and all possible subdomains | myorg.co.uk | myorg.co.uk
subdomain1.myorg.co.uk
subdomain2.myorg.co.uk | +| 2 | organization name and all possible suffixes | myorg | myorg.co.uk
myorg.com
myorg.net | -Mozilla intends to make use of this API in its [multi-account-containers](https://github.com/mozilla/multi-account-containers) extension to allow users to create containers -that include wildcard domain names: [PR #2352](https://github.com/mozilla/multi-account-containers/pull/2352). Currently, users must manually curate their containers to capture all -anticipated subdomains of a base domain, and this manual approach is cumbersome and -error-prone. +To support this use case, such extensions may: -Apart from this, any other extension that already rolls its own solution in order to parse -the PSL could potentially benefit from this API, for example [uBlock Origin](https://github.com/gorhill/uBlock). +* automatically calculate the set of organization names and *registrable domain / eTLD+1*s +from a user-specified set of domain names, and propose these as filtering rules for +the user to choose +* prevent users from mistakenly specifying a *public suffix / eTLD* as a filtering rule, +believing it to be a *registrable domain / eTLD+1*, e.g. `co.uk` instead of `myorg.co.uk` +* apply the users' filtering rules automatically while the user browses by determining +the *registrable domain / eTLD+1* of each web request, and testing this using appropriate +regexes to decide whether or not to act on the request + +##### PSL Requirements + +| PSL Feature | Requirement | Discussion | +|------------------------|-------------|------------| +| Allow Private Suffixes | Yes & No | some filters may require private suffixes, others ICANN-only | +| | | preventing user mistake may require both types of suffix | +| Allow Unknown Suffixes | Yes | provides better performance | +| Preserve IDN Unicode | Yes & No | Unicode when viewing filters in UI, Punycode when implementing filtering logic for better performance | + +#### 2. Group Domains in UI + +Where extensions present lists of domain names to users, it can be beneficial +from a UX perspective to group them by their *registrable domain / eTLD+1*s. + +##### Example + +| No grouping | ---------> | With grouping | | +|-----------------------|------------|-------------------|-----------------------| +| example.co.uk | | **example.co.uk** | | +| example2.com | | | example.co.uk | +| foo.bar.example.co.uk | | | foo.bar.example.co.uk | +| foo.bar.example2.com | | | www.example.co.uk | +| www.example.co.uk | | | | +| www.example2.com | | **example2.com** | | +| | | | example2.com | +| | | | foo.bar.example2.com | +| | | | www.example2.com | + +##### PSL Requirements + +| PSL Feature | Requirement | Discussion | +|------------------------|-------------|------------| +| Allow Private Suffixes | Yes & No | optimal grouping may require either private (more groups, fewer domains per group) or ICANN-only (fewer groups, more domains per group) | +| Allow Unknown Suffixes | Yes | grouping behaviour is the same regardless of whether suffixes are known/unknown | +| Preserve IDN Unicode | Yes | better UX, because users may be unfamiliar with Punycode | + +#### 3. Detect Third-Party Requests + +Some ad-blocker extensions allow users to create request-blocking rules that only +apply to *[third-party requests](https://help.adblockplus.org/hc/en-us/articles/360062733293-How-to-write-filters#party-requests)*. Additionally, tracker-blocking extensions +typically exclude requests from tracking detection unless they are third-party requests. + +In order to determine if a request is a third-party request, such extensions lookup +the *registrable domain / eTLD+1* of the request and of the parent document using the PSL. +If the *registrable domain / eTLD+1* of the request is different to that of the parent +document, then the request is considered third-party. + +##### Example of Determining if Requests are Third-Party + +| Req. | Domain | Registrable Domain | PSL Section | Third-party? | +|:----:|----------------------------:|----------------------------:|:-----------:|:------------:| +| #1 | foo.com | foo.com | ICANN | | +| #2 | bar.com | bar.com | ICANN | Yes | +| | | | | | +| #1 | foo.amazonaws.com | amazonaws.com | ICANN | | +| #2 | bar.amazonaws.com | amazonaws.com | ICANN | No | +| | | | | | +| #1 | foo.amazonaws.com | amazonaws.com | ICANN | | +| #2 | bar.us-east-1.amazonaws.com | bar.us-east-1.amazonaws.com | Private | Yes | + +##### PSL Requirements + +| PSL Feature | Requirement | Discussion | +|------------------------|-------------|------------| +| Allow Private Suffixes | Yes | including all suffixes in PSL means more information about third-party boundaries | +| Allow Unknown Suffixes | Yes | provides better performance | +| Preserve IDN Unicode | No | provides better performance | + +### Known Consumers + +Mozilla intends to make use of this API in its [multi-account-containers](https://addons.mozilla.org/en-US/firefox/addon/multi-account-containers/) +extension to allow users to create containers that isolate organizational base domains: +[PR #2352](https://github.com/mozilla/multi-account-containers/pull/2352). +Currently, users must manually curate their containers to capture all anticipated subdomains +of an organization's base domain, and this manual approach is cumbersome and error-prone. + +In addition to this, an analysis has been done of the relevant use cases of a selection of +other extensions that currently roll their own solutions in order to parse the PSL, and the +use cases are set out in the [Use Cases](#use-cases) section. Although the extension +authors have not been consulted directly, it is assumed that they would stand to benefit +from this API due to the reasons stated in the [Objective](#objective) section. + +| Extension | Relevant use cases | Description | +|-----------|--------------------|-------------| +| [multi-account-containers](https://addons.mozilla.org/en-US/firefox/addon/multi-account-containers/) | #1, #2 | isolate requests into containers by organizational base domain, view all domains grouped by organization | +| [uBlock Origin](https://addons.mozilla.org/en-US/firefox/addon/ublock-origin/) | #1, #2, #3 | create organization/3rd-party request-blocking rules, view request blocking status by organization, create blocking rules for specifically 3rd-party requests | +| [adblock-plus](https://addons.mozilla.org/en-US/firefox/addon/adblock-plus/) | #1, #3 | create organization/3rd-party request-blocking rules | +| [duckduckgo-for-firefox](https://addons.mozilla.org/en-US/firefox/addon/duckduckgo-for-firefox/) | #3 | exclude requests from tracking detection unless they are 3rd-party | +| [violentmonkey](https://addons.mozilla.org/en-US/firefox/addon/violentmonkey/) | #1 | run scripts automatically for requests having a specific organization name plus any suffix | ## Specification @@ -101,38 +235,231 @@ the PSL could potentially benefit from this API, for example [uBlock Origin](htt A new API `publicSuffix` is added as follows: ```ts -// -// Example: -// -// let domain = await browser.publicSuffix.getRegistrableDomain("www.example.co.uk"); -// ==> 'example.co.uk' -// -Promise browser.publicSuffix.getRegistrableDomain( - // The domain name or URL whose registrable domain we want to find - domainNameOrUrl: string, -) +namespace publicSuffix { + // + // Object containing both registrable domains (i.e. ICANN/unknown, Private) + // for a domain. + // + interface RegistrableDomains { + // The original domain whose registrable domain we want + domain: string, + // The ICANN-suffixed or unknown-suffixed registrable domain. + // Null if an error occurred. + base?: string, + // The Private-suffixed registrable domain. + // Null if an error occurred, or if the domain has no matching Private suffix. + private?: string, + // Error thrown during lookup, if any. + error?: Error, + } + + // + // Options that may be passed to getRegistrableDomain() to control its behaviour. + // + interface RegistrableDomainOptions { + // If true, exclude private (non-ICANN) suffixes from the lookup algorithm + excludePrivateSuffixes: boolean, + // If true, use Punycode instead of Unicode when returning the registrable domain + punycode: boolean, + } + + // + // Gets the longest registrable domain for a specified domain. + // + // Example: + // + // let domain = await browser.publicSuffix.getRegistrableDomain("www.example.co.uk"); + // ==> "example.co.uk" + // + export function getRegistrableDomain( + // The domain name whose registrable domain we want to find + domain: string, + // Options that control the behaviour of the lookup algorithm + options?: RegistrableDomainOptions, + ) + // Resolves to the longest registrable domain of the input domain name + : Promise; + + // + // Gets both registrable domains (i.e. ICANN/unknown, Private) for each domain name + // in an array of domain names. + // + // Example: + // + // let domains = await browser.publicSuffix.getRegistrableDomains([ + // "foo.bar.wixsite.com", + // "www.example.net", + // "a..b", + // ]); + // ==> [ + // { domain: "foo.bar.wixsite.com", base: "wixsite.com", private: "bar.wixsite.com" }, + // { domain: "www.example.net", base: "example.net", }, + // { domain: "a..b", error: "Invalid domain name", }, + // ] + // + export function getRegistrableDomains( + // The domain names whose registrable domains we want to find + domains: Array, + // Options that control the behaviour of the lookup algorithm + options?: RegistrableDomainOptions, + ) + // Resolves to the registrable domains of each input domain name + : Promise>; + + // + // Gets the PSL dataset version if available + // + export function getVersion(): string?; +} ``` -### Behavior #1: Returned Promise +### Behaviours + +#### 1. Private Suffixes + +By default, the lookup performed by `getRegistrableDomain()` should **include** all +suffixes in the PSL dataset, i.e. both ICANN and Private (non-ICANN) suffixes. + +However, if an `options` object is passed to `getRegistrableDomain()` with key +`excludePrivateSuffixes` set to `true`, then Private (non-ICANN) suffixes should be +**excluded** from the lookup algorithm. -The promise returned by `getRegistrableDomain()` method will either: +##### Example -1. Resolve with the topmost *registrable domain / eTLD+1* if it can be determined from -the `domainNameOrUrl`. -2. Reject with an `Error` / populate `browser.runtime.lastError`. +`domain` = foo.bar.wixsite.com -### Behavior #2: Private Domains +| Option | Registrable Domain | PSL Section | +|------------------------------------------|:--------------------:|:-----------:| +| excludePrivateSuffixes = false (default) | bar.wixsite.com | Private | +| excludePrivateSuffixes = true | wixsite.com | ICANN | -By default, the lookup performed by `getRegistrableDomain()` should **exclude** private domains -contained in the PSL dataset. +#### 2. Multiple Suffixes per Domain -See [Future work](#1-extend-the-api). +The lookup performed by `getRegistrableDomain()` should select the **longest** matching +suffix (unless specifically excluded using the `excludePrivateSuffixes` option). + +##### Example + +`domain` = foo.bar.lib.de.us + +| Candidate Suffix | PSL Section | +|-----------------:|:-----------:| +| de.us | ICANN | +| lib.de.us | Private | + +The longest is lib.de.us, so `getRegistrableDomain()` resolves to bar.lib.de.us + +#### 3. PSL Special Rules + +The lookup performed by `getRegistrableDomain()` should adhere to the +[PSL algorithm](https://github.com/publicsuffix/list/wiki/Format#formal-algorithm). +In particular, it should apply the 'wildcard' and 'exception' rules in the PSL. + +##### Examples + +| Domain | Public suffix | Matched PSL rule | Explanation | Registrable Domain | +|-------------------------:|--------------:|-----------------:|:--------------:|-------------------:| +| sub.domain.com | com | com | Simple rule | domain.com | +| sub.domain.co.uk | co.uk | co.uk | Simple rule | domain.co.uk | +| sub.domain.gov.ck | gov.ck | *.ck | Wildcard rule | domain.gov.ck | +| sub.domain.any.ck | any.ck | *.ck | Wildcard rule | domain.any.ck | +| sub.sub.domain.any.ck | any.ck | *.ck | Wildcard rule | domain.any.ck | +| www.ck | ck | !www.ck | Exception rule | www.ck | +| sub.www.ck | ck | !www.ck | Exception rule | www.ck | +| sub.sub.www.ck | ck | !www.ck | Exception rule | www.ck | + +#### 4. Unknown Suffixes + +If no matching suffix is found in the PSL for a `domain` parameter, then unless it is determined +to be specifically [invalid](#6-invalid-domain-parameter), it should be assumed the domain has a +single-label suffix. + +##### Example + +| Domain parameter | Registrable domain | PSL Section | +|-----------------------:|-------------------:|:-----------:| +| www.example.foobar | example.foobar | n/a | +| www.example.co.foobar | co.foobar | n/a | + +#### 5. IDN + +The `domain` parameter passed to `getRegistrableDomain()` may be either Unicode +or Punycode. + +When settling the promise returned by `getRegistrableDomain()`, the resulting +domain name should be converted to Unicode from Punycode by default. + +However, if an `options` object is passed to `getRegistrableDomain()` with key +`punycode` set to `true`, then Punycode should be used instead. + +##### Example + +`domain` = foo.bar.example.مليسيا + +| Option | Registrable Domain | +|----------------------------|-----------------------:| +| punycode = false (default) | example.مليسيا | +| punycode = true | example.xn--mgbx4cd0ab | + +#### 6. Invalid domain parameter + +The promise returned by `getRegistrableDomain()` should reject if the `domain` parameter +meets any of the following criteria: +* Contains a character that is invalid in an Internationalized Domain Name (IDN) - e.g. symbols, whitespace +* Is an IP address - IPv4 or IPv6 +* Is a public suffix itself - including the case of it being a single-label suffix not explicitly matched in the PSL +* Is an empty string +* Is equal to `'.'` +* Contains empty domain labels (i.e. any occurrences of `'..'`) + +#### 7. Summary of behaviours + +The following table sets out the eventual settled state of the promise returned by +`getRegistrableDomain()` for different classes of input `domain` parameter: + +| Domain parameter | Description | Registrable domain | +|:-------------------|:-------------------------------------------------|:-----------------------| +| example.net | eTLD+1 | example.net | +| www.example.net | eTLD+2 | example.net | +| net | is a public suffix itself | Error | +| foobar | no matching suffix in PSL, assume 1-label suffix | Error | +| net.foobar | no matching suffix in PSL, assume 1-label suffix | net.foobar | +| 127.0.0.1 | IP address, IPv4 | Error | +| [::1] | IPv6 address | Error | +| EXAMPLE.NET | uppercase | example.net | +| .example.net | dot in front | example.net | +| example.net. | dot in the end, this is an FQDN | example.net. | +| *.com | contains invalid character `'*'` | Error | +| github.io | is a public suffix in the Private section | Error | +| github.io | as above, with `excludePrivateSuffixes = true` | github.io | +| foobar.github.io | has a public suffix in the Private section | foobar.github.io | +| foobar.github.io | as above, with `excludePrivateSuffixes = true` | github.io | +| مليسيا | this is an IDN that is also a public suffix | Error | +| xn--mgbx4cd0ab | as above, but Punycode | Error | +| foo.مليسيا | this is an IDN | foo.مليسيا | +| foo.مليسيا | as above, with `punycode = true` | foo.xn--mgbx4cd0ab | +| foo.xn--mgbx4cd0ab | this is an IDN, but Punycode | foo.مليسيا | +| foo.xn--mgbx4cd0ab | as above, with `punycode = true` | foo.xn--mgbx4cd0ab | +| | empty string | Error | +| . | no domain labels | Error | +| example..com | contains an empty domain label | Error | + +#### 8. Batching + +For [Use Case #2](#2-group-domains-in-ui), function `getRegistrableDomains()` enables +extensions to get multiple registrable domains with a single API call. + +The returned promises's resolved array should contain a `RegistrableDomains` object +for each item in the input `domains` parameter, with the same ordering. + +An error during the lookup of any of the domain names does not cause the returned promise +to be rejected. Instead, each such error is stored in the `error` key of the corresponding `RegistrableDomains` object for the domain name in question. ### New Permissions | Permission Added | Suggested Warning | | ---------------- | ----------------- | -| publicSuffix | Read the public suffix list | +| publicSuffix | N/A | ### Manifest File Changes @@ -157,11 +484,11 @@ N/A ### Existing Workarounds Developers can download the PSL dataset, bundle it with their extensions, and implement -logic that parses and interprets the dataset in order to determine the topmost +logic that parses and interprets the dataset in order to determine the *registrable domain / eTLD+1* for a domain name. There are several drawbacks to this approach: -1. Potential for inconsistencies in the determination of topmost *registrable domain / eTLD+1* +1. Potential for inconsistencies in the determination of *registrable domain / eTLD+1* by the extension and the host browser, due to differences in the version of the PSL dataset used, and differences in the implementations that the host browser and the extension use in order to interpret this dataset. @@ -185,8 +512,8 @@ should ultimately be obtained by means of an Open Web API. ## Implementation Notes Since the major browsers all already implement internal methods for determining -topmost *registrable domain / eTLD+1*s, it is hoped that the implementation will -involve little more than providing the relevant mechanism for exposing these same methods +*registrable domain / eTLD+1*s, it is hoped that the implementation will +largely involve providing the relevant mechanism for exposing these same methods to extensions: | Browser | Registrable Domain Method | @@ -195,33 +522,34 @@ to extensions: | Firefox | [getBaseDomain](https://searchfox.org/mozilla-central/rev/e9f9bf31d1c0057a1cd339b5a853a75d1b16db39/netwerk/dns/nsIEffectiveTLDService.idl#94) | | Safari | [topPrivatelyControlledDomain](https://github.com/WebKit/WebKit/blob/01eba7c416725cfd4eec57ab16daffa25b8124b4/Source/WebCore/platform/PublicSuffixStore.h#L43) | +However, differences may exist in the implementations of these internal methods. Therefore +there may be additional effort involved in testing that the existing internal methods +conform to the expected behaviours of the API in this proposal. + ## Future Work ### 1. Extend the API The major browsers provide additional methods/parameters internally for getting -information related to the *registable domain / eTLD+1*. The API could be extended -to expose more of these internal methods/parameters, for example: +information related to the *registrable domain / eTLD+1*. The API could be extended +to expose more of these internal methods/parameters, if relevant use cases for such +additional functionality are identified. For example: -1. Provide an option to include private domains when getting the *registrable domain / eTLD+1*. -2. Provide method `getPublicSuffix()` to get the *public suffix / eTLD*. -3. Provide methods `isRegistrableDomain()` and/or `isPublicSuffix()` for possibly improved +1. Provide method `getPublicSuffix()` to get the *public suffix / eTLD*. +2. Provide methods `isRegistrableDomain()` and/or `isPublicSuffix()` for possibly improved efficiency in certain use cases. +3. Provide an option to require that `getRegistrableDomain()` must explicitly match a +public suffix in the PSL (i.e. the domain must have a "known" suffix). -### 2. Batching - -Extensions may want to obtain *registrable domain / eTLD+1*s for large numbers of -domain names / URLs at once. A possible enhancement to the API would be to -provide a method that performed a lookup on multiple domain names / URLs with a -single method call. - -### 3. Change Notifications +### 2. Change Notifications The PSL dataset, used by the browsers to determine *registrable domain / eTLD+1*s, -is a dynamic dataset that can change at any time. This API proposal currently provides no -mechanism for notifying extensions when the host browser's PSL dataset changes. -It is understood that such changes are only made currently when a new browser version +is a dynamic dataset that can change at any time. Although this API provides a function +for retrieving the current version of the PSL dataset used by the browser, no mechanism +is provided for notifying extensions when the host browser's PSL dataset changes. It is +understood that such changes are only made currently when a new browser version is released, however this may not always be the case. It may be useful to implement a notification mechanism so that extensions can take -appropriate action when the host browser's PSL dataset changes. +appropriate action when the host browser's PSL dataset changes, to avoid having to +poll the `getVersion()` function provided by this API. From c2171564bebfbd15314e2e147a613129044a4753 Mon Sep 17 00:00:00 2001 From: Francis McKenzie Date: Sun, 2 Feb 2025 20:34:58 +0800 Subject: [PATCH 3/7] Update Public Suffix API proposal --- proposals/public-suffix.md | 607 +++++++++++++++++++++++++------------ 1 file changed, 417 insertions(+), 190 deletions(-) diff --git a/proposals/public-suffix.md b/proposals/public-suffix.md index cdd39c19..1e77bca8 100644 --- a/proposals/public-suffix.md +++ b/proposals/public-suffix.md @@ -22,36 +22,41 @@ from a domain name. ### Objective This API enables developers to obtain the *registrable domain / eTLD+1* +(henceforth *registrable domain*) from a domain name. This functionality is already implemented for internal use by all the major browsers. Therefore the effect of this API is to expose existing built-in browser functionality to extensions developers. The primary objective of this API is to eliminate the possibility of inconsistencies between the host browser and hosted extensions when deriving -*registrable domain / eTLD+1*s from domain names. +registrable domains from domain names. Secondary objectives of this API are to: 1. Improve extension developer experience by reducing complexity and maintenance overhead, since developers will no longer need to roll their own solutions for obtaining and parsing -the [Public Suffix List (PSL)](https://publicsuffix.org/list/). +the [Public Suffix List](https://publicsuffix.org/list/) (henceforth *PSL*). 2. Reduce extensions' resource usage (CPU, memory, disk space), since extensions will no longer be duplicating work already done by the host browser. ### Features of the PSL -In order to set out the use cases fully, it is first necessary to describe certain -relevant features of the PSL. +When determining the registrable domain of a candidate domain name, the major browsers +all make use of the PSL, which is a dataset containing known *public suffix / eTLD* +(henceforth *eTLD*) values. + +The following sections set out the PSL's main characteristics and discusses how +they may be handled in the eventual API. #### 1. ICANN vs Private -The PSL it is divided into two sections: ICANN suffixes and Private (i.e. non-ICANN) -suffixes. +The eTLDs in the PSL are divided into two sections: an ICANN section and a Private +(i.e. non-ICANN) section. ##### Examples -| ICANN Suffixes | Private Suffixes (Non-ICANN) | -|-------------------------------|------------------------------ | +| ICANN Section | Private Section | +|-------------------------------|-------------------------------| | com | blogspot.com | | | members.linode.com | | | simplesite.com | @@ -60,50 +65,190 @@ suffixes. | | wellbeingzone.co.uk | | | vm.bytemark.co.uk | -The suffixes in the PSL represent boundaries between registrar-type organizations -and their clients. The "owner" of the suffix is one organization, and the "owner" -of a domain label immediately preceeding the suffix is a different organization. - -In the case of suffixes in the ICANN section, the "owner" of a suffix is ICANN / -an affiliated registrar. In the case of suffixes in the Private section, the -"owner" of a suffix is a private organization offering a service for clients +In the case of eTLDs in the ICANN section, the "owner" of an eTLD is ICANN / +an affiliated registrar. In the case of eTLDs in the Private section, the +"owner" of an eTLD is a private organization offering a service for clients to take ownership of subdomains underneath its own ICANN-suffixed domain. -#### 2. Multiple Suffixes per Domain +At first glance, it might seem that knowing which section an eTLD is from +could give useful information as to whether or not it is an "official" +ICANN-designated eTLD. In practice, however, the distinction +between the two sections is somewhat arbitrary and not well-defined. +This is due to the inherent challenges in maintaining and curating the +PSL dataset, which relies on the cooperation of registrars. + +##### 1.1 Issues + +In order to construct the eTLD list in the ICANN section, the maintainers of +the PSL start by taking the "official" ICANN/IANA TLDs, which are only the +very last single-label suffixes. These are set out in the following lists: + +* [IANA](https://www.iana.org/domains/root/db) +* [gTLD](https://www.icann.org/resources/registries/gtlds/v2/gtlds.json) + +In addition to these two sources, the PSL maintainers try to get in touch with +the ICANN-affiliated registrars to get them to update their "official" subdomains +or to ask questions, but this is often unsuccessful. + +Examples of issues with the supposedly "official" ICANN-section eTLDs are: + +* while the `.au` entries in the PSL are in the ICANN section, the relevant parties +cannot agree what their "official" eTLDs are. +* `bd` has a wildcard `*.bd` entry in the ICANN section, meaning any domain label +can be prepended to `.bd` and it is automatically an "official" ICANN eTLD. +* `.co.uk` is in the ICANN section but the actual IANA TLD is still only `.uk`. + +##### 1.2 Recommendation + +Due to the ad hoc nature of the PSL sections, it is recommended that consumers of +the PSL should: + +* treat the PSL dataset as a single eTLD list (i.e. both sections combined) +* not derive any significant meaning from the section in which an eTLD is located + +For this reason, this proposal's API should use all of the PSL's eTLDs and +should not expose information about the specific section (ICANN vs Private) +eTLDs are assigned to. -If a domain name ends with a suffix listed in the Private section of the PSL, -then it must also end with a shorter suffix in the ICANN section. The question -of which one should be taken to be the domain's public suffix depends on the -specific use case. +#### 2. Multiple eTLDs per Domain + +For any given domain name, there may be multiple matching eTLDs in the PSL. ##### Example -| Domain name | Private suffix | ICANN suffix | -|:-------------------:|:---------------:|:------------:| -| foo.bar.wixsite.com | wixsite.com | com | +| Domain name | Matching eTLDs in PSL | Implied registrable domain | +|----------------------|----------------------:|---------------------------:| +| foo.bar.wixsite.com | wixsite.com | bar.wixsite.com | +| | com | wixsite.com | +| | | | +| foo.bar.paris.eu.org | paris.eu.org | bar.paris.eu.org | +| | eu.org | paris.eu.org | +| | org | eu.org | + +All of the identified use cases in this proposal make use of the +registrable domain in order to carry out some action (e.g. malware-blocking) +based on the organization responsible for the domain's content, not the +associated registrar(s). Of the possible registrable domains for a given +domain, the only one that designates the content-owning organization +is the *longest one*. All of the other registrable domains designate +the registrars, who play a passive role and are not responsible for the +full domain's content. + +Therefore, in the case of a candidate domain having multiple eTLDs and consequently +multiple possible registrable domains, the API in this proposal will only calculate +the longest registrable domain, which designates the candidate domain's content owner. +The API will not provide a means of determining the registrable domain(s) +of the associated registrars. #### 3. Known vs Unknown -If a domain name does not match any suffix in the PSL, it is considered to have -an unknown suffix. Depending on the use case, the public suffix could be taken -by default to be the last domain label of the domain name, or alternatively -the domain name could be considered invalid. +If a domain name does not match any eTLD in the PSL, it is considered to have +an "unknown" eTLD. + +##### 3.1 Non-public eTLDs + +A specific situation where a valid domain name may legitimately have an +unknown eTLD is when an intranet has custom non-public hostnames. +E.g. `printer.homenet` and `backup.homenet` would not match any entry in the PSL, +but are likely part of the same internal structure having the non-public eTLD `homenet`. + +##### 3.2 Incomplete / old PSL datasets + +A candidate domain may have a known eTLD, but may still be incorrectly deemed +by the algorithm to have an unknown eTLD if the PSL dataset used by the algorithm +has not yet been updated to include that particular eTLD. As of the date of this +proposal, such an eventuality is somewhat likely because the major browsers only update +their PSL datasets when releasing new browser versions. This introduces a certain delay +between the time when new eTLDs are offered by registrars, and the time when +those same eTLDs are present in the PSL datasets stored in users' browsers. + +##### 3.3 Algorithm performance + +It may be more performant to allow unknown eTLDs and assume a single-label +eTLD by default. The PSL algorithm starts with a candidate domain name and +removes labels in turn until a matching eTLD is found. For any unknown-suffixed +candidate domain (regardless of how many labels it has), the algorithm will always +reach the final label without finding any matches in the PSL. At this point, if it +is allowable to assume that all unknown single-label eTLDs are valid, then +certain optimisations to the algorithm are possible as follows: + +###### 3.3.1 Fewer PSL searches + +The algorithm can avoid doing the final search against the PSL using the final label. +This may save a few CPU cycles for every candidate domain lookup. + +Example candidate domain: `foo.bar.baz` + +| Step | Domain | Search in PSL? | +|:----:|:------:|:------:| +| 1 | `foo.bar.baz` | yes | +| 2 | `bar.baz` | yes | +| 3 | `baz` | no | + +It is unclear how much of a performance benefit such an optimization would give +in practice. -**Note:** it may be more performant to allow unknown suffixes and assume a single-label -suffix by default, because it allows the following optimisation of the lookup algorithm: -all single-label suffixes in the PSL can be excluded from the lookup, since they do not -need to be matched specifically. +###### 3.3.2 Smaller browser footprint + +The browser can avoid storing *any* single-label eTLDs on disk or in memory. +This allows a possible reduction in browser startup time, since it is loading fewer +PSL eTLDs into memory from disk, and thereafter lower browser memory usage due to +holding fewer PSL eTLDs in memory. + +However, this is a moot point if browsers already need to distinguish known from +unknown eTLDs for the purposes of other browser functionality unrelated to this +proposal; in that case, the full PSL dataset (including all single-label eTLDs) +is already available and therefore this proposal would not be adding any overhead +in terms of browser footprint due to storing the entire PSL. + +For example, Firefox *does* already need to distinguish known vs unknown eTLDs +in order to determine whether to issue a search query or whether to try a navigation, +when a user enters a domain-like string in the navigation bar. In such instances, +a PSL lookup is made and: + +* If the domain has a known eTLD, attempt to navigate. +* If the domain has an unknown eTLD, use a search engine. + +##### 3.4 Recommendation + +It is recommended that this API should only expose known eTLDs, and leave it up +to extensions to handle unknown eTLDs because: + +* none of the use cases identified later in this proposal are relevant to non-public +(intranet) hostnames +* unknown eTLDs due to out-of-date PSL datasets, although possible, are likely to be +somewhat infrequent and temporary +* even if it were possible to make certain performance optimisations by assuming +all single-label eTLDs are valid, any associated performance benefits +may not be significant #### 4. IDN -The PSL uses Unicode, and contains International Domain Name (IDN) suffixes. Depending -on the use case, when returning the *registrable domain / eTLD+1* for a domain name, +The PSL uses Unicode, and contains International Domain Name (IDN) eTLDs. Depending +on the use case, when returning the registrable domain for a domain name, either Unicode or Punycode may be preferred. -**Note:** the [PSL algorithm](https://github.com/publicsuffix/list/wiki/Format#formal-algorithm) -requires Punycode for the matching logic. Therefore a requirement to convert Punycode -back to Unicode involves extra work. It may be preferable to avoid this in -performance-sensitive use cases. +According to a possible [PSL algorithm](https://github.com/publicsuffix/list/wiki/Format#formal-algorithm) +for interpreting the PSL dataset, candidate domains should be converted to Punycode +before matching against the PSL. Therefore, a requirement to convert Punycode +back to Unicode would involve extra work, and it may be preferable to avoid this in +performance-sensitive use cases. However, it has been suggested that this PSL algorithm +is not authoritative. Indeed, any algorithm used to interpret the PSL dataset is likely +to be fairly trivial, since it essentially involves comparing labels for equality. + +Further points to consider in this context are: + +* Punycode may be the most appropriate encoding to use, since a Punycode hostname is a +valid URL whereas a Unicode hostname is not. Therefore Punycode may be the most sane default. + +* When this API is used to obtain registrable domains intended for display to the user, +it is likely that the end result will at some point need to be converted to Unicode, +since users may be less familiar with Punycode. + +##### 4.1 Recommendation + +It is recommended that registrable domains should be returned as Punycode by default, +but the API should also provide an option to convert these to Unicode. #### Use Cases @@ -125,32 +270,25 @@ The user may want to specify request filtering rules as follows: | # | Filtering rule type | User value | Examples of domains affected | |:-:|---------------------|:----------:|-----------------------------:| | 1 | organizational base domain and all possible subdomains | myorg.co.uk | myorg.co.uk
subdomain1.myorg.co.uk
subdomain2.myorg.co.uk | -| 2 | organization name and all possible suffixes | myorg | myorg.co.uk
myorg.com
myorg.net | +| 2 | organization name and all possible eTLDs | myorg | myorg.co.uk
myorg.com
myorg.net | To support this use case, such extensions may: -* automatically calculate the set of organization names and *registrable domain / eTLD+1*s +* automatically calculate the set of organization names and registrable domains from a user-specified set of domain names, and propose these as filtering rules for the user to choose -* prevent users from mistakenly specifying a *public suffix / eTLD* as a filtering rule, -believing it to be a *registrable domain / eTLD+1*, e.g. `co.uk` instead of `myorg.co.uk` +* provide a warning to users if they specify an eTLD as a filtering rule, +believing it to be a registrable domain, e.g. `co.uk` instead of `myorg.co.uk` +(however eTLDs *do* in fact sometimes have websites, therefore it would be a mistake +to entirely prevent a user from using an eTLD as a filter) * apply the users' filtering rules automatically while the user browses by determining -the *registrable domain / eTLD+1* of each web request, and testing this using appropriate +the registrable domain of each web request, and testing this using appropriate regexes to decide whether or not to act on the request -##### PSL Requirements - -| PSL Feature | Requirement | Discussion | -|------------------------|-------------|------------| -| Allow Private Suffixes | Yes & No | some filters may require private suffixes, others ICANN-only | -| | | preventing user mistake may require both types of suffix | -| Allow Unknown Suffixes | Yes | provides better performance | -| Preserve IDN Unicode | Yes & No | Unicode when viewing filters in UI, Punycode when implementing filtering logic for better performance | - #### 2. Group Domains in UI Where extensions present lists of domain names to users, it can be beneficial -from a UX perspective to group them by their *registrable domain / eTLD+1*s. +from a UX perspective to group them by their registrable domains. ##### Example @@ -166,14 +304,6 @@ from a UX perspective to group them by their *registrable domain / eTLD+1*s. | | | | foo.bar.example2.com | | | | | www.example2.com | -##### PSL Requirements - -| PSL Feature | Requirement | Discussion | -|------------------------|-------------|------------| -| Allow Private Suffixes | Yes & No | optimal grouping may require either private (more groups, fewer domains per group) or ICANN-only (fewer groups, more domains per group) | -| Allow Unknown Suffixes | Yes | grouping behaviour is the same regardless of whether suffixes are known/unknown | -| Preserve IDN Unicode | Yes | better UX, because users may be unfamiliar with Punycode | - #### 3. Detect Third-Party Requests Some ad-blocker extensions allow users to create request-blocking rules that only @@ -181,30 +311,22 @@ apply to *[third-party requests](https://help.adblockplus.org/hc/en-us/articles/ typically exclude requests from tracking detection unless they are third-party requests. In order to determine if a request is a third-party request, such extensions lookup -the *registrable domain / eTLD+1* of the request and of the parent document using the PSL. -If the *registrable domain / eTLD+1* of the request is different to that of the parent +the registrable domain of the request and of the parent document using the PSL. +If the registrable domain of the request is different to that of the parent document, then the request is considered third-party. ##### Example of Determining if Requests are Third-Party -| Req. | Domain | Registrable Domain | PSL Section | Third-party? | -|:----:|----------------------------:|----------------------------:|:-----------:|:------------:| -| #1 | foo.com | foo.com | ICANN | | -| #2 | bar.com | bar.com | ICANN | Yes | -| | | | | | -| #1 | foo.amazonaws.com | amazonaws.com | ICANN | | -| #2 | bar.amazonaws.com | amazonaws.com | ICANN | No | -| | | | | | -| #1 | foo.amazonaws.com | amazonaws.com | ICANN | | -| #2 | bar.us-east-1.amazonaws.com | bar.us-east-1.amazonaws.com | Private | Yes | - -##### PSL Requirements - -| PSL Feature | Requirement | Discussion | -|------------------------|-------------|------------| -| Allow Private Suffixes | Yes | including all suffixes in PSL means more information about third-party boundaries | -| Allow Unknown Suffixes | Yes | provides better performance | -| Preserve IDN Unicode | No | provides better performance | +| Req. | Domain | Registrable Domain | Third-party? | +|:----:|----------------------------:|----------------------------:|:------------:| +| #1 | foo.com | foo.com | | +| #2 | bar.com | bar.com | Yes | +| | | | | +| #1 | foo.amazonaws.com | amazonaws.com | | +| #2 | bar.amazonaws.com | amazonaws.com | No | +| | | | | +| #1 | foo.amazonaws.com | amazonaws.com | | +| #2 | bar.us-east-1.amazonaws.com | bar.us-east-1.amazonaws.com | Yes | ### Known Consumers @@ -226,7 +348,7 @@ from this API due to the reasons stated in the [Objective](#objective) section. | [uBlock Origin](https://addons.mozilla.org/en-US/firefox/addon/ublock-origin/) | #1, #2, #3 | create organization/3rd-party request-blocking rules, view request blocking status by organization, create blocking rules for specifically 3rd-party requests | | [adblock-plus](https://addons.mozilla.org/en-US/firefox/addon/adblock-plus/) | #1, #3 | create organization/3rd-party request-blocking rules | | [duckduckgo-for-firefox](https://addons.mozilla.org/en-US/firefox/addon/duckduckgo-for-firefox/) | #3 | exclude requests from tracking detection unless they are 3rd-party | -| [violentmonkey](https://addons.mozilla.org/en-US/firefox/addon/violentmonkey/) | #1 | run scripts automatically for requests having a specific organization name plus any suffix | +| [violentmonkey](https://addons.mozilla.org/en-US/firefox/addon/violentmonkey/) | #1 | run scripts automatically for requests having a specific organization name plus any eTLD | ## Specification @@ -237,35 +359,27 @@ A new API `publicSuffix` is added as follows: ```ts namespace publicSuffix { // - // Object containing both registrable domains (i.e. ICANN/unknown, Private) - // for a domain. + // Determines if all specified domains have the same registrable domain. // - interface RegistrableDomains { - // The original domain whose registrable domain we want - domain: string, - // The ICANN-suffixed or unknown-suffixed registrable domain. - // Null if an error occurred. - base?: string, - // The Private-suffixed registrable domain. - // Null if an error occurred, or if the domain has no matching Private suffix. - private?: string, - // Error thrown during lookup, if any. - error?: Error, - } - + // Note: rejects with an error if any domain name has an unknown eTLD + // (i.e. not in the PSL). // - // Options that may be passed to getRegistrableDomain() to control its behaviour. + // Example: // - interface RegistrableDomainOptions { - // If true, exclude private (non-ICANN) suffixes from the lookup algorithm - excludePrivateSuffixes: boolean, - // If true, use Punycode instead of Unicode when returning the registrable domain - punycode: boolean, - } + // let result = await browser.publicSuffix.hasSameRegistrableDomain( + // "www.example.co.uk", + // "xyz.example.co.uk", + // "foo.bar.baz.example.co.uk", + // ); + // ==> true + // + export function hasSameRegistrableDomain(...domains: string[]) : Promise; // // Gets the longest registrable domain for a specified domain. // + // Note: fulfils with null if the input domain name has an unknown eTLD. + // // Example: // // let domain = await browser.publicSuffix.getRegistrableDomain("www.example.co.uk"); @@ -277,83 +391,105 @@ namespace publicSuffix { // Options that control the behaviour of the lookup algorithm options?: RegistrableDomainOptions, ) - // Resolves to the longest registrable domain of the input domain name - : Promise; + // Fulfils with the longest registrable domain of the input domain name. + : Promise; // - // Gets both registrable domains (i.e. ICANN/unknown, Private) for each domain name - // in an array of domain names. + // Gets the longest registrable domain for each domain in a specified + // list of domains. + // + // Note: modelled on `Promise.allSettled()` // // Example: // // let domains = await browser.publicSuffix.getRegistrableDomains([ // "foo.bar.wixsite.com", // "www.example.net", + // "printer.homenet", // "a..b", // ]); // ==> [ - // { domain: "foo.bar.wixsite.com", base: "wixsite.com", private: "bar.wixsite.com" }, - // { domain: "www.example.net", base: "example.net", }, - // { domain: "a..b", error: "Invalid domain name", }, + // { status: "fulfilled", value: "bar.wixsite.com" }, + // { status: "fulfilled", value: "www.example.net" }, + // { status: "fulfilled", value: null }, + // { status: "rejected", reason: "Invalid domain name" }, // ] // export function getRegistrableDomains( // The domain names whose registrable domains we want to find - domains: Array, + domains: Iterable, // Options that control the behaviour of the lookup algorithm options?: RegistrableDomainOptions, ) - // Resolves to the registrable domains of each input domain name - : Promise>; + // Fulfils with a registrable domain result corresponding to each input domain name + : Promise>; + + // + // Gets the value of the VERSION metadata field in the PSL dataset if available + // + export function getVersion(): string | null; + + // + // Options that may be passed to getRegistrableDomain() and getRegistrableDomains() + // to control their behaviour. + // + interface RegistrableDomainOptions { + // If true, the returned registrable domain(s) should be encoded as Unicode + unicode?: boolean, + } // - // Gets the PSL dataset version if available + // Object containing the result of calculating the registrable domain for one of the + // domains in the array passed to getRegistrableDomains(). + // + // Note: modelled on `Promise.allSettled()` // - export function getVersion(): string?; + interface RegistrableDomainResult { + // A string, either "fulfilled" or "rejected", indicating the eventual state of the promise. + status: string, + // Only present if status is "fulfilled". The calculated registrable domain, or null + // if the corresponding input domain has an unknown eTLD. + value?: string | null, + // Only present if status is "rejected". The reason that the promise was rejected with. + reason?: string, + } } ``` ### Behaviours -#### 1. Private Suffixes +#### 1. PSL Sections -By default, the lookup performed by `getRegistrableDomain()` should **include** all -suffixes in the PSL dataset, i.e. both ICANN and Private (non-ICANN) suffixes. +Lookups of the PSL dataset should **include** all eTLDs from both PSL sections, +i.e. both ICANN-section and Private-section eTLDs. -However, if an `options` object is passed to `getRegistrableDomain()` with key -`excludePrivateSuffixes` set to `true`, then Private (non-ICANN) suffixes should be -**excluded** from the lookup algorithm. +#### 2. Multiple eTLDs per Domain -##### Example - -`domain` = foo.bar.wixsite.com - -| Option | Registrable Domain | PSL Section | -|------------------------------------------|:--------------------:|:-----------:| -| excludePrivateSuffixes = false (default) | bar.wixsite.com | Private | -| excludePrivateSuffixes = true | wixsite.com | ICANN | - -#### 2. Multiple Suffixes per Domain - -The lookup performed by `getRegistrableDomain()` should select the **longest** matching -suffix (unless specifically excluded using the `excludePrivateSuffixes` option). +Lookups of the PSL dataset should always select the **longest** matching eTLD +when determining the registrable domain. ##### Example `domain` = foo.bar.lib.de.us -| Candidate Suffix | PSL Section | -|-----------------:|:-----------:| -| de.us | ICANN | -| lib.de.us | Private | +| Matching eTLDs | +|----------------:| +| lib.de.us | +| de.us | The longest is lib.de.us, so `getRegistrableDomain()` resolves to bar.lib.de.us #### 3. PSL Special Rules -The lookup performed by `getRegistrableDomain()` should adhere to the +It is noted that the major browsers currently have their own implementations +of the PSL-handling logic, and attempts to standardise the various implementations +across browsers have thus-far been unsuccessful. Therefore it is proposed that the +browsers should follow their existing PSL-handling rules when implementing this +proposal's API. + +For informational purposes, examples of the 'wildcard' and 'exception' rules +in the PSL are given below. These are described in more detail in a possible [PSL algorithm](https://github.com/publicsuffix/list/wiki/Format#formal-algorithm). -In particular, it should apply the 'wildcard' and 'exception' rules in the PSL. ##### Examples @@ -370,27 +506,25 @@ In particular, it should apply the 'wildcard' and 'exception' rules in the PSL. #### 4. Unknown Suffixes -If no matching suffix is found in the PSL for a `domain` parameter, then unless it is determined -to be specifically [invalid](#6-invalid-domain-parameter), it should be assumed the domain has a -single-label suffix. +A `null` registrable domain should be returned for a candidate domain that is otherwise +valid but has an unknown eTLD (i.e. one that is not found in the PSL). ##### Example -| Domain parameter | Registrable domain | PSL Section | -|-----------------------:|-------------------:|:-----------:| -| www.example.foobar | example.foobar | n/a | -| www.example.co.foobar | co.foobar | n/a | +| Domain parameter | Registrable Domain | +|------------------------|-------------------:| +| www.example.com | example.com | +| www.example.foobar | `null` | #### 5. IDN -The `domain` parameter passed to `getRegistrableDomain()` may be either Unicode -or Punycode. +All API methods should accept domains passed as input parameters using either +Unicode or Punycode encoding. -When settling the promise returned by `getRegistrableDomain()`, the resulting -domain name should be converted to Unicode from Punycode by default. - -However, if an `options` object is passed to `getRegistrableDomain()` with key -`punycode` set to `true`, then Punycode should be used instead. +Methods that return registrable domains should encode them using Punycode +encoding by default, unless an `options` object is passed as an input parameter +with key `unicode` set to `true`, in which case they should be encoded +using Unicode encoding. ##### Example @@ -398,16 +532,18 @@ However, if an `options` object is passed to `getRegistrableDomain()` with key | Option | Registrable Domain | |----------------------------|-----------------------:| -| punycode = false (default) | example.مليسيا | -| punycode = true | example.xn--mgbx4cd0ab | +| unicode !== true (default) | example.xn--mgbx4cd0ab | +| unicode === true | example.مليسيا | #### 6. Invalid domain parameter -The promise returned by `getRegistrableDomain()` should reject if the `domain` parameter -meets any of the following criteria: +The promises returned by this API's methods should reject with an error if a domain +passed as an input parameter meets any of the following criteria: + * Contains a character that is invalid in an Internationalized Domain Name (IDN) - e.g. symbols, whitespace -* Is an IP address - IPv4 or IPv6 -* Is a public suffix itself - including the case of it being a single-label suffix not explicitly matched in the PSL +* Is an IP address - IPv4 or IPv6. (Avoids `100.200.30.2` and `100.200.31.2` being interpreted as belonging to the same organization.) +* Is an eTLD itself +* Is a single domain label, regardless of whether or not it exists in the PSL * Is an empty string * Is equal to `'.'` * Contains empty domain labels (i.e. any occurrences of `'..'`) @@ -418,42 +554,112 @@ The following table sets out the eventual settled state of the promise returned `getRegistrableDomain()` for different classes of input `domain` parameter: | Domain parameter | Description | Registrable domain | -|:-------------------|:-------------------------------------------------|:-----------------------| +|:-------------------|:-------------------------------------------------|-----------------------:| | example.net | eTLD+1 | example.net | | www.example.net | eTLD+2 | example.net | -| net | is a public suffix itself | Error | -| foobar | no matching suffix in PSL, assume 1-label suffix | Error | -| net.foobar | no matching suffix in PSL, assume 1-label suffix | net.foobar | +| net | is an eTLD itself, single-label | Error | +| github.io | is an eTLD itself, multi-label | Error | +| foobar | no matching eTLD in PSL, single-label | Error | +| net.foobar | no matching eTLD in PSL, multi-label | `null` | +| foobar.net | has an eTLD in the ICANN section | foobar.net | +| foobar.github.io | has an eTLD in the Private section | foobar.github.io | | 127.0.0.1 | IP address, IPv4 | Error | | [::1] | IPv6 address | Error | | EXAMPLE.NET | uppercase | example.net | | .example.net | dot in front | example.net | | example.net. | dot in the end, this is an FQDN | example.net. | | *.com | contains invalid character `'*'` | Error | -| github.io | is a public suffix in the Private section | Error | -| github.io | as above, with `excludePrivateSuffixes = true` | github.io | -| foobar.github.io | has a public suffix in the Private section | foobar.github.io | -| foobar.github.io | as above, with `excludePrivateSuffixes = true` | github.io | -| مليسيا | this is an IDN that is also a public suffix | Error | +| مليسيا | this is an IDN that is also an eTLD | Error | | xn--mgbx4cd0ab | as above, but Punycode | Error | -| foo.مليسيا | this is an IDN | foo.مليسيا | -| foo.مليسيا | as above, with `punycode = true` | foo.xn--mgbx4cd0ab | -| foo.xn--mgbx4cd0ab | this is an IDN, but Punycode | foo.مليسيا | -| foo.xn--mgbx4cd0ab | as above, with `punycode = true` | foo.xn--mgbx4cd0ab | +| foo.مليسيا | this is an IDN | foo.xn--mgbx4cd0ab | +| foo.مليسيا | as above, with `unicode === true` | foo.مليسيا | +| foo.xn--mgbx4cd0ab | this is an IDN, but Punycode | foo.xn--mgbx4cd0ab | +| foo.xn--mgbx4cd0ab | as above, with `unicode === true` | foo.مليسيا | | | empty string | Error | | . | no domain labels | Error | | example..com | contains an empty domain label | Error | -#### 8. Batching +#### 8. Comparing registrable domains + +Method `hasSameRegistrableDomain(domain1, domain2)` is suited in particular to +[Use Case #3: Detect Third-Party Requests](#3-detect-third-party-requests), +since it provides a simple way of comparing registrable domains. -For [Use Case #2](#2-group-domains-in-ui), function `getRegistrableDomains()` enables -extensions to get multiple registrable domains with a single API call. +This method is *almost but not quite* equivalent to: + +``` +await publicSuffix.getRegistrableDomain(domain1) === publicSuffix.getRegistrableDomain(domain2) +``` -The returned promises's resolved array should contain a `RegistrableDomains` object -for each item in the input `domains` parameter, with the same ordering. +The key difference is in the case of domains with unknown eTLDs (i.e. not in the PSL). +In such cases, `getRegistrableDomain()` returns `null`, which would mean +`hasSameRegistrableDomain()`, if implemented using the above code, would return `true` +for all domains with unknown eTLDs. This would likely be a surprising result for users +of this API. + +Therefore if any domain passed to `hasSameRegistrableDomain()` has an unknown eTLD, +the returned promise should reject with an error. + +##### Examples + +Given `homenet` and `mywork` are not eTLDs in the PSL: + +| Domain1 | Domain2 | hasSameRegistrableDomain() | +|---------------------|------------------|---------------------------:| +| foo.example.com | bar.example.com | true | +| foo.example.com | bar.example2.com | false | +| foo.example.com | backup.homenet | Error | +| printer.homenet | backup.homenet | Error | +| printer.homenet | backup.mywork | Error | + +#### 9. Batching + +Method `getRegistrableDomains()` is suited in particular to +[Use Case #2: Group Domains in UI](#2-group-domains-in-ui), since it enables +extensions to get the registrable domains of multiple domains with a single API call. + +This method is equivalent to calling `Promise.allSettled()` using the results of multiple +individual calls to `getRegistrableDomain()`. E.g. for three domains: + +``` +Promise.allSettled( + publicSuffix.getRegistrableDomain(domain1), + publicSuffix.getRegistrableDomain(domain2), + publicSuffix.getRegistrableDomain(domain3), +) +``` + +The promise returned by `getRegistrableDomains()` mirrors that returned by `Promise.allSettled()`: +the fulfilment value is an array of `RegistrableDomainResult` objects, in the order of the +input domains passed, with the same fields as those of `Promise.allSettled()`. For each valid +input domain, the `status` field of the corresponding `RegistrableDomainResult` object is `fulfilled` +and the `value` field is the registrable domain, or `null` if the input domain has an unknown eTLD. An error during the lookup of any of the domain names does not cause the returned promise -to be rejected. Instead, each such error is stored in the `error` key of the corresponding `RegistrableDomains` object for the domain name in question. +to be rejected. Instead, the corresponding `RegistrableDomainResult` object has the `status` field +set to `rejected` and the `reason` field holds the error description. + +##### 9.1 Justification for batching + +As stated, the same information provided by `getRegistrableDomains()` could be obtained +by simply calling `getRegistrableDomain()` multiple times. + +The problem with this approach is that there is overhead associated with an extension +calling an async function on the parent browser. For example, obtaining the registrable domain +for a list of 50 domains would involve making 50 async calls to the parent browser. +However, with the batching approach afforded by `getRegistrableDomains()`, only +a single async call to the parent browser would be made, passing all 50 domains at once. + +A quick mockup of the two approaches was built using a simplified implementation +of this proposal's API in a modified Firefox, and the batching approach was +about 2-3 times faster for 50 domains. + +#### 10. PSL Version + +Versioning metadata was introduced into the PSL with [this commit](https://github.com/publicsuffix/list/issues/1808#issuecomment-2455793503). + +Method `getVersion()` of this API should return the value of the +VERSION metadata field contained in the specific PSL dataset used by the browser. ### New Permissions @@ -461,6 +667,10 @@ to be rejected. Instead, each such error is stored in the `error` key of the cor | ---------------- | ----------------- | | publicSuffix | N/A | +Method `publicSuffix.getVersion()` may theoretically contribute to making the browser +fingerprintable. However, it is already possible for malicious sites to get the browser's +version, therefore this API does not introduce significant additional fingerprintability. + ### Manifest File Changes There are no changes to the manifest. @@ -473,7 +683,8 @@ The only data exposed by this API is the [public suffix list](https://publicsuff ### Abuse Mitigations -This does not expose any new non-public data so there are no new abuse vectors. +This does not expose any new non-public data, nor does it significantly increase +browser fingerprintability, so there are no new abuse vectors. ### Additional Security Considerations @@ -485,10 +696,10 @@ N/A Developers can download the PSL dataset, bundle it with their extensions, and implement logic that parses and interprets the dataset in order to determine the -*registrable domain / eTLD+1* for a domain name. There are several drawbacks to +registrable domain for a domain name. There are several drawbacks to this approach: -1. Potential for inconsistencies in the determination of *registrable domain / eTLD+1* +1. Potential for inconsistencies in the determination of registrable domains by the extension and the host browser, due to differences in the version of the PSL dataset used, and differences in the implementations that the host browser and the extension use in order to interpret this dataset. @@ -512,7 +723,7 @@ should ultimately be obtained by means of an Open Web API. ## Implementation Notes Since the major browsers all already implement internal methods for determining -*registrable domain / eTLD+1*s, it is hoped that the implementation will +registrable domains, it is hoped that the implementation will largely involve providing the relevant mechanism for exposing these same methods to extensions: @@ -531,19 +742,35 @@ conform to the expected behaviours of the API in this proposal. ### 1. Extend the API The major browsers provide additional methods/parameters internally for getting -information related to the *registrable domain / eTLD+1*. The API could be extended +information related to the registrable domain. The API could be extended to expose more of these internal methods/parameters, if relevant use cases for such additional functionality are identified. For example: -1. Provide method `getPublicSuffix()` to get the *public suffix / eTLD*. -2. Provide methods `isRegistrableDomain()` and/or `isPublicSuffix()` for possibly improved +1. Provide method `getPublicSuffix()` to get the eTLD as opposed to the registrable domain. +2. Provide method `isPublicSuffix()` and/or `isRegistrableDomain()` for possibly improved efficiency in certain use cases. -3. Provide an option to require that `getRegistrableDomain()` must explicitly match a -public suffix in the PSL (i.e. the domain must have a "known" suffix). +3. Provide an option to handle domains with unknown eTLDs differently: instead of +returning a `null` registrable domain in these cases, the API would assume a single-label +eTLD and calculate the registrable domain accordingly. + +### 2. Registrable Domains of Registrars + +In the case where a domain has multiple matching eTLDs in the PSL and therefore +more than one possible registrable domain, this proposal's API always selects +the longest one. In other words, the API only provides the registrable domain +of the domain's content-owning organization, and not that of any +associated registrars. + +Theoretically, there may be cases where all organizations using a certain +registrar are behaving similarly (e.g. distributing malware), and therefore +extensions may want to provide support for carrying out some action +(e.g. malware blocking) against the shared registrar itself, rather than against +each individual content-owning organization. Support for any such use case +could be handled in a future update to this API. -### 2. Change Notifications +### 3. Change Notifications -The PSL dataset, used by the browsers to determine *registrable domain / eTLD+1*s, +The PSL dataset, used by the browsers to determine registrable domains, is a dynamic dataset that can change at any time. Although this API provides a function for retrieving the current version of the PSL dataset used by the browser, no mechanism is provided for notifying extensions when the host browser's PSL dataset changes. It is From b874592521b2ce45662554295f58f9c784a45efa Mon Sep 17 00:00:00 2001 From: Francis McKenzie Date: Wed, 7 May 2025 23:18:39 +0800 Subject: [PATCH 4/7] Update Public Suffix API proposal --- proposals/public-suffix.md | 316 ++++++++++++++++++++----------------- 1 file changed, 174 insertions(+), 142 deletions(-) diff --git a/proposals/public-suffix.md b/proposals/public-suffix.md index 1e77bca8..1dad3e81 100644 --- a/proposals/public-suffix.md +++ b/proposals/public-suffix.md @@ -172,7 +172,7 @@ reach the final label without finding any matches in the PSL. At this point, if is allowable to assume that all unknown single-label eTLDs are valid, then certain optimisations to the algorithm are possible as follows: -###### 3.3.1 Fewer PSL searches +###### 3.3.1 Optimisation: fewer PSL searches The algorithm can avoid doing the final search against the PSL using the final label. This may save a few CPU cycles for every candidate domain lookup. @@ -188,7 +188,7 @@ Example candidate domain: `foo.bar.baz` It is unclear how much of a performance benefit such an optimization would give in practice. -###### 3.3.2 Smaller browser footprint +###### 3.3.2 Optimisation: smaller browser footprint The browser can avoid storing *any* single-label eTLDs on disk or in memory. This allows a possible reduction in browser startup time, since it is loading fewer @@ -201,26 +201,14 @@ proposal; in that case, the full PSL dataset (including all single-label eTLDs) is already available and therefore this proposal would not be adding any overhead in terms of browser footprint due to storing the entire PSL. -For example, Firefox *does* already need to distinguish known vs unknown eTLDs -in order to determine whether to issue a search query or whether to try a navigation, -when a user enters a domain-like string in the navigation bar. In such instances, -a PSL lookup is made and: - -* If the domain has a known eTLD, attempt to navigate. -* If the domain has an unknown eTLD, use a search engine. +For example, Firefox *does* already need to distinguish known vs unknown eTLDs, +when determining whether to search or navigate upon receiving input into the url bar +(described in the [Use Cases](#use-cases) section). ##### 3.4 Recommendation -It is recommended that this API should only expose known eTLDs, and leave it up -to extensions to handle unknown eTLDs because: - -* none of the use cases identified later in this proposal are relevant to non-public -(intranet) hostnames -* unknown eTLDs due to out-of-date PSL datasets, although possible, are likely to be -somewhat infrequent and temporary -* even if it were possible to make certain performance optimisations by assuming -all single-label eTLDs are valid, any associated performance benefits -may not be significant +It is recommended that this API should provide the ability to determine whether +or not a candidate domain has a known eTLD. #### 4. IDN @@ -328,6 +316,25 @@ document, then the request is considered third-party. | #1 | foo.amazonaws.com | amazonaws.com | | | #2 | bar.us-east-1.amazonaws.com | bar.us-east-1.amazonaws.com | Yes | +#### 4. Search vs Navigate + +Firefox makes use of the PSL in order to determine whether to issue a search query +or whether to try a navigation, when a user enters a domain-like string in the +url bar. In such instances, a PSL lookup is made and: + +* If the domain has a known eTLD, attempt to navigate. +* If the domain has an unknown eTLD, use a search engine. + +#### 5. Site-specific data + +Extensions sometimes need to associate data with a hostname's "site", which may be: + +* a registrable domain (i.e. with a known eTLD) +* an IP address +* an intranet hostname with a non-public (i.e. unknown) eTLD, or without any suffix + +Examples of this kind of data include cookies and password autofill. + ### Known Consumers Mozilla intends to make use of this API in its [multi-account-containers](https://addons.mozilla.org/en-US/firefox/addon/multi-account-containers/) @@ -358,101 +365,83 @@ A new API `publicSuffix` is added as follows: ```ts namespace publicSuffix { - // - // Determines if all specified domains have the same registrable domain. - // - // Note: rejects with an error if any domain name has an unknown eTLD - // (i.e. not in the PSL). - // - // Example: - // - // let result = await browser.publicSuffix.hasSameRegistrableDomain( - // "www.example.co.uk", - // "xyz.example.co.uk", - // "foo.bar.baz.example.co.uk", - // ); - // ==> true - // - export function hasSameRegistrableDomain(...domains: string[]) : Promise; - - // - // Gets the longest registrable domain for a specified domain. - // - // Note: fulfils with null if the input domain name has an unknown eTLD. - // - // Example: - // - // let domain = await browser.publicSuffix.getRegistrableDomain("www.example.co.uk"); - // ==> "example.co.uk" - // + + + // METHODS + + // Determines if the given hostnames have the same registrable domain. + export function hasSameRegistrableDomain( + hostname1: string, + hostname2: string, + options?: RegistrableDomainOptions, + ) : Promise; + + // Gets the registrable domain of a given hostname. export function getRegistrableDomain( - // The domain name whose registrable domain we want to find - domain: string, - // Options that control the behaviour of the lookup algorithm + hostname: string, options?: RegistrableDomainOptions, ) - // Fulfils with the longest registrable domain of the input domain name. : Promise; - // - // Gets the longest registrable domain for each domain in a specified - // list of domains. - // - // Note: modelled on `Promise.allSettled()` - // - // Example: - // - // let domains = await browser.publicSuffix.getRegistrableDomains([ - // "foo.bar.wixsite.com", - // "www.example.net", - // "printer.homenet", - // "a..b", - // ]); - // ==> [ - // { status: "fulfilled", value: "bar.wixsite.com" }, - // { status: "fulfilled", value: "www.example.net" }, - // { status: "fulfilled", value: null }, - // { status: "rejected", reason: "Invalid domain name" }, - // ] - // + // Gets the registrable domain of each hostname in a given list of hostnames. export function getRegistrableDomains( - // The domain names whose registrable domains we want to find - domains: Iterable, - // Options that control the behaviour of the lookup algorithm + hostnames: Iterable, options?: RegistrableDomainOptions, ) - // Fulfils with a registrable domain result corresponding to each input domain name : Promise>; - // + // Determines which one of the following kinds of value applies to each hostname + // in a list of hostnames: + // RegistrableDomain + // PublicSuffix + // IPAddress + // Unknown + // Invalid + export function parse( + hostnames: Iterable, + ) + : Promise> + // Gets the value of the VERSION metadata field in the PSL dataset if available - // export function getVersion(): string | null; - // - // Options that may be passed to getRegistrableDomain() and getRegistrableDomains() - // to control their behaviour. - // + + // INTERFACES + + // Options that may be passed to the API's methods to control their behaviour. interface RegistrableDomainOptions { - // If true, the returned registrable domain(s) should be encoded as Unicode + // If true, each resulting registrable domain should be encoded as Unicode. + // Default = false (Punycode) unicode?: boolean, + // If false, IP addresses and hostnames lacking a known eTLD are + // treated as having registrable domains. + // Default = true + strict?: boolean, } - // - // Object containing the result of calculating the registrable domain for one of the - // domains in the array passed to getRegistrableDomains(). - // - // Note: modelled on `Promise.allSettled()` - // + // Object containing the result of calculating the registrable domain of one of + // the hostnames in the input list. interface RegistrableDomainResult { // A string, either "fulfilled" or "rejected", indicating the eventual state of the promise. status: string, - // Only present if status is "fulfilled". The calculated registrable domain, or null - // if the corresponding input domain has an unknown eTLD. + // Only present if status is "fulfilled". The resulting registrable domain. value?: string | null, // Only present if status is "rejected". The reason that the promise was rejected with. reason?: string, } + + // Object containing the result of parsing a hostname. + interface ParseResult { + // The value obtained from the hostname. + value: string | null, + // The kind of value obtained, which must be one of the following: + // RegistrableDomain + // PublicSuffix + // IPAddress + // Unknown + // Invalid + kind: string, + } } ``` @@ -504,17 +493,34 @@ in the PSL are given below. These are described in more detail in a possible | sub.www.ck | ck | !www.ck | Exception rule | www.ck | | sub.sub.www.ck | ck | !www.ck | Exception rule | www.ck | -#### 4. Unknown Suffixes +#### 4. Strict -A `null` registrable domain should be returned for a candidate domain that is otherwise -valid but has an unknown eTLD (i.e. one that is not found in the PSL). +By default, if a hostname lacks a known eTLD (i.e. in the PSL), its registrable domain +is `null`. -##### Example +The same applies if the hostname is an IP address - IPv4 or IPv6. This avoids any +possibility of `100.200.30.2` and `100.200.31.2` being interpreted as belonging to the +same organization (i.e. with eTLD `.2`). + +In order to support use cases that need to determine a hostname's "site", +a `strict` option is provided, allowing a more general-purpose interpretation of +what constitutes a registrable domain that includes IP addresses and unknown eTLDs. + +If a hostname lacks a known eTLD, and the option `strict` is set to `false`, +then the registrable domain is determined by the type of hostname as follows: -| Domain parameter | Registrable Domain | -|------------------------|-------------------:| -| www.example.com | example.com | -| www.example.foobar | `null` | +| Input hostname | Registrable domain | +|--------------------|-----------------------------------------------------| +| IP address | the input hostname itself (i.e. an IP address) | +| Non IP-address | the last domain label (i.e. an unknown TLD) | + +##### Examples: Registrable domains + +| Input hostname | strict = true | strict = false | +|------------------------|---------------:|---------------:| +| 1.2.3.4 | null | 1.2.3.4 | +| printer.homenet | null | homenet | +| com | null | com | #### 5. IDN @@ -535,15 +541,12 @@ using Unicode encoding. | unicode !== true (default) | example.xn--mgbx4cd0ab | | unicode === true | example.مليسيا | -#### 6. Invalid domain parameter +#### 6. Invalid hostname -The promises returned by this API's methods should reject with an error if a domain +The promises returned by this API's methods should reject with an error if a hostname passed as an input parameter meets any of the following criteria: * Contains a character that is invalid in an Internationalized Domain Name (IDN) - e.g. symbols, whitespace -* Is an IP address - IPv4 or IPv6. (Avoids `100.200.30.2` and `100.200.31.2` being interpreted as belonging to the same organization.) -* Is an eTLD itself -* Is a single domain label, regardless of whether or not it exists in the PSL * Is an empty string * Is equal to `'.'` * Contains empty domain labels (i.e. any occurrences of `'..'`) @@ -551,68 +554,100 @@ passed as an input parameter meets any of the following criteria: #### 7. Summary of behaviours The following table sets out the eventual settled state of the promise returned by -`getRegistrableDomain()` for different classes of input `domain` parameter: +`getRegistrableDomain()` for different classes of input `hostname` parameter: -| Domain parameter | Description | Registrable domain | +| Input hostname | Description | Registrable domain | |:-------------------|:-------------------------------------------------|-----------------------:| | example.net | eTLD+1 | example.net | | www.example.net | eTLD+2 | example.net | -| net | is an eTLD itself, single-label | Error | -| github.io | is an eTLD itself, multi-label | Error | -| foobar | no matching eTLD in PSL, single-label | Error | -| net.foobar | no matching eTLD in PSL, multi-label | `null` | +| net | is an eTLD itself, single-label | null | +| net | as above, with `strict === false` | net | +| github.io | is an eTLD itself, multi-label | github.io | +| foobar | no matching eTLD in PSL, single-label | null | +| foobar | as above, with `strict === false` | foobar | +| net.foobar | no matching eTLD in PSL, multi-label | null | +| net.foobar | as above, with `strict === false` | foobar | | foobar.net | has an eTLD in the ICANN section | foobar.net | | foobar.github.io | has an eTLD in the Private section | foobar.github.io | -| 127.0.0.1 | IP address, IPv4 | Error | -| [::1] | IPv6 address | Error | +| 127.0.0.1 | IP address, IPv4 | null | +| 127.0.0.1 | as above, with `strict === false` | 127.0.0.1 | +| [::1] | IPv6 address | null | +| [::1] | as above, with `strict === false` | [::1] | | EXAMPLE.NET | uppercase | example.net | | .example.net | dot in front | example.net | | example.net. | dot in the end, this is an FQDN | example.net. | -| *.com | contains invalid character `'*'` | Error | -| مليسيا | this is an IDN that is also an eTLD | Error | -| xn--mgbx4cd0ab | as above, but Punycode | Error | +| مليسيا | this is an IDN that is also an eTLD | null | +| xn--mgbx4cd0ab | as above, but Punycode | null | | foo.مليسيا | this is an IDN | foo.xn--mgbx4cd0ab | | foo.مليسيا | as above, with `unicode === true` | foo.مليسيا | | foo.xn--mgbx4cd0ab | this is an IDN, but Punycode | foo.xn--mgbx4cd0ab | | foo.xn--mgbx4cd0ab | as above, with `unicode === true` | foo.مليسيا | +| *.com | contains invalid character `'*'` | Error | | | empty string | Error | | . | no domain labels | Error | | example..com | contains an empty domain label | Error | #### 8. Comparing registrable domains -Method `hasSameRegistrableDomain(domain1, domain2)` is suited in particular to +Method `hasSameRegistrableDomain(hostname1, hostname2)` is suited in particular to [Use Case #3: Detect Third-Party Requests](#3-detect-third-party-requests), since it provides a simple way of comparing registrable domains. -This method is *almost but not quite* equivalent to: +This method should resolve to `true` if and only if the computed registrable domains +are **equal and nonnull**. -``` -await publicSuffix.getRegistrableDomain(domain1) === publicSuffix.getRegistrableDomain(domain2) -``` +The calculation of the registrable domains should apply the `strict` option if specified, +allowing IP addresses and unknown registrable domains to be compared. -The key difference is in the case of domains with unknown eTLDs (i.e. not in the PSL). -In such cases, `getRegistrableDomain()` returns `null`, which would mean -`hasSameRegistrableDomain()`, if implemented using the above code, would return `true` -for all domains with unknown eTLDs. This would likely be a surprising result for users -of this API. +##### Examples: Has same registrable domain -Therefore if any domain passed to `hasSameRegistrableDomain()` has an unknown eTLD, -the returned promise should reject with an error. +Given `homenet` and `mywork` are not eTLDs in the PSL: -##### Examples +| hostname1 | hostname2 | strict === true | strict === false | +|---------------------|------------------|-----------------:|-----------------:| +| foo.example.com | bar.example.com | true | true | +| foo.example.com | bar.example2.com | false | false | +| printer.homenet | printer.mywork | false | false | +| printer.homenet | printer.homenet | false | true | +| printer.homenet | backup.homenet | false | true | +| 1.2.3.4 | 1.2.3.4 | false | true | +| 1.2.3.4 | 2.2.3.4 | false | false | -Given `homenet` and `mywork` are not eTLDs in the PSL: +#### 9. Parsing + +Method `parse(hostnames)` is suited to use cases that need to determine the "site" +of a hostname. It is the counterpart to calling `getRegistrableDomain()` with the +`strict = false` option, in that it gets a hostname's registrable domain if possible, +or otherwise the most appropriate alternative value. + +The key difference is that the `parse()` method also returns an indicator of the +kind of value that was returned, not just the value itself. In addition, +the `parse()` method does not throw an error if the hostname is invalid. + +The `kind` value of the returned object must be one of the following: +`RegistrableDomain`, `PublicSuffix`, `Unknown`, `IPAddress`, `Invalid`. + +##### Examples: parse result + +| Input hostname | Parse: value | Parse: kind | Explanation | +|-----------------|----------------|-------------------|-----------------------------------| +| foo.example.com | example.com | RegistrableDomain | eTLD+1 with a known eTLD | +| com | com | PublicSuffix | Known eTLD lacking a +1 label | +| printer.homenet | homenet | Unknown | Lacking a known eTLD | +| 1.2.3.4 | 1.2.3.4 | IPAddress | An IP address | +| *.com | null | Invalid | Contains invalid character '*' | + +##### 9.1 Justification for parsing -| Domain1 | Domain2 | hasSameRegistrableDomain() | -|---------------------|------------------|---------------------------:| -| foo.example.com | bar.example.com | true | -| foo.example.com | bar.example2.com | false | -| foo.example.com | backup.homenet | Error | -| printer.homenet | backup.homenet | Error | -| printer.homenet | backup.mywork | Error | +Some use cases may have need for the more fine-grained functionality offered by `parse()` +than that of the other API methods in this proposal. For example, if Firefox's +[Search vs Navigate](#4-search-vs-navigate) functionality was based purely on the return +value of `getRegistrableDomain()`, i.e. navigate if nonnull or search if null, +then IP addresses would incorrectly cause a search. However, it would be possible using +this `parse()` method to group IP addresses and known registrable domains together, +by checking the `kind` value of each parse result object. -#### 9. Batching +#### 10. Batching Method `getRegistrableDomains()` is suited in particular to [Use Case #2: Group Domains in UI](#2-group-domains-in-ui), since it enables @@ -636,10 +671,10 @@ input domain, the `status` field of the corresponding `RegistrableDomainResult` and the `value` field is the registrable domain, or `null` if the input domain has an unknown eTLD. An error during the lookup of any of the domain names does not cause the returned promise -to be rejected. Instead, the corresponding `RegistrableDomainResult` object has the `status` field +to be rejected. Instead, the corresponding fulfilment object has the `status` field set to `rejected` and the `reason` field holds the error description. -##### 9.1 Justification for batching +##### 10.1 Justification for batching As stated, the same information provided by `getRegistrableDomains()` could be obtained by simply calling `getRegistrableDomain()` multiple times. @@ -654,7 +689,7 @@ A quick mockup of the two approaches was built using a simplified implementation of this proposal's API in a modified Firefox, and the batching approach was about 2-3 times faster for 50 domains. -#### 10. PSL Version +#### 11. PSL Version Versioning metadata was introduced into the PSL with [this commit](https://github.com/publicsuffix/list/issues/1808#issuecomment-2455793503). @@ -749,9 +784,6 @@ additional functionality are identified. For example: 1. Provide method `getPublicSuffix()` to get the eTLD as opposed to the registrable domain. 2. Provide method `isPublicSuffix()` and/or `isRegistrableDomain()` for possibly improved efficiency in certain use cases. -3. Provide an option to handle domains with unknown eTLDs differently: instead of -returning a `null` registrable domain in these cases, the API would assume a single-label -eTLD and calculate the registrable domain accordingly. ### 2. Registrable Domains of Registrars From 5ba391fd8441e6754317e74216dad919e4ec9021 Mon Sep 17 00:00:00 2001 From: Francis McKenzie Date: Mon, 12 May 2025 12:31:27 +0800 Subject: [PATCH 5/7] Update Public Suffix API proposal --- proposals/public-suffix.md | 444 +++++++++++++++++++------------------ 1 file changed, 225 insertions(+), 219 deletions(-) diff --git a/proposals/public-suffix.md b/proposals/public-suffix.md index 1dad3e81..ecddc1c8 100644 --- a/proposals/public-suffix.md +++ b/proposals/public-suffix.md @@ -273,6 +273,14 @@ to entirely prevent a user from using an eTLD as a filter) the registrable domain of each web request, and testing this using appropriate regexes to decide whether or not to act on the request +It is noted that filtering rule type #2 should only be supported by this proposal +as it applies to calculating the registrable domain of an incoming web request, +and using that registrable domain to compare against filtering rules to find a match. +Other than this, there is also a declarative use case that entails auto-generating +all filtering rules for every possible known eTLD that could be matched by +an input filtering rule such as `myorg.*`. This kind of declarative use case is +out of scope, and will not be enabled by this proposal. + #### 2. Group Domains in UI Where extensions present lists of domain names to users, it can be beneficial @@ -369,38 +377,24 @@ namespace publicSuffix { // METHODS - // Determines if the given hostnames have the same registrable domain. - export function hasSameRegistrableDomain( - hostname1: string, - hostname2: string, - options?: RegistrableDomainOptions, - ) : Promise; + // Determines if the given hostname is itself a known eTLD (i.e. in the PSL). + export function isKnownPublicSuffix( + hostname: string, + ) + : boolean; - // Gets the registrable domain of a given hostname. - export function getRegistrableDomain( + // Gets the known eTLD, if any, of a given hostname. + export function getKnownPublicSuffix( hostname: string, - options?: RegistrableDomainOptions, ) - : Promise; + : string | null; - // Gets the registrable domain of each hostname in a given list of hostnames. - export function getRegistrableDomains( - hostnames: Iterable, + // Gets the registrable domain of a given hostname. + export function getRegistrableDomain( + hostname: string, options?: RegistrableDomainOptions, ) - : Promise>; - - // Determines which one of the following kinds of value applies to each hostname - // in a list of hostnames: - // RegistrableDomain - // PublicSuffix - // IPAddress - // Unknown - // Invalid - export function parse( - hostnames: Iterable, - ) - : Promise> + : string | null; // Gets the value of the VERSION metadata field in the PSL dataset if available export function getVersion(): string | null; @@ -408,56 +402,47 @@ namespace publicSuffix { // INTERFACES - // Options that may be passed to the API's methods to control their behaviour. + // Options that may be passed to the API method to control its behaviour. interface RegistrableDomainOptions { - // If true, each resulting registrable domain should be encoded as Unicode. + // If true, the resulting registrable domain should be encoded as Unicode. // Default = false (Punycode) unicode?: boolean, - // If false, IP addresses and hostnames lacking a known eTLD are - // treated as having registrable domains. - // Default = true - strict?: boolean, - } - - // Object containing the result of calculating the registrable domain of one of - // the hostnames in the input list. - interface RegistrableDomainResult { - // A string, either "fulfilled" or "rejected", indicating the eventual state of the promise. - status: string, - // Only present if status is "fulfilled". The resulting registrable domain. - value?: string | null, - // Only present if status is "rejected". The reason that the promise was rejected with. - reason?: string, + // If true, an IP address is a registrable domain. + // Default = false + allowIP?: boolean, + // If true, a known eTLD is a registrable domain. + // Default = false + allowPlainSuffix?: boolean, + // If true, a hostname that lacks a known eTLD is a registrable domain. + // Default = false + allowUnknownSuffix?: boolean, } - // Object containing the result of parsing a hostname. - interface ParseResult { - // The value obtained from the hostname. - value: string | null, - // The kind of value obtained, which must be one of the following: - // RegistrableDomain - // PublicSuffix - // IPAddress - // Unknown - // Invalid - kind: string, - } } ``` ### Behaviours -#### 1. PSL Sections +#### 1. PSL Algorithm + +The major browsers currently have their own implementations of the PSL-handling logic, +and attempts to standardise the various implementations across browsers have thus-far been unsuccessful. A possible [PSL algorithm](https://github.com/publicsuffix/list/wiki/Format#formal-algorithm) +is available, but this may not be authoritative. Therefore it is proposed +that the browsers should follow their existing PSL-handling logic when implementing +this proposal's API. Key features of the algorithm are highlighted here +for informational purposes. + +##### 1.1 PSL Sections Lookups of the PSL dataset should **include** all eTLDs from both PSL sections, i.e. both ICANN-section and Private-section eTLDs. -#### 2. Multiple eTLDs per Domain +##### 1.2 Multiple eTLDs per Domain Lookups of the PSL dataset should always select the **longest** matching eTLD when determining the registrable domain. -##### Example +###### Example `domain` = foo.bar.lib.de.us @@ -466,21 +451,30 @@ when determining the registrable domain. | lib.de.us | | de.us | -The longest is lib.de.us, so `getRegistrableDomain()` resolves to bar.lib.de.us +The longest is lib.de.us, so the API's methods yield the following: -#### 3. PSL Special Rules +| eTLD | Registrable Domain | +|------------:|-------------------:| +| lib.de.us | bar.lib.de.us | -It is noted that the major browsers currently have their own implementations -of the PSL-handling logic, and attempts to standardise the various implementations -across browsers have thus-far been unsuccessful. Therefore it is proposed that the -browsers should follow their existing PSL-handling rules when implementing this -proposal's API. +##### 1.3 eTLD vs Registrable Domain -For informational purposes, examples of the 'wildcard' and 'exception' rules -in the PSL are given below. These are described in more detail in a possible -[PSL algorithm](https://github.com/publicsuffix/list/wiki/Format#formal-algorithm). +According to the [URL specification](https://url.spec.whatwg.org/#host-registrable-domain), +any domain that is itself an eTLD in the PSL cannot have a registrable domain. +However, reviewers of this proposal have noted that some PSL eTLDs do have their own websites, +e.g. github.io and blogspot.com. Therefore it may be worthwhile updating the PSL algorithm +to allow registrable domains to be obtained from known eTLDs, and browsers would subsequently +need to update their implementations. -##### Examples +Given this proposal's API exposes the host browser's existing PSL-handling logic, +by default this API should treat known eTLDs as not having registrable domains. + +##### 1.4 PSL Special Rules + +Entries in the PSL dataset include instances of 'wildcard' and 'exception' rules, +whose effects are demonstrated using the following examples. + +###### Examples | Domain | Public suffix | Matched PSL rule | Explanation | Registrable Domain | |-------------------------:|--------------:|-----------------:|:--------------:|-------------------:| @@ -493,41 +487,136 @@ in the PSL are given below. These are described in more detail in a possible | sub.www.ck | ck | !www.ck | Exception rule | www.ck | | sub.sub.www.ck | ck | !www.ck | Exception rule | www.ck | -#### 4. Strict +#### 2. API Methods + +##### 2.1 Public Suffix + +Method `getKnownPublicSuffix()` returns the input hostname's known eTLD (i.e. in the PSL) +if it has one, otherwise `null`. + +Method `isKnownPublicSuffix()` returns `true` if and only if the input hostname is itself +a known eTLD. In other words, this method returns `true` if calling `getKnownPublicSuffix()` +with the input hostname returns the input hostname itself. + +These methods are included in the API because the PSL algorithm returns the longest eTLD, +but sometimes one may be interested in knowing whether there is any other shorter eTLD +that might have matched the input hostname in theory. E.g. 'github.io' is a +public suffix itself, but could also be interpreted as a registrable domain +whose public suffix is 'io'. + +###### Examples + +| Input hostname | Public Suffix | +|----------------|--------------:| +| github.io | github.io | +| foo.github.io | github.io | +| facebook.co.uk | co.uk | +| 192.168.2.1 | null | +| green.banana | null | + +##### 2.2 Registrable Domain + +Method `getRegistrableDomain()` returns the input hostname's registrable domain, +as determined by running the PSL algorithm, otherwise `null`. + +By default, this method returns `null` if the input hostname: + +* lacks a known eTLD (i.e. in the PSL) +* is itself a known eTLD +* is an IP address - IPv4 or IPv6 + +##### 2.2.1 Options: Registrable Domain + +In order to support different use cases including those that need to determine +a hostname's "site", additional options are provided, allowing a more +general-purpose interpretation of what constitutes a registrable domain +that includes IP addresses and unknown eTLDs. -By default, if a hostname lacks a known eTLD (i.e. in the PSL), its registrable domain -is `null`. +Options `allowIP`, `allowPlainSuffix` and `allowUnknownSuffix` each target +a specific kind of input hostname lacking a registrable domain +in the strictest sense (i.e. having a known eTLD as stipulated by +the PSL algorithm), as follows: -The same applies if the hostname is an IP address - IPv4 or IPv6. This avoids any -possibility of `100.200.30.2` and `100.200.31.2` being interpreted as belonging to the -same organization (i.e. with eTLD `.2`). +| Option | Kind of Input Hostname Targetted | +|--------------------|---------------------------------:| +| allowIP | IP Address (IPv4 of IPv6) | +| allowPlainSuffix | is itself a known eTLD | +| allowUnknownSuffix | lacks a known eTLD | -In order to support use cases that need to determine a hostname's "site", -a `strict` option is provided, allowing a more general-purpose interpretation of -what constitutes a registrable domain that includes IP addresses and unknown eTLDs. +The effect of each option when applied to an input hostname of the +kind targetted by the option is to change the registrable domain +from being `null` to being instead *the full input hostname itself*. -If a hostname lacks a known eTLD, and the option `strict` is set to `false`, -then the registrable domain is determined by the type of hostname as follows: +###### Examples -| Input hostname | Registrable domain | -|--------------------|-----------------------------------------------------| -| IP address | the input hostname itself (i.e. an IP address) | -| Non IP-address | the last domain label (i.e. an unknown TLD) | +| Input hostname | Option = true | Registrable domain | +|-------------------|--------------------|-------------------:| +| 192.168.2.1 | allowIP | 192.168.2.1 | +| github.io | allowPlainSuffix | github.io | +| apple.pear.banana | allowUnknownSuffix | apple.pear.banana | -##### Examples: Registrable domains +##### 2.2.2 Options: Justification -| Input hostname | strict = true | strict = false | -|------------------------|---------------:|---------------:| -| 1.2.3.4 | null | 1.2.3.4 | -| printer.homenet | null | homenet | -| com | null | com | +Option `allowUnknownSuffix` supports use cases that need to target +not only domains on the internet having known eTLDs, but also +intranet hostnames having non-public (i.e. unknown) suffixes, or no suffix. -#### 5. IDN +Reviewers of this proposal note that if it were the case that non-domains +were included by default, `getRegistrableDomain()` would effectively +return a string for almost every input. -All API methods should accept domains passed as input parameters using either +As a result of the inclusion of unknown suffixes, the API implementation must +take care to ensure input hostnames that are IP addresses (IPv4 or IPv6) are detected +*before* running the algorithm. IP addresses must be treated differently, +otherwise it would be possible for `100.200.30.2` and `100.200.31.2` to be +mistakenly interpreted as belonging to the same organization (i.e. with unknown eTLD `.2`). + +Option `allowIP` supports use cases needing to obtain a hostname's site, +which may be an IP address or a domain name. + +An example of such a use case is Firefox's [Search vs Navigate](#4-search-vs-navigate), +which involves determining if an entry in the URL bar is a navigable site, +or a search term. If this functionality was based purely on the return value +of `getRegistrableDomain()`, i.e. navigate if nonnull or search if null, +then IP addresses would incorrectly cause a search. By using the `allowIP` option, +the return value for an input IP address would be the IP address itself instead of null, +thereby causing the desired result of navigating instead of searching. + +Option `allowPlainSuffix` only exists because there are domains that do not have +a registrable domain, due to themselves being PSL eTLDs, but can still be +navigated to, such as github.io and blogspot.com. + +##### 2.2.3 Options: Discussion + +The effect of the options is that `getRegistrableDomain()` may return values +that are not registrable domains in the strictest sense, e.g. they may +be IP addresses. + +The author of this proposal is of the view that: + +1. Any method named `getXYZ()` should return a value of type `XYZ`. Therefore +`getRegistrableDomain()` may not be the most suitable name, since it does +not always return true registrable domains. Reviewers of this proposal +feel this is not a significant enough issue to warrant alternative naming. + +2. This API should provide a way not just to get a hostname's +registrable-domain-like value, but also to know what kind of value that is, +be it an IP address, a domain name, or an intranet hostname lacking a known eTLD. +Reviewers of this proposal are of the view that no compelling use case has been +identified to support the need for such additional functionality. However, +reviewers have conceded that IP addresses have to be special-cased, because for +most domain inputs, one could split at dots to try and get a different domain level, +but that logic does not make sense for IP addresses. By not providing a way of +knowing whether the return value of `getRegistrableDomain()` is an IP address +or a domain name, it is more difficult for users of this API to implement +the special-casing that the reviewers have identified. + +#### 3. IDN + +All API methods should accept hostnames passed as input parameters using either Unicode or Punycode encoding. -Methods that return registrable domains should encode them using Punycode +Methods that return registrable domains or eTLDs should encode them using Punycode encoding by default, unless an `options` object is passed as an input parameter with key `unicode` set to `true`, in which case they should be encoded using Unicode encoding. @@ -538,10 +627,10 @@ using Unicode encoding. | Option | Registrable Domain | |----------------------------|-----------------------:| -| unicode !== true (default) | example.xn--mgbx4cd0ab | -| unicode === true | example.مليسيا | +| unicode == false (default) | example.xn--mgbx4cd0ab | +| unicode == true | example.مليسيا | -#### 6. Invalid hostname +#### 4. Invalid hostname The promises returned by this API's methods should reject with an error if a hostname passed as an input parameter meets any of the following criteria: @@ -551,7 +640,7 @@ passed as an input parameter meets any of the following criteria: * Is equal to `'.'` * Contains empty domain labels (i.e. any occurrences of `'..'`) -#### 7. Summary of behaviours +#### 5. Summary of behaviours The following table sets out the eventual settled state of the promise returned by `getRegistrableDomain()` for different classes of input `hostname` parameter: @@ -561,133 +650,61 @@ The following table sets out the eventual settled state of the promise returned | example.net | eTLD+1 | example.net | | www.example.net | eTLD+2 | example.net | | net | is an eTLD itself, single-label | null | -| net | as above, with `strict === false` | net | -| github.io | is an eTLD itself, multi-label | github.io | +| net | as above, with `allowPlainSuffix = true` | net | +| github.io | is an eTLD itself, multi-label | null | +| github.io | as above, with `allowPlainSuffix = true` | github.io | | foobar | no matching eTLD in PSL, single-label | null | -| foobar | as above, with `strict === false` | foobar | -| net.foobar | no matching eTLD in PSL, multi-label | null | -| net.foobar | as above, with `strict === false` | foobar | +| foobar | as above, with `allowUnknownSuffix = true` | foobar | +| my.net.foobar | no matching eTLD in PSL, multi-label | null | +| my.net.foobar | as above, with `allowUnknownSuffix = true` | my.net.foobar | | foobar.net | has an eTLD in the ICANN section | foobar.net | | foobar.github.io | has an eTLD in the Private section | foobar.github.io | | 127.0.0.1 | IP address, IPv4 | null | -| 127.0.0.1 | as above, with `strict === false` | 127.0.0.1 | +| 127.0.0.1 | as above, with `allowIP = true` | 127.0.0.1 | | [::1] | IPv6 address | null | -| [::1] | as above, with `strict === false` | [::1] | +| [::1] | as above, with `allowIP = true` | [::1] | | EXAMPLE.NET | uppercase | example.net | | .example.net | dot in front | example.net | | example.net. | dot in the end, this is an FQDN | example.net. | | مليسيا | this is an IDN that is also an eTLD | null | | xn--mgbx4cd0ab | as above, but Punycode | null | | foo.مليسيا | this is an IDN | foo.xn--mgbx4cd0ab | -| foo.مليسيا | as above, with `unicode === true` | foo.مليسيا | +| foo.مليسيا | as above, with `unicode = true` | foo.مليسيا | | foo.xn--mgbx4cd0ab | this is an IDN, but Punycode | foo.xn--mgbx4cd0ab | -| foo.xn--mgbx4cd0ab | as above, with `unicode === true` | foo.مليسيا | +| foo.xn--mgbx4cd0ab | as above, with `unicode = true` | foo.مليسيا | | *.com | contains invalid character `'*'` | Error | | | empty string | Error | | . | no domain labels | Error | | example..com | contains an empty domain label | Error | -#### 8. Comparing registrable domains - -Method `hasSameRegistrableDomain(hostname1, hostname2)` is suited in particular to -[Use Case #3: Detect Third-Party Requests](#3-detect-third-party-requests), -since it provides a simple way of comparing registrable domains. - -This method should resolve to `true` if and only if the computed registrable domains -are **equal and nonnull**. - -The calculation of the registrable domains should apply the `strict` option if specified, -allowing IP addresses and unknown registrable domains to be compared. - -##### Examples: Has same registrable domain - -Given `homenet` and `mywork` are not eTLDs in the PSL: - -| hostname1 | hostname2 | strict === true | strict === false | -|---------------------|------------------|-----------------:|-----------------:| -| foo.example.com | bar.example.com | true | true | -| foo.example.com | bar.example2.com | false | false | -| printer.homenet | printer.mywork | false | false | -| printer.homenet | printer.homenet | false | true | -| printer.homenet | backup.homenet | false | true | -| 1.2.3.4 | 1.2.3.4 | false | true | -| 1.2.3.4 | 2.2.3.4 | false | false | - -#### 9. Parsing - -Method `parse(hostnames)` is suited to use cases that need to determine the "site" -of a hostname. It is the counterpart to calling `getRegistrableDomain()` with the -`strict = false` option, in that it gets a hostname's registrable domain if possible, -or otherwise the most appropriate alternative value. - -The key difference is that the `parse()` method also returns an indicator of the -kind of value that was returned, not just the value itself. In addition, -the `parse()` method does not throw an error if the hostname is invalid. - -The `kind` value of the returned object must be one of the following: -`RegistrableDomain`, `PublicSuffix`, `Unknown`, `IPAddress`, `Invalid`. - -##### Examples: parse result +#### 6. Sync vs Async -| Input hostname | Parse: value | Parse: kind | Explanation | -|-----------------|----------------|-------------------|-----------------------------------| -| foo.example.com | example.com | RegistrableDomain | eTLD+1 with a known eTLD | -| com | com | PublicSuffix | Known eTLD lacking a +1 label | -| printer.homenet | homenet | Unknown | Lacking a known eTLD | -| 1.2.3.4 | 1.2.3.4 | IPAddress | An IP address | -| *.com | null | Invalid | Contains invalid character '*' | - -##### 9.1 Justification for parsing - -Some use cases may have need for the more fine-grained functionality offered by `parse()` -than that of the other API methods in this proposal. For example, if Firefox's -[Search vs Navigate](#4-search-vs-navigate) functionality was based purely on the return -value of `getRegistrableDomain()`, i.e. navigate if nonnull or search if null, -then IP addresses would incorrectly cause a search. However, it would be possible using -this `parse()` method to group IP addresses and known registrable domains together, -by checking the `kind` value of each parse result object. - -#### 10. Batching - -Method `getRegistrableDomains()` is suited in particular to -[Use Case #2: Group Domains in UI](#2-group-domains-in-ui), since it enables -extensions to get the registrable domains of multiple domains with a single API call. - -This method is equivalent to calling `Promise.allSettled()` using the results of multiple -individual calls to `getRegistrableDomain()`. E.g. for three domains: - -``` -Promise.allSettled( - publicSuffix.getRegistrableDomain(domain1), - publicSuffix.getRegistrableDomain(domain2), - publicSuffix.getRegistrableDomain(domain3), -) -``` - -The promise returned by `getRegistrableDomains()` mirrors that returned by `Promise.allSettled()`: -the fulfilment value is an array of `RegistrableDomainResult` objects, in the order of the -input domains passed, with the same fields as those of `Promise.allSettled()`. For each valid -input domain, the `status` field of the corresponding `RegistrableDomainResult` object is `fulfilled` -and the `value` field is the registrable domain, or `null` if the input domain has an unknown eTLD. - -An error during the lookup of any of the domain names does not cause the returned promise -to be rejected. Instead, the corresponding fulfilment object has the `status` field -set to `rejected` and the `reason` field holds the error description. - -##### 10.1 Justification for batching - -As stated, the same information provided by `getRegistrableDomains()` could be obtained -by simply calling `getRegistrableDomain()` multiple times. +Browser extension APIs are most commonly async, with API methods returning Promises. +Earlier versions of this proposal set out an async API, with `getRegistrableDomain()` +returning a `Promise`. However, some use cases require getting lists of +registrable domains all in one go. In theory, this could be achieved by simply calling +`getRegistrableDomain()` multiple times. The problem with this approach is that there is overhead associated with an extension -calling an async function on the parent browser. For example, obtaining the registrable domain -for a list of 50 domains would involve making 50 async calls to the parent browser. -However, with the batching approach afforded by `getRegistrableDomains()`, only -a single async call to the parent browser would be made, passing all 50 domains at once. - -A quick mockup of the two approaches was built using a simplified implementation -of this proposal's API in a modified Firefox, and the batching approach was -about 2-3 times faster for 50 domains. +calling an async function on the parent browser. For example, obtaining the registrable domains +of a list of 50 domains would involve making 50 async calls to the parent browser. +A batching method would allow the same result to be obtained with a single async call. + +For this reason, batching method `getRegistrableDomains()` was added to this API. +The method accepted an array of hostnames as input and returning a promise resolving to +an array of registrable domains. A quick mockup of the two approaches was built using +a simplified implementation of this proposal's API in a modified Firefox, and the +batching approach was about 2-3 times faster for 50 domains. + +Unfortunately, while this offered a solution to the performance problem, +it added additional complexity to the API. To resolve this issue, the API +has now been changed to being synchronous, which has allowed the batching method +to be removed, thereby making the API more ergonomic. + +A high level analysis of the implementations of the major browser engines +(Firefox, Chromium, Webkit) indicates that the synchronous approach is feasible, +since the required functionality is already available in each browser engine's +content/render process. #### 11. PSL Version @@ -774,18 +791,7 @@ conform to the expected behaviours of the API in this proposal. ## Future Work -### 1. Extend the API - -The major browsers provide additional methods/parameters internally for getting -information related to the registrable domain. The API could be extended -to expose more of these internal methods/parameters, if relevant use cases for such -additional functionality are identified. For example: - -1. Provide method `getPublicSuffix()` to get the eTLD as opposed to the registrable domain. -2. Provide method `isPublicSuffix()` and/or `isRegistrableDomain()` for possibly improved -efficiency in certain use cases. - -### 2. Registrable Domains of Registrars +### 1. Registrable Domains of Registrars In the case where a domain has multiple matching eTLDs in the PSL and therefore more than one possible registrable domain, this proposal's API always selects @@ -797,10 +803,10 @@ Theoretically, there may be cases where all organizations using a certain registrar are behaving similarly (e.g. distributing malware), and therefore extensions may want to provide support for carrying out some action (e.g. malware blocking) against the shared registrar itself, rather than against -each individual content-owning organization. Support for any such use case -could be handled in a future update to this API. +each individual content-owning organization. More explicit support for such +use cases could be handled in a future update to this API. -### 3. Change Notifications +### 2. Change Notifications The PSL dataset, used by the browsers to determine registrable domains, is a dynamic dataset that can change at any time. Although this API provides a function From f1d81e5c10e93748e49c7a018ac9351b404fdd6d Mon Sep 17 00:00:00 2001 From: Francis McKenzie Date: Wed, 14 May 2025 23:27:09 +0800 Subject: [PATCH 6/7] Update Public Suffix API proposal --- proposals/public-suffix.md | 156 +++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 76 deletions(-) diff --git a/proposals/public-suffix.md b/proposals/public-suffix.md index ecddc1c8..9dee851c 100644 --- a/proposals/public-suffix.md +++ b/proposals/public-suffix.md @@ -179,11 +179,11 @@ This may save a few CPU cycles for every candidate domain lookup. Example candidate domain: `foo.bar.baz` -| Step | Domain | Search in PSL? | -|:----:|:------:|:------:| -| 1 | `foo.bar.baz` | yes | -| 2 | `bar.baz` | yes | -| 3 | `baz` | no | +| Step | Domain | Search in PSL? | +|:----:|:-------------:|:--------------:| +| 1 | `foo.bar.baz` | yes | +| 2 | `bar.baz` | yes | +| 3 | `baz` | no | It is unclear how much of a performance benefit such an optimization would give in practice. @@ -378,21 +378,21 @@ namespace publicSuffix { // METHODS // Determines if the given hostname is itself a known eTLD (i.e. in the PSL). - export function isKnownPublicSuffix( + export function isKnownSuffix( hostname: string, ) : boolean; // Gets the known eTLD, if any, of a given hostname. - export function getKnownPublicSuffix( + export function getKnownSuffix( hostname: string, ) : string | null; // Gets the registrable domain of a given hostname. - export function getRegistrableDomain( + export function getDomain( hostname: string, - options?: RegistrableDomainOptions, + options?: DomainOptions, ) : string | null; @@ -403,17 +403,17 @@ namespace publicSuffix { // INTERFACES // Options that may be passed to the API method to control its behaviour. - interface RegistrableDomainOptions { - // If true, the resulting registrable domain should be encoded as Unicode. + interface DomainOptions { + // If true, the returned domain should be encoded as Unicode. // Default = false (Punycode) unicode?: boolean, - // If true, an IP address is a registrable domain. + // If true, the returned domain may be an IP address. // Default = false allowIP?: boolean, - // If true, a known eTLD is a registrable domain. + // If true, the returned domain may be a known eTLD. // Default = false allowPlainSuffix?: boolean, - // If true, a hostname that lacks a known eTLD is a registrable domain. + // If true, the returned domain may lack a known eTLD. // Default = false allowUnknownSuffix?: boolean, } @@ -489,13 +489,13 @@ whose effects are demonstrated using the following examples. #### 2. API Methods -##### 2.1 Public Suffix +##### 2.1 Known Suffix -Method `getKnownPublicSuffix()` returns the input hostname's known eTLD (i.e. in the PSL) +Method `getKnownSuffix()` returns the input hostname's known eTLD (i.e. in the PSL) if it has one, otherwise `null`. -Method `isKnownPublicSuffix()` returns `true` if and only if the input hostname is itself -a known eTLD. In other words, this method returns `true` if calling `getKnownPublicSuffix()` +Method `isKnownSuffix()` returns `true` if and only if the input hostname is itself +a known eTLD. In other words, this method returns `true` if calling `getKnownSuffix()` with the input hostname returns the input hostname itself. These methods are included in the API because the PSL algorithm returns the longest eTLD, @@ -506,7 +506,7 @@ whose public suffix is 'io'. ###### Examples -| Input hostname | Public Suffix | +| Input hostname | Known Suffix | |----------------|--------------:| | github.io | github.io | | foo.github.io | github.io | @@ -514,9 +514,9 @@ whose public suffix is 'io'. | 192.168.2.1 | null | | green.banana | null | -##### 2.2 Registrable Domain +##### 2.2 Domain -Method `getRegistrableDomain()` returns the input hostname's registrable domain, +Method `getDomain()` returns the input hostname's registrable domain, as determined by running the PSL algorithm, otherwise `null`. By default, this method returns `null` if the input hostname: @@ -525,35 +525,42 @@ By default, this method returns `null` if the input hostname: * is itself a known eTLD * is an IP address - IPv4 or IPv6 -##### 2.2.1 Options: Registrable Domain +##### 2.2.1 Options: Domain In order to support different use cases including those that need to determine a hostname's "site", additional options are provided, allowing a more -general-purpose interpretation of what constitutes a registrable domain -that includes IP addresses and unknown eTLDs. +general-purpose interpretation of a domain to include not only registrable domains +but also IP addresses and domains with unknown (non-registrable) eTLDs. Options `allowIP`, `allowPlainSuffix` and `allowUnknownSuffix` each target a specific kind of input hostname lacking a registrable domain in the strictest sense (i.e. having a known eTLD as stipulated by the PSL algorithm), as follows: -| Option | Kind of Input Hostname Targetted | +| Option | Kind of Input Hostname Targeted | |--------------------|---------------------------------:| | allowIP | IP Address (IPv4 of IPv6) | | allowPlainSuffix | is itself a known eTLD | | allowUnknownSuffix | lacks a known eTLD | The effect of each option when applied to an input hostname of the -kind targetted by the option is to change the registrable domain -from being `null` to being instead *the full input hostname itself*. +kind targeted by the option is to change the returned domain +from being `null` to being the following: + +| Option | Returned Domain | Returned Domain Kind | +|--------------------|:------------------------------------------------:|:--------------------:| +| allowIP | input hostname | IP address | +| allowPlainSuffix | input hostname | eTLD | +| allowUnknownSuffix | last 2 labels, or input hostname if single label | eTLD+1 or eTLD | ###### Examples -| Input hostname | Option = true | Registrable domain | +| Input hostname | Option = true | Returned domain | |-------------------|--------------------|-------------------:| | 192.168.2.1 | allowIP | 192.168.2.1 | | github.io | allowPlainSuffix | github.io | -| apple.pear.banana | allowUnknownSuffix | apple.pear.banana | +| apple.pear.banana | allowUnknownSuffix | pear.banana | +| banana | allowUnknownSuffix | banana | ##### 2.2.2 Options: Justification @@ -562,7 +569,7 @@ not only domains on the internet having known eTLDs, but also intranet hostnames having non-public (i.e. unknown) suffixes, or no suffix. Reviewers of this proposal note that if it were the case that non-domains -were included by default, `getRegistrableDomain()` would effectively +were included by default, `getDomain()` would effectively return a string for almost every input. As a result of the inclusion of unknown suffixes, the API implementation must @@ -577,7 +584,7 @@ which may be an IP address or a domain name. An example of such a use case is Firefox's [Search vs Navigate](#4-search-vs-navigate), which involves determining if an entry in the URL bar is a navigable site, or a search term. If this functionality was based purely on the return value -of `getRegistrableDomain()`, i.e. navigate if nonnull or search if null, +of `getDomain()`, i.e. navigate if nonnull or search if null, then IP addresses would incorrectly cause a search. By using the `allowIP` option, the return value for an input IP address would be the IP address itself instead of null, thereby causing the desired result of navigating instead of searching. @@ -586,31 +593,6 @@ Option `allowPlainSuffix` only exists because there are domains that do not have a registrable domain, due to themselves being PSL eTLDs, but can still be navigated to, such as github.io and blogspot.com. -##### 2.2.3 Options: Discussion - -The effect of the options is that `getRegistrableDomain()` may return values -that are not registrable domains in the strictest sense, e.g. they may -be IP addresses. - -The author of this proposal is of the view that: - -1. Any method named `getXYZ()` should return a value of type `XYZ`. Therefore -`getRegistrableDomain()` may not be the most suitable name, since it does -not always return true registrable domains. Reviewers of this proposal -feel this is not a significant enough issue to warrant alternative naming. - -2. This API should provide a way not just to get a hostname's -registrable-domain-like value, but also to know what kind of value that is, -be it an IP address, a domain name, or an intranet hostname lacking a known eTLD. -Reviewers of this proposal are of the view that no compelling use case has been -identified to support the need for such additional functionality. However, -reviewers have conceded that IP addresses have to be special-cased, because for -most domain inputs, one could split at dots to try and get a different domain level, -but that logic does not make sense for IP addresses. By not providing a way of -knowing whether the return value of `getRegistrableDomain()` is an IP address -or a domain name, it is more difficult for users of this API to implement -the special-casing that the reviewers have identified. - #### 3. IDN All API methods should accept hostnames passed as input parameters using either @@ -625,15 +607,15 @@ using Unicode encoding. `domain` = foo.bar.example.مليسيا -| Option | Registrable Domain | +| Option | Returned Domain | |----------------------------|-----------------------:| | unicode == false (default) | example.xn--mgbx4cd0ab | | unicode == true | example.مليسيا | #### 4. Invalid hostname -The promises returned by this API's methods should reject with an error if a hostname -passed as an input parameter meets any of the following criteria: +This API's methods should throw an error if a hostname passed as an input parameter +meets any of the following criteria: * Contains a character that is invalid in an Internationalized Domain Name (IDN) - e.g. symbols, whitespace * Is an empty string @@ -642,10 +624,10 @@ passed as an input parameter meets any of the following criteria: #### 5. Summary of behaviours -The following table sets out the eventual settled state of the promise returned by -`getRegistrableDomain()` for different classes of input `hostname` parameter: +The following table sets out the value returned by `getDomain()` for different +classes of input `hostname` parameter: -| Input hostname | Description | Registrable domain | +| Input hostname | Description | Returned domain | |:-------------------|:-------------------------------------------------|-----------------------:| | example.net | eTLD+1 | example.net | | www.example.net | eTLD+2 | example.net | @@ -656,7 +638,7 @@ The following table sets out the eventual settled state of the promise returned | foobar | no matching eTLD in PSL, single-label | null | | foobar | as above, with `allowUnknownSuffix = true` | foobar | | my.net.foobar | no matching eTLD in PSL, multi-label | null | -| my.net.foobar | as above, with `allowUnknownSuffix = true` | my.net.foobar | +| my.net.foobar | as above, with `allowUnknownSuffix = true` | net.foobar | | foobar.net | has an eTLD in the ICANN section | foobar.net | | foobar.github.io | has an eTLD in the Private section | foobar.github.io | | 127.0.0.1 | IP address, IPv4 | null | @@ -680,21 +662,21 @@ The following table sets out the eventual settled state of the promise returned #### 6. Sync vs Async Browser extension APIs are most commonly async, with API methods returning Promises. -Earlier versions of this proposal set out an async API, with `getRegistrableDomain()` -returning a `Promise`. However, some use cases require getting lists of +Earlier versions of this proposal set out an async API, with `getDomain()` +returning a `Promise`. However, some use cases require getting lists of registrable domains all in one go. In theory, this could be achieved by simply calling -`getRegistrableDomain()` multiple times. +`getDomain()` multiple times. The problem with this approach is that there is overhead associated with an extension calling an async function on the parent browser. For example, obtaining the registrable domains -of a list of 50 domains would involve making 50 async calls to the parent browser. +of a list of 50 hostnames would involve making 50 async calls to the parent browser. A batching method would allow the same result to be obtained with a single async call. -For this reason, batching method `getRegistrableDomains()` was added to this API. +For this reason, batching method `getDomains()` was added to this API. The method accepted an array of hostnames as input and returning a promise resolving to an array of registrable domains. A quick mockup of the two approaches was built using a simplified implementation of this proposal's API in a modified Firefox, and the -batching approach was about 2-3 times faster for 50 domains. +batching approach was about 2-3 times faster for 50 hostnames. Unfortunately, while this offered a solution to the performance problem, it added additional complexity to the API. To resolve this issue, the API @@ -763,14 +745,17 @@ done by the host browser. ### Open Web API -The purpose of this API is to eliminate the potential for inconsistency between -the host browser and its hosted extensions. The simplest way of achieving this -is for extensions to access this functionality via the host browser itself rather -than via some external source, such as an Open Web API. +Implementing this proposal as an open web API is not realistic at this time because: -It is then a determination for the host browser itself as to whether -the functionality (used by both the host browser and its extensions) -should ultimately be obtained by means of an Open Web API. +* Compared to web extension APIs, there is a higher bar for introducing web APIs, +and in the past there has not been sufficient interest in moving forward a proposal +like this one. Therefore the preferred approach is to start with extensions, +and it will always be possible to propose a web API later if this work proves +useful and there is appetite. + +* The PSL is not appropriate for use in all circumstances. Extensions have a +very compelling set of use cases that match browser use cases, but there +is not a universal agreement this is the case more generally. ## Implementation Notes @@ -818,3 +803,22 @@ is released, however this may not always be the case. It may be useful to implement a notification mechanism so that extensions can take appropriate action when the host browser's PSL dataset changes, to avoid having to poll the `getVersion()` function provided by this API. + +### 3. Get Domain and Kind + +While API method `getDomain()` by default returns registrable domains, +with additional options this method may return other types of domain: +IP addresses, intranet hostnames lacking known suffixes, and public suffixes themselves. +There is currently no straightfoward way for the method caller to determine +which of these kinds of value was returned from an invocation such as: +`getDomain(hostname, { allowIP, allowUnknownSuffix, allowPlainSuffix })`. + +It may be beneficial to provide an additional API method that would +return not only the domain value as returned by `getDomain()`, +but also a designation of the kind of value returned: +`RegistrableDomain`, `UnknownDomain`, `KnownSuffix`, `IPAddress`. + +An example use case would be if extension developers wanted to prepend +additional labels to the domain returned by `getDomain()`. This would +not make sense for returned IP addresses, so developers would need a +way of separating returned IP addresses from returned domain names. From e0f283531330fb48febfad686795793a41dc630f Mon Sep 17 00:00:00 2001 From: Francis McKenzie Date: Thu, 15 May 2025 22:23:39 +0800 Subject: [PATCH 7/7] Update Public Suffix API proposal --- proposals/public-suffix.md | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/proposals/public-suffix.md b/proposals/public-suffix.md index 9dee851c..df4f18b9 100644 --- a/proposals/public-suffix.md +++ b/proposals/public-suffix.md @@ -404,16 +404,23 @@ namespace publicSuffix { // Options that may be passed to the API method to control its behaviour. interface DomainOptions { + // If true, the returned domain should be encoded as Unicode. // Default = false (Punycode) unicode?: boolean, - // If true, the returned domain may be an IP address. + + // If true, and the input hostname is an IP address, then this is returned as-is. // Default = false allowIP?: boolean, - // If true, the returned domain may be a known eTLD. + + // If true, and the input hostname is itself a known eTLD (without a preceding label) + // then this is returned as-is. // Default = false allowPlainSuffix?: boolean, - // If true, the returned domain may lack a known eTLD. + + // If true, and the input hostname lacks a known eTLD, and is neither itself + // a known eTLD nor an IP address, then the returned domain consists of + // the penultimate two domain labels of the input. // Default = false allowUnknownSuffix?: boolean, } @@ -614,10 +621,15 @@ using Unicode encoding. #### 4. Invalid hostname -This API's methods should throw an error if a hostname passed as an input parameter -meets any of the following criteria: +For each method in this API that takes a hostname as an input parameter, +the hostname may be an IP address or a domain name. Unless the hostname +is a correctly-formatted IP address (IPv4 or IPv6), then the hostname +should be validated as an Internationalized Domain Name (IDN), and +an error should be thrown if such validation fails. + +For example, an error should be thrown if the hostname is not an IP address and it: -* Contains a character that is invalid in an Internationalized Domain Name (IDN) - e.g. symbols, whitespace +* Contains a character that is invalid in an IDN - e.g. symbols, whitespace * Is an empty string * Is equal to `'.'` * Contains empty domain labels (i.e. any occurrences of `'..'`)