feat: add suffix lookup functionality with trie structure
- Implemented a new suffix-trie.ts file for efficient domain suffix lookups. - Introduced a lookupInTrie function to search for public suffixes in a trie. - Added suffixLookup function to check if a hostname has a valid public suffix. - Created package.json and package-lock.json to manage dependencies, including tldts and tldts-core.
This commit is contained in:
+170
@@ -0,0 +1,170 @@
|
||||
/**
|
||||
* @param url - URL we want to extract a hostname from.
|
||||
* @param urlIsValidHostname - hint from caller; true if `url` is already a valid hostname.
|
||||
*/
|
||||
export default function extractHostname(
|
||||
url: string,
|
||||
urlIsValidHostname: boolean,
|
||||
): string | null {
|
||||
let start = 0;
|
||||
let end: number = url.length;
|
||||
let hasUpper = false;
|
||||
|
||||
// If url is not already a valid hostname, then try to extract hostname.
|
||||
if (!urlIsValidHostname) {
|
||||
// Special handling of data URLs
|
||||
if (url.startsWith('data:')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Trim leading spaces
|
||||
while (start < url.length && url.charCodeAt(start) <= 32) {
|
||||
start += 1;
|
||||
}
|
||||
|
||||
// Trim trailing spaces
|
||||
while (end > start + 1 && url.charCodeAt(end - 1) <= 32) {
|
||||
end -= 1;
|
||||
}
|
||||
|
||||
// Skip scheme.
|
||||
if (
|
||||
url.charCodeAt(start) === 47 /* '/' */ &&
|
||||
url.charCodeAt(start + 1) === 47 /* '/' */
|
||||
) {
|
||||
start += 2;
|
||||
} else {
|
||||
const indexOfProtocol = url.indexOf(':/', start);
|
||||
if (indexOfProtocol !== -1) {
|
||||
// Implement fast-path for common protocols. We expect most protocols
|
||||
// should be one of these 4 and thus we will not need to perform the
|
||||
// more expansive validity check most of the time.
|
||||
const protocolSize = indexOfProtocol - start;
|
||||
const c0 = url.charCodeAt(start);
|
||||
const c1 = url.charCodeAt(start + 1);
|
||||
const c2 = url.charCodeAt(start + 2);
|
||||
const c3 = url.charCodeAt(start + 3);
|
||||
const c4 = url.charCodeAt(start + 4);
|
||||
|
||||
if (
|
||||
protocolSize === 5 &&
|
||||
c0 === 104 /* 'h' */ &&
|
||||
c1 === 116 /* 't' */ &&
|
||||
c2 === 116 /* 't' */ &&
|
||||
c3 === 112 /* 'p' */ &&
|
||||
c4 === 115 /* 's' */
|
||||
) {
|
||||
// https
|
||||
} else if (
|
||||
protocolSize === 4 &&
|
||||
c0 === 104 /* 'h' */ &&
|
||||
c1 === 116 /* 't' */ &&
|
||||
c2 === 116 /* 't' */ &&
|
||||
c3 === 112 /* 'p' */
|
||||
) {
|
||||
// http
|
||||
} else if (
|
||||
protocolSize === 3 &&
|
||||
c0 === 119 /* 'w' */ &&
|
||||
c1 === 115 /* 's' */ &&
|
||||
c2 === 115 /* 's' */
|
||||
) {
|
||||
// wss
|
||||
} else if (
|
||||
protocolSize === 2 &&
|
||||
c0 === 119 /* 'w' */ &&
|
||||
c1 === 115 /* 's' */
|
||||
) {
|
||||
// ws
|
||||
} else {
|
||||
// Check that scheme is valid
|
||||
for (let i = start; i < indexOfProtocol; i += 1) {
|
||||
const lowerCaseCode = url.charCodeAt(i) | 32;
|
||||
if (
|
||||
!(
|
||||
(
|
||||
(lowerCaseCode >= 97 && lowerCaseCode <= 122) || // [a, z]
|
||||
(lowerCaseCode >= 48 && lowerCaseCode <= 57) || // [0, 9]
|
||||
lowerCaseCode === 46 || // '.'
|
||||
lowerCaseCode === 45 || // '-'
|
||||
lowerCaseCode === 43
|
||||
) // '+'
|
||||
)
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip 0, 1 or more '/' after ':/'
|
||||
start = indexOfProtocol + 2;
|
||||
while (url.charCodeAt(start) === 47 /* '/' */) {
|
||||
start += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Detect first occurrence of '/', '?' or '#'. We also keep track of the
|
||||
// last occurrence of '@', ']' or ':' to speed-up subsequent parsing of
|
||||
// (respectively), identifier, ipv6 or port.
|
||||
let indexOfIdentifier = -1;
|
||||
let indexOfClosingBracket = -1;
|
||||
let indexOfPort = -1;
|
||||
for (let i = start; i < end; i += 1) {
|
||||
const code: number = url.charCodeAt(i);
|
||||
if (
|
||||
code === 35 || // '#'
|
||||
code === 47 || // '/'
|
||||
code === 63 // '?'
|
||||
) {
|
||||
end = i;
|
||||
break;
|
||||
} else if (code === 64) {
|
||||
// '@'
|
||||
indexOfIdentifier = i;
|
||||
} else if (code === 93) {
|
||||
// ']'
|
||||
indexOfClosingBracket = i;
|
||||
} else if (code === 58) {
|
||||
// ':'
|
||||
indexOfPort = i;
|
||||
} else if (code >= 65 && code <= 90) {
|
||||
hasUpper = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Detect identifier: '@'
|
||||
if (
|
||||
indexOfIdentifier !== -1 &&
|
||||
indexOfIdentifier > start &&
|
||||
indexOfIdentifier < end
|
||||
) {
|
||||
start = indexOfIdentifier + 1;
|
||||
}
|
||||
|
||||
// Handle ipv6 addresses
|
||||
if (url.charCodeAt(start) === 91 /* '[' */) {
|
||||
if (indexOfClosingBracket !== -1) {
|
||||
return url.slice(start + 1, indexOfClosingBracket).toLowerCase();
|
||||
}
|
||||
return null;
|
||||
} else if (indexOfPort !== -1 && indexOfPort > start && indexOfPort < end) {
|
||||
// Detect port: ':'
|
||||
end = indexOfPort;
|
||||
}
|
||||
}
|
||||
|
||||
// Trim trailing dots
|
||||
while (end > start + 1 && url.charCodeAt(end - 1) === 46 /* '.' */) {
|
||||
end -= 1;
|
||||
}
|
||||
|
||||
const hostname: string =
|
||||
start !== 0 || end !== url.length ? url.slice(start, end) : url;
|
||||
|
||||
if (hasUpper) {
|
||||
return hostname.toLowerCase();
|
||||
}
|
||||
|
||||
return hostname;
|
||||
}
|
||||
Reference in New Issue
Block a user