feat: add suffix lookup functionality with trie structure

- Implemented a new suffix-trie.ts file for efficient domain suffix lookups. - Introduced a lookupInTrie function to search for public suffixes in a trie. - Added suffixLookup function to check if a hostname has a valid public suffix. - Created package.json and package-lock.json to manage dependencies, including tldts and tldts-core.
2025-11-21 17:43:43 -05:00
parent 1ba719366b
commit b3c56529a6
106 changed files with 4421 additions and 4 deletions
@@ -0,0 +1,170 @@
+/**
+ * @param url - URL we want to extract a hostname from.
+ * @param urlIsValidHostname - hint from caller; true if `url` is already a valid hostname.
+ */
+export default function extractHostname(
+  url: string,
+  urlIsValidHostname: boolean,
+): string | null {
+  let start = 0;
+  let end: number = url.length;
+  let hasUpper = false;
+
+  // If url is not already a valid hostname, then try to extract hostname.
+  if (!urlIsValidHostname) {
+    // Special handling of data URLs
+    if (url.startsWith('data:')) {
+      return null;
+    }
+
+    // Trim leading spaces
+    while (start < url.length && url.charCodeAt(start) <= 32) {
+      start += 1;
+    }
+
+    // Trim trailing spaces
+    while (end > start + 1 && url.charCodeAt(end - 1) <= 32) {
+      end -= 1;
+    }
+
+    // Skip scheme.
+    if (
+      url.charCodeAt(start) === 47 /* '/' */ &&
+      url.charCodeAt(start + 1) === 47 /* '/' */
+    ) {
+      start += 2;
+    } else {
+      const indexOfProtocol = url.indexOf(':/', start);
+      if (indexOfProtocol !== -1) {
+        // Implement fast-path for common protocols. We expect most protocols
+        // should be one of these 4 and thus we will not need to perform the
+        // more expansive validity check most of the time.
+        const protocolSize = indexOfProtocol - start;
+        const c0 = url.charCodeAt(start);
+        const c1 = url.charCodeAt(start + 1);
+        const c2 = url.charCodeAt(start + 2);
+        const c3 = url.charCodeAt(start + 3);
+        const c4 = url.charCodeAt(start + 4);
+
+        if (
+          protocolSize === 5 &&
+          c0 === 104 /* 'h' */ &&
+          c1 === 116 /* 't' */ &&
+          c2 === 116 /* 't' */ &&
+          c3 === 112 /* 'p' */ &&
+          c4 === 115 /* 's' */
+        ) {
+          // https
+        } else if (
+          protocolSize === 4 &&
+          c0 === 104 /* 'h' */ &&
+          c1 === 116 /* 't' */ &&
+          c2 === 116 /* 't' */ &&
+          c3 === 112 /* 'p' */
+        ) {
+          // http
+        } else if (
+          protocolSize === 3 &&
+          c0 === 119 /* 'w' */ &&
+          c1 === 115 /* 's' */ &&
+          c2 === 115 /* 's' */
+        ) {
+          // wss
+        } else if (
+          protocolSize === 2 &&
+          c0 === 119 /* 'w' */ &&
+          c1 === 115 /* 's' */
+        ) {
+          // ws
+        } else {
+          // Check that scheme is valid
+          for (let i = start; i < indexOfProtocol; i += 1) {
+            const lowerCaseCode = url.charCodeAt(i) | 32;
+            if (
+              !(
+                (
+                  (lowerCaseCode >= 97 && lowerCaseCode <= 122) || // [a, z]
+                  (lowerCaseCode >= 48 && lowerCaseCode <= 57) || // [0, 9]
+                  lowerCaseCode === 46 || // '.'
+                  lowerCaseCode === 45 || // '-'
+                  lowerCaseCode === 43
+                ) // '+'
+              )
+            ) {
+              return null;
+            }
+          }
+        }
+
+        // Skip 0, 1 or more '/' after ':/'
+        start = indexOfProtocol + 2;
+        while (url.charCodeAt(start) === 47 /* '/' */) {
+          start += 1;
+        }
+      }
+    }
+
+    // Detect first occurrence of '/', '?' or '#'. We also keep track of the
+    // last occurrence of '@', ']' or ':' to speed-up subsequent parsing of
+    // (respectively), identifier, ipv6 or port.
+    let indexOfIdentifier = -1;
+    let indexOfClosingBracket = -1;
+    let indexOfPort = -1;
+    for (let i = start; i < end; i += 1) {
+      const code: number = url.charCodeAt(i);
+      if (
+        code === 35 || // '#'
+        code === 47 || // '/'
+        code === 63 // '?'
+      ) {
+        end = i;
+        break;
+      } else if (code === 64) {
+        // '@'
+        indexOfIdentifier = i;
+      } else if (code === 93) {
+        // ']'
+        indexOfClosingBracket = i;
+      } else if (code === 58) {
+        // ':'
+        indexOfPort = i;
+      } else if (code >= 65 && code <= 90) {
+        hasUpper = true;
+      }
+    }
+
+    // Detect identifier: '@'
+    if (
+      indexOfIdentifier !== -1 &&
+      indexOfIdentifier > start &&
+      indexOfIdentifier < end
+    ) {
+      start = indexOfIdentifier + 1;
+    }
+
+    // Handle ipv6 addresses
+    if (url.charCodeAt(start) === 91 /* '[' */) {
+      if (indexOfClosingBracket !== -1) {
+        return url.slice(start + 1, indexOfClosingBracket).toLowerCase();
+      }
+      return null;
+    } else if (indexOfPort !== -1 && indexOfPort > start && indexOfPort < end) {
+      // Detect port: ':'
+      end = indexOfPort;
+    }
+  }
+
+  // Trim trailing dots
+  while (end > start + 1 && url.charCodeAt(end - 1) === 46 /* '.' */) {
+    end -= 1;
+  }
+
+  const hostname: string =
+    start !== 0 || end !== url.length ? url.slice(start, end) : url;
+
+  if (hasUpper) {
+    return hostname.toLowerCase();
+  }
+
+  return hostname;
+}