aboutsummaryrefslogtreecommitdiffstats
path: root/vanilla/node_modules/html-encoding-sniffer/lib
diff options
context:
space:
mode:
authorAdam Mathes <adam@adammathes.com>2026-02-13 21:34:48 -0800
committerAdam Mathes <adam@adammathes.com>2026-02-13 21:34:48 -0800
commit76cb9c2a39d477a64824a985ade40507e3bbade1 (patch)
tree41e997aa9c6f538d3a136af61dae9424db2005a9 /vanilla/node_modules/html-encoding-sniffer/lib
parent819a39a21ac992b1393244a4c283bbb125208c69 (diff)
downloadneko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.gz
neko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.bz2
neko-76cb9c2a39d477a64824a985ade40507e3bbade1.zip
feat(vanilla): add testing infrastructure and tests (NK-wjnczv)
Diffstat (limited to 'vanilla/node_modules/html-encoding-sniffer/lib')
-rw-r--r--vanilla/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js299
1 files changed, 299 insertions, 0 deletions
diff --git a/vanilla/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js b/vanilla/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js
new file mode 100644
index 0000000..08a92d3
--- /dev/null
+++ b/vanilla/node_modules/html-encoding-sniffer/lib/html-encoding-sniffer.js
@@ -0,0 +1,299 @@
+"use strict";
+const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js");
+
+// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
+module.exports = (uint8Array, { xml = false, transportLayerEncodingLabel, defaultEncoding } = {}) => {
+ if (defaultEncoding === undefined) {
+ defaultEncoding = xml ? "UTF-8" : "windows-1252";
+ }
+
+ let encoding = labelToName(getBOMEncoding(uint8Array));
+
+ if (encoding === null && transportLayerEncodingLabel !== undefined) {
+ encoding = labelToName(transportLayerEncodingLabel);
+ }
+
+ if (encoding === null && !xml) {
+ encoding = prescanMetaCharset(uint8Array);
+ }
+
+ if (encoding === null) {
+ encoding = defaultEncoding;
+ }
+
+ return encoding;
+};
+
+// https://html.spec.whatwg.org/multipage/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
+function prescanMetaCharset(uint8Array) {
+ const l = Math.min(uint8Array.byteLength, 1024);
+ for (let i = 0; i < l; i++) {
+ let c = uint8Array[i];
+ if (c === 0x3C) {
+ // "<"
+ const c1 = uint8Array[i + 1];
+ const c2 = uint8Array[i + 2];
+ const c3 = uint8Array[i + 3];
+ const c4 = uint8Array[i + 4];
+ const c5 = uint8Array[i + 5];
+ // !-- (comment start)
+ if (c1 === 0x21 && c2 === 0x2D && c3 === 0x2D) {
+ i += 4;
+ for (; i < l; i++) {
+ c = uint8Array[i];
+ const cMinus1 = uint8Array[i - 1];
+ const cMinus2 = uint8Array[i - 2];
+ // --> (comment end)
+ if (c === 0x3E && cMinus1 === 0x2D && cMinus2 === 0x2D) {
+ break;
+ }
+ }
+ } else if ((c1 === 0x4D || c1 === 0x6D) &&
+ (c2 === 0x45 || c2 === 0x65) &&
+ (c3 === 0x54 || c3 === 0x74) &&
+ (c4 === 0x41 || c4 === 0x61) &&
+ (isSpaceCharacter(c5) || c5 === 0x2F)) {
+ // "meta" + space or /
+ i += 6;
+ const attributeList = new Set();
+ let gotPragma = false;
+ let needPragma = null;
+ let charset = null;
+
+ let attrRes;
+ do {
+ attrRes = getAttribute(uint8Array, i, l);
+ if (attrRes.attr && !attributeList.has(attrRes.attr.name)) {
+ attributeList.add(attrRes.attr.name);
+ if (attrRes.attr.name === "http-equiv") {
+ gotPragma = attrRes.attr.value === "content-type";
+ } else if (attrRes.attr.name === "content" && !charset) {
+ charset = extractCharacterEncodingFromMeta(attrRes.attr.value);
+ if (charset !== null) {
+ needPragma = true;
+ }
+ } else if (attrRes.attr.name === "charset") {
+ charset = labelToName(attrRes.attr.value);
+ needPragma = false;
+ }
+ }
+ i = attrRes.i;
+ } while (attrRes.attr);
+
+ if (needPragma === null) {
+ continue;
+ }
+ if (needPragma === true && gotPragma === false) {
+ continue;
+ }
+ if (charset === null) {
+ continue;
+ }
+
+ if (charset === "UTF-16LE" || charset === "UTF-16BE") {
+ charset = "UTF-8";
+ }
+ if (charset === "x-user-defined") {
+ charset = "windows-1252";
+ }
+
+ return charset;
+ } else if ((c1 >= 0x41 && c1 <= 0x5A) || (c1 >= 0x61 && c1 <= 0x7A)) {
+ // a-z or A-Z
+ for (i += 2; i < l; i++) {
+ c = uint8Array[i];
+ // space or >
+ if (isSpaceCharacter(c) || c === 0x3E) {
+ break;
+ }
+ }
+ let attrRes;
+ do {
+ attrRes = getAttribute(uint8Array, i, l);
+ i = attrRes.i;
+ } while (attrRes.attr);
+ } else if (c1 === 0x21 || c1 === 0x2F || c1 === 0x3F) {
+ // ! or / or ?
+ for (i += 2; i < l; i++) {
+ c = uint8Array[i];
+ // >
+ if (c === 0x3E) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ return null;
+}
+
+// https://html.spec.whatwg.org/multipage/syntax.html#concept-get-attributes-when-sniffing
+function getAttribute(uint8Array, i, l) {
+ for (; i < l; i++) {
+ let c = uint8Array[i];
+ // space or /
+ if (isSpaceCharacter(c) || c === 0x2F) {
+ continue;
+ }
+ // ">"
+ if (c === 0x3E) {
+ break;
+ }
+ let name = "";
+ let value = "";
+ nameLoop:for (; i < l; i++) {
+ c = uint8Array[i];
+ // "="
+ if (c === 0x3D && name !== "") {
+ i++;
+ break;
+ }
+ // space
+ if (isSpaceCharacter(c)) {
+ for (i++; i < l; i++) {
+ c = uint8Array[i];
+ // space
+ if (isSpaceCharacter(c)) {
+ continue;
+ }
+ // not "="
+ if (c !== 0x3D) {
+ return { attr: { name, value }, i };
+ }
+
+ i++;
+ break nameLoop;
+ }
+ break;
+ }
+ // / or >
+ if (c === 0x2F || c === 0x3E) {
+ return { attr: { name, value }, i };
+ }
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ name += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ name += String.fromCharCode(c);
+ }
+ }
+ c = uint8Array[i];
+ // space
+ if (isSpaceCharacter(c)) {
+ for (i++; i < l; i++) {
+ c = uint8Array[i];
+ // space
+ if (isSpaceCharacter(c)) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ }
+ // " or '
+ if (c === 0x22 || c === 0x27) {
+ const quote = c;
+ for (i++; i < l; i++) {
+ c = uint8Array[i];
+
+ if (c === quote) {
+ i++;
+ return { attr: { name, value }, i };
+ }
+
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ value += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ value += String.fromCharCode(c);
+ }
+ }
+ }
+
+ // >
+ if (c === 0x3E) {
+ return { attr: { name, value }, i };
+ }
+
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ value += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ value += String.fromCharCode(c);
+ }
+
+ for (i++; i < l; i++) {
+ c = uint8Array[i];
+
+ // space or >
+ if (isSpaceCharacter(c) || c === 0x3E) {
+ return { attr: { name, value }, i };
+ }
+
+ // A-Z
+ if (c >= 0x41 && c <= 0x5A) {
+ value += String.fromCharCode(c + 0x20); // lowercase
+ } else {
+ value += String.fromCharCode(c);
+ }
+ }
+ }
+ return { i };
+}
+
+function extractCharacterEncodingFromMeta(string) {
+ let position = 0;
+
+ while (true) {
+ const indexOfCharset = string.substring(position).search(/charset/ui);
+
+ if (indexOfCharset === -1) {
+ return null;
+ }
+ let subPosition = position + indexOfCharset + "charset".length;
+
+ while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
+ ++subPosition;
+ }
+
+ if (string[subPosition] !== "=") {
+ position = subPosition - 1;
+ continue;
+ }
+
+ ++subPosition;
+
+ while (isSpaceCharacter(string[subPosition].charCodeAt(0))) {
+ ++subPosition;
+ }
+
+ position = subPosition;
+ break;
+ }
+
+ if (string[position] === "\"" || string[position] === "'") {
+ const nextIndex = string.indexOf(string[position], position + 1);
+
+ if (nextIndex !== -1) {
+ return labelToName(string.substring(position + 1, nextIndex));
+ }
+
+ // It is an unmatched quotation mark
+ return null;
+ }
+
+ if (string.length === position + 1) {
+ return null;
+ }
+
+ const indexOfASCIIWhitespaceOrSemicolon = string.substring(position + 1).search(/\x09|\x0A|\x0C|\x0D|\x20|;/u);
+ const end = indexOfASCIIWhitespaceOrSemicolon === -1 ?
+ string.length :
+ position + indexOfASCIIWhitespaceOrSemicolon + 1;
+
+ return labelToName(string.substring(position, end));
+}
+
+function isSpaceCharacter(c) {
+ return c === 0x09 || c === 0x0A || c === 0x0C || c === 0x0D || c === 0x20;
+}