feat(vanilla): add testing infrastructure and tests (NK-wjnczv)

author: Adam Mathes <adam@adammathes.com> 2026-02-13 21:34:48 -0800
committer: Adam Mathes <adam@adammathes.com> 2026-02-13 21:34:48 -0800
commit: 76cb9c2a39d477a64824a985ade40507e3bbade1 (patch)
tree: 41e997aa9c6f538d3a136af61dae9424db2005a9 /vanilla/node_modules/@exodus/bytes/fallback/encoding.js
parent: 819a39a21ac992b1393244a4c283bbb125208c69 (diff)
download: neko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.gz
neko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.bz2
neko-76cb9c2a39d477a64824a985ade40507e3bbade1.zip
1 files changed, 359 insertions, 0 deletions
diff --git a/vanilla/node_modules/@exodus/bytes/fallback/encoding.js b/vanilla/node_modules/@exodus/bytes/fallback/encoding.js
new file mode 100644
index 0000000..fab1080
--- /dev/null
+++ b/vanilla/node_modules/@exodus/bytes/fallback/encoding.js
@@ -0,0 +1,359 @@
+// We can't return native TextDecoder if it's present, as Node.js one is broken on windows-1252 and we fix that
+// We are also faster than Node.js built-in on both TextEncoder and TextDecoder
+
+import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
+import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
+import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
+import labels from './encoding.labels.js'
+import { fromSource, getBOMEncoding } from './encoding.api.js'
+import { unfinishedBytes, mergePrefix } from './encoding.util.js'
+
+export { getBOMEncoding } from './encoding.api.js'
+
+export const E_ENCODING = 'Unknown encoding'
+const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
+const E_OPTIONS = 'The "options" argument must be of type object'
+const replacementChar = '\uFFFD'
+const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
+let createMultibyteDecoder, multibyteEncoder
+
+let labelsMap
+// Warning: unlike whatwg-encoding, returns lowercased labels
+// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
+// https://encoding.spec.whatwg.org/#names-and-labels
+export function normalizeEncoding(label) {
+  // fast path
+  if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
+  if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
+  // full map
+  if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
+  const low = `${label}`.trim().toLowerCase()
+  if (Object.hasOwn(labels, low)) return low
+  if (!labelsMap) {
+    labelsMap = new Map()
+    for (const [name, aliases] of Object.entries(labels)) {
+      for (const alias of aliases) labelsMap.set(alias, name)
+    }
+  }
+
+  const mapped = labelsMap.get(low)
+  if (mapped) return mapped
+  return null
+}
+
+const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
+
+// Unlike normalizeEncoding, case-sensitive
+// https://encoding.spec.whatwg.org/#names-and-labels
+export function labelToName(label) {
+  const enc = normalizeEncoding(label)
+  if (enc === 'utf-8') return 'UTF-8' // fast path
+  if (!enc) return enc
+  if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
+  if (enc === 'big5') return 'Big5'
+  if (enc === 'shift_jis') return 'Shift_JIS'
+  return enc
+}
+
+export const isMultibyte = (enc) => multibyteSet.has(enc)
+export function setMultibyte(createDecoder, createEncoder) {
+  createMultibyteDecoder = createDecoder
+  multibyteEncoder = createEncoder
+}
+
+export function getMultibyteEncoder() {
+  if (!multibyteEncoder) throw new Error(E_MULTI)
+  return multibyteEncoder
+}
+
+const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
+
+function isAnyUint8Array(x) {
+  if (x instanceof Uint8Array) return true
+  if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
+  return Object.prototype.toString.call(x) === '[object Uint8Array]'
+}
+
+function unicodeDecoder(encoding, loose) {
+  if (encoding === 'utf-8') return loose ? utf8toStringLoose : utf8toString // likely
+  const form = encoding === 'utf-16le' ? 'uint8-le' : 'uint8-be'
+  return loose ? (u) => utf16toStringLoose(u, form) : (u) => utf16toString(u, form)
+}
+
+export class TextDecoder {
+  #decode
+  #unicode
+  #multibyte
+  #chunk
+  #canBOM
+
+  constructor(encoding = 'utf-8', options = {}) {
+    if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
+    const enc = normalizeEncoding(encoding)
+    if (!enc || enc === 'replacement') throw new RangeError(E_ENCODING)
+    define(this, 'encoding', enc)
+    define(this, 'fatal', !!options.fatal)
+    define(this, 'ignoreBOM', !!options.ignoreBOM)
+    this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
+    this.#multibyte = !this.#unicode && isMultibyte(enc)
+    this.#canBOM = this.#unicode && !this.ignoreBOM
+  }
+
+  get [Symbol.toStringTag]() {
+    return 'TextDecoder'
+  }
+
+  decode(input, options = {}) {
+    if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
+    const stream = !!options.stream
+    let u = input === undefined ? new Uint8Array() : fromSource(input)
+    const empty = u.length === 0 // also can't be streaming after next line
+    if (empty && stream) return '' // no state change
+
+    if (this.#unicode) {
+      let prefix
+      if (this.#chunk) {
+        const merged = mergePrefix(u, this.#chunk, this.encoding)
+        if (u.length < 3) {
+          u = merged // might be unfinished, but fully consumed old u
+        } else {
+          prefix = merged // stops at complete chunk
+          const add = prefix.length - this.#chunk.length
+          if (add > 0) u = u.subarray(add)
+        }
+
+        this.#chunk = null
+      } else if (empty) {
+        this.#canBOM = !this.ignoreBOM // not streaming
+        return ''
+      }
+
+      // For non-stream utf-8 we don't have to do this as it matches utf8toStringLoose already
+      // For non-stream loose utf-16 we still have to do this as this API supports uneven byteLength unlike utf16toStringLoose
+      let suffix = ''
+      if (stream || (!this.fatal && this.encoding !== 'utf-8')) {
+        const trail = unfinishedBytes(u, u.byteLength, this.encoding)
+        if (trail > 0) {
+          if (stream) {
+            this.#chunk = Uint8Array.from(u.subarray(-trail)) // copy
+          } else {
+            // non-fatal mode as already checked
+            suffix = replacementChar
+          }
+
+          u = u.subarray(0, -trail)
+        }
+      }
+
+      let seenBOM = false
+      if (this.#canBOM) {
+        const bom = this.#findBom(prefix ?? u)
+        if (bom) {
+          seenBOM = true
+          if (prefix) {
+            prefix = prefix.subarray(bom)
+          } else {
+            u = u.subarray(bom)
+          }
+        }
+      } else if (!stream && !this.ignoreBOM) {
+        this.#canBOM = true
+      }
+
+      if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
+      try {
+        const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
+        // "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
+        if (stream && (seenBOM || res.length > 0)) this.#canBOM = false
+        return res
+      } catch (err) {
+        this.#chunk = null // reset unfinished chunk on errors
+        // The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
+        // See also multi-byte.js
+        throw err
+      }
+
+      // eslint-disable-next-line no-else-return
+    } else if (this.#multibyte) {
+      if (!createMultibyteDecoder) throw new Error(E_MULTI)
+      if (!this.#decode) this.#decode = createMultibyteDecoder(this.encoding, !this.fatal) // can contain state!
+      return this.#decode(u, stream)
+    } else {
+      if (!this.#decode) this.#decode = createSinglebyteDecoder(this.encoding, !this.fatal)
+      return this.#decode(u)
+    }
+  }
+
+  #findBom(u) {
+    switch (this.encoding) {
+      case 'utf-8':
+        return u.byteLength >= 3 && u[0] === 0xef && u[1] === 0xbb && u[2] === 0xbf ? 3 : 0
+      case 'utf-16le':
+        return u.byteLength >= 2 && u[0] === 0xff && u[1] === 0xfe ? 2 : 0
+      case 'utf-16be':
+        return u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff ? 2 : 0
+    }
+
+    /* c8 ignore next */
+    throw new Error('Unreachable')
+  }
+}
+
+export class TextEncoder {
+  constructor() {
+    define(this, 'encoding', 'utf-8')
+  }
+
+  get [Symbol.toStringTag]() {
+    return 'TextEncoder'
+  }
+
+  encode(str = '') {
+    if (typeof str !== 'string') str = `${str}`
+    const res = utf8fromStringLoose(str)
+    // match new Uint8Array (per spec), which is non-pooled
+    return res.byteOffset === 0 && res.length === res.buffer.byteLength ? res : res.slice(0)
+  }
+
+  encodeInto(str, target) {
+    if (typeof str !== 'string') str = `${str}`
+    if (!isAnyUint8Array(target)) throw new TypeError('Target must be an Uint8Array')
+    if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
+
+    const tlen = target.length
+    if (tlen < str.length) str = str.slice(0, tlen)
+    let u8 = utf8fromStringLoose(str)
+    let read
+    if (tlen >= u8.length) {
+      read = str.length
+    } else if (u8.length === str.length) {
+      if (u8.length > tlen) u8 = u8.subarray(0, tlen) // ascii can be truncated
+      read = u8.length
+    } else {
+      u8 = u8.subarray(0, tlen)
+      const unfinished = unfinishedBytes(u8, u8.length, 'utf-8')
+      if (unfinished > 0) u8 = u8.subarray(0, u8.length - unfinished)
+
+      // We can do this because loose str -> u8 -> str preserves length, unlike loose u8 -> str -> u8
+      // Each unpaired surrogate (1 charcode) is replaced with a single charcode
+      read = utf8toStringLoose(u8).length // FIXME: Converting back is very inefficient
+    }
+
+    try {
+      target.set(u8)
+    } catch {
+      return { read: 0, written: 0 } // see above, likely detached but no .detached property support
+    }
+
+    return { read, written: u8.length }
+  }
+}
+
+const E_NO_STREAMS = 'TransformStream global not present in the environment'
+
+// https://encoding.spec.whatwg.org/#interface-textdecoderstream
+export class TextDecoderStream {
+  constructor(encoding = 'utf-8', options = {}) {
+    if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
+    const decoder = new TextDecoder(encoding, options)
+    const transform = new TransformStream({
+      transform: (chunk, controller) => {
+        const value = decoder.decode(fromSource(chunk), { stream: true })
+        if (value) controller.enqueue(value)
+      },
+      flush: (controller) => {
+        // https://streams.spec.whatwg.org/#dom-transformer-flush
+        const value = decoder.decode()
+        if (value) controller.enqueue(value)
+        // No need to call .terminate() (Node.js is wrong)
+      },
+    })
+
+    define(this, 'encoding', decoder.encoding)
+    define(this, 'fatal', decoder.fatal)
+    define(this, 'ignoreBOM', decoder.ignoreBOM)
+    define(this, 'readable', transform.readable)
+    define(this, 'writable', transform.writable)
+  }
+
+  get [Symbol.toStringTag]() {
+    return 'TextDecoderStream'
+  }
+}
+
+// https://encoding.spec.whatwg.org/#interface-textencoderstream
+// Only UTF-8 per spec
+export class TextEncoderStream {
+  constructor() {
+    if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
+    let lead
+    const transform = new TransformStream({
+      // https://encoding.spec.whatwg.org/#encode-and-enqueue-a-chunk
+      // Not identical in code, but reuses loose mode to have identical behavior
+      transform: (chunk, controller) => {
+        let s = String(chunk) // DOMString, might contain unpaired surrogates
+        if (s.length === 0) return
+        if (lead) {
+          s = lead + s
+          lead = null
+        }
+
+        const last = s.charCodeAt(s.length - 1) // Can't come from previous lead due to length check
+        if ((last & 0xfc_00) === 0xd8_00) {
+          lead = s[s.length - 1]
+          s = s.slice(0, -1)
+        }
+
+        if (s) controller.enqueue(utf8fromStringLoose(s))
+      },
+      // https://encoding.spec.whatwg.org/#encode-and-flush
+      flush: (controller) => {
+        if (lead) controller.enqueue(Uint8Array.of(0xef, 0xbf, 0xbd))
+      },
+    })
+
+    define(this, 'encoding', 'utf-8')
+    define(this, 'readable', transform.readable)
+    define(this, 'writable', transform.writable)
+  }
+
+  get [Symbol.toStringTag]() {
+    return 'TextEncoderStream'
+  }
+}
+
+// https://encoding.spec.whatwg.org/#decode
+// Warning: encoding sniffed from BOM takes preference over the supplied one
+// Warning: lossy, performs replacement, no option of throwing
+// Completely ignores encoding and even skips validation when BOM is found
+// Unlike TextDecoder public API, additionally supports 'replacement' encoding
+export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
+  let u8 = fromSource(input)
+  const bomEncoding = getBOMEncoding(u8)
+  if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
+  const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
+
+  if (enc === 'utf-8') return utf8toStringLoose(u8)
+  if (enc === 'utf-16le' || enc === 'utf-16be') {
+    let suffix = ''
+    if (u8.byteLength % 2 !== 0) {
+      suffix = replacementChar
+      u8 = u8.subarray(0, -unfinishedBytes(u8, u8.byteLength, enc))
+    }
+
+    return utf16toStringLoose(u8, enc === 'utf-16le' ? 'uint8-le' : 'uint8-be') + suffix
+  }
+
+  if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
+
+  if (isMultibyte(enc)) {
+    if (!createMultibyteDecoder) throw new Error(E_MULTI)
+    return createMultibyteDecoder(enc, true)(u8)
+  }
+
+  // https://encoding.spec.whatwg.org/#replacement-decoder
+  // On non-streaming non-fatal case, it just replaces any non-empty input with a single replacement char
+  if (enc === 'replacement') return input.byteLength > 0 ? replacementChar : ''
+
+  return createSinglebyteDecoder(enc, true)(u8)
+}
author	Adam Mathes <adam@adammathes.com>	2026-02-13 21:34:48 -0800
committer	Adam Mathes <adam@adammathes.com>	2026-02-13 21:34:48 -0800
commit	76cb9c2a39d477a64824a985ade40507e3bbade1 (patch)
tree	41e997aa9c6f538d3a136af61dae9424db2005a9 /vanilla/node_modules/@exodus/bytes/fallback/encoding.js
parent	819a39a21ac992b1393244a4c283bbb125208c69 (diff)
download	neko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.gz neko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.bz2 neko-76cb9c2a39d477a64824a985ade40507e3bbade1.zip