From 76cb9c2a39d477a64824a985ade40507e3bbade1 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Fri, 13 Feb 2026 21:34:48 -0800 Subject: feat(vanilla): add testing infrastructure and tests (NK-wjnczv) --- .../node_modules/@exodus/bytes/fallback/utf8.js | 270 +++++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 vanilla/node_modules/@exodus/bytes/fallback/utf8.js (limited to 'vanilla/node_modules/@exodus/bytes/fallback/utf8.js') diff --git a/vanilla/node_modules/@exodus/bytes/fallback/utf8.js b/vanilla/node_modules/@exodus/bytes/fallback/utf8.js new file mode 100644 index 0000000..d1467bc --- /dev/null +++ b/vanilla/node_modules/@exodus/bytes/fallback/utf8.js @@ -0,0 +1,270 @@ +import { E_STRICT_UNICODE } from './_utils.js' +import { isHermes } from './platform.js' +import { asciiPrefix, decodeLatin1, encodeAsciiPrefix } from './latin1.js' + +export const E_STRICT = 'Input is not well-formed utf8' + +const replacementPoint = 0xff_fd +const shouldUseEscapePath = isHermes // faster only on Hermes, js path beats it on normal engines +const { decodeURIComponent, escape } = globalThis + +export function decodeFast(arr, loose) { + // Fast path for ASCII prefix, this is faster than all alternatives below + const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here + if (prefix.length === arr.length) return prefix + + // This codepath gives a ~3x perf boost on Hermes + if (shouldUseEscapePath && escape && decodeURIComponent) { + const o = escape(decodeLatin1(arr, prefix.length, arr.length)) + try { + return prefix + decodeURIComponent(o) // Latin1 to utf8 + } catch { + if (!loose) throw new TypeError(E_STRICT) + // Ok, we have to use manual implementation for loose decoder + } + } + + return prefix + decode(arr, loose, prefix.length) +} + +// https://encoding.spec.whatwg.org/#utf-8-decoder +// We are most likely in loose mode, for non-loose escape & decodeURIComponent solved everything +export function decode(arr, loose, start = 0) { + start |= 0 + const end = arr.length + let out = '' + const chunkSize = 0x2_00 // far below MAX_ARGUMENTS_LENGTH in npmjs.com/buffer, we use smaller chunks + const tmpSize = Math.min(end - start, chunkSize + 1) // need 1 extra slot for last codepoint, which can be 2 charcodes + const tmp = new Array(tmpSize).fill(0) + let ti = 0 + + for (let i = start; i < end; i++) { + if (ti >= chunkSize) { + tmp.length = ti // can be larger by 1 if last codepoint is two charcodes + out += String.fromCharCode.apply(String, tmp) + if (tmp.length <= chunkSize) tmp.push(0) // restore 1 extra slot for last codepoint + ti = 0 + } + + const byte = arr[i] + if (byte < 0x80) { + tmp[ti++] = byte + // ascii fast path is in decodeFast(), this is called only on non-ascii input + // so we don't unroll this anymore + } else if (byte < 0xc2) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + } else if (byte < 0xe0) { + // need 1 more + if (i + 1 >= end) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + break + } + + const byte1 = arr[i + 1] + if (byte1 < 0x80 || byte1 > 0xbf) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + continue + } + + i++ + tmp[ti++] = ((byte & 0x1f) << 6) | (byte1 & 0x3f) + } else if (byte < 0xf0) { + // need 2 more + if (i + 1 >= end) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + break + } + + const lower = byte === 0xe0 ? 0xa0 : 0x80 + const upper = byte === 0xed ? 0x9f : 0xbf + const byte1 = arr[i + 1] + if (byte1 < lower || byte1 > upper) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + continue + } + + i++ + if (i + 1 >= end) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + break + } + + const byte2 = arr[i + 1] + if (byte2 < 0x80 || byte2 > 0xbf) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + continue + } + + i++ + tmp[ti++] = ((byte & 0xf) << 12) | ((byte1 & 0x3f) << 6) | (byte2 & 0x3f) + } else if (byte <= 0xf4) { + // need 3 more + if (i + 1 >= end) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + break + } + + const lower = byte === 0xf0 ? 0x90 : 0x80 + const upper = byte === 0xf4 ? 0x8f : 0xbf + const byte1 = arr[i + 1] + if (byte1 < lower || byte1 > upper) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + continue + } + + i++ + if (i + 1 >= end) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + break + } + + const byte2 = arr[i + 1] + if (byte2 < 0x80 || byte2 > 0xbf) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + continue + } + + i++ + if (i + 1 >= end) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + break + } + + const byte3 = arr[i + 1] + if (byte3 < 0x80 || byte3 > 0xbf) { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + continue + } + + i++ + const codePoint = + ((byte & 0xf) << 18) | ((byte1 & 0x3f) << 12) | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f) + if (codePoint > 0xff_ff) { + // split into char codes as String.fromCharCode is faster than String.fromCodePoint + const u = codePoint - 0x1_00_00 + tmp[ti++] = 0xd8_00 + ((u >> 10) & 0x3_ff) + tmp[ti++] = 0xdc_00 + (u & 0x3_ff) + } else { + tmp[ti++] = codePoint + } + // eslint-disable-next-line sonarjs/no-duplicated-branches + } else { + if (!loose) throw new TypeError(E_STRICT) + tmp[ti++] = replacementPoint + } + } + + if (ti === 0) return out + tmp.length = ti + return out + String.fromCharCode.apply(String, tmp) +} + +export function encode(string, loose) { + const length = string.length + let small = true + let bytes = new Uint8Array(length) // assume ascii + + let i = encodeAsciiPrefix(bytes, string) + let p = i + for (; i < length; i++) { + let code = string.charCodeAt(i) + if (code < 0x80) { + bytes[p++] = code + // Unroll the loop a bit for faster ops + while (true) { + i++ + if (i >= length) break + code = string.charCodeAt(i) + if (code >= 0x80) break + bytes[p++] = code + i++ + if (i >= length) break + code = string.charCodeAt(i) + if (code >= 0x80) break + bytes[p++] = code + i++ + if (i >= length) break + code = string.charCodeAt(i) + if (code >= 0x80) break + bytes[p++] = code + i++ + if (i >= length) break + code = string.charCodeAt(i) + if (code >= 0x80) break + bytes[p++] = code + } + + if (i >= length) break + // now, code is present and >= 0x80 + } + + if (small) { + // TODO: use resizable array buffers? will have to return a non-resizeable one + if (p !== i) /* c8 ignore next */ throw new Error('Unreachable') // Here, p === i (only when small is still true) + const bytesNew = new Uint8Array(p + (length - i) * 3) // maximium can be 3x of the string length in charcodes + bytesNew.set(bytes) + bytes = bytesNew + small = false + } + + // surrogate, charcodes = [d800 + a & 3ff, dc00 + b & 3ff]; codePoint = 0x1_00_00 | (a << 10) | b + // lead: d800 - dbff + // trail: dc00 - dfff + if (code >= 0xd8_00 && code < 0xe0_00) { + // Can't be a valid trail as we already processed that below + + if (code > 0xdb_ff || i + 1 >= length) { + // An unexpected trail or a lead at the very end of input + if (!loose) throw new TypeError(E_STRICT_UNICODE) + bytes[p++] = 0xef + bytes[p++] = 0xbf + bytes[p++] = 0xbd + continue + } + + const next = string.charCodeAt(i + 1) // Process valid pairs immediately + if (next >= 0xdc_00 && next < 0xe0_00) { + // here, codePoint is always between 0x1_00_00 and 0x11_00_00, we encode as 4 bytes + const codePoint = (((code - 0xd8_00) << 10) | (next - 0xdc_00)) + 0x1_00_00 + bytes[p++] = (codePoint >> 18) | 0xf0 + bytes[p++] = ((codePoint >> 12) & 0x3f) | 0x80 + bytes[p++] = ((codePoint >> 6) & 0x3f) | 0x80 + bytes[p++] = (codePoint & 0x3f) | 0x80 + i++ // consume next + } else { + // Next is not a trail, leave next unconsumed but process unmatched lead error + if (!loose) throw new TypeError(E_STRICT_UNICODE) + bytes[p++] = 0xef + bytes[p++] = 0xbf + bytes[p++] = 0xbd + } + + continue + } + + // We are left with a non-pair char code above ascii, it gets encoded to 2 or 3 bytes + if (code < 0x8_00) { + bytes[p++] = (code >> 6) | 0xc0 + bytes[p++] = (code & 0x3f) | 0x80 + } else { + bytes[p++] = (code >> 12) | 0xe0 + bytes[p++] = ((code >> 6) & 0x3f) | 0x80 + bytes[p++] = (code & 0x3f) | 0x80 + } + } + + return bytes.length === p ? bytes : bytes.slice(0, p) +} -- cgit v1.2.3