aboutsummaryrefslogtreecommitdiffstats
path: root/vanilla/node_modules/@exodus/bytes/fallback/encoding.js
blob: fab108015fa3762d51d3e23a83a277d8e374fd39 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
// We can't return native TextDecoder if it's present, as Node.js one is broken on windows-1252 and we fix that
// We are also faster than Node.js built-in on both TextEncoder and TextDecoder

import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
import labels from './encoding.labels.js'
import { fromSource, getBOMEncoding } from './encoding.api.js'
import { unfinishedBytes, mergePrefix } from './encoding.util.js'

export { getBOMEncoding } from './encoding.api.js'

export const E_ENCODING = 'Unknown encoding'
const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
const E_OPTIONS = 'The "options" argument must be of type object'
const replacementChar = '\uFFFD'
const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
let createMultibyteDecoder, multibyteEncoder

let labelsMap
// Warning: unlike whatwg-encoding, returns lowercased labels
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
// https://encoding.spec.whatwg.org/#names-and-labels
export function normalizeEncoding(label) {
  // fast path
  if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
  if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
  // full map
  if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
  const low = `${label}`.trim().toLowerCase()
  if (Object.hasOwn(labels, low)) return low
  if (!labelsMap) {
    labelsMap = new Map()
    for (const [name, aliases] of Object.entries(labels)) {
      for (const alias of aliases) labelsMap.set(alias, name)
    }
  }

  const mapped = labelsMap.get(low)
  if (mapped) return mapped
  return null
}

const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])

// Unlike normalizeEncoding, case-sensitive
// https://encoding.spec.whatwg.org/#names-and-labels
export function labelToName(label) {
  const enc = normalizeEncoding(label)
  if (enc === 'utf-8') return 'UTF-8' // fast path
  if (!enc) return enc
  if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
  if (enc === 'big5') return 'Big5'
  if (enc === 'shift_jis') return 'Shift_JIS'
  return enc
}

export const isMultibyte = (enc) => multibyteSet.has(enc)
export function setMultibyte(createDecoder, createEncoder) {
  createMultibyteDecoder = createDecoder
  multibyteEncoder = createEncoder
}

export function getMultibyteEncoder() {
  if (!multibyteEncoder) throw new Error(E_MULTI)
  return multibyteEncoder
}

const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })

function isAnyUint8Array(x) {
  if (x instanceof Uint8Array) return true
  if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
  return Object.prototype.toString.call(x) === '[object Uint8Array]'
}

function unicodeDecoder(encoding, loose) {
  if (encoding === 'utf-8') return loose ? utf8toStringLoose : utf8toString // likely
  const form = encoding === 'utf-16le' ? 'uint8-le' : 'uint8-be'
  return loose ? (u) => utf16toStringLoose(u, form) : (u) => utf16toString(u, form)
}

export class TextDecoder {
  #decode
  #unicode
  #multibyte
  #chunk
  #canBOM

  constructor(encoding = 'utf-8', options = {}) {
    if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
    const enc = normalizeEncoding(encoding)
    if (!enc || enc === 'replacement') throw new RangeError(E_ENCODING)
    define(this, 'encoding', enc)
    define(this, 'fatal', !!options.fatal)
    define(this, 'ignoreBOM', !!options.ignoreBOM)
    this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
    this.#multibyte = !this.#unicode && isMultibyte(enc)
    this.#canBOM = this.#unicode && !this.ignoreBOM
  }

  get [Symbol.toStringTag]() {
    return 'TextDecoder'
  }

  decode(input, options = {}) {
    if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
    const stream = !!options.stream
    let u = input === undefined ? new Uint8Array() : fromSource(input)
    const empty = u.length === 0 // also can't be streaming after next line
    if (empty && stream) return '' // no state change

    if (this.#unicode) {
      let prefix
      if (this.#chunk) {
        const merged = mergePrefix(u, this.#chunk, this.encoding)
        if (u.length < 3) {
          u = merged // might be unfinished, but fully consumed old u
        } else {
          prefix = merged // stops at complete chunk
          const add = prefix.length - this.#chunk.length
          if (add > 0) u = u.subarray(add)
        }

        this.#chunk = null
      } else if (empty) {
        this.#canBOM = !this.ignoreBOM // not streaming
        return ''
      }

      // For non-stream utf-8 we don't have to do this as it matches utf8toStringLoose already
      // For non-stream loose utf-16 we still have to do this as this API supports uneven byteLength unlike utf16toStringLoose
      let suffix = ''
      if (stream || (!this.fatal && this.encoding !== 'utf-8')) {
        const trail = unfinishedBytes(u, u.byteLength, this.encoding)
        if (trail > 0) {
          if (stream) {
            this.#chunk = Uint8Array.from(u.subarray(-trail)) // copy
          } else {
            // non-fatal mode as already checked
            suffix = replacementChar
          }

          u = u.subarray(0, -trail)
        }
      }

      let seenBOM = false
      if (this.#canBOM) {
        const bom = this.#findBom(prefix ?? u)
        if (bom) {
          seenBOM = true
          if (prefix) {
            prefix = prefix.subarray(bom)
          } else {
            u = u.subarray(bom)
          }
        }
      } else if (!stream && !this.ignoreBOM) {
        this.#canBOM = true
      }

      if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
      try {
        const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
        // "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
        if (stream && (seenBOM || res.length > 0)) this.#canBOM = false
        return res
      } catch (err) {
        this.#chunk = null // reset unfinished chunk on errors
        // The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
        // See also multi-byte.js
        throw err
      }

      // eslint-disable-next-line no-else-return
    } else if (this.#multibyte) {
      if (!createMultibyteDecoder) throw new Error(E_MULTI)
      if (!this.#decode) this.#decode = createMultibyteDecoder(this.encoding, !this.fatal) // can contain state!
      return this.#decode(u, stream)
    } else {
      if (!this.#decode) this.#decode = createSinglebyteDecoder(this.encoding, !this.fatal)
      return this.#decode(u)
    }
  }

  #findBom(u) {
    switch (this.encoding) {
      case 'utf-8':
        return u.byteLength >= 3 && u[0] === 0xef && u[1] === 0xbb && u[2] === 0xbf ? 3 : 0
      case 'utf-16le':
        return u.byteLength >= 2 && u[0] === 0xff && u[1] === 0xfe ? 2 : 0
      case 'utf-16be':
        return u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff ? 2 : 0
    }

    /* c8 ignore next */
    throw new Error('Unreachable')
  }
}

export class TextEncoder {
  constructor() {
    define(this, 'encoding', 'utf-8')
  }

  get [Symbol.toStringTag]() {
    return 'TextEncoder'
  }

  encode(str = '') {
    if (typeof str !== 'string') str = `${str}`
    const res = utf8fromStringLoose(str)
    // match new Uint8Array (per spec), which is non-pooled
    return res.byteOffset === 0 && res.length === res.buffer.byteLength ? res : res.slice(0)
  }

  encodeInto(str, target) {
    if (typeof str !== 'string') str = `${str}`
    if (!isAnyUint8Array(target)) throw new TypeError('Target must be an Uint8Array')
    if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved

    const tlen = target.length
    if (tlen < str.length) str = str.slice(0, tlen)
    let u8 = utf8fromStringLoose(str)
    let read
    if (tlen >= u8.length) {
      read = str.length
    } else if (u8.length === str.length) {
      if (u8.length > tlen) u8 = u8.subarray(0, tlen) // ascii can be truncated
      read = u8.length
    } else {
      u8 = u8.subarray(0, tlen)
      const unfinished = unfinishedBytes(u8, u8.length, 'utf-8')
      if (unfinished > 0) u8 = u8.subarray(0, u8.length - unfinished)

      // We can do this because loose str -> u8 -> str preserves length, unlike loose u8 -> str -> u8
      // Each unpaired surrogate (1 charcode) is replaced with a single charcode
      read = utf8toStringLoose(u8).length // FIXME: Converting back is very inefficient
    }

    try {
      target.set(u8)
    } catch {
      return { read: 0, written: 0 } // see above, likely detached but no .detached property support
    }

    return { read, written: u8.length }
  }
}

const E_NO_STREAMS = 'TransformStream global not present in the environment'

// https://encoding.spec.whatwg.org/#interface-textdecoderstream
export class TextDecoderStream {
  constructor(encoding = 'utf-8', options = {}) {
    if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
    const decoder = new TextDecoder(encoding, options)
    const transform = new TransformStream({
      transform: (chunk, controller) => {
        const value = decoder.decode(fromSource(chunk), { stream: true })
        if (value) controller.enqueue(value)
      },
      flush: (controller) => {
        // https://streams.spec.whatwg.org/#dom-transformer-flush
        const value = decoder.decode()
        if (value) controller.enqueue(value)
        // No need to call .terminate() (Node.js is wrong)
      },
    })

    define(this, 'encoding', decoder.encoding)
    define(this, 'fatal', decoder.fatal)
    define(this, 'ignoreBOM', decoder.ignoreBOM)
    define(this, 'readable', transform.readable)
    define(this, 'writable', transform.writable)
  }

  get [Symbol.toStringTag]() {
    return 'TextDecoderStream'
  }
}

// https://encoding.spec.whatwg.org/#interface-textencoderstream
// Only UTF-8 per spec
export class TextEncoderStream {
  constructor() {
    if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
    let lead
    const transform = new TransformStream({
      // https://encoding.spec.whatwg.org/#encode-and-enqueue-a-chunk
      // Not identical in code, but reuses loose mode to have identical behavior
      transform: (chunk, controller) => {
        let s = String(chunk) // DOMString, might contain unpaired surrogates
        if (s.length === 0) return
        if (lead) {
          s = lead + s
          lead = null
        }

        const last = s.charCodeAt(s.length - 1) // Can't come from previous lead due to length check
        if ((last & 0xfc_00) === 0xd8_00) {
          lead = s[s.length - 1]
          s = s.slice(0, -1)
        }

        if (s) controller.enqueue(utf8fromStringLoose(s))
      },
      // https://encoding.spec.whatwg.org/#encode-and-flush
      flush: (controller) => {
        if (lead) controller.enqueue(Uint8Array.of(0xef, 0xbf, 0xbd))
      },
    })

    define(this, 'encoding', 'utf-8')
    define(this, 'readable', transform.readable)
    define(this, 'writable', transform.writable)
  }

  get [Symbol.toStringTag]() {
    return 'TextEncoderStream'
  }
}

// https://encoding.spec.whatwg.org/#decode
// Warning: encoding sniffed from BOM takes preference over the supplied one
// Warning: lossy, performs replacement, no option of throwing
// Completely ignores encoding and even skips validation when BOM is found
// Unlike TextDecoder public API, additionally supports 'replacement' encoding
export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
  let u8 = fromSource(input)
  const bomEncoding = getBOMEncoding(u8)
  if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
  const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"

  if (enc === 'utf-8') return utf8toStringLoose(u8)
  if (enc === 'utf-16le' || enc === 'utf-16be') {
    let suffix = ''
    if (u8.byteLength % 2 !== 0) {
      suffix = replacementChar
      u8 = u8.subarray(0, -unfinishedBytes(u8, u8.byteLength, enc))
    }

    return utf16toStringLoose(u8, enc === 'utf-16le' ? 'uint8-le' : 'uint8-be') + suffix
  }

  if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)

  if (isMultibyte(enc)) {
    if (!createMultibyteDecoder) throw new Error(E_MULTI)
    return createMultibyteDecoder(enc, true)(u8)
  }

  // https://encoding.spec.whatwg.org/#replacement-decoder
  // On non-streaming non-fatal case, it just replaces any non-empty input with a single replacement char
  if (enc === 'replacement') return input.byteLength > 0 ? replacementChar : ''

  return createSinglebyteDecoder(enc, true)(u8)
}