1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
|
// We can't return native TextDecoder if it's present, as Node.js one is broken on windows-1252 and we fix that
// We are also faster than Node.js built-in on both TextEncoder and TextDecoder
import { utf16toString, utf16toStringLoose } from '@exodus/bytes/utf16.js'
import { utf8fromStringLoose, utf8toString, utf8toStringLoose } from '@exodus/bytes/utf8.js'
import { createSinglebyteDecoder } from '@exodus/bytes/single-byte.js'
import labels from './encoding.labels.js'
import { fromSource, getBOMEncoding } from './encoding.api.js'
import { unfinishedBytes, mergePrefix } from './encoding.util.js'
export { getBOMEncoding } from './encoding.api.js'
export const E_ENCODING = 'Unknown encoding'
const E_MULTI = "import '@exodus/bytes/encoding.js' for legacy multi-byte encodings support"
const E_OPTIONS = 'The "options" argument must be of type object'
const replacementChar = '\uFFFD'
const multibyteSet = new Set(['big5', 'euc-kr', 'euc-jp', 'iso-2022-jp', 'shift_jis', 'gbk', 'gb18030']) // prettier-ignore
let createMultibyteDecoder, multibyteEncoder
let labelsMap
// Warning: unlike whatwg-encoding, returns lowercased labels
// Those are case-insensitive and that's how TextDecoder encoding getter normalizes them
// https://encoding.spec.whatwg.org/#names-and-labels
export function normalizeEncoding(label) {
// fast path
if (label === 'utf-8' || label === 'utf8' || label === 'UTF-8' || label === 'UTF8') return 'utf-8'
if (label === 'windows-1252' || label === 'ascii' || label === 'latin1') return 'windows-1252'
// full map
if (/[^\w\t\n\f\r .:-]/i.test(label)) return null // must be ASCII (with ASCII whitespace)
const low = `${label}`.trim().toLowerCase()
if (Object.hasOwn(labels, low)) return low
if (!labelsMap) {
labelsMap = new Map()
for (const [name, aliases] of Object.entries(labels)) {
for (const alias of aliases) labelsMap.set(alias, name)
}
}
const mapped = labelsMap.get(low)
if (mapped) return mapped
return null
}
const uppercasePrefixes = new Set(['utf', 'iso', 'koi', 'euc', 'ibm', 'gbk'])
// Unlike normalizeEncoding, case-sensitive
// https://encoding.spec.whatwg.org/#names-and-labels
export function labelToName(label) {
const enc = normalizeEncoding(label)
if (enc === 'utf-8') return 'UTF-8' // fast path
if (!enc) return enc
if (uppercasePrefixes.has(enc.slice(0, 3))) return enc.toUpperCase()
if (enc === 'big5') return 'Big5'
if (enc === 'shift_jis') return 'Shift_JIS'
return enc
}
export const isMultibyte = (enc) => multibyteSet.has(enc)
export function setMultibyte(createDecoder, createEncoder) {
createMultibyteDecoder = createDecoder
multibyteEncoder = createEncoder
}
export function getMultibyteEncoder() {
if (!multibyteEncoder) throw new Error(E_MULTI)
return multibyteEncoder
}
const define = (obj, key, value) => Object.defineProperty(obj, key, { value, writable: false })
function isAnyUint8Array(x) {
if (x instanceof Uint8Array) return true
if (!x || !ArrayBuffer.isView(x) || x.BYTES_PER_ELEMENT !== 1) return false
return Object.prototype.toString.call(x) === '[object Uint8Array]'
}
function unicodeDecoder(encoding, loose) {
if (encoding === 'utf-8') return loose ? utf8toStringLoose : utf8toString // likely
const form = encoding === 'utf-16le' ? 'uint8-le' : 'uint8-be'
return loose ? (u) => utf16toStringLoose(u, form) : (u) => utf16toString(u, form)
}
export class TextDecoder {
#decode
#unicode
#multibyte
#chunk
#canBOM
constructor(encoding = 'utf-8', options = {}) {
if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
const enc = normalizeEncoding(encoding)
if (!enc || enc === 'replacement') throw new RangeError(E_ENCODING)
define(this, 'encoding', enc)
define(this, 'fatal', !!options.fatal)
define(this, 'ignoreBOM', !!options.ignoreBOM)
this.#unicode = enc === 'utf-8' || enc === 'utf-16le' || enc === 'utf-16be'
this.#multibyte = !this.#unicode && isMultibyte(enc)
this.#canBOM = this.#unicode && !this.ignoreBOM
}
get [Symbol.toStringTag]() {
return 'TextDecoder'
}
decode(input, options = {}) {
if (typeof options !== 'object') throw new TypeError(E_OPTIONS)
const stream = !!options.stream
let u = input === undefined ? new Uint8Array() : fromSource(input)
const empty = u.length === 0 // also can't be streaming after next line
if (empty && stream) return '' // no state change
if (this.#unicode) {
let prefix
if (this.#chunk) {
const merged = mergePrefix(u, this.#chunk, this.encoding)
if (u.length < 3) {
u = merged // might be unfinished, but fully consumed old u
} else {
prefix = merged // stops at complete chunk
const add = prefix.length - this.#chunk.length
if (add > 0) u = u.subarray(add)
}
this.#chunk = null
} else if (empty) {
this.#canBOM = !this.ignoreBOM // not streaming
return ''
}
// For non-stream utf-8 we don't have to do this as it matches utf8toStringLoose already
// For non-stream loose utf-16 we still have to do this as this API supports uneven byteLength unlike utf16toStringLoose
let suffix = ''
if (stream || (!this.fatal && this.encoding !== 'utf-8')) {
const trail = unfinishedBytes(u, u.byteLength, this.encoding)
if (trail > 0) {
if (stream) {
this.#chunk = Uint8Array.from(u.subarray(-trail)) // copy
} else {
// non-fatal mode as already checked
suffix = replacementChar
}
u = u.subarray(0, -trail)
}
}
let seenBOM = false
if (this.#canBOM) {
const bom = this.#findBom(prefix ?? u)
if (bom) {
seenBOM = true
if (prefix) {
prefix = prefix.subarray(bom)
} else {
u = u.subarray(bom)
}
}
} else if (!stream && !this.ignoreBOM) {
this.#canBOM = true
}
if (!this.#decode) this.#decode = unicodeDecoder(this.encoding, !this.fatal)
try {
const res = (prefix ? this.#decode(prefix) : '') + this.#decode(u) + suffix
// "BOM seen" is set on the current decode call only if it did not error, in "serialize I/O queue" after decoding
if (stream && (seenBOM || res.length > 0)) this.#canBOM = false
return res
} catch (err) {
this.#chunk = null // reset unfinished chunk on errors
// The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
// See also multi-byte.js
throw err
}
// eslint-disable-next-line no-else-return
} else if (this.#multibyte) {
if (!createMultibyteDecoder) throw new Error(E_MULTI)
if (!this.#decode) this.#decode = createMultibyteDecoder(this.encoding, !this.fatal) // can contain state!
return this.#decode(u, stream)
} else {
if (!this.#decode) this.#decode = createSinglebyteDecoder(this.encoding, !this.fatal)
return this.#decode(u)
}
}
#findBom(u) {
switch (this.encoding) {
case 'utf-8':
return u.byteLength >= 3 && u[0] === 0xef && u[1] === 0xbb && u[2] === 0xbf ? 3 : 0
case 'utf-16le':
return u.byteLength >= 2 && u[0] === 0xff && u[1] === 0xfe ? 2 : 0
case 'utf-16be':
return u.byteLength >= 2 && u[0] === 0xfe && u[1] === 0xff ? 2 : 0
}
/* c8 ignore next */
throw new Error('Unreachable')
}
}
export class TextEncoder {
constructor() {
define(this, 'encoding', 'utf-8')
}
get [Symbol.toStringTag]() {
return 'TextEncoder'
}
encode(str = '') {
if (typeof str !== 'string') str = `${str}`
const res = utf8fromStringLoose(str)
// match new Uint8Array (per spec), which is non-pooled
return res.byteOffset === 0 && res.length === res.buffer.byteLength ? res : res.slice(0)
}
encodeInto(str, target) {
if (typeof str !== 'string') str = `${str}`
if (!isAnyUint8Array(target)) throw new TypeError('Target must be an Uint8Array')
if (target.buffer.detached) return { read: 0, written: 0 } // Until https://github.com/whatwg/encoding/issues/324 is resolved
const tlen = target.length
if (tlen < str.length) str = str.slice(0, tlen)
let u8 = utf8fromStringLoose(str)
let read
if (tlen >= u8.length) {
read = str.length
} else if (u8.length === str.length) {
if (u8.length > tlen) u8 = u8.subarray(0, tlen) // ascii can be truncated
read = u8.length
} else {
u8 = u8.subarray(0, tlen)
const unfinished = unfinishedBytes(u8, u8.length, 'utf-8')
if (unfinished > 0) u8 = u8.subarray(0, u8.length - unfinished)
// We can do this because loose str -> u8 -> str preserves length, unlike loose u8 -> str -> u8
// Each unpaired surrogate (1 charcode) is replaced with a single charcode
read = utf8toStringLoose(u8).length // FIXME: Converting back is very inefficient
}
try {
target.set(u8)
} catch {
return { read: 0, written: 0 } // see above, likely detached but no .detached property support
}
return { read, written: u8.length }
}
}
const E_NO_STREAMS = 'TransformStream global not present in the environment'
// https://encoding.spec.whatwg.org/#interface-textdecoderstream
export class TextDecoderStream {
constructor(encoding = 'utf-8', options = {}) {
if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
const decoder = new TextDecoder(encoding, options)
const transform = new TransformStream({
transform: (chunk, controller) => {
const value = decoder.decode(fromSource(chunk), { stream: true })
if (value) controller.enqueue(value)
},
flush: (controller) => {
// https://streams.spec.whatwg.org/#dom-transformer-flush
const value = decoder.decode()
if (value) controller.enqueue(value)
// No need to call .terminate() (Node.js is wrong)
},
})
define(this, 'encoding', decoder.encoding)
define(this, 'fatal', decoder.fatal)
define(this, 'ignoreBOM', decoder.ignoreBOM)
define(this, 'readable', transform.readable)
define(this, 'writable', transform.writable)
}
get [Symbol.toStringTag]() {
return 'TextDecoderStream'
}
}
// https://encoding.spec.whatwg.org/#interface-textencoderstream
// Only UTF-8 per spec
export class TextEncoderStream {
constructor() {
if (!globalThis.TransformStream) throw new Error(E_NO_STREAMS)
let lead
const transform = new TransformStream({
// https://encoding.spec.whatwg.org/#encode-and-enqueue-a-chunk
// Not identical in code, but reuses loose mode to have identical behavior
transform: (chunk, controller) => {
let s = String(chunk) // DOMString, might contain unpaired surrogates
if (s.length === 0) return
if (lead) {
s = lead + s
lead = null
}
const last = s.charCodeAt(s.length - 1) // Can't come from previous lead due to length check
if ((last & 0xfc_00) === 0xd8_00) {
lead = s[s.length - 1]
s = s.slice(0, -1)
}
if (s) controller.enqueue(utf8fromStringLoose(s))
},
// https://encoding.spec.whatwg.org/#encode-and-flush
flush: (controller) => {
if (lead) controller.enqueue(Uint8Array.of(0xef, 0xbf, 0xbd))
},
})
define(this, 'encoding', 'utf-8')
define(this, 'readable', transform.readable)
define(this, 'writable', transform.writable)
}
get [Symbol.toStringTag]() {
return 'TextEncoderStream'
}
}
// https://encoding.spec.whatwg.org/#decode
// Warning: encoding sniffed from BOM takes preference over the supplied one
// Warning: lossy, performs replacement, no option of throwing
// Completely ignores encoding and even skips validation when BOM is found
// Unlike TextDecoder public API, additionally supports 'replacement' encoding
export function legacyHookDecode(input, fallbackEncoding = 'utf-8') {
let u8 = fromSource(input)
const bomEncoding = getBOMEncoding(u8)
if (bomEncoding) u8 = u8.subarray(bomEncoding === 'utf-8' ? 3 : 2)
const enc = bomEncoding ?? normalizeEncoding(fallbackEncoding) // "the byte order mark is more authoritative than anything else"
if (enc === 'utf-8') return utf8toStringLoose(u8)
if (enc === 'utf-16le' || enc === 'utf-16be') {
let suffix = ''
if (u8.byteLength % 2 !== 0) {
suffix = replacementChar
u8 = u8.subarray(0, -unfinishedBytes(u8, u8.byteLength, enc))
}
return utf16toStringLoose(u8, enc === 'utf-16le' ? 'uint8-le' : 'uint8-be') + suffix
}
if (!Object.hasOwn(labels, enc)) throw new RangeError(E_ENCODING)
if (isMultibyte(enc)) {
if (!createMultibyteDecoder) throw new Error(E_MULTI)
return createMultibyteDecoder(enc, true)(u8)
}
// https://encoding.spec.whatwg.org/#replacement-decoder
// On non-streaming non-fatal case, it just replaces any non-empty input with a single replacement char
if (enc === 'replacement') return input.byteLength > 0 ? replacementChar : ''
return createSinglebyteDecoder(enc, true)(u8)
}
|