1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
|
import { E_STRICT_UNICODE } from './_utils.js'
import { isHermes } from './platform.js'
import { asciiPrefix, decodeLatin1, encodeAsciiPrefix } from './latin1.js'
export const E_STRICT = 'Input is not well-formed utf8'
const replacementPoint = 0xff_fd
const shouldUseEscapePath = isHermes // faster only on Hermes, js path beats it on normal engines
const { decodeURIComponent, escape } = globalThis
export function decodeFast(arr, loose) {
// Fast path for ASCII prefix, this is faster than all alternatives below
const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here
if (prefix.length === arr.length) return prefix
// This codepath gives a ~3x perf boost on Hermes
if (shouldUseEscapePath && escape && decodeURIComponent) {
const o = escape(decodeLatin1(arr, prefix.length, arr.length))
try {
return prefix + decodeURIComponent(o) // Latin1 to utf8
} catch {
if (!loose) throw new TypeError(E_STRICT)
// Ok, we have to use manual implementation for loose decoder
}
}
return prefix + decode(arr, loose, prefix.length)
}
// https://encoding.spec.whatwg.org/#utf-8-decoder
// We are most likely in loose mode, for non-loose escape & decodeURIComponent solved everything
export function decode(arr, loose, start = 0) {
start |= 0
const end = arr.length
let out = ''
const chunkSize = 0x2_00 // far below MAX_ARGUMENTS_LENGTH in npmjs.com/buffer, we use smaller chunks
const tmpSize = Math.min(end - start, chunkSize + 1) // need 1 extra slot for last codepoint, which can be 2 charcodes
const tmp = new Array(tmpSize).fill(0)
let ti = 0
for (let i = start; i < end; i++) {
if (ti >= chunkSize) {
tmp.length = ti // can be larger by 1 if last codepoint is two charcodes
out += String.fromCharCode.apply(String, tmp)
if (tmp.length <= chunkSize) tmp.push(0) // restore 1 extra slot for last codepoint
ti = 0
}
const byte = arr[i]
if (byte < 0x80) {
tmp[ti++] = byte
// ascii fast path is in decodeFast(), this is called only on non-ascii input
// so we don't unroll this anymore
} else if (byte < 0xc2) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
} else if (byte < 0xe0) {
// need 1 more
if (i + 1 >= end) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
break
}
const byte1 = arr[i + 1]
if (byte1 < 0x80 || byte1 > 0xbf) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
continue
}
i++
tmp[ti++] = ((byte & 0x1f) << 6) | (byte1 & 0x3f)
} else if (byte < 0xf0) {
// need 2 more
if (i + 1 >= end) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
break
}
const lower = byte === 0xe0 ? 0xa0 : 0x80
const upper = byte === 0xed ? 0x9f : 0xbf
const byte1 = arr[i + 1]
if (byte1 < lower || byte1 > upper) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
continue
}
i++
if (i + 1 >= end) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
break
}
const byte2 = arr[i + 1]
if (byte2 < 0x80 || byte2 > 0xbf) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
continue
}
i++
tmp[ti++] = ((byte & 0xf) << 12) | ((byte1 & 0x3f) << 6) | (byte2 & 0x3f)
} else if (byte <= 0xf4) {
// need 3 more
if (i + 1 >= end) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
break
}
const lower = byte === 0xf0 ? 0x90 : 0x80
const upper = byte === 0xf4 ? 0x8f : 0xbf
const byte1 = arr[i + 1]
if (byte1 < lower || byte1 > upper) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
continue
}
i++
if (i + 1 >= end) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
break
}
const byte2 = arr[i + 1]
if (byte2 < 0x80 || byte2 > 0xbf) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
continue
}
i++
if (i + 1 >= end) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
break
}
const byte3 = arr[i + 1]
if (byte3 < 0x80 || byte3 > 0xbf) {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
continue
}
i++
const codePoint =
((byte & 0xf) << 18) | ((byte1 & 0x3f) << 12) | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f)
if (codePoint > 0xff_ff) {
// split into char codes as String.fromCharCode is faster than String.fromCodePoint
const u = codePoint - 0x1_00_00
tmp[ti++] = 0xd8_00 + ((u >> 10) & 0x3_ff)
tmp[ti++] = 0xdc_00 + (u & 0x3_ff)
} else {
tmp[ti++] = codePoint
}
// eslint-disable-next-line sonarjs/no-duplicated-branches
} else {
if (!loose) throw new TypeError(E_STRICT)
tmp[ti++] = replacementPoint
}
}
if (ti === 0) return out
tmp.length = ti
return out + String.fromCharCode.apply(String, tmp)
}
export function encode(string, loose) {
const length = string.length
let small = true
let bytes = new Uint8Array(length) // assume ascii
let i = encodeAsciiPrefix(bytes, string)
let p = i
for (; i < length; i++) {
let code = string.charCodeAt(i)
if (code < 0x80) {
bytes[p++] = code
// Unroll the loop a bit for faster ops
while (true) {
i++
if (i >= length) break
code = string.charCodeAt(i)
if (code >= 0x80) break
bytes[p++] = code
i++
if (i >= length) break
code = string.charCodeAt(i)
if (code >= 0x80) break
bytes[p++] = code
i++
if (i >= length) break
code = string.charCodeAt(i)
if (code >= 0x80) break
bytes[p++] = code
i++
if (i >= length) break
code = string.charCodeAt(i)
if (code >= 0x80) break
bytes[p++] = code
}
if (i >= length) break
// now, code is present and >= 0x80
}
if (small) {
// TODO: use resizable array buffers? will have to return a non-resizeable one
if (p !== i) /* c8 ignore next */ throw new Error('Unreachable') // Here, p === i (only when small is still true)
const bytesNew = new Uint8Array(p + (length - i) * 3) // maximium can be 3x of the string length in charcodes
bytesNew.set(bytes)
bytes = bytesNew
small = false
}
// surrogate, charcodes = [d800 + a & 3ff, dc00 + b & 3ff]; codePoint = 0x1_00_00 | (a << 10) | b
// lead: d800 - dbff
// trail: dc00 - dfff
if (code >= 0xd8_00 && code < 0xe0_00) {
// Can't be a valid trail as we already processed that below
if (code > 0xdb_ff || i + 1 >= length) {
// An unexpected trail or a lead at the very end of input
if (!loose) throw new TypeError(E_STRICT_UNICODE)
bytes[p++] = 0xef
bytes[p++] = 0xbf
bytes[p++] = 0xbd
continue
}
const next = string.charCodeAt(i + 1) // Process valid pairs immediately
if (next >= 0xdc_00 && next < 0xe0_00) {
// here, codePoint is always between 0x1_00_00 and 0x11_00_00, we encode as 4 bytes
const codePoint = (((code - 0xd8_00) << 10) | (next - 0xdc_00)) + 0x1_00_00
bytes[p++] = (codePoint >> 18) | 0xf0
bytes[p++] = ((codePoint >> 12) & 0x3f) | 0x80
bytes[p++] = ((codePoint >> 6) & 0x3f) | 0x80
bytes[p++] = (codePoint & 0x3f) | 0x80
i++ // consume next
} else {
// Next is not a trail, leave next unconsumed but process unmatched lead error
if (!loose) throw new TypeError(E_STRICT_UNICODE)
bytes[p++] = 0xef
bytes[p++] = 0xbf
bytes[p++] = 0xbd
}
continue
}
// We are left with a non-pair char code above ascii, it gets encoded to 2 or 3 bytes
if (code < 0x8_00) {
bytes[p++] = (code >> 6) | 0xc0
bytes[p++] = (code & 0x3f) | 0x80
} else {
bytes[p++] = (code >> 12) | 0xe0
bytes[p++] = ((code >> 6) & 0x3f) | 0x80
bytes[p++] = (code & 0x3f) | 0x80
}
}
return bytes.length === p ? bytes : bytes.slice(0, p)
}
|