aboutsummaryrefslogtreecommitdiffstats
path: root/vanilla/node_modules/@exodus/bytes/fallback/utf8.js
blob: d1467bc226a6ea73db172fd77ffc6c5223f219f0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import { E_STRICT_UNICODE } from './_utils.js'
import { isHermes } from './platform.js'
import { asciiPrefix, decodeLatin1, encodeAsciiPrefix } from './latin1.js'

export const E_STRICT = 'Input is not well-formed utf8'

const replacementPoint = 0xff_fd
const shouldUseEscapePath = isHermes // faster only on Hermes, js path beats it on normal engines
const { decodeURIComponent, escape } = globalThis

export function decodeFast(arr, loose) {
  // Fast path for ASCII prefix, this is faster than all alternatives below
  const prefix = decodeLatin1(arr, 0, asciiPrefix(arr)) // No native decoder to use, so decodeAscii is useless here
  if (prefix.length === arr.length) return prefix

  // This codepath gives a ~3x perf boost on Hermes
  if (shouldUseEscapePath && escape && decodeURIComponent) {
    const o = escape(decodeLatin1(arr, prefix.length, arr.length))
    try {
      return prefix + decodeURIComponent(o) // Latin1 to utf8
    } catch {
      if (!loose) throw new TypeError(E_STRICT)
      // Ok, we have to use manual implementation for loose decoder
    }
  }

  return prefix + decode(arr, loose, prefix.length)
}

// https://encoding.spec.whatwg.org/#utf-8-decoder
// We are most likely in loose mode, for non-loose escape & decodeURIComponent solved everything
export function decode(arr, loose, start = 0) {
  start |= 0
  const end = arr.length
  let out = ''
  const chunkSize = 0x2_00 // far below MAX_ARGUMENTS_LENGTH in npmjs.com/buffer, we use smaller chunks
  const tmpSize = Math.min(end - start, chunkSize + 1) // need 1 extra slot for last codepoint, which can be 2 charcodes
  const tmp = new Array(tmpSize).fill(0)
  let ti = 0

  for (let i = start; i < end; i++) {
    if (ti >= chunkSize) {
      tmp.length = ti // can be larger by 1 if last codepoint is two charcodes
      out += String.fromCharCode.apply(String, tmp)
      if (tmp.length <= chunkSize) tmp.push(0) // restore 1 extra slot for last codepoint
      ti = 0
    }

    const byte = arr[i]
    if (byte < 0x80) {
      tmp[ti++] = byte
      // ascii fast path is in decodeFast(), this is called only on non-ascii input
      // so we don't unroll this anymore
    } else if (byte < 0xc2) {
      if (!loose) throw new TypeError(E_STRICT)
      tmp[ti++] = replacementPoint
    } else if (byte < 0xe0) {
      // need 1 more
      if (i + 1 >= end) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        break
      }

      const byte1 = arr[i + 1]
      if (byte1 < 0x80 || byte1 > 0xbf) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        continue
      }

      i++
      tmp[ti++] = ((byte & 0x1f) << 6) | (byte1 & 0x3f)
    } else if (byte < 0xf0) {
      // need 2 more
      if (i + 1 >= end) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        break
      }

      const lower = byte === 0xe0 ? 0xa0 : 0x80
      const upper = byte === 0xed ? 0x9f : 0xbf
      const byte1 = arr[i + 1]
      if (byte1 < lower || byte1 > upper) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        continue
      }

      i++
      if (i + 1 >= end) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        break
      }

      const byte2 = arr[i + 1]
      if (byte2 < 0x80 || byte2 > 0xbf) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        continue
      }

      i++
      tmp[ti++] = ((byte & 0xf) << 12) | ((byte1 & 0x3f) << 6) | (byte2 & 0x3f)
    } else if (byte <= 0xf4) {
      // need 3 more
      if (i + 1 >= end) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        break
      }

      const lower = byte === 0xf0 ? 0x90 : 0x80
      const upper = byte === 0xf4 ? 0x8f : 0xbf
      const byte1 = arr[i + 1]
      if (byte1 < lower || byte1 > upper) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        continue
      }

      i++
      if (i + 1 >= end) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        break
      }

      const byte2 = arr[i + 1]
      if (byte2 < 0x80 || byte2 > 0xbf) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        continue
      }

      i++
      if (i + 1 >= end) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        break
      }

      const byte3 = arr[i + 1]
      if (byte3 < 0x80 || byte3 > 0xbf) {
        if (!loose) throw new TypeError(E_STRICT)
        tmp[ti++] = replacementPoint
        continue
      }

      i++
      const codePoint =
        ((byte & 0xf) << 18) | ((byte1 & 0x3f) << 12) | ((byte2 & 0x3f) << 6) | (byte3 & 0x3f)
      if (codePoint > 0xff_ff) {
        // split into char codes as String.fromCharCode is faster than String.fromCodePoint
        const u = codePoint - 0x1_00_00
        tmp[ti++] = 0xd8_00 + ((u >> 10) & 0x3_ff)
        tmp[ti++] = 0xdc_00 + (u & 0x3_ff)
      } else {
        tmp[ti++] = codePoint
      }
      // eslint-disable-next-line sonarjs/no-duplicated-branches
    } else {
      if (!loose) throw new TypeError(E_STRICT)
      tmp[ti++] = replacementPoint
    }
  }

  if (ti === 0) return out
  tmp.length = ti
  return out + String.fromCharCode.apply(String, tmp)
}

export function encode(string, loose) {
  const length = string.length
  let small = true
  let bytes = new Uint8Array(length) // assume ascii

  let i = encodeAsciiPrefix(bytes, string)
  let p = i
  for (; i < length; i++) {
    let code = string.charCodeAt(i)
    if (code < 0x80) {
      bytes[p++] = code
      // Unroll the loop a bit for faster ops
      while (true) {
        i++
        if (i >= length) break
        code = string.charCodeAt(i)
        if (code >= 0x80) break
        bytes[p++] = code
        i++
        if (i >= length) break
        code = string.charCodeAt(i)
        if (code >= 0x80) break
        bytes[p++] = code
        i++
        if (i >= length) break
        code = string.charCodeAt(i)
        if (code >= 0x80) break
        bytes[p++] = code
        i++
        if (i >= length) break
        code = string.charCodeAt(i)
        if (code >= 0x80) break
        bytes[p++] = code
      }

      if (i >= length) break
      // now, code is present and >= 0x80
    }

    if (small) {
      // TODO: use resizable array buffers? will have to return a non-resizeable one
      if (p !== i) /* c8 ignore next */ throw new Error('Unreachable') // Here, p === i (only when small is still true)
      const bytesNew = new Uint8Array(p + (length - i) * 3) // maximium can be 3x of the string length in charcodes
      bytesNew.set(bytes)
      bytes = bytesNew
      small = false
    }

    // surrogate, charcodes = [d800 + a & 3ff, dc00 + b & 3ff]; codePoint = 0x1_00_00 | (a << 10) | b
    // lead: d800 - dbff
    // trail: dc00 - dfff
    if (code >= 0xd8_00 && code < 0xe0_00) {
      // Can't be a valid trail as we already processed that below

      if (code > 0xdb_ff || i + 1 >= length) {
        // An unexpected trail or a lead at the very end of input
        if (!loose) throw new TypeError(E_STRICT_UNICODE)
        bytes[p++] = 0xef
        bytes[p++] = 0xbf
        bytes[p++] = 0xbd
        continue
      }

      const next = string.charCodeAt(i + 1) // Process valid pairs immediately
      if (next >= 0xdc_00 && next < 0xe0_00) {
        // here, codePoint is always between 0x1_00_00 and 0x11_00_00, we encode as 4 bytes
        const codePoint = (((code - 0xd8_00) << 10) | (next - 0xdc_00)) + 0x1_00_00
        bytes[p++] = (codePoint >> 18) | 0xf0
        bytes[p++] = ((codePoint >> 12) & 0x3f) | 0x80
        bytes[p++] = ((codePoint >> 6) & 0x3f) | 0x80
        bytes[p++] = (codePoint & 0x3f) | 0x80
        i++ // consume next
      } else {
        // Next is not a trail, leave next unconsumed but process unmatched lead error
        if (!loose) throw new TypeError(E_STRICT_UNICODE)
        bytes[p++] = 0xef
        bytes[p++] = 0xbf
        bytes[p++] = 0xbd
      }

      continue
    }

    // We are left with a non-pair char code above ascii, it gets encoded to 2 or 3 bytes
    if (code < 0x8_00) {
      bytes[p++] = (code >> 6) | 0xc0
      bytes[p++] = (code & 0x3f) | 0x80
    } else {
      bytes[p++] = (code >> 12) | 0xe0
      bytes[p++] = ((code >> 6) & 0x3f) | 0x80
      bytes[p++] = (code & 0x3f) | 0x80
    }
  }

  return bytes.length === p ? bytes : bytes.slice(0, p)
}