aboutsummaryrefslogtreecommitdiffstats
path: root/vanilla/node_modules/@exodus/bytes/fallback/utf16.js
blob: a6f906f0bcfde17589e84935a197d752aa5c72ed (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import { decodeUCS2 } from './latin1.js'
import { assertU8, E_STRING, E_STRICT_UNICODE } from './_utils.js'
import { nativeDecoder, isLE, encodeCharcodes } from './platform.js'

export const E_STRICT = 'Input is not well-formed utf16'
const isWellFormedStr = /* @__PURE__ */ (() => String.prototype.isWellFormed)()
const toWellFormedStr = /* @__PURE__ */ (() => String.prototype.toWellFormed)()

const replacementCodepoint = 0xff_fd
const replacementCodepointSwapped = 0xfd_ff

const to16 = (a) => new Uint16Array(a.buffer, a.byteOffset, a.byteLength / 2) // Requires checked length and alignment!

export function encodeApi(str, loose, format) {
  if (typeof str !== 'string') throw new TypeError(E_STRING)
  if (format !== 'uint16' && format !== 'uint8-le' && format !== 'uint8-be') {
    throw new TypeError('Unknown format')
  }

  // On v8 and SpiderMonkey, check via isWellFormed is faster than js
  // On JSC, check during loop is faster than isWellFormed
  // If isWellFormed is available, we skip check during decoding and recheck after
  // If isWellFormed is unavailable, we check in js during decoding
  if (!loose && isWellFormedStr && !isWellFormedStr.call(str)) throw new TypeError(E_STRICT_UNICODE)
  const shouldSwap = (isLE && format === 'uint8-be') || (!isLE && format === 'uint8-le')
  const u16 = encode(str, loose, !loose && isWellFormedStr, shouldSwap)

  // Bytes are already swapped and format is already checked, we need to just cast the view
  return format === 'uint16' ? u16 : new Uint8Array(u16.buffer, u16.byteOffset, u16.byteLength)
}

const fatalLE = nativeDecoder ? new TextDecoder('utf-16le', { ignoreBOM: true, fatal: true }) : null
const looseLE = nativeDecoder ? new TextDecoder('utf-16le', { ignoreBOM: true }) : null
const fatalBE = nativeDecoder ? new TextDecoder('utf-16be', { ignoreBOM: true, fatal: true }) : null
const looseBE = nativeDecoder ? new TextDecoder('utf-16be', { ignoreBOM: true }) : null

export function decodeApiDecoders(input, loose, format) {
  if (format === 'uint16') {
    if (!(input instanceof Uint16Array)) throw new TypeError('Expected an Uint16Array')
  } else if (format === 'uint8-le' || format === 'uint8-be') {
    assertU8(input)
    if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
  } else {
    throw new TypeError('Unknown format')
  }

  const le = format === 'uint8-le' || (format === 'uint16' && isLE)
  return (le ? (loose ? looseLE : fatalLE) : loose ? looseBE : fatalBE).decode(input)
}

export function decodeApiJS(input, loose, format) {
  let u16
  switch (format) {
    case 'uint16':
      if (!(input instanceof Uint16Array)) throw new TypeError('Expected an Uint16Array')
      u16 = input
      break
    case 'uint8-le':
      assertU8(input)
      if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
      u16 = to16input(input, true)
      break
    case 'uint8-be':
      assertU8(input)
      if (input.byteLength % 2 !== 0) throw new TypeError('Expected even number of bytes')
      u16 = to16input(input, false)
      break
    default:
      throw new TypeError('Unknown format')
  }

  const str = decode(u16, loose, (!loose && isWellFormedStr) || (loose && toWellFormedStr))
  if (!loose && isWellFormedStr && !isWellFormedStr.call(str)) throw new TypeError(E_STRICT)
  if (loose && toWellFormedStr) return toWellFormedStr.call(str)

  return str
}

export function to16input(u8, le) {
  // Assume even number of bytes
  if (le === isLE) return to16(u8.byteOffset % 2 === 0 ? u8 : Uint8Array.from(u8))
  return to16(swap16(Uint8Array.from(u8)))
}

export const decode = (u16, loose = false, checked = false) => {
  if (checked || isWellFormed(u16)) return decodeUCS2(u16)
  if (!loose) throw new TypeError(E_STRICT)
  return decodeUCS2(toWellFormed(Uint16Array.from(u16))) // cloned for replacement
}

export function encode(str, loose = false, checked = false, swapped = false) {
  const arr = new Uint16Array(str.length)
  if (checked) return swapped ? encodeCheckedSwapped(str, arr) : encodeChecked(str, arr)
  return swapped ? encodeUncheckedSwapped(str, arr, loose) : encodeUnchecked(str, arr, loose)
}

/* eslint-disable @exodus/mutable/no-param-reassign-prop-only */

// Assumes checked length % 2 === 0, otherwise does not swap tail
function swap16(u8) {
  let i = 0
  for (const last3 = u8.length - 3; i < last3; i += 4) {
    const x0 = u8[i]
    const x1 = u8[i + 1]
    const x2 = u8[i + 2]
    const x3 = u8[i + 3]
    u8[i] = x1
    u8[i + 1] = x0
    u8[i + 2] = x3
    u8[i + 3] = x2
  }

  for (const last = u8.length - 1; i < last; i += 2) {
    const x0 = u8[i]
    const x1 = u8[i + 1]
    u8[i] = x1
    u8[i + 1] = x0
  }

  return u8
}

// Splitting paths into small functions helps (at least on SpiderMonkey)

const encodeChecked = (str, arr) => encodeCharcodes(str, arr) // Same as encodeLatin1, but with Uint16Array

function encodeCheckedSwapped(str, arr) {
  // TODO: faster path for Hermes? See encodeCharcodes
  const length = str.length
  for (let i = 0; i < length; i++) {
    const x = str.charCodeAt(i)
    arr[i] = ((x & 0xff) << 8) | (x >> 8)
  }

  return arr
}

// lead: d800 - dbff, trail: dc00 - dfff

function encodeUnchecked(str, arr, loose = false) {
  // TODO: faster path for Hermes? See encodeCharcodes
  const length = str.length
  for (let i = 0; i < length; i++) {
    const code = str.charCodeAt(i)
    arr[i] = code
    if (code >= 0xd8_00 && code < 0xe0_00) {
      // An unexpected trail or a lead at the very end of input
      if (code > 0xdb_ff || i + 1 >= length) {
        if (!loose) throw new TypeError(E_STRICT_UNICODE)
        arr[i] = replacementCodepoint
      } else {
        const next = str.charCodeAt(i + 1) // Process valid pairs immediately
        if (next < 0xdc_00 || next >= 0xe0_00) {
          if (!loose) throw new TypeError(E_STRICT_UNICODE)
          arr[i] = replacementCodepoint
        } else {
          i++ // consume next
          arr[i] = next
        }
      }
    }
  }

  return arr
}

function encodeUncheckedSwapped(str, arr, loose = false) {
  // TODO: faster path for Hermes? See encodeCharcodes
  const length = str.length
  for (let i = 0; i < length; i++) {
    const code = str.charCodeAt(i)
    arr[i] = ((code & 0xff) << 8) | (code >> 8)
    if (code >= 0xd8_00 && code < 0xe0_00) {
      // An unexpected trail or a lead at the very end of input
      if (code > 0xdb_ff || i + 1 >= length) {
        if (!loose) throw new TypeError(E_STRICT_UNICODE)
        arr[i] = replacementCodepointSwapped
      } else {
        const next = str.charCodeAt(i + 1) // Process valid pairs immediately
        if (next < 0xdc_00 || next >= 0xe0_00) {
          if (!loose) throw new TypeError(E_STRICT_UNICODE)
          arr[i] = replacementCodepointSwapped
        } else {
          i++ // consume next
          arr[i] = ((next & 0xff) << 8) | (next >> 8)
        }
      }
    }
  }

  return arr
}

// Only needed on Hermes, everything else has native impl
export function toWellFormed(u16) {
  const length = u16.length
  for (let i = 0; i < length; i++) {
    const code = u16[i]
    if (code >= 0xd8_00 && code < 0xe0_00) {
      // An unexpected trail or a lead at the very end of input
      if (code > 0xdb_ff || i + 1 >= length) {
        u16[i] = replacementCodepoint
      } else {
        const next = u16[i + 1] // Process valid pairs immediately
        if (next < 0xdc_00 || next >= 0xe0_00) {
          u16[i] = replacementCodepoint
        } else {
          i++ // consume next
        }
      }
    }
  }

  return u16
}

// Only needed on Hermes, everything else has native impl
export function isWellFormed(u16) {
  const length = u16.length
  let i = 0

  const m = 0x80_00_80_00
  const l = 0xd8_00
  const h = 0xe0_00

  // Speedup with u32, by skipping to the first surrogate
  // Only implemented for aligned input for now, but almost all input is aligned (pooled Buffer or 0 offset)
  if (length > 32 && u16.byteOffset % 4 === 0) {
    const u32length = (u16.byteLength / 4) | 0
    const u32 = new Uint32Array(u16.buffer, u16.byteOffset, u32length)
    for (const last3 = u32length - 3; ; i += 4) {
      if (i >= last3) break // loop is fast enough for moving this here to be _very_ useful, likely due to array access checks
      const a = u32[i]
      const b = u32[i + 1]
      const c = u32[i + 2]
      const d = u32[i + 3]
      if (a & m || b & m || c & m || d & m) break // bitwise OR does not make this faster on Hermes
    }

    for (; i < u32length; i++) if (u32[i] & m) break
    i *= 2
  }

  // An extra loop gives ~30-40% speedup e.g. on English text without surrogates but with other symbols above 0x80_00
  for (const last3 = length - 3; ; i += 4) {
    if (i >= last3) break
    const a = u16[i]
    const b = u16[i + 1]
    const c = u16[i + 2]
    const d = u16[i + 3]
    if ((a >= l && a < h) || (b >= l && b < h) || (c >= l && c < h) || (d >= l && d < h)) break
  }

  for (; i < length; i++) {
    const code = u16[i]
    if (code >= l && code < h) {
      // An unexpected trail or a lead at the very end of input
      if (code >= 0xdc_00 || i + 1 >= length) return false
      i++ // consume next
      const next = u16[i] // Process valid pairs immediately
      if (next < 0xdc_00 || next >= h) return false
    }
  }

  return true
}