From 76cb9c2a39d477a64824a985ade40507e3bbade1 Mon Sep 17 00:00:00 2001 From: Adam Mathes Date: Fri, 13 Feb 2026 21:34:48 -0800 Subject: feat(vanilla): add testing infrastructure and tests (NK-wjnczv) --- .../parse5/dist/tokenizer/preprocessor.js | 196 +++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 vanilla/node_modules/parse5/dist/tokenizer/preprocessor.js (limited to 'vanilla/node_modules/parse5/dist/tokenizer/preprocessor.js') diff --git a/vanilla/node_modules/parse5/dist/tokenizer/preprocessor.js b/vanilla/node_modules/parse5/dist/tokenizer/preprocessor.js new file mode 100644 index 0000000..530407b --- /dev/null +++ b/vanilla/node_modules/parse5/dist/tokenizer/preprocessor.js @@ -0,0 +1,196 @@ +import { CODE_POINTS as $, getSurrogatePairCodePoint, isControlCodePoint, isSurrogate, isSurrogatePair, isUndefinedCodePoint, } from '../common/unicode.js'; +import { ERR } from '../common/error-codes.js'; +//Const +const DEFAULT_BUFFER_WATERLINE = 1 << 16; +//Preprocessor +//NOTE: HTML input preprocessing +//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) +export class Preprocessor { + constructor(handler) { + this.handler = handler; + this.html = ''; + this.pos = -1; + // NOTE: Initial `lastGapPos` is -2, to ensure `col` on initialisation is 0 + this.lastGapPos = -2; + this.gapStack = []; + this.skipNextNewLine = false; + this.lastChunkWritten = false; + this.endOfChunkHit = false; + this.bufferWaterline = DEFAULT_BUFFER_WATERLINE; + this.isEol = false; + this.lineStartPos = 0; + this.droppedBufferSize = 0; + this.line = 1; + //NOTE: avoid reporting errors twice on advance/retreat + this.lastErrOffset = -1; + } + /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */ + get col() { + return this.pos - this.lineStartPos + Number(this.lastGapPos !== this.pos); + } + get offset() { + return this.droppedBufferSize + this.pos; + } + getError(code, cpOffset) { + const { line, col, offset } = this; + const startCol = col + cpOffset; + const startOffset = offset + cpOffset; + return { + code, + startLine: line, + endLine: line, + startCol, + endCol: startCol, + startOffset, + endOffset: startOffset, + }; + } + _err(code) { + if (this.handler.onParseError && this.lastErrOffset !== this.offset) { + this.lastErrOffset = this.offset; + this.handler.onParseError(this.getError(code, 0)); + } + } + _addGap() { + this.gapStack.push(this.lastGapPos); + this.lastGapPos = this.pos; + } + _processSurrogate(cp) { + //NOTE: try to peek a surrogate pair + if (this.pos !== this.html.length - 1) { + const nextCp = this.html.charCodeAt(this.pos + 1); + if (isSurrogatePair(nextCp)) { + //NOTE: we have a surrogate pair. Peek pair character and recalculate code point. + this.pos++; + //NOTE: add a gap that should be avoided during retreat + this._addGap(); + return getSurrogatePairCodePoint(cp, nextCp); + } + } + //NOTE: we are at the end of a chunk, therefore we can't infer the surrogate pair yet. + else if (!this.lastChunkWritten) { + this.endOfChunkHit = true; + return $.EOF; + } + //NOTE: isolated surrogate + this._err(ERR.surrogateInInputStream); + return cp; + } + willDropParsedChunk() { + return this.pos > this.bufferWaterline; + } + dropParsedChunk() { + if (this.willDropParsedChunk()) { + this.html = this.html.substring(this.pos); + this.lineStartPos -= this.pos; + this.droppedBufferSize += this.pos; + this.pos = 0; + this.lastGapPos = -2; + this.gapStack.length = 0; + } + } + write(chunk, isLastChunk) { + if (this.html.length > 0) { + this.html += chunk; + } + else { + this.html = chunk; + } + this.endOfChunkHit = false; + this.lastChunkWritten = isLastChunk; + } + insertHtmlAtCurrentPos(chunk) { + this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1); + this.endOfChunkHit = false; + } + startsWith(pattern, caseSensitive) { + // Check if our buffer has enough characters + if (this.pos + pattern.length > this.html.length) { + this.endOfChunkHit = !this.lastChunkWritten; + return false; + } + if (caseSensitive) { + return this.html.startsWith(pattern, this.pos); + } + for (let i = 0; i < pattern.length; i++) { + const cp = this.html.charCodeAt(this.pos + i) | 0x20; + if (cp !== pattern.charCodeAt(i)) { + return false; + } + } + return true; + } + peek(offset) { + const pos = this.pos + offset; + if (pos >= this.html.length) { + this.endOfChunkHit = !this.lastChunkWritten; + return $.EOF; + } + const code = this.html.charCodeAt(pos); + return code === $.CARRIAGE_RETURN ? $.LINE_FEED : code; + } + advance() { + this.pos++; + //NOTE: LF should be in the last column of the line + if (this.isEol) { + this.isEol = false; + this.line++; + this.lineStartPos = this.pos; + } + if (this.pos >= this.html.length) { + this.endOfChunkHit = !this.lastChunkWritten; + return $.EOF; + } + let cp = this.html.charCodeAt(this.pos); + //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters + if (cp === $.CARRIAGE_RETURN) { + this.isEol = true; + this.skipNextNewLine = true; + return $.LINE_FEED; + } + //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character + //must be ignored. + if (cp === $.LINE_FEED) { + this.isEol = true; + if (this.skipNextNewLine) { + // `line` will be bumped again in the recursive call. + this.line--; + this.skipNextNewLine = false; + this._addGap(); + return this.advance(); + } + } + this.skipNextNewLine = false; + if (isSurrogate(cp)) { + cp = this._processSurrogate(cp); + } + //OPTIMIZATION: first check if code point is in the common allowed + //range (ASCII alphanumeric, whitespaces, big chunk of BMP) + //before going into detailed performance cost validation. + const isCommonValidRange = this.handler.onParseError === null || + (cp > 0x1f && cp < 0x7f) || + cp === $.LINE_FEED || + cp === $.CARRIAGE_RETURN || + (cp > 0x9f && cp < 64976); + if (!isCommonValidRange) { + this._checkForProblematicCharacters(cp); + } + return cp; + } + _checkForProblematicCharacters(cp) { + if (isControlCodePoint(cp)) { + this._err(ERR.controlCharacterInInputStream); + } + else if (isUndefinedCodePoint(cp)) { + this._err(ERR.noncharacterInInputStream); + } + } + retreat(count) { + this.pos -= count; + while (this.pos < this.lastGapPos) { + this.lastGapPos = this.gapStack.pop(); + this.pos--; + } + this.isEol = false; + } +} -- cgit v1.2.3