diff options
| author | Adam Mathes <adam@adammathes.com> | 2026-02-13 21:34:48 -0800 |
|---|---|---|
| committer | Adam Mathes <adam@adammathes.com> | 2026-02-13 21:34:48 -0800 |
| commit | 76cb9c2a39d477a64824a985ade40507e3bbade1 (patch) | |
| tree | 41e997aa9c6f538d3a136af61dae9424db2005a9 /vanilla/node_modules/css-tree/lib/tokenizer | |
| parent | 819a39a21ac992b1393244a4c283bbb125208c69 (diff) | |
| download | neko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.gz neko-76cb9c2a39d477a64824a985ade40507e3bbade1.tar.bz2 neko-76cb9c2a39d477a64824a985ade40507e3bbade1.zip | |
feat(vanilla): add testing infrastructure and tests (NK-wjnczv)
Diffstat (limited to 'vanilla/node_modules/css-tree/lib/tokenizer')
8 files changed, 1447 insertions, 0 deletions
diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/OffsetToLocation.js b/vanilla/node_modules/css-tree/lib/tokenizer/OffsetToLocation.js new file mode 100644 index 0000000..cc584c0 --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/OffsetToLocation.js @@ -0,0 +1,87 @@ +import { adoptBuffer } from './adopt-buffer.js'; +import { isBOM } from './char-code-definitions.js'; + +const N = 10; +const F = 12; +const R = 13; + +function computeLinesAndColumns(host) { + const source = host.source; + const sourceLength = source.length; + const startOffset = source.length > 0 ? isBOM(source.charCodeAt(0)) : 0; + const lines = adoptBuffer(host.lines, sourceLength); + const columns = adoptBuffer(host.columns, sourceLength); + let line = host.startLine; + let column = host.startColumn; + + for (let i = startOffset; i < sourceLength; i++) { + const code = source.charCodeAt(i); + + lines[i] = line; + columns[i] = column++; + + if (code === N || code === R || code === F) { + if (code === R && i + 1 < sourceLength && source.charCodeAt(i + 1) === N) { + i++; + lines[i] = line; + columns[i] = column; + } + + line++; + column = 1; + } + } + + lines[sourceLength] = line; + columns[sourceLength] = column; + + host.lines = lines; + host.columns = columns; + host.computed = true; +} + +export class OffsetToLocation { + constructor(source, startOffset, startLine, startColumn) { + this.setSource(source, startOffset, startLine, startColumn); + this.lines = null; + this.columns = null; + } + setSource(source = '', startOffset = 0, startLine = 1, startColumn = 1) { + this.source = source; + this.startOffset = startOffset; + this.startLine = startLine; + this.startColumn = startColumn; + this.computed = false; + } + getLocation(offset, filename) { + if (!this.computed) { + computeLinesAndColumns(this); + } + + return { + source: filename, + offset: this.startOffset + offset, + line: this.lines[offset], + column: this.columns[offset] + }; + } + getLocationRange(start, end, filename) { + if (!this.computed) { + computeLinesAndColumns(this); + } + + return { + source: filename, + start: { + offset: this.startOffset + start, + line: this.lines[start], + column: this.columns[start] + }, + end: { + offset: this.startOffset + end, + line: this.lines[end], + column: this.columns[end] + } + }; + } +}; diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/TokenStream.js b/vanilla/node_modules/css-tree/lib/tokenizer/TokenStream.js new file mode 100644 index 0000000..96d48b7 --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/TokenStream.js @@ -0,0 +1,316 @@ +import { adoptBuffer } from './adopt-buffer.js'; +import { cmpStr } from './utils.js'; +import tokenNames from './names.js'; +import { + WhiteSpace, + Comment, + Delim, + EOF, + Function as FunctionToken, + LeftParenthesis, + RightParenthesis, + LeftSquareBracket, + RightSquareBracket, + LeftCurlyBracket, + RightCurlyBracket +} from './types.js'; + +const OFFSET_MASK = 0x00FFFFFF; +const TYPE_SHIFT = 24; +const balancePair = new Uint8Array(32); // 32b of memory ought to be enough for anyone (any number of tokens) +balancePair[FunctionToken] = RightParenthesis; +balancePair[LeftParenthesis] = RightParenthesis; +balancePair[LeftSquareBracket] = RightSquareBracket; +balancePair[LeftCurlyBracket] = RightCurlyBracket; + +function isBlockOpenerToken(tokenType) { + return balancePair[tokenType] !== 0; +} + +export class TokenStream { + constructor(source, tokenize) { + this.setSource(source, tokenize); + } + reset() { + this.eof = false; + this.tokenIndex = -1; + this.tokenType = 0; + this.tokenStart = this.firstCharOffset; + this.tokenEnd = this.firstCharOffset; + } + setSource(source = '', tokenize = () => {}) { + source = String(source || ''); + + const sourceLength = source.length; + const offsetAndType = adoptBuffer(this.offsetAndType, source.length + 1); // +1 because of eof-token + const balance = adoptBuffer(this.balance, source.length + 1); + let tokenCount = 0; + let firstCharOffset = -1; + let balanceCloseType = 0; + let balanceStart = source.length; + + // capture buffers + this.offsetAndType = null; + this.balance = null; + balance.fill(0); + + tokenize(source, (type, start, end) => { + const index = tokenCount++; + + // type & offset + offsetAndType[index] = (type << TYPE_SHIFT) | end; + + if (firstCharOffset === -1) { + firstCharOffset = start; + } + + // balance + balance[index] = balanceStart; + + if (type === balanceCloseType) { + const prevBalanceStart = balance[balanceStart]; + + // set reference to balance end for a block opener + balance[balanceStart] = index; + + // pop state + balanceStart = prevBalanceStart; + balanceCloseType = balancePair[offsetAndType[prevBalanceStart] >> TYPE_SHIFT]; + } else if (isBlockOpenerToken(type)) { // check for FunctionToken, <(-token>, <[-token> and <{-token> + // push state + balanceStart = index; + balanceCloseType = balancePair[type]; + } + }); + + // finalize buffers + offsetAndType[tokenCount] = (EOF << TYPE_SHIFT) | sourceLength; // <EOF-token> + balance[tokenCount] = tokenCount; // prevents false positive balance match with any token + + // reverse references from balance start to end + // tokens + // token: a ( [ b c ] d e ) { + // index: 0 1 2 3 4 5 6 7 8 9 + // before + // balance: 0 8 5 2 2 2 1 1 1 0 + // - > > < < < < < < - + // after + // balance: 9 8 5 5 5 2 8 8 1 9 + // > > > > > < > > < > + for (let i = 0; i < tokenCount; i++) { + const balanceStart = balance[i]; + + if (balanceStart <= i) { + const balanceEnd = balance[balanceStart]; + + if (balanceEnd !== i) { + balance[i] = balanceEnd; + } + } else if (balanceStart > tokenCount) { + balance[i] = tokenCount; + } + } + + // balance[0] = tokenCount; + + this.source = source; + this.firstCharOffset = firstCharOffset === -1 ? 0 : firstCharOffset; + this.tokenCount = tokenCount; + this.offsetAndType = offsetAndType; + this.balance = balance; + + this.reset(); + this.next(); + } + + lookupType(offset) { + offset += this.tokenIndex; + + if (offset < this.tokenCount) { + return this.offsetAndType[offset] >> TYPE_SHIFT; + } + + return EOF; + } + lookupTypeNonSC(idx) { + for (let offset = this.tokenIndex; offset < this.tokenCount; offset++) { + const tokenType = this.offsetAndType[offset] >> TYPE_SHIFT; + + if (tokenType !== WhiteSpace && tokenType !== Comment) { + if (idx-- === 0) { + return tokenType; + } + } + } + + return EOF; + } + lookupOffset(offset) { + offset += this.tokenIndex; + + if (offset < this.tokenCount) { + return this.offsetAndType[offset - 1] & OFFSET_MASK; + } + + return this.source.length; + } + lookupOffsetNonSC(idx) { + for (let offset = this.tokenIndex; offset < this.tokenCount; offset++) { + const tokenType = this.offsetAndType[offset] >> TYPE_SHIFT; + + if (tokenType !== WhiteSpace && tokenType !== Comment) { + if (idx-- === 0) { + return offset - this.tokenIndex; + } + } + } + + return EOF; + } + lookupValue(offset, referenceStr) { + offset += this.tokenIndex; + + if (offset < this.tokenCount) { + return cmpStr( + this.source, + this.offsetAndType[offset - 1] & OFFSET_MASK, + this.offsetAndType[offset] & OFFSET_MASK, + referenceStr + ); + } + + return false; + } + getTokenStart(tokenIndex) { + if (tokenIndex === this.tokenIndex) { + return this.tokenStart; + } + + if (tokenIndex > 0) { + return tokenIndex < this.tokenCount + ? this.offsetAndType[tokenIndex - 1] & OFFSET_MASK + : this.offsetAndType[this.tokenCount] & OFFSET_MASK; + } + + return this.firstCharOffset; + } + substrToCursor(start) { + return this.source.substring(start, this.tokenStart); + } + + isBalanceEdge(pos) { + return this.balance[this.tokenIndex] < pos; + // return this.balance[this.balance[pos]] !== this.tokenIndex; + } + isDelim(code, offset) { + if (offset) { + return ( + this.lookupType(offset) === Delim && + this.source.charCodeAt(this.lookupOffset(offset)) === code + ); + } + + return ( + this.tokenType === Delim && + this.source.charCodeAt(this.tokenStart) === code + ); + } + + skip(tokenCount) { + let next = this.tokenIndex + tokenCount; + + if (next < this.tokenCount) { + this.tokenIndex = next; + this.tokenStart = this.offsetAndType[next - 1] & OFFSET_MASK; + next = this.offsetAndType[next]; + this.tokenType = next >> TYPE_SHIFT; + this.tokenEnd = next & OFFSET_MASK; + } else { + this.tokenIndex = this.tokenCount; + this.next(); + } + } + next() { + let next = this.tokenIndex + 1; + + if (next < this.tokenCount) { + this.tokenIndex = next; + this.tokenStart = this.tokenEnd; + next = this.offsetAndType[next]; + this.tokenType = next >> TYPE_SHIFT; + this.tokenEnd = next & OFFSET_MASK; + } else { + this.eof = true; + this.tokenIndex = this.tokenCount; + this.tokenType = EOF; + this.tokenStart = this.tokenEnd = this.source.length; + } + } + skipSC() { + while (this.tokenType === WhiteSpace || this.tokenType === Comment) { + this.next(); + } + } + skipUntilBalanced(startToken, stopConsume) { + let cursor = startToken; + let balanceEnd = 0; + let offset = 0; + + loop: + for (; cursor < this.tokenCount; cursor++) { + balanceEnd = this.balance[cursor]; + + // stop scanning on balance edge that points to offset before start token + if (balanceEnd < startToken) { + break loop; + } + + offset = cursor > 0 ? this.offsetAndType[cursor - 1] & OFFSET_MASK : this.firstCharOffset; + + // check stop condition + switch (stopConsume(this.source.charCodeAt(offset))) { + case 1: // just stop + break loop; + + case 2: // stop & included + cursor++; + break loop; + + default: + // fast forward to the end of balanced block for an open block tokens + if (isBlockOpenerToken(this.offsetAndType[cursor] >> TYPE_SHIFT)) { + cursor = balanceEnd; + } + } + } + + this.skip(cursor - this.tokenIndex); + } + + forEachToken(fn) { + for (let i = 0, offset = this.firstCharOffset; i < this.tokenCount; i++) { + const start = offset; + const item = this.offsetAndType[i]; + const end = item & OFFSET_MASK; + const type = item >> TYPE_SHIFT; + + offset = end; + + fn(type, start, end, i); + } + } + dump() { + const tokens = new Array(this.tokenCount); + + this.forEachToken((type, start, end, index) => { + tokens[index] = { + idx: index, + type: tokenNames[type], + chunk: this.source.substring(start, end), + balance: this.balance[index] + }; + }); + + return tokens; + } +}; diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/adopt-buffer.js b/vanilla/node_modules/css-tree/lib/tokenizer/adopt-buffer.js new file mode 100644 index 0000000..ab4566d --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/adopt-buffer.js @@ -0,0 +1,9 @@ +const MIN_SIZE = 16 * 1024; + +export function adoptBuffer(buffer = null, size) { + if (buffer === null || buffer.length < size) { + return new Uint32Array(Math.max(size + 1024, MIN_SIZE)); + } + + return buffer; +}; diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/char-code-definitions.js b/vanilla/node_modules/css-tree/lib/tokenizer/char-code-definitions.js new file mode 100644 index 0000000..715572a --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/char-code-definitions.js @@ -0,0 +1,212 @@ +const EOF = 0; + +// https://drafts.csswg.org/css-syntax-3/ +// § 4.2. Definitions + +// digit +// A code point between U+0030 DIGIT ZERO (0) and U+0039 DIGIT NINE (9). +export function isDigit(code) { + return code >= 0x0030 && code <= 0x0039; +} + +// hex digit +// A digit, or a code point between U+0041 LATIN CAPITAL LETTER A (A) and U+0046 LATIN CAPITAL LETTER F (F), +// or a code point between U+0061 LATIN SMALL LETTER A (a) and U+0066 LATIN SMALL LETTER F (f). +export function isHexDigit(code) { + return ( + isDigit(code) || // 0 .. 9 + (code >= 0x0041 && code <= 0x0046) || // A .. F + (code >= 0x0061 && code <= 0x0066) // a .. f + ); +} + +// uppercase letter +// A code point between U+0041 LATIN CAPITAL LETTER A (A) and U+005A LATIN CAPITAL LETTER Z (Z). +export function isUppercaseLetter(code) { + return code >= 0x0041 && code <= 0x005A; +} + +// lowercase letter +// A code point between U+0061 LATIN SMALL LETTER A (a) and U+007A LATIN SMALL LETTER Z (z). +export function isLowercaseLetter(code) { + return code >= 0x0061 && code <= 0x007A; +} + +// letter +// An uppercase letter or a lowercase letter. +export function isLetter(code) { + return isUppercaseLetter(code) || isLowercaseLetter(code); +} + +// non-ASCII code point +// A code point with a value equal to or greater than U+0080 <control>. +// +// 2024-09-02: The latest spec narrows the range for non-ASCII characters (see https://github.com/csstree/csstree/issues/188). +// However, all modern browsers support a wider range, and strictly following the latest spec could result +// in some CSS being parsed incorrectly, even though it works in the browser. Therefore, this function adheres +// to the previous, broader definition of non-ASCII characters. +export function isNonAscii(code) { + return code >= 0x0080; +} + +// name-start code point +// A letter, a non-ASCII code point, or U+005F LOW LINE (_). +export function isNameStart(code) { + return isLetter(code) || isNonAscii(code) || code === 0x005F; +} + +// name code point +// A name-start code point, a digit, or U+002D HYPHEN-MINUS (-). +export function isName(code) { + return isNameStart(code) || isDigit(code) || code === 0x002D; +} + +// non-printable code point +// A code point between U+0000 NULL and U+0008 BACKSPACE, or U+000B LINE TABULATION, +// or a code point between U+000E SHIFT OUT and U+001F INFORMATION SEPARATOR ONE, or U+007F DELETE. +export function isNonPrintable(code) { + return ( + (code >= 0x0000 && code <= 0x0008) || + (code === 0x000B) || + (code >= 0x000E && code <= 0x001F) || + (code === 0x007F) + ); +} + +// newline +// U+000A LINE FEED. Note that U+000D CARRIAGE RETURN and U+000C FORM FEED are not included in this definition, +// as they are converted to U+000A LINE FEED during preprocessing. +// TODO: we doesn't do a preprocessing, so check a code point for U+000D CARRIAGE RETURN and U+000C FORM FEED +export function isNewline(code) { + return code === 0x000A || code === 0x000D || code === 0x000C; +} + +// whitespace +// A newline, U+0009 CHARACTER TABULATION, or U+0020 SPACE. +export function isWhiteSpace(code) { + return isNewline(code) || code === 0x0020 || code === 0x0009; +} + +// § 4.3.8. Check if two code points are a valid escape +export function isValidEscape(first, second) { + // If the first code point is not U+005C REVERSE SOLIDUS (\), return false. + if (first !== 0x005C) { + return false; + } + + // Otherwise, if the second code point is a newline or EOF, return false. + if (isNewline(second) || second === EOF) { + return false; + } + + // Otherwise, return true. + return true; +} + +// § 4.3.9. Check if three code points would start an identifier +export function isIdentifierStart(first, second, third) { + // Look at the first code point: + + // U+002D HYPHEN-MINUS + if (first === 0x002D) { + // If the second code point is a name-start code point or a U+002D HYPHEN-MINUS, + // or the second and third code points are a valid escape, return true. Otherwise, return false. + return ( + isNameStart(second) || + second === 0x002D || + isValidEscape(second, third) + ); + } + + // name-start code point + if (isNameStart(first)) { + // Return true. + return true; + } + + // U+005C REVERSE SOLIDUS (\) + if (first === 0x005C) { + // If the first and second code points are a valid escape, return true. Otherwise, return false. + return isValidEscape(first, second); + } + + // anything else + // Return false. + return false; +} + +// § 4.3.10. Check if three code points would start a number +export function isNumberStart(first, second, third) { + // Look at the first code point: + + // U+002B PLUS SIGN (+) + // U+002D HYPHEN-MINUS (-) + if (first === 0x002B || first === 0x002D) { + // If the second code point is a digit, return true. + if (isDigit(second)) { + return 2; + } + + // Otherwise, if the second code point is a U+002E FULL STOP (.) + // and the third code point is a digit, return true. + // Otherwise, return false. + return second === 0x002E && isDigit(third) ? 3 : 0; + } + + // U+002E FULL STOP (.) + if (first === 0x002E) { + // If the second code point is a digit, return true. Otherwise, return false. + return isDigit(second) ? 2 : 0; + } + + // digit + if (isDigit(first)) { + // Return true. + return 1; + } + + // anything else + // Return false. + return 0; +} + +// +// Misc +// + +// detect BOM (https://en.wikipedia.org/wiki/Byte_order_mark) +export function isBOM(code) { + // UTF-16BE + if (code === 0xFEFF) { + return 1; + } + + // UTF-16LE + if (code === 0xFFFE) { + return 1; + } + + return 0; +} + +// Fast code category +// Only ASCII code points has a special meaning, that's why we define a maps for 0..127 codes only +const CATEGORY = new Array(0x80); +export const EofCategory = 0x80; +export const WhiteSpaceCategory = 0x82; +export const DigitCategory = 0x83; +export const NameStartCategory = 0x84; +export const NonPrintableCategory = 0x85; + +for (let i = 0; i < CATEGORY.length; i++) { + CATEGORY[i] = + isWhiteSpace(i) && WhiteSpaceCategory || + isDigit(i) && DigitCategory || + isNameStart(i) && NameStartCategory || + isNonPrintable(i) && NonPrintableCategory || + i || EofCategory; +} + +export function charCodeCategory(code) { + return code < 0x80 ? CATEGORY[code] : NameStartCategory; +} diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/index.js b/vanilla/node_modules/css-tree/lib/tokenizer/index.js new file mode 100644 index 0000000..16df44c --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/index.js @@ -0,0 +1,513 @@ +import * as TYPE from './types.js'; +import { + isNewline, + isName, + isValidEscape, + isNumberStart, + isIdentifierStart, + isBOM, + charCodeCategory, + WhiteSpaceCategory, + DigitCategory, + NameStartCategory, + NonPrintableCategory +} from './char-code-definitions.js'; +import { + cmpStr, + getNewlineLength, + findWhiteSpaceEnd, + consumeEscaped, + consumeName, + consumeNumber, + consumeBadUrlRemnants +} from './utils.js'; + +export function tokenize(source, onToken) { + function getCharCode(offset) { + return offset < sourceLength ? source.charCodeAt(offset) : 0; + } + + // § 4.3.3. Consume a numeric token + function consumeNumericToken() { + // Consume a number and let number be the result. + offset = consumeNumber(source, offset); + + // If the next 3 input code points would start an identifier, then: + if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) { + // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string. + // Consume a name. Set the <dimension-token>’s unit to the returned value. + // Return the <dimension-token>. + type = TYPE.Dimension; + offset = consumeName(source, offset); + return; + } + + // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. + if (getCharCode(offset) === 0x0025) { + // Create a <percentage-token> with the same value as number, and return it. + type = TYPE.Percentage; + offset++; + return; + } + + // Otherwise, create a <number-token> with the same value and type flag as number, and return it. + type = TYPE.Number; + } + + // § 4.3.4. Consume an ident-like token + function consumeIdentLikeToken() { + const nameStartOffset = offset; + + // Consume a name, and let string be the result. + offset = consumeName(source, offset); + + // If string’s value is an ASCII case-insensitive match for "url", + // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. + if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) { + // While the next two input code points are whitespace, consume the next input code point. + offset = findWhiteSpaceEnd(source, offset + 1); + + // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), + // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('), + // then create a <function-token> with its value set to string and return it. + if (getCharCode(offset) === 0x0022 || + getCharCode(offset) === 0x0027) { + type = TYPE.Function; + offset = nameStartOffset + 4; + return; + } + + // Otherwise, consume a url token, and return it. + consumeUrlToken(); + return; + } + + // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. + // Create a <function-token> with its value set to string and return it. + if (getCharCode(offset) === 0x0028) { + type = TYPE.Function; + offset++; + return; + } + + // Otherwise, create an <ident-token> with its value set to string and return it. + type = TYPE.Ident; + } + + // § 4.3.5. Consume a string token + function consumeStringToken(endingCodePoint) { + // This algorithm may be called with an ending code point, which denotes the code point + // that ends the string. If an ending code point is not specified, + // the current input code point is used. + if (!endingCodePoint) { + endingCodePoint = getCharCode(offset++); + } + + // Initially create a <string-token> with its value set to the empty string. + type = TYPE.String; + + // Repeatedly consume the next input code point from the stream: + for (; offset < source.length; offset++) { + const code = source.charCodeAt(offset); + + switch (charCodeCategory(code)) { + // ending code point + case endingCodePoint: + // Return the <string-token>. + offset++; + return; + + // EOF + // case EofCategory: + // This is a parse error. Return the <string-token>. + // return; + + // newline + case WhiteSpaceCategory: + if (isNewline(code)) { + // This is a parse error. Reconsume the current input code point, + // create a <bad-string-token>, and return it. + offset += getNewlineLength(source, offset, code); + type = TYPE.BadString; + return; + } + break; + + // U+005C REVERSE SOLIDUS (\) + case 0x005C: + // If the next input code point is EOF, do nothing. + if (offset === source.length - 1) { + break; + } + + const nextCode = getCharCode(offset + 1); + + // Otherwise, if the next input code point is a newline, consume it. + if (isNewline(nextCode)) { + offset += getNewlineLength(source, offset + 1, nextCode); + } else if (isValidEscape(code, nextCode)) { + // Otherwise, (the stream starts with a valid escape) consume + // an escaped code point and append the returned code point to + // the <string-token>’s value. + offset = consumeEscaped(source, offset) - 1; + } + break; + + // anything else + // Append the current input code point to the <string-token>’s value. + } + } + } + + // § 4.3.6. Consume a url token + // Note: This algorithm assumes that the initial "url(" has already been consumed. + // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo). + // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token + // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise. + function consumeUrlToken() { + // Initially create a <url-token> with its value set to the empty string. + type = TYPE.Url; + + // Consume as much whitespace as possible. + offset = findWhiteSpaceEnd(source, offset); + + // Repeatedly consume the next input code point from the stream: + for (; offset < source.length; offset++) { + const code = source.charCodeAt(offset); + + switch (charCodeCategory(code)) { + // U+0029 RIGHT PARENTHESIS ()) + case 0x0029: + // Return the <url-token>. + offset++; + return; + + // EOF + // case EofCategory: + // This is a parse error. Return the <url-token>. + // return; + + // whitespace + case WhiteSpaceCategory: + // Consume as much whitespace as possible. + offset = findWhiteSpaceEnd(source, offset); + + // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF, + // consume it and return the <url-token> + // (if EOF was encountered, this is a parse error); + if (getCharCode(offset) === 0x0029 || offset >= source.length) { + if (offset < source.length) { + offset++; + } + return; + } + + // otherwise, consume the remnants of a bad url, create a <bad-url-token>, + // and return it. + offset = consumeBadUrlRemnants(source, offset); + type = TYPE.BadUrl; + return; + + // U+0022 QUOTATION MARK (") + // U+0027 APOSTROPHE (') + // U+0028 LEFT PARENTHESIS (() + // non-printable code point + case 0x0022: + case 0x0027: + case 0x0028: + case NonPrintableCategory: + // This is a parse error. Consume the remnants of a bad url, + // create a <bad-url-token>, and return it. + offset = consumeBadUrlRemnants(source, offset); + type = TYPE.BadUrl; + return; + + // U+005C REVERSE SOLIDUS (\) + case 0x005C: + // If the stream starts with a valid escape, consume an escaped code point and + // append the returned code point to the <url-token>’s value. + if (isValidEscape(code, getCharCode(offset + 1))) { + offset = consumeEscaped(source, offset) - 1; + break; + } + + // Otherwise, this is a parse error. Consume the remnants of a bad url, + // create a <bad-url-token>, and return it. + offset = consumeBadUrlRemnants(source, offset); + type = TYPE.BadUrl; + return; + + // anything else + // Append the current input code point to the <url-token>’s value. + } + } + } + + // ensure source is a string + source = String(source || ''); + + const sourceLength = source.length; + let start = isBOM(getCharCode(0)); + let offset = start; + let type; + + // https://drafts.csswg.org/css-syntax-3/#consume-token + // § 4.3.1. Consume a token + while (offset < sourceLength) { + const code = source.charCodeAt(offset); + + switch (charCodeCategory(code)) { + // whitespace + case WhiteSpaceCategory: + // Consume as much whitespace as possible. Return a <whitespace-token>. + type = TYPE.WhiteSpace; + offset = findWhiteSpaceEnd(source, offset + 1); + break; + + // U+0022 QUOTATION MARK (") + case 0x0022: + // Consume a string token and return it. + consumeStringToken(); + break; + + // U+0023 NUMBER SIGN (#) + case 0x0023: + // If the next input code point is a name code point or the next two input code points are a valid escape, then: + if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) { + // Create a <hash-token>. + type = TYPE.Hash; + + // If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id". + // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) { + // // TODO: set id flag + // } + + // Consume a name, and set the <hash-token>’s value to the returned string. + offset = consumeName(source, offset + 1); + + // Return the <hash-token>. + } else { + // Otherwise, return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + + break; + + // U+0027 APOSTROPHE (') + case 0x0027: + // Consume a string token and return it. + consumeStringToken(); + break; + + // U+0028 LEFT PARENTHESIS (() + case 0x0028: + // Return a <(-token>. + type = TYPE.LeftParenthesis; + offset++; + break; + + // U+0029 RIGHT PARENTHESIS ()) + case 0x0029: + // Return a <)-token>. + type = TYPE.RightParenthesis; + offset++; + break; + + // U+002B PLUS SIGN (+) + case 0x002B: + // If the input stream starts with a number, ... + if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { + // ... reconsume the current input code point, consume a numeric token, and return it. + consumeNumericToken(); + } else { + // Otherwise, return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + break; + + // U+002C COMMA (,) + case 0x002C: + // Return a <comma-token>. + type = TYPE.Comma; + offset++; + break; + + // U+002D HYPHEN-MINUS (-) + case 0x002D: + // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it. + if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { + consumeNumericToken(); + } else { + // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>. + if (getCharCode(offset + 1) === 0x002D && + getCharCode(offset + 2) === 0x003E) { + type = TYPE.CDC; + offset = offset + 3; + } else { + // Otherwise, if the input stream starts with an identifier, ... + if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { + // ... reconsume the current input code point, consume an ident-like token, and return it. + consumeIdentLikeToken(); + } else { + // Otherwise, return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + } + } + break; + + // U+002E FULL STOP (.) + case 0x002E: + // If the input stream starts with a number, ... + if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) { + // ... reconsume the current input code point, consume a numeric token, and return it. + consumeNumericToken(); + } else { + // Otherwise, return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + + break; + + // U+002F SOLIDUS (/) + case 0x002F: + // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*), + if (getCharCode(offset + 1) === 0x002A) { + // ... consume them and all following code points up to and including the first U+002A ASTERISK (*) + // followed by a U+002F SOLIDUS (/), or up to an EOF code point. + type = TYPE.Comment; + offset = source.indexOf('*/', offset + 2); + offset = offset === -1 ? source.length : offset + 2; + } else { + type = TYPE.Delim; + offset++; + } + break; + + // U+003A COLON (:) + case 0x003A: + // Return a <colon-token>. + type = TYPE.Colon; + offset++; + break; + + // U+003B SEMICOLON (;) + case 0x003B: + // Return a <semicolon-token>. + type = TYPE.Semicolon; + offset++; + break; + + // U+003C LESS-THAN SIGN (<) + case 0x003C: + // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ... + if (getCharCode(offset + 1) === 0x0021 && + getCharCode(offset + 2) === 0x002D && + getCharCode(offset + 3) === 0x002D) { + // ... consume them and return a <CDO-token>. + type = TYPE.CDO; + offset = offset + 4; + } else { + // Otherwise, return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + + break; + + // U+0040 COMMERCIAL AT (@) + case 0x0040: + // If the next 3 input code points would start an identifier, ... + if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) { + // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it. + type = TYPE.AtKeyword; + offset = consumeName(source, offset + 1); + } else { + // Otherwise, return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + + break; + + // U+005B LEFT SQUARE BRACKET ([) + case 0x005B: + // Return a <[-token>. + type = TYPE.LeftSquareBracket; + offset++; + break; + + // U+005C REVERSE SOLIDUS (\) + case 0x005C: + // If the input stream starts with a valid escape, ... + if (isValidEscape(code, getCharCode(offset + 1))) { + // ... reconsume the current input code point, consume an ident-like token, and return it. + consumeIdentLikeToken(); + } else { + // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + break; + + // U+005D RIGHT SQUARE BRACKET (]) + case 0x005D: + // Return a <]-token>. + type = TYPE.RightSquareBracket; + offset++; + break; + + // U+007B LEFT CURLY BRACKET ({) + case 0x007B: + // Return a <{-token>. + type = TYPE.LeftCurlyBracket; + offset++; + break; + + // U+007D RIGHT CURLY BRACKET (}) + case 0x007D: + // Return a <}-token>. + type = TYPE.RightCurlyBracket; + offset++; + break; + + // digit + case DigitCategory: + // Reconsume the current input code point, consume a numeric token, and return it. + consumeNumericToken(); + break; + + // name-start code point + case NameStartCategory: + // Reconsume the current input code point, consume an ident-like token, and return it. + consumeIdentLikeToken(); + break; + + // EOF + // case EofCategory: + // Return an <EOF-token>. + // break; + + // anything else + default: + // Return a <delim-token> with its value set to the current input code point. + type = TYPE.Delim; + offset++; + } + + // put token to stream + onToken(type, start, start = offset); + } +} + +export * from './types.js'; +export * as tokenTypes from './types.js'; +export { default as tokenNames } from './names.js'; +export * from './char-code-definitions.js'; +export * from './utils.js'; +export * from './OffsetToLocation.js'; +export * from './TokenStream.js'; diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/names.js b/vanilla/node_modules/css-tree/lib/tokenizer/names.js new file mode 100644 index 0000000..54831bd --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/names.js @@ -0,0 +1,28 @@ +export default [ + 'EOF-token', + 'ident-token', + 'function-token', + 'at-keyword-token', + 'hash-token', + 'string-token', + 'bad-string-token', + 'url-token', + 'bad-url-token', + 'delim-token', + 'number-token', + 'percentage-token', + 'dimension-token', + 'whitespace-token', + 'CDO-token', + 'CDC-token', + 'colon-token', + 'semicolon-token', + 'comma-token', + '[-token', + ']-token', + '(-token', + ')-token', + '{-token', + '}-token', + 'comment-token' +]; diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/types.js b/vanilla/node_modules/css-tree/lib/tokenizer/types.js new file mode 100644 index 0000000..5018569 --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/types.js @@ -0,0 +1,28 @@ +// CSS Syntax Module Level 3 +// https://www.w3.org/TR/css-syntax-3/ +export const EOF = 0; // <EOF-token> +export const Ident = 1; // <ident-token> +export const Function = 2; // <function-token> +export const AtKeyword = 3; // <at-keyword-token> +export const Hash = 4; // <hash-token> +export const String = 5; // <string-token> +export const BadString = 6; // <bad-string-token> +export const Url = 7; // <url-token> +export const BadUrl = 8; // <bad-url-token> +export const Delim = 9; // <delim-token> +export const Number = 10; // <number-token> +export const Percentage = 11; // <percentage-token> +export const Dimension = 12; // <dimension-token> +export const WhiteSpace = 13; // <whitespace-token> +export const CDO = 14; // <CDO-token> +export const CDC = 15; // <CDC-token> +export const Colon = 16; // <colon-token> : +export const Semicolon = 17; // <semicolon-token> ; +export const Comma = 18; // <comma-token> , +export const LeftSquareBracket = 19; // <[-token> +export const RightSquareBracket = 20; // <]-token> +export const LeftParenthesis = 21; // <(-token> +export const RightParenthesis = 22; // <)-token> +export const LeftCurlyBracket = 23; // <{-token> +export const RightCurlyBracket = 24; // <}-token> +export const Comment = 25; diff --git a/vanilla/node_modules/css-tree/lib/tokenizer/utils.js b/vanilla/node_modules/css-tree/lib/tokenizer/utils.js new file mode 100644 index 0000000..c131ec5 --- /dev/null +++ b/vanilla/node_modules/css-tree/lib/tokenizer/utils.js @@ -0,0 +1,254 @@ +import { + isDigit, + isHexDigit, + isUppercaseLetter, + isName, + isWhiteSpace, + isValidEscape +} from './char-code-definitions.js'; + +function getCharCode(source, offset) { + return offset < source.length ? source.charCodeAt(offset) : 0; +} + +export function getNewlineLength(source, offset, code) { + if (code === 13 /* \r */ && getCharCode(source, offset + 1) === 10 /* \n */) { + return 2; + } + + return 1; +} + +export function cmpChar(testStr, offset, referenceCode) { + let code = testStr.charCodeAt(offset); + + // code.toLowerCase() for A..Z + if (isUppercaseLetter(code)) { + code = code | 32; + } + + return code === referenceCode; +} + +export function cmpStr(testStr, start, end, referenceStr) { + if (end - start !== referenceStr.length) { + return false; + } + + if (start < 0 || end > testStr.length) { + return false; + } + + for (let i = start; i < end; i++) { + const referenceCode = referenceStr.charCodeAt(i - start); + let testCode = testStr.charCodeAt(i); + + // testCode.toLowerCase() for A..Z + if (isUppercaseLetter(testCode)) { + testCode = testCode | 32; + } + + if (testCode !== referenceCode) { + return false; + } + } + + return true; +} + +export function findWhiteSpaceStart(source, offset) { + for (; offset >= 0; offset--) { + if (!isWhiteSpace(source.charCodeAt(offset))) { + break; + } + } + + return offset + 1; +} + +export function findWhiteSpaceEnd(source, offset) { + for (; offset < source.length; offset++) { + if (!isWhiteSpace(source.charCodeAt(offset))) { + break; + } + } + + return offset; +} + +export function findDecimalNumberEnd(source, offset) { + for (; offset < source.length; offset++) { + if (!isDigit(source.charCodeAt(offset))) { + break; + } + } + + return offset; +} + +// § 4.3.7. Consume an escaped code point +export function consumeEscaped(source, offset) { + // It assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed and + // that the next input code point has already been verified to be part of a valid escape. + offset += 2; + + // hex digit + if (isHexDigit(getCharCode(source, offset - 1))) { + // Consume as many hex digits as possible, but no more than 5. + // Note that this means 1-6 hex digits have been consumed in total. + for (const maxOffset = Math.min(source.length, offset + 5); offset < maxOffset; offset++) { + if (!isHexDigit(getCharCode(source, offset))) { + break; + } + } + + // If the next input code point is whitespace, consume it as well. + const code = getCharCode(source, offset); + if (isWhiteSpace(code)) { + offset += getNewlineLength(source, offset, code); + } + } + + return offset; +} + +// §4.3.11. Consume a name +// Note: This algorithm does not do the verification of the first few code points that are necessary +// to ensure the returned code points would constitute an <ident-token>. If that is the intended use, +// ensure that the stream starts with an identifier before calling this algorithm. +export function consumeName(source, offset) { + // Let result initially be an empty string. + // Repeatedly consume the next input code point from the stream: + for (; offset < source.length; offset++) { + const code = source.charCodeAt(offset); + + // name code point + if (isName(code)) { + // Append the code point to result. + continue; + } + + // the stream starts with a valid escape + if (isValidEscape(code, getCharCode(source, offset + 1))) { + // Consume an escaped code point. Append the returned code point to result. + offset = consumeEscaped(source, offset) - 1; + continue; + } + + // anything else + // Reconsume the current input code point. Return result. + break; + } + + return offset; +} + +// §4.3.12. Consume a number +export function consumeNumber(source, offset) { + let code = source.charCodeAt(offset); + + // 2. If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-), + // consume it and append it to repr. + if (code === 0x002B || code === 0x002D) { + code = source.charCodeAt(offset += 1); + } + + // 3. While the next input code point is a digit, consume it and append it to repr. + if (isDigit(code)) { + offset = findDecimalNumberEnd(source, offset + 1); + code = source.charCodeAt(offset); + } + + // 4. If the next 2 input code points are U+002E FULL STOP (.) followed by a digit, then: + if (code === 0x002E && isDigit(source.charCodeAt(offset + 1))) { + // 4.1 Consume them. + // 4.2 Append them to repr. + offset += 2; + + // 4.3 Set type to "number". + // TODO + + // 4.4 While the next input code point is a digit, consume it and append it to repr. + + offset = findDecimalNumberEnd(source, offset); + } + + // 5. If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) + // or U+0065 LATIN SMALL LETTER E (e), ... , followed by a digit, then: + if (cmpChar(source, offset, 101 /* e */)) { + let sign = 0; + code = source.charCodeAt(offset + 1); + + // ... optionally followed by U+002D HYPHEN-MINUS (-) or U+002B PLUS SIGN (+) ... + if (code === 0x002D || code === 0x002B) { + sign = 1; + code = source.charCodeAt(offset + 2); + } + + // ... followed by a digit + if (isDigit(code)) { + // 5.1 Consume them. + // 5.2 Append them to repr. + + // 5.3 Set type to "number". + // TODO + + // 5.4 While the next input code point is a digit, consume it and append it to repr. + offset = findDecimalNumberEnd(source, offset + 1 + sign + 1); + } + } + + return offset; +} + +// § 4.3.14. Consume the remnants of a bad url +// ... its sole use is to consume enough of the input stream to reach a recovery point +// where normal tokenizing can resume. +export function consumeBadUrlRemnants(source, offset) { + // Repeatedly consume the next input code point from the stream: + for (; offset < source.length; offset++) { + const code = source.charCodeAt(offset); + + // U+0029 RIGHT PARENTHESIS ()) + // EOF + if (code === 0x0029) { + // Return. + offset++; + break; + } + + if (isValidEscape(code, getCharCode(source, offset + 1))) { + // Consume an escaped code point. + // Note: This allows an escaped right parenthesis ("\)") to be encountered + // without ending the <bad-url-token>. This is otherwise identical to + // the "anything else" clause. + offset = consumeEscaped(source, offset); + } + } + + return offset; +} + +// § 4.3.7. Consume an escaped code point +// Note: This algorithm assumes that escaped is valid without leading U+005C REVERSE SOLIDUS (\) +export function decodeEscaped(escaped) { + // Single char escaped that's not a hex digit + if (escaped.length === 1 && !isHexDigit(escaped.charCodeAt(0))) { + return escaped[0]; + } + + // Interpret the hex digits as a hexadecimal number. + let code = parseInt(escaped, 16); + + if ( + (code === 0) || // If this number is zero, + (code >= 0xD800 && code <= 0xDFFF) || // or is for a surrogate, + (code > 0x10FFFF) // or is greater than the maximum allowed code point + ) { + // ... return U+FFFD REPLACEMENT CHARACTER + code = 0xFFFD; + } + + // Otherwise, return the code point with that value. + return String.fromCodePoint(code); +} |
