import { DOMParser } from "@xmldom/xmldom"; import { isBoolean, isNumber, isPlainObject, isString } from "es-toolkit"; import * as xpath from "xpath"; import type { ConfigValidationIssue } from "../schema/issues"; import type { JsonValue } from "../types"; import { issue, joinPath } from "../schema/issues"; import { CONTENT_EXTRACTOR_KEY_SET, MATCHER_KEY_SET } from "./keys"; import { isUnsafeRegex } from "./redos"; import { isValueMatcherPrimitive } from "./value"; export function isJsonValue(value: unknown): value is JsonValue { if (value === null) return true; if (isString(value) || isBoolean(value)) return true; if (isNumber(value)) return Number.isFinite(value); if (Array.isArray(value)) return value.every(isJsonValue); if (isPlainObject(value)) return Object.values(value).every(isJsonValue); return false; } export function isPlainRecord(value: unknown): value is Record { return isPlainObject(value); } export function validateJsonPath(path: string, expectationPath: string, targetName?: string): ConfigValidationIssue[] { if (!path.startsWith("$.") || path.length <= 2) { return [ issue("invalid-jsonpath", joinPath(expectationPath, "path"), '必须为以 "$." 开头的有效 JSONPath', targetName), ]; } const issues: ConfigValidationIssue[] = []; const segments = path.slice(2).split("."); for (const seg of segments) { if (seg === "") { issues.push(issue("invalid-jsonpath", joinPath(expectationPath, "path"), "包含空段", targetName)); } const bracketMatch = /^(.+?)\[(\d+)\]$/.exec(seg); if (bracketMatch?.[1]!.trim() === "") { issues.push(issue("invalid-jsonpath", joinPath(expectationPath, "path"), "数组访问缺少属性名", targetName)); } } return issues; } export function validateRawContentExpectations( expectations: unknown, path: string, targetName?: string, ): ConfigValidationIssue[] { if (!Array.isArray(expectations)) return [issue("invalid-type", path, "必须为数组", targetName)]; return expectations.flatMap((entry, index) => validateRawContentExpectation(entry, `${path}[${index}]`, targetName)); } export function validateRawKeyedExpectations( value: unknown, path: string, targetName?: string, options?: { caseInsensitive?: boolean }, ): ConfigValidationIssue[] { if (Array.isArray(value)) return validateNormalizedKeyedExpectations(value, path, targetName, options); if (!isPlainRecord(value)) return [issue("invalid-type", path, "必须为对象", targetName)]; const issues: ConfigValidationIssue[] = []; if (options?.caseInsensitive) { const seen = new Map(); for (const key of Object.keys(value)) { const lower = key.toLowerCase(); const prev = seen.get(lower); if (prev !== undefined) { issues.push(issue("duplicate-key", joinPath(path, key), `与 "${prev}" 大小写归一化后重复`, targetName)); } else { seen.set(lower, key); } } } for (const [key, item] of Object.entries(value)) { const itemPath = joinPath(path, key); issues.push(...validateRawValueExpectation(item, itemPath, targetName)); } return issues; } export function validateRawValueExpectation( matcher: unknown, path: string, targetName?: string, options: { requireAtLeastOne?: boolean } = {}, ): ConfigValidationIssue[] { const requireAtLeastOne = options.requireAtLeastOne ?? true; if (isValueMatcherPrimitive(matcher)) return []; if (Array.isArray(matcher)) { return [ issue( "invalid-type", path, "必须为 primitive 原始值或 matcher 对象;如需数组 equals 匹配应写成 {equals: [...]}", targetName, ), ]; } if (!isPlainRecord(matcher)) return [issue("invalid-type", path, "必须为 primitive 原始值或 matcher 对象", targetName)]; const issues: ConfigValidationIssue[] = []; let found = 0; for (const [key, value] of Object.entries(matcher)) { if (!MATCHER_KEY_SET.has(key)) { issues.push(issue("unknown-matcher", joinPath(path, key), "是未知 matcher", targetName)); continue; } if (value === undefined) continue; found++; issues.push(...validateMatcherValue(key, value, joinPath(path, key), targetName)); } if (requireAtLeastOne && found === 0) { issues.push(issue("empty-matcher", path, "必须包含至少一个合法 matcher", targetName)); } if (matcher["exists"] === false && found > 1) { issues.push(issue("invalid-value", joinPath(path, "exists"), "exists:false 不能与其他 matcher 组合", targetName)); } return issues; } function validateCssExpectation(expectation: unknown, path: string, targetName?: string): ConfigValidationIssue[] { if (!isPlainRecord(expectation)) return [issue("invalid-type", path, "必须为对象", targetName)]; const issues: ConfigValidationIssue[] = []; if (!isString(expectation["selector"]) || expectation["selector"].trim() === "") { issues.push(issue("invalid-type", joinPath(path, "selector"), "必须为非空字符串", targetName)); } if ("attr" in expectation && !isString(expectation["attr"])) { issues.push(issue("invalid-type", joinPath(path, "attr"), "必须为字符串", targetName)); } issues.push(...validateExtractorMatcher(expectation, new Set(["attr", "selector"]), path, targetName)); return issues; } function validateExtractorMatcher( expectation: Record, allowedFields: Set, path: string, targetName?: string, ): ConfigValidationIssue[] { const matcher: Record = {}; const issues: ConfigValidationIssue[] = []; for (const [key, value] of Object.entries(expectation)) { if (allowedFields.has(key)) continue; matcher[key] = value; } issues.push(...validateRawValueExpectation(matcher, path, targetName, { requireAtLeastOne: false })); return issues; } function validateJsonExpectation(expectation: unknown, path: string, targetName?: string): ConfigValidationIssue[] { if (!isPlainRecord(expectation)) return [issue("invalid-type", path, "必须为对象", targetName)]; const issues: ConfigValidationIssue[] = []; if (!isString(expectation["path"])) { issues.push(issue("invalid-type", joinPath(path, "path"), "必须为字符串", targetName)); } else { issues.push(...validateJsonPath(expectation["path"], path, targetName)); } issues.push(...validateExtractorMatcher(expectation, new Set(["path"]), path, targetName)); return issues; } function validateMatcherValue(key: string, value: unknown, path: string, targetName?: string): ConfigValidationIssue[] { switch (key) { case "contains": return isString(value) ? [] : [issue("invalid-type", path, "必须为字符串", targetName)]; case "empty": case "exists": return isBoolean(value) ? [] : [issue("invalid-type", path, "必须为布尔值", targetName)]; case "equals": return isJsonValue(value) ? [] : [issue("invalid-type", path, "必须为 JSON value", targetName)]; case "gt": case "gte": case "lt": case "lte": return isNumber(value) && Number.isFinite(value) ? [] : [issue("invalid-type", path, "必须为有限数字", targetName)]; case "regex": if (!isString(value)) return [issue("invalid-type", path, "必须为字符串", targetName)]; try { new RegExp(value); } catch { return [issue("invalid-regex", path, "正则不合法", targetName)]; } return isUnsafeRegex(value) ? [issue("unsafe-regex", path, "正则存在 ReDoS 风险", targetName)] : []; default: return [issue("unknown-matcher", path, "是未知 matcher", targetName)]; } } function validateNormalizedContentExpectation( expectation: Record, path: string, targetName?: string, ): ConfigValidationIssue[] { const kind = expectation["kind"]; const matcherPath = joinPath(path, "matcher"); const issues = validateRawValueExpectation(expectation["matcher"], matcherPath, targetName); switch (kind) { case "css": if (!isString(expectation["selector"]) || expectation["selector"].trim() === "") { issues.push(issue("invalid-type", joinPath(path, "selector"), "必须为非空字符串", targetName)); } if ("attr" in expectation && !isString(expectation["attr"])) { issues.push(issue("invalid-type", joinPath(path, "attr"), "必须为字符串", targetName)); } return issues; case "json": return isString(expectation["path"]) ? [...issues, ...validateJsonPath(expectation["path"], path, targetName)] : [...issues, issue("invalid-type", joinPath(path, "path"), "必须为字符串", targetName)]; case "value": return issues; case "xpath": return isString(expectation["path"]) ? [...issues, ...validateXpathExpectation({ path: expectation["path"] }, path, targetName)] : [...issues, issue("invalid-type", joinPath(path, "path"), "必须为非空字符串", targetName)]; default: return [...issues, issue("invalid-type", joinPath(path, "kind"), "必须为 value、json、css 或 xpath", targetName)]; } } function validateNormalizedKeyedExpectations( value: unknown[], path: string, targetName?: string, options?: { caseInsensitive?: boolean }, ): ConfigValidationIssue[] { const issues: ConfigValidationIssue[] = []; const seen = new Map(); for (let i = 0; i < value.length; i++) { const itemPath = `${path}[${i}]`; const item = value[i]; if (!isPlainRecord(item)) { issues.push(issue("invalid-type", itemPath, "必须为对象", targetName)); continue; } if (!isString(item["key"])) { issues.push(issue("invalid-type", joinPath(itemPath, "key"), "必须为字符串", targetName)); } else if (options?.caseInsensitive) { const normalized = item["key"].toLowerCase(); const prev = seen.get(normalized); if (prev !== undefined) { issues.push(issue("duplicate-key", joinPath(itemPath, "key"), `与 "${prev}" 大小写归一化后重复`, targetName)); } else { seen.set(normalized, item["key"]); } } issues.push(...validateRawValueExpectation(item["matcher"], joinPath(itemPath, "matcher"), targetName)); } return issues; } function validateRawContentExpectation( expectation: unknown, path: string, targetName?: string, ): ConfigValidationIssue[] { if (!isPlainRecord(expectation)) return [issue("invalid-type", path, "必须为对象", targetName)]; if (isString(expectation["kind"])) return validateNormalizedContentExpectation(expectation, path, targetName); const issues: ConfigValidationIssue[] = []; const extractors = Object.keys(expectation).filter((key) => CONTENT_EXTRACTOR_KEY_SET.has(key)); const directMatchers = Object.keys(expectation).filter((key) => MATCHER_KEY_SET.has(key)); for (const key of Object.keys(expectation)) { if (!MATCHER_KEY_SET.has(key) && !CONTENT_EXTRACTOR_KEY_SET.has(key)) { issues.push(issue("unknown-field", joinPath(path, key), "是未知字段", targetName)); } } if (extractors.length > 1) { issues.push( issue("multiple-content-expectations", path, "一条 expectation 不能同时包含多个 extractor", targetName), ); } if (extractors.length === 1 && directMatchers.length > 0) { issues.push(issue("invalid-content-expectation", path, "直接 matcher 不能与 extractor 混用", targetName)); } if (issues.length > 0) return issues; if (extractors.length === 0) return validateRawValueExpectation(expectation, path, targetName); const extractor = extractors[0]!; switch (extractor) { case "css": return validateCssExpectation(expectation["css"], joinPath(path, "css"), targetName); case "json": return validateJsonExpectation(expectation["json"], joinPath(path, "json"), targetName); case "xpath": return validateXpathExpectation(expectation["xpath"], joinPath(path, "xpath"), targetName); } return []; } function validateXpathExpectation(expectation: unknown, path: string, targetName?: string): ConfigValidationIssue[] { if (!isPlainRecord(expectation)) return [issue("invalid-type", path, "必须为对象", targetName)]; const issues: ConfigValidationIssue[] = []; if (!isString(expectation["path"]) || expectation["path"].trim() === "") { issues.push(issue("invalid-type", joinPath(path, "path"), "必须为非空字符串", targetName)); } else { try { const doc = new DOMParser().parseFromString("", "text/xml"); xpath.select(expectation["path"], doc as unknown as Node); } catch { issues.push(issue("invalid-xpath", joinPath(path, "path"), "xpath 不合法", targetName)); } } issues.push(...validateExtractorMatcher(expectation, new Set(["path"]), path, targetName)); return issues; }