diff --git a/lib/.gitignore b/lib/.gitignore index 8a1f98bf6..baebf27e2 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -12,6 +12,7 @@ marienfressinaud/lib_opml/tests/ phpgt/cssxpath/.* phpgt/cssxpath/composer.json phpgt/cssxpath/CONTRIBUTING.md +phpgt/cssxpath/phpmd.xml phpgt/cssxpath/phpunit.xml phpgt/cssxpath/SECURITY* phpgt/cssxpath/test/ diff --git a/lib/composer.json b/lib/composer.json index 8c8e04734..9da6300cd 100644 --- a/lib/composer.json +++ b/lib/composer.json @@ -12,7 +12,7 @@ ], "require": { "marienfressinaud/lib_opml": "0.5.1", - "phpgt/cssxpath": "v1.4.0", + "phpgt/cssxpath": "v1.5.0", "phpmailer/phpmailer": "7.0.2", "simplepie/simplepie": "dev-freshrss#6405099830e5383fc2cb9aa1be7a8f42a18cb21c" }, diff --git a/lib/phpgt/cssxpath/README.md b/lib/phpgt/cssxpath/README.md index a1777c423..2f0e64a1a 100644 --- a/lib/phpgt/cssxpath/README.md +++ b/lib/phpgt/cssxpath/README.md @@ -46,7 +46,7 @@ $document = new DOMDocument(); $document->loadHTML($html); $xpath = new DOMXPath($document); -$inputElementList = $xpath->query(new Translator("form>label>input"); +$inputElementList = $xpath->query(new Translator("form>label>input")); ``` ## Using this library with XML Documents diff --git a/lib/phpgt/cssxpath/phpcs.xml b/lib/phpgt/cssxpath/phpcs.xml new file mode 100644 index 000000000..c197710f8 --- /dev/null +++ b/lib/phpgt/cssxpath/phpcs.xml @@ -0,0 +1,60 @@ + + + Created from PHP.Gt/Styleguide + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/phpgt/cssxpath/src/AttributeSelectorConverter.php b/lib/phpgt/cssxpath/src/AttributeSelectorConverter.php new file mode 100644 index 000000000..cd957bdcd --- /dev/null +++ b/lib/phpgt/cssxpath/src/AttributeSelectorConverter.php @@ -0,0 +1,127 @@ + $token */ + public function apply( + array $token, + XPathExpression $expression, + bool $htmlMode + ):void { + $expression->ensureElement(); + + $attribute = (string)$token["content"]; + if($htmlMode) { + $attribute = strtolower($attribute); + } + + $detail = $token["detail"] ?? null; + $detailType = $detail[0] ?? null; + $detailValue = $detail[1] ?? null; + + if(!$this->hasEqualsType($detailType)) { + $expression->appendFragment("[@{$attribute}]"); + return; + } + + $valueString = trim((string)$detailValue["content"], " '\""); + $equalsType = $detailType["content"]; + $expression->appendFragment( + $this->buildExpression($attribute, $valueString, $equalsType) + ); + } + + /** @param array $token */ + public function buildConditionFromToken(array $token, bool $htmlMode):string { + $parts = $this->extractTokenParts($token, $htmlMode); + return $this->buildConditionFromParts( + $parts["attribute"], + $parts["detailType"], + $parts["detailValue"], + ); + } + + /** + * @param array|null $detailType + * @param array|null $detailValue + */ + private function buildConditionFromParts( + string $attribute, + ?array $detailType, + ?array $detailValue, + ):string { + if(!$this->hasEqualsType($detailType)) { + return "@{$attribute}"; + } + + $valueString = trim((string)$detailValue["content"], " '\""); + $equalsType = $detailType["content"]; + return $this->buildCondition($attribute, $valueString, $equalsType); + } + + /** + * @param array $token + * @return array{ + * attribute: string, + * detailType: array|null, + * detailValue: array|null + * } + */ + private function extractTokenParts(array $token, bool $htmlMode):array { + $attribute = (string)$token["content"]; + if($htmlMode) { + $attribute = strtolower($attribute); + } + + $detail = $token["detail"] ?? null; + return [ + "attribute" => $attribute, + "detailType" => $detail[0] ?? null, + "detailValue" => $detail[1] ?? null, + ]; + } + + /** @param array|null $detailType */ + private function hasEqualsType(?array $detailType):bool { + return isset($detailType["type"]) + && $detailType["type"] === "attribute_equals"; + } + + private function buildCondition( + string $attribute, + string $value, + string $equalsType + ):string { + return match($equalsType) { + Translator::EQUALS_EXACT => "@{$attribute}=\"{$value}\"", + Translator::EQUALS_CONTAINS => "contains(@{$attribute},\"{$value}\")", + Translator::EQUALS_CONTAINS_WORD => "" + . "contains(concat(\" \",@{$attribute},\" \")," + . "concat(\" \",\"{$value}\",\" \"))" + . "", + Translator::EQUALS_OR_STARTS_WITH_HYPHENATED => "" + . "@{$attribute}=\"{$value}\" or " + . "starts-with(@{$attribute}, \"{$value}-\")" + . "", + Translator::EQUALS_STARTS_WITH => "" + . "starts-with(@{$attribute}, \"{$value}\")" + . "", + Translator::EQUALS_ENDS_WITH => "" + . "substring(@{$attribute}," + . "string-length(@{$attribute}) - " + . "string-length(\"{$value}\") + 1)" + . "=\"{$value}\"" + . "", + default => "@{$attribute}", + }; + } + + private function buildExpression( + string $attribute, + string $value, + string $equalsType + ):string { + return "[" . $this->buildCondition($attribute, $value, $equalsType) . "]"; + } +} diff --git a/lib/phpgt/cssxpath/src/CssAttributeTokenBuilder.php b/lib/phpgt/cssxpath/src/CssAttributeTokenBuilder.php new file mode 100644 index 000000000..adc18c77b --- /dev/null +++ b/lib/phpgt/cssxpath/src/CssAttributeTokenBuilder.php @@ -0,0 +1,113 @@ + + */ + public function build(string $content, ?callable $transform):array { + $operatorData = $this->extractOperator($content); + $token = $this->buildMatchPayload( + "attribute", + $operatorData["name"], + $transform + ); + + if($operatorData["operator"] === null) { + return $token; + } + + $token["detail"] = [ + $this->buildMatchPayload( + "attribute_equals", + $operatorData["operator"], + $transform + ), + $this->buildMatchPayload( + "attribute_value", + $operatorData["value"], + $transform + ), + ]; + return $token; + } + + /** + * @return array{name: string, operator: string|null, value: string} + */ + private function extractOperator(string $content):array { + $operators = ["~=", "$=", "|=", "^=", "*=", "="]; + $quote = null; + $length = strlen($content); + + for($index = 0; $index < $length; $index++) { + $char = $content[$index]; + if($quote !== null) { + if($char === $quote) { + $quote = null; + } + + continue; + } + + if($char === "'" || $char === '"') { + $quote = $char; + continue; + } + + $matchedOperator = $this->matchOperator( + $content, + $index, + $operators + ); + if($matchedOperator === null) { + continue; + } + + return [ + "name" => trim(substr($content, 0, $index)), + "operator" => $matchedOperator, + "value" => trim( + substr($content, $index + strlen($matchedOperator)) + ), + ]; + } + + return [ + "name" => trim($content), + "operator" => null, + "value" => "", + ]; + } + + /** + * @param array $operators + */ + private function matchOperator( + string $content, + int $index, + array $operators + ):?string { + foreach($operators as $operator) { + if(substr($content, $index, strlen($operator)) === $operator) { + return $operator; + } + } + + return null; + } + + /** @return array */ + private function buildMatchPayload( + string $groupKey, + string $match, + ?callable $transform + ):array { + if($transform) { + return $transform($groupKey, $match); + } + + return ["type" => $groupKey, "content" => $match]; + } +} diff --git a/lib/phpgt/cssxpath/src/CssSelectorLexer.php b/lib/phpgt/cssxpath/src/CssSelectorLexer.php new file mode 100644 index 000000000..0a8e2f6b7 --- /dev/null +++ b/lib/phpgt/cssxpath/src/CssSelectorLexer.php @@ -0,0 +1,371 @@ +attributeTokenBuilder = $attributeTokenBuilder + ?? new CssAttributeTokenBuilder(); + } + + /** @return array> */ + public function lex(string $selector, ?callable $transform):array { + $tokens = []; + $length = strlen($selector); + + for($index = 0; $index < $length;) { + $char = $selector[$index]; + + if(ctype_space($char)) { + $index = $this->consumeWhitespace( + $selector, + $index, + $tokens, + $transform + ); + continue; + } + + $index = $this->consumeToken( + $selector, + $index, + $char, + $tokens, + $transform + ); + } + + return $tokens; + } + + /** + * @param array> $tokens + */ + private function consumeToken( + string $selector, + int $index, + string $char, + array &$tokens, + ?callable $transform + ):int { + return match($char) { + "*" => $this->consumeSimpleToken("star", "*", $index, $tokens, $transform), + ">" => $this->consumeSimpleToken("child", ">", $index, $tokens, $transform), + "+" => $this->consumeSimpleToken( + "sibling", + "+", + $index, + $tokens, + $transform + ), + "~" => $this->consumeSimpleToken( + "subsequentsibling", + "~", + $index, + $tokens, + $transform + ), + "#" => $this->consumeIdentifierToken( + "id", + $selector, + $index + 1, + $tokens, + $transform + ), + "." => $this->consumeIdentifierToken( + "class", + $selector, + $index + 1, + $tokens, + $transform + ), + ":" => $this->consumePseudoToken($selector, $index, $tokens, $transform), + "[" => $this->consumeAttributeToken($selector, $index, $tokens, $transform), + default => $this->consumeDefaultToken( + $selector, + $index, + $char, + $tokens, + $transform + ), + }; + } + + /** + * @param array> $tokens + */ + private function consumeSimpleToken( + string $type, + string $content, + int $index, + array &$tokens, + ?callable $transform + ):int { + $tokens[] = $this->buildMatchPayload($type, $content, $transform); + return $index + 1; + } + + /** + * @param array> $tokens + */ + private function consumeIdentifierToken( + string $type, + string $selector, + int $index, + array &$tokens, + ?callable $transform + ):int { + [$identifier, $nextIndex] = $this->readIdentifier($selector, $index); + $tokens[] = $this->buildMatchPayload($type, $identifier, $transform); + return $nextIndex; + } + + /** + * @param array> $tokens + */ + private function consumePseudoToken( + string $selector, + int $index, + array &$tokens, + ?callable $transform + ):int { + [$pseudoTokens, $nextIndex] = $this->readPseudo( + $selector, + $index, + $transform + ); + array_push($tokens, ...$pseudoTokens); + return $nextIndex; + } + + /** + * @param array> $tokens + */ + private function consumeAttributeToken( + string $selector, + int $index, + array &$tokens, + ?callable $transform + ):int { + [$attributeToken, $nextIndex] = $this->readAttribute( + $selector, + $index, + $transform + ); + $tokens[] = $attributeToken; + return $nextIndex; + } + + /** + * @param array> $tokens + */ + private function consumeDefaultToken( + string $selector, + int $index, + string $char, + array &$tokens, + ?callable $transform + ):int { + if(!$this->isIdentifierCharacter($char)) { + return $index + 1; + } + + return $this->consumeIdentifierToken( + "element", + $selector, + $index, + $tokens, + $transform + ); + } + + /** + * @param array> $tokens + */ + private function consumeWhitespace( + string $selector, + int $index, + array &$tokens, + ?callable $transform + ):int { + $length = strlen($selector); + $nextIndex = $index; + while($nextIndex < $length && ctype_space($selector[$nextIndex])) { + $nextIndex++; + } + + if($this->shouldEmitDescendantToken($selector, $tokens, $nextIndex)) { + $tokens[] = $this->buildMatchPayload("descendant", " ", $transform); + } + + return $nextIndex; + } + + /** + * @param array> $tokens + */ + private function shouldEmitDescendantToken( + string $selector, + array $tokens, + int $nextIndex + ):bool { + if(empty($tokens) || !isset($selector[$nextIndex])) { + return false; + } + + $nextChar = $selector[$nextIndex]; + if(in_array($nextChar, [">", "+", "~", ",", ")"], true)) { + return false; + } + + $previousType = (string)$tokens[array_key_last($tokens)]["type"]; + return !in_array($previousType, [ + "child", + "sibling", + "subsequentsibling", + "descendant", + ], true); + } + + /** @return array{0: string, 1: int} */ + private function readIdentifier(string $selector, int $index):array { + $length = strlen($selector); + $identifier = ""; + + while($index < $length && $this->isIdentifierCharacter($selector[$index])) { + $identifier .= $selector[$index]; + $index++; + } + + return [$identifier, $index]; + } + + /** + * @return array{0: array>, 1: int} + */ + private function readPseudo( + string $selector, + int $index, + ?callable $transform + ):array { + $tokens = []; + $isPseudoElement = isset($selector[$index + 1]) + && $selector[$index + 1] === ":"; + $nameStart = $index + ($isPseudoElement ? 2 : 1); + [$name, $nextIndex] = $this->readIdentifier($selector, $nameStart); + + $tokens[] = $this->buildMatchPayload( + $isPseudoElement ? "pseudo-element" : "pseudo", + $name, + $transform + ); + + if(isset($selector[$nextIndex]) && $selector[$nextIndex] === "(") { + [$content, $nextIndex] = $this->readBalancedContent( + $selector, + $nextIndex, + "(", + ")" + ); + $tokens[] = $this->buildMatchPayload( + "pseudospecifier", + $content, + $transform + ); + } + + return [$tokens, $nextIndex]; + } + + /** + * @return array{0: array, 1: int} + */ + private function readAttribute( + string $selector, + int $index, + ?callable $transform + ):array { + [$content, $nextIndex] = $this->readBalancedContent( + $selector, + $index, + "[", + "]" + ); + return [ + $this->attributeTokenBuilder->build($content, $transform), + $nextIndex, + ]; + } + + /** @return array{0: string, 1: int} */ + private function readBalancedContent( + string $selector, + int $startIndex, + string $open, + string $close + ):array { + $length = strlen($selector); + $depth = 1; + $content = ""; + $quote = null; + + for($index = $startIndex + 1; $index < $length; $index++) { + $char = $selector[$index]; + + if($quote !== null) { + $content .= $char; + if($char === $quote) { + $quote = null; + } + continue; + } + + if($char === "'" || $char === '"') { + $quote = $char; + $content .= $char; + continue; + } + + if($char === $open) { + $depth++; + $content .= $char; + continue; + } + + if($char === $close) { + $depth--; + if($depth === 0) { + return [$content, $index + 1]; + } + + $content .= $char; + continue; + } + + $content .= $char; + } + + return [$content, $length]; + } + + private function isIdentifierCharacter(string $char):bool { + return preg_match('/[\w-]/', $char) === 1; + } + + /** @return array */ + private function buildMatchPayload( + string $groupKey, + string $match, + ?callable $transform + ):array { + if($transform) { + return $transform($groupKey, $match); + } + + return ["type" => $groupKey, "content" => $match]; + } +} diff --git a/lib/phpgt/cssxpath/src/HasSelectorConditionBuilder.php b/lib/phpgt/cssxpath/src/HasSelectorConditionBuilder.php new file mode 100644 index 000000000..7fddcd0d5 --- /dev/null +++ b/lib/phpgt/cssxpath/src/HasSelectorConditionBuilder.php @@ -0,0 +1,81 @@ +selectorListSplitter = $selectorListSplitter + ?? new SelectorListSplitter(); + $this->singleSelectorConverter = $singleSelectorConverter + ?? new SingleSelectorConverter(); + } + + public function build(string $selectorList, bool $htmlMode):?string { + $selectorList = trim($selectorList); + if($selectorList === "") { + return null; + } + + $this->assertSupported($selectorList); + + $selectors = $this->selectorListSplitter->split($selectorList); + if(empty($selectors)) { + return null; + } + + $conditions = []; + foreach($selectors as $selector) { + $conditions[] = $this->buildCondition(trim($selector), $htmlMode); + } + + if(count($conditions) === 1) { + return $conditions[0]; + } + + $wrappedConditions = array_map( + fn(string $condition):string => "({$condition})", + $conditions + ); + return implode(" or ", $wrappedConditions); + } + + private function buildCondition(string $selector, bool $htmlMode):string { + $prefix = str_starts_with($selector, ">") + || str_starts_with($selector, "+") + || str_starts_with($selector, "~") + ? "." + : ".//"; + + return $this->singleSelectorConverter->convert( + $selector, + $prefix, + $htmlMode + ); + } + + private function assertSupported(string $selectorList):void { + if(preg_match('/(^|[^[:alnum:]_-]):has\s*\(/', $selectorList) === 1) { + throw new NotYetImplementedException( + "Nested :has selector functionality is deferred" + ); + } + + if(str_contains($selectorList, "::")) { + throw new NotYetImplementedException( + "Pseudo-element :has selector functionality is deferred" + ); + } + + if(preg_match('/:nth-child\([^)]*\bof\b/', $selectorList) === 1) { + throw new NotYetImplementedException( + "':nth-child(of S)' in :has selector functionality is deferred" + ); + } + } +} diff --git a/lib/phpgt/cssxpath/src/NotSelectorConditionBuilder.php b/lib/phpgt/cssxpath/src/NotSelectorConditionBuilder.php new file mode 100644 index 000000000..a18c86cc1 --- /dev/null +++ b/lib/phpgt/cssxpath/src/NotSelectorConditionBuilder.php @@ -0,0 +1,154 @@ +threadMatcher = $threadMatcher ?? new ThreadMatcher(); + $this->attributeSelectorConverter = $attributeSelectorConverter + ?? new AttributeSelectorConverter(); + } + + public function build(string $selector, bool $htmlMode):?string { + $selector = trim($selector); + if($selector === "") { + return null; + } + + $thread = array_values( + $this->threadMatcher->collate(Translator::CSS_REGEX, $selector) + ); + if(!$this->isSupportedThread($thread)) { + return null; + } + + $token = $thread[0]; + $next = $thread[1] ?? null; + return $this->buildConditionFromToken($token, $next, $htmlMode); + } + + /** @param array> $thread */ + private function isSupportedThread(array $thread):bool { + if(empty($thread) || count($thread) > 2) { + return false; + } + + foreach($thread as $token) { + if($this->isAxisToken((string)$token["type"])) { + return false; + } + } + + return true; + } + + private function isAxisToken(string $type):bool { + return in_array($type, [ + "descendant", + "child", + "sibling", + "subsequentsibling", + ], true); + } + + /** + * @param array $token + * @param array|null $next + */ + private function buildConditionFromToken( + array $token, + ?array $next, + bool $htmlMode + ):?string { + $type = (string)$token["type"]; + if($this->isElementType($type)) { + return $this->buildElementCondition( + (string)$token["content"], + $htmlMode + ); + } + + return $this->buildNonElementCondition($type, $token, $next, $htmlMode); + } + + private function isElementType(string $type):bool { + return in_array($type, ["element", "star"], true); + } + + /** + * @param array $token + * @param array|null $next + */ + private function buildNonElementCondition( + string $type, + array $token, + ?array $next, + bool $htmlMode + ):?string { + return match($type) { + "id" => "@id='" . $token["content"] . "'", + "class" => $this->buildClassCondition((string)$token["content"]), + "attribute" => $this + ->attributeSelectorConverter + ->buildConditionFromToken($token, $htmlMode), + "pseudo" => $this->buildPseudoCondition($token, $next), + default => null, + }; + } + + private function buildClassCondition(string $className):string { + return "" + . "contains(concat(' ',normalize-space(@class),' ')," + . "' {$className} ')"; + } + + /** + * @param array $token + * @param array|null $next + */ + private function buildPseudoCondition(array $token, ?array $next):?string { + $pseudo = (string)$token["content"]; + $specifier = $this->extractSpecifier($next); + + if(in_array($pseudo, ["disabled", "checked", "selected"], true)) { + return "@{$pseudo}"; + } + + return match($pseudo) { + "text" => '@type="text"', + "contains" => $specifier !== "" + ? "contains(text(),{$specifier})" + : null, + "first-child", "first-of-type" => "position() = 1", + "nth-child", "nth-of-type" => $specifier !== "" + ? "position() = {$specifier}" + : null, + "last-child", "last-of-type" => "position() = last()", + default => null, + }; + } + + private function buildElementCondition(string $name, bool $htmlMode):string { + if($name === "*") { + return "self::*"; + } + + $element = $htmlMode ? strtolower($name) : $name; + return "self::{$element}"; + } + + /** @param array|null $next */ + private function extractSpecifier(?array $next):string { + if(!$next || $next["type"] !== "pseudospecifier") { + return ""; + } + + return (string)$next["content"]; + } +} diff --git a/lib/phpgt/cssxpath/src/PseudoSelectorConverter.php b/lib/phpgt/cssxpath/src/PseudoSelectorConverter.php new file mode 100644 index 000000000..95b16702c --- /dev/null +++ b/lib/phpgt/cssxpath/src/PseudoSelectorConverter.php @@ -0,0 +1,160 @@ + */ + private const BOOLEAN_ATTRIBUTES = ["disabled", "checked", "selected"]; + private SelectorListSplitter $selectorListSplitter; + private NotSelectorConditionBuilder $notSelectorConditionBuilder; + private ?HasSelectorConditionBuilder $hasSelectorConditionBuilder; + + public function __construct( + ?SelectorListSplitter $selectorListSplitter = null, + ?NotSelectorConditionBuilder $notSelectorConditionBuilder = null, + ?HasSelectorConditionBuilder $hasSelectorConditionBuilder = null, + ) { + $this->selectorListSplitter = $selectorListSplitter + ?? new SelectorListSplitter(); + $this->notSelectorConditionBuilder = $notSelectorConditionBuilder + ?? new NotSelectorConditionBuilder(); + $this->hasSelectorConditionBuilder = $hasSelectorConditionBuilder; + } + + /** + * @param array $token + * @param array|null $next + */ + public function apply( + array $token, + ?array $next, + XPathExpression $expression, + bool $htmlMode + ):void { + $pseudo = $token["content"]; + $specifier = $this->extractSpecifier($next); + + if(in_array($pseudo, self::BOOLEAN_ATTRIBUTES, true)) { + $expression->appendFragment("[@{$pseudo}]"); + return; + } + + $handlers = [ + "text" => fn() => $this->applyText($expression), + "contains" => fn() => $this->applyContains($expression, $specifier), + "not" => fn() => $this->applyNot($expression, $specifier, $htmlMode), + "has" => fn() => $this->applyHas($expression, $specifier, $htmlMode), + "first-child" => fn() => $expression->prependToLast("*[1]/self::"), + "nth-child" => fn() => $this->applyNthChild($expression, $specifier), + "last-child" => fn() => $expression->prependToLast("*[last()]/self::"), + "first-of-type" => fn() => $expression->appendFragment("[1]"), + "nth-of-type" => fn() => $this->applyNthOfType($expression, $specifier), + "last-of-type" => fn() => $expression->appendFragment("[last()]"), + ]; + + $handler = $handlers[$pseudo] ?? null; + if($handler !== null) { + $handler(); + } + } + + private function applyText(XPathExpression $expression):void { + $expression->appendFragment('[@type="text"]'); + } + + private function applyContains( + XPathExpression $expression, + string $specifier + ):void { + if($specifier === "") { + return; + } + + $expression->appendFragment("[contains(text(),{$specifier})]"); + } + + private function applyNthChild( + XPathExpression $expression, + string $specifier + ):void { + if($specifier === "") { + return; + } + + if($expression->lastPartEndsWith("]")) { + $replacement = " and position() = {$specifier}]"; + $expression->replaceInLast("]", $replacement); + return; + } + + $expression->appendFragment("[{$specifier}]"); + } + + private function applyNthOfType( + XPathExpression $expression, + string $specifier + ):void { + if($specifier === "") { + return; + } + + $expression->appendFragment("[{$specifier}]"); + } + + private function applyNot( + XPathExpression $expression, + string $specifier, + bool $htmlMode + ):void { + $selectorList = $this->selectorListSplitter->split($specifier); + if(empty($selectorList)) { + return; + } + + $conditions = []; + foreach($selectorList as $selector) { + $condition = $this->notSelectorConditionBuilder + ->build($selector, $htmlMode); + if($condition === null) { + return; + } + + $conditions[] = $condition; + } + + $combined = count($conditions) === 1 + ? $conditions[0] + : "(" . implode(" or ", $conditions) . ")"; + $expression->ensureElement(); + $expression->appendFragment("[not({$combined})]"); + } + + private function applyHas( + XPathExpression $expression, + string $specifier, + bool $htmlMode + ):void { + $condition = $this->getHasSelectorConditionBuilder() + ->build($specifier, $htmlMode); + if($condition === null) { + return; + } + + $expression->ensureElement(); + $expression->appendFragment("[{$condition}]"); + } + + private function getHasSelectorConditionBuilder():HasSelectorConditionBuilder { + return $this->hasSelectorConditionBuilder + ??= new HasSelectorConditionBuilder(); + } + + /** @param array|null $next */ + private function extractSpecifier(?array $next):string { + if(!$next || $next["type"] !== "pseudospecifier") { + return ""; + } + + return (string)$next["content"]; + } +} diff --git a/lib/phpgt/cssxpath/src/SelectorListSplitter.php b/lib/phpgt/cssxpath/src/SelectorListSplitter.php new file mode 100644 index 000000000..1f4def662 --- /dev/null +++ b/lib/phpgt/cssxpath/src/SelectorListSplitter.php @@ -0,0 +1,107 @@ + */ + public function split(string $selectorList):array { + $selectorList = trim($selectorList); + if($selectorList === "") { + return []; + } + + $parts = []; + $current = ""; + $quote = null; + $bracketDepth = 0; + $parenDepth = 0; + $length = strlen($selectorList); + + for($i = 0; $i < $length; $i++) { + $char = $selectorList[$i]; + + if($this->handleQuotedState($char, $current, $quote)) { + continue; + } + + if($this->openQuoteIfNeeded($char, $current, $quote)) { + continue; + } + + $this->trackDepth($char, $bracketDepth, $parenDepth); + if($this->isTopLevelComma($char, $bracketDepth, $parenDepth)) { + $this->appendCurrentPart($parts, $current); + $current = ""; + continue; + } + + $current .= $char; + } + + $this->appendCurrentPart($parts, $current); + return $parts; + } + + private function handleQuotedState( + string $char, + string &$current, + ?string &$quote + ):bool { + if($quote === null) { + return false; + } + + $current .= $char; + if($char === $quote) { + $quote = null; + } + + return true; + } + + private function openQuoteIfNeeded( + string $char, + string &$current, + ?string &$quote + ):bool { + if($char !== "'" && $char !== '"') { + return false; + } + + $quote = $char; + $current .= $char; + return true; + } + + private function trackDepth( + string $char, + int &$bracketDepth, + int &$parenDepth + ):void { + match($char) { + "[" => $bracketDepth++, + "]" => $bracketDepth = max(0, $bracketDepth - 1), + "(" => $parenDepth++, + ")" => $parenDepth = max(0, $parenDepth - 1), + default => null, + }; + } + + private function isTopLevelComma( + string $char, + int $bracketDepth, + int $parenDepth + ):bool { + return $char === "," + && $bracketDepth === 0 + && $parenDepth === 0; + } + + /** @param array $parts */ + private function appendCurrentPart(array &$parts, string $current):void { + $trimmed = trim($current); + if($trimmed !== "") { + $parts[] = $trimmed; + } + } +} diff --git a/lib/phpgt/cssxpath/src/SingleSelectorConverter.php b/lib/phpgt/cssxpath/src/SingleSelectorConverter.php new file mode 100644 index 000000000..8ddf32d7c --- /dev/null +++ b/lib/phpgt/cssxpath/src/SingleSelectorConverter.php @@ -0,0 +1,104 @@ +threadMatcher = $threadMatcher ?? new ThreadMatcher(); + $this->pseudoSelectorConverter = $pseudoSelectorConverter + ?? new PseudoSelectorConverter(); + $this->attributeSelectorConverter = $attributeSelectorConverter + ?? new AttributeSelectorConverter(); + } + + public function convert( + string $css, + string $prefix, + bool $htmlMode + ):string { + $thread = array_values( + array_filter( + $this->threadMatcher->collate(Translator::CSS_REGEX, $css) + ) + ); + $expression = new XPathExpression($prefix); + + foreach($thread as $index => $token) { + $next = $thread[$index + 1] ?? null; + $this->applyToken($token, $next, $expression, $htmlMode); + } + + return $expression->toString(); + } + + /** + * @param array $token + * @param array|null $next + */ + private function applyToken( + array $token, + ?array $next, + XPathExpression $expression, + bool $htmlMode + ):void { + $handlers = [ + "star" => fn() => $expression + ->appendElement((string)$token["content"], $htmlMode), + "element" => fn() => $expression + ->appendElement((string)$token["content"], $htmlMode), + "pseudo" => fn() => $this->pseudoSelectorConverter + ->apply($token, $next, $expression, $htmlMode), + "child" => fn() => $this->appendAxis($expression, "/"), + "id" => fn() => $this->appendId($expression, (string)$token["content"]), + "class" => fn() => $this + ->appendClass($expression, (string)$token["content"]), + "sibling" => fn() => $this->appendAxis( + $expression, + "/following-sibling::*[1]/self::" + ), + "subsequentsibling" => fn() => $this->appendAxis( + $expression, + "/following-sibling::" + ), + "attribute" => fn() => $this->attributeSelectorConverter + ->apply($token, $expression, $htmlMode), + "descendant" => fn() => $this->appendAxis($expression, "//"), + ]; + + $handler = $handlers[$token["type"]] ?? null; + if($handler !== null) { + $handler(); + } + } + + private function appendAxis(XPathExpression $expression, string $axis):void { + $expression->appendFragment($axis); + $expression->markElementMissing(); + } + + private function appendId( + XPathExpression $expression, + string $identifier + ):void { + $expression->ensureElement(); + $expression->appendFragment("[@id='{$identifier}']"); + } + + private function appendClass( + XPathExpression $expression, + string $className + ):void { + $expression->ensureElement(); + $expression->appendFragment( + "[contains(concat(' ',normalize-space(@class),' '),' {$className} ')]" + ); + } +} diff --git a/lib/phpgt/cssxpath/src/ThreadMatcher.php b/lib/phpgt/cssxpath/src/ThreadMatcher.php new file mode 100644 index 000000000..a1fcdae86 --- /dev/null +++ b/lib/phpgt/cssxpath/src/ThreadMatcher.php @@ -0,0 +1,118 @@ +cssSelectorLexer = $cssSelectorLexer + ?? new CssSelectorLexer(); + } + + /** @return array> */ + public function collate( + string $regex, + string $string, + ?callable $transform = null + ):array { + if($regex === Translator::CSS_REGEX) { + return $this->collateCssSelector($string, $transform); + } + + preg_match_all( + $regex, + $string, + $matches, + PREG_PATTERN_ORDER + ); + + $set = $this->initialiseSet($matches[0]); + + foreach($matches as $key => $matchedGroup) { + if(is_numeric($key)) { + continue; + } + + $this->collateGroup($set, $key, $matchedGroup, $transform); + } + + return $set; + } + + /** @return array> */ + private function collateCssSelector( + string $selector, + ?callable $transform + ):array { + return $this->cssSelectorLexer->lex($selector, $transform); + } + + /** + * @param array $matches + * @return array|null> + */ + private function initialiseSet(array $matches):array { + $set = []; + + foreach($matches as $index => $value) { + if($value !== "") { + $set[$index] = null; + } + } + + return $set; + } + + /** + * @param array|null> $set + * @param array $matchedGroup + */ + private function collateGroup( + array &$set, + string $groupKey, + array $matchedGroup, + ?callable $transform + ):void { + foreach($matchedGroup as $index => $match) { + if($match === "") { + continue; + } + + $toSet = $this->buildMatchPayload($groupKey, $match, $transform); + $this->appendMatch($set, $index, $toSet); + } + } + + /** @return array */ + private function buildMatchPayload( + string $groupKey, + string $match, + ?callable $transform + ):array { + if($transform) { + return $transform($groupKey, $match); + } + + return ["type" => $groupKey, "content" => $match]; + } + + /** + * @param array|null> $set + * @param array $toSet + */ + private function appendMatch(array &$set, int $index, array $toSet):void { + if(!isset($set[$index])) { + $set[$index] = $toSet; + return; + } + + if(!isset($set[$index]["detail"])) { + $set[$index]["detail"] = []; + } + + $set[$index]["detail"][] = $toSet; + } +} diff --git a/lib/phpgt/cssxpath/src/Translator.php b/lib/phpgt/cssxpath/src/Translator.php index e5802fd77..955d164be 100644 --- a/lib/phpgt/cssxpath/src/Translator.php +++ b/lib/phpgt/cssxpath/src/Translator.php @@ -1,402 +1,73 @@ -\*)' . '|(:(?P[\w-]*))' - . '|\(*(?P["\']*[\w\h-]*["\']*)\)' + . '|\((?P[^)]*)\)' . '|(?P[\w-]*)' . '|(?P\s*>\s*)' . '|(#(?P[\w-]*))' . '|(\.(?P[\w-]*))' . '|(?P\s*\+\s*)' . '|(?P\s*~\s*)' - . "|(\[(?P[\w-]*)((?P[=~$|^*]+)(?P(.+\[\]'?)|[^\]]+))*\])+" + . "|(\[(?P[\w-]*)((?P[=~$|^*]+)" + . "(?P(.+\[\]'?)|[^\]]+))*\])+" . '|(?P\s+)' . '/'; - const EQUALS_EXACT = "="; - const EQUALS_CONTAINS_WORD = "~="; - const EQUALS_ENDS_WITH = "$="; - const EQUALS_CONTAINS = "*="; - const EQUALS_OR_STARTS_WITH_HYPHENATED = "|="; - const EQUALS_STARTS_WITH = "^="; + public const EQUALS_EXACT = "="; + public const EQUALS_CONTAINS_WORD = "~="; + public const EQUALS_ENDS_WITH = "$="; + public const EQUALS_CONTAINS = "*="; + public const EQUALS_OR_STARTS_WITH_HYPHENATED = "|="; + public const EQUALS_STARTS_WITH = "^="; + + private SingleSelectorConverter $singleSelectorConverter; + private SelectorListSplitter $selectorListSplitter; public function __construct( - protected string $cssSelector, - protected string $prefix = ".//", - protected bool $htmlMode = true - ) { + protected string $cssSelector, + protected string $prefix = ".//", + protected bool $htmlMode = true, + ?SingleSelectorConverter $singleSelectorConverter = null, + ?SelectorListSplitter $selectorListSplitter = null, + ) { + $this->singleSelectorConverter = $singleSelectorConverter + ?? new SingleSelectorConverter(); + $this->selectorListSplitter = $selectorListSplitter + ?? new SelectorListSplitter(); } public function __toString():string { - return $this->asXPath(); - } - - public function asXPath():string { return $this->convert($this->cssSelector); } + // phpcs:disable Generic.NamingConventions.CamelCapsFunctionName + public function asXPath():string { + return $this->convert($this->cssSelector); + } + // phpcs:enable + protected function convert(string $css):string { - $cssArray = preg_split( - '/(["\']).*?\1(*SKIP)(*F)|,/', - $css - ); + $cssArray = $this->selectorListSplitter->split($css); $xPathArray = []; foreach($cssArray as $input) { - $output = $this->convertSingleSelector(trim($input)); - $xPathArray []= $output; + $xPathArray[] = $this->convertSingleSelector(trim($input)); } return implode(" | ", $xPathArray); } protected function convertSingleSelector(string $css):string { - $thread = $this->preg_match_collated(self::cssRegex, $css); - $thread = array_values($thread); - - $xpath = [$this->prefix]; - $hasElement = false; - foreach($thread as $threadKey => $currentThreadItem) { - $next = isset($thread[$threadKey + 1]) - ? $thread[$threadKey + 1] - : false; - - switch ($currentThreadItem["type"]) { - case "star": - case "element": - if($this->htmlMode) { - $xpath []= strtolower($currentThreadItem['content']); - } else { - $xpath []= $currentThreadItem['content']; - } - $hasElement = true; - break; - - case "pseudo": - $specifier = ""; - if ($next && $next["type"] == "pseudospecifier") { - $specifier = "{$next['content']}"; - } - - switch ($currentThreadItem["content"]) { - case "disabled": - case "checked": - case "selected": - array_push( - $xpath, - "[@{$currentThreadItem['content']}]" - ); - break; - - case "text": - array_push( - $xpath, - '[@type="text"]' - ); - break; - - case "contains": - if(empty($specifier)) { - continue 3; - } - - array_push( - $xpath, - "[contains(text(),$specifier)]" - ); - break; - - case "first-child": - $prev = count($xpath) - 1; - $xpath[$prev] = '*[1]/self::' . $xpath[$prev]; - break; - - case "nth-child": - if (empty($specifier)) { - continue 3; - } - - $prev = count($xpath) - 1; - $previous = $xpath[$prev]; - - if (substr($previous, -1, 1) === "]") { - $xpath[$prev] = str_replace( - "]", - " and position() = $specifier]", - $xpath[$prev] - ); - } - else { - array_push( - $xpath, - "[$specifier]" - ); - } - break; - - case "last-child": - $prev = count($xpath) - 1; - $xpath[$prev] = '*[last()]/self::' . $xpath[$prev]; - break; - - case 'first-of-type': - $prev = count($xpath) - 1; - $previous = $xpath[$prev]; - - if(substr($previous, -1, 1) === "]") { - array_push( - $xpath, - "[1]" - ); - } - else { - array_push( - $xpath, - "[1]" - ); - } - break; - - case "nth-of-type": - if (empty($specifier)) { - continue 3; - } - - $prev = count($xpath) - 1; - $previous = $xpath[$prev]; - - if(substr($previous, -1, 1) === "]") { - array_push( - $xpath, - "[$specifier]" - ); - } - else { - array_push( - $xpath, - "[$specifier]" - ); - } - break; - - case "last-of-type": - $prev = count($xpath) - 1; - $previous = $xpath[$prev]; - - if(substr($previous, -1, 1) === "]") { - array_push( - $xpath, - "[last()]" - ); - } - else { - array_push( - $xpath, - "[last()]" - ); - } - break; - - } - break; - - case "child": - array_push($xpath, "/"); - $hasElement = false; - break; - - case "id": - array_push( - $xpath, - ($hasElement ? '' : '*') - . "[@id='{$currentThreadItem['content']}']" - ); - $hasElement = true; - break; - - case "class": - // https://devhints.io/xpath#class-check - array_push( - $xpath, - ($hasElement ? '' : '*') - . "[contains(concat(' ',normalize-space(@class),' '),' {$currentThreadItem['content']} ')]" - ); - $hasElement = true; - break; - - case "sibling": - array_push( - $xpath, - "/following-sibling::*[1]/self::" - ); - $hasElement = false; - break; - - case "subsequentsibling": - array_push( - $xpath, - "/following-sibling::" - ); - $hasElement = false; - break; - - case "attribute": - if(!$hasElement) { - array_push($xpath, "*"); - $hasElement = true; - } - - if($this->htmlMode) { - $currentThreadItem['content'] = strtolower($currentThreadItem['content']); - } - - /** @var null|array> $detail */ - $detail = $currentThreadItem["detail"] ?? null; - $detailType = $detail[0] ?? null; - $detailValue = $detail[1] ?? null; - - if(!$detailType - || $detailType["type"] !== "attribute_equals") { - array_push( - $xpath, - "[@{$currentThreadItem['content']}]" - ); - continue 2; - } - - $valueString = trim( - $detailValue["content"], - " '\"" - ); - - $equalsType = $detailType["content"]; - switch ($equalsType) { - case self::EQUALS_EXACT: - array_push( - $xpath, - "[@{$currentThreadItem['content']}=\"{$valueString}\"]" - ); - break; - - case self::EQUALS_CONTAINS: - array_push( - $xpath, - "[contains(@{$currentThreadItem['content']},\"{$valueString}\")]" - ); - break; - - case self::EQUALS_CONTAINS_WORD: - array_push( - $xpath, - "[" - . "contains(" - . "concat(\" \",@{$currentThreadItem['content']},\" \")," - . "concat(\" \",\"{$valueString}\",\" \")" - . ")" - . "]" - ); - break; - - case self::EQUALS_OR_STARTS_WITH_HYPHENATED: - array_push( - $xpath, - "[" - . "@{$currentThreadItem['content']}=\"{$valueString}\" or " - . "starts-with(@{$currentThreadItem['content']}, \"{$valueString}-\")" - . "]" - ); - break; - - case self::EQUALS_STARTS_WITH: - array_push( - $xpath, - "[starts-with(" - . "@{$currentThreadItem['content']}, \"{$valueString}\"" - . ")]" - ); - break; - - case self::EQUALS_ENDS_WITH: - array_push( - $xpath, - "[" - . "substring(" - . "@{$currentThreadItem['content']}," - . "string-length(@{$currentThreadItem['content']}) - " - . "string-length(\"{$valueString}\") + 1)" - . "=\"{$valueString}\"" - . "]" - ); - break; - } - break; - - case "descendant": - array_push($xpath, "//"); - $hasElement = false; - break; - } - } - - return implode("", $xpath); - } - - /** @return array> */ - protected function preg_match_collated( - string $regex, - string $string, - ?callable $transform = null - ):array { - preg_match_all( - $regex, - $string, - $matches, - PREG_PATTERN_ORDER + return $this->singleSelectorConverter->convert( + $css, + $this->prefix, + $this->htmlMode ); - - $set = []; - foreach($matches[0] as $k => $v) { - if(!empty($v)) { - $set[$k] = null; - } - } - - foreach($matches as $k => $m) { - if(is_numeric($k)) { - continue; - } - - foreach($m as $i => $match) { - if($match === "") { - continue; - } - - $toSet = null; - - if($transform) { - $toSet = $transform($k, $match); - } - else { - $toSet = ["type" => $k, "content" => $match]; - } - - if(!isset($set[$i])) { - $set[$i] = $toSet; - } - else { - if(!isset($set[$i]["detail"])) { - $set[$i]["detail"] = []; - } - - array_push($set[$i]["detail"], $toSet); - } - } - } - - return $set; } } diff --git a/lib/phpgt/cssxpath/src/XPathExpression.php b/lib/phpgt/cssxpath/src/XPathExpression.php new file mode 100644 index 000000000..b41a720b4 --- /dev/null +++ b/lib/phpgt/cssxpath/src/XPathExpression.php @@ -0,0 +1,54 @@ + */ + private array $parts; + private bool $hasElement = false; + + public function __construct(string $prefix) { + $this->parts = [$prefix]; + } + + public function appendElement(string $element, bool $htmlMode):void { + $this->parts[] = $htmlMode ? strtolower($element) : $element; + $this->hasElement = true; + } + + public function ensureElement():void { + if($this->hasElement) { + return; + } + + $this->parts[] = "*"; + $this->hasElement = true; + } + + public function appendFragment(string $fragment):void { + $this->parts[] = $fragment; + } + + public function markElementMissing():void { + $this->hasElement = false; + } + + public function prependToLast(string $prefix):void { + $index = count($this->parts) - 1; + $this->parts[$index] = $prefix . $this->parts[$index]; + } + + public function replaceInLast(string $search, string $replace):void { + $index = count($this->parts) - 1; + $this->parts[$index] = str_replace($search, $replace, $this->parts[$index]); + } + + public function lastPartEndsWith(string $suffix):bool { + $index = count($this->parts) - 1; + return substr($this->parts[$index], -strlen($suffix)) === $suffix; + } + + public function toString():string { + return implode("", $this->parts); + } +}