From 14a56261eccb23858cb7de0fb712d51fe17d3f1a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 6 Dec 2022 21:03:12 -0700 Subject: [PATCH] WIP: Introduce class for sourcing block attributes from HTML --- .../html/class-wp-html-attribute-sourcer.php | 390 ++++++++++++++++++ .../html/class-wp-html-naive-processor.php | 14 + .../html/class-wp-html-tag-processor.php | 2 +- .../html/wp-html-attribute-sourcer-test.php | 177 ++++++++ 4 files changed, 582 insertions(+), 1 deletion(-) create mode 100644 lib/experimental/html/class-wp-html-attribute-sourcer.php create mode 100644 lib/experimental/html/class-wp-html-naive-processor.php create mode 100644 phpunit/html/wp-html-attribute-sourcer-test.php diff --git a/lib/experimental/html/class-wp-html-attribute-sourcer.php b/lib/experimental/html/class-wp-html-attribute-sourcer.php new file mode 100644 index 0000000000000..268d1be8f477c --- /dev/null +++ b/lib/experimental/html/class-wp-html-attribute-sourcer.php @@ -0,0 +1,390 @@ + a + * figure a + * figure img + * figure video,figure img + * h1,h2,h3,h4,h5,h6 + * img + * li + * ol,ul + * p + * pre + * tbody tr + * td,th + * tfoot tr + * thead tr + * video + */ + +require_once __DIR__ . '/class-wp-html-naive-processor.php'; + +/* + * @see PHP docs for array_is_list: user-contributed polyfill + */ +if ( ! function_exists( 'array_is_list' ) ) { + function array_is_list( $array ) { + $i = 0; + + foreach ( $array as $k => $v ) { + if ( $k !== $i++ ) { + return false; + } + } + + return true; + } +} + + +class WP_HTML_Attribute_Sourcer { + /** + * Attributes definitions, typically from `block.json`. + * + * @see WP_Block_Type_Registry + * + * @var mixed|null + */ + public $attribute_definitions; + + /** + * Source HTML containing embedded attributes. + * + * @var mixed|null + */ + private $html; + + public function __construct( $attribute_definitions = null, $html = null ) { + $this->attribute_definitions = $attribute_definitions; + $this->html = $html; + } + + public function source_attributes() { + $attributes = []; + $unparsed = []; + + foreach ( $this->attribute_definitions as $name => $definition ) { + $sourcer = self::parse_definition( $definition ); + switch ( $sourcer ) { + case null: + case 'not-sourced': + case 'unsupported': + $unparsed[] = $name; + continue 2; + + case 'inner-html': + $attributes[ $name ] = $this->html; + continue 2; + } + + $tags = self::select( $sourcer['selector'], $this->html ); + if ( null === $tags ) { + $unparsed[] = $name; + continue; + } + + switch ( $sourcer['type'] ) { + case 'html': + $attributes[ $name ] = self::get_inner_html( $tags ); + continue 2; + + case 'attribute': + $attributes[ $name ] = $tags->get_attribute( $sourcer['attribute'] ); + continue 2; + } + } + + return array( + 'attributes' => $attributes, + 'unparsed' => $unparsed + ); + } + + public static function select( $selector, $html ) { + $tags = new WP_HTML_Naive_Processor( $html ); + + if ( array_is_list( $selector ) ) { + while ( $tags->next_tag() ) { + foreach ( $selector as $s ) { + if ( 'element' === $s['type'] && $tags->get_tag() === strtoupper( $s['identifier'] ) ) { + return $tags; + } + + // @TODO: $tags->has_class() would be _really_ handy here. + if ( 'class' === $s['type'] && preg_match( "~\b{$s['identifier']}\b~", $tags->get_attribute( 'class' ) ) ) { + return $tags; + } + + if ( 'hash' === $s['type'] && $s['identifier'] === $tags->get_attribute( 'id' ) ) { + return $tags; + } + } + } + + return null; + } + + switch ( $selector['type'] ) { + case 'element': + $tags->next_tag( [ 'tag_name' => $selector['identifier'] ] ); + return $tags; + + case 'class': + $tags->next_tag( [ 'class_name' => $selector['identifier'] ] ); + return $tags; + + case 'hash': + while ( $tags->next_tag() ) { + if ( $selector['identifier'] === $tags->get_attribute( 'id' ) ) { + return $tags; + } + } + } + + return null; + } + + public static function get_inner_html( WP_HTML_Naive_Processor $tags ) { + $tags->set_bookmark( 'start' ); + $tag_name = $tags->get_tag(); + $depth = 1; + + if ( self::is_void_element( $tag_name ) ) { + return ''; + } + + while ( $tags->next_tag( [ 'tag_closers' => 'visit' ] ) ) { + if ( $tags->get_tag() !== $tag_name ) { + continue; + } + + if ( $tags->is_tag_closer() && $depth === 1 ) { + $tags->set_bookmark( 'end' ); + break; + } + + $depth += $tags->is_tag_closer() ? -1 : 1; + } + + return $tags->inner_content( 'start', 'end' ); + } + + /** + * @see https://html.spec.whatwg.org/#elements-2 + */ + public static function is_void_element( $tag_name ) { + switch ( $tag_name ) { + case 'area': + case 'base': + case 'br': + case 'col': + case 'embed': + case 'hr': + case 'img': + case 'input': + case 'link': + case 'meta': + case 'source': + case 'track': + case 'wbr': + return true; + + default: + return false; + } + } + + public static function parse_definition( $definition ) { + if ( empty( $definition['source'] ) ) { + return 'not-sourced'; + } + + $source = $definition['source']; + if ( 'html' !== $source && 'attribute' !== $source ) { + return 'unsupported'; + } + + if ( 'attribute' === $source && empty( $definition['selector'] ) ) { + return null; + } + + if ( 'html' === $source && empty( $definition['selector'] ) ) { + return 'inner-html'; + } + + $selector = self::parse_selector( $definition['selector'] ); + if ( null === $selector ) { + return 'unsupported'; + } + + if ( 'html' === $source ) { + return array( 'type' => 'html', 'selector' => $selector ); + } + + $attribute = self::parse_attribute( $definition['attribute'] ); + if ( null === $attribute ) { + return null; + } + + return array( 'type' => 'attribute', 'selector' => $selector, 'attribute' => $attribute ); + } + + public static function parse_selector( $s, $at = 0 ) { + $selectors = explode( ',', $s ); + if ( count( $selectors ) > 1 ) { + $parsed = []; + + foreach ( $selectors as $selector ) { + $parsed[] = self::parse_selector( $selector, strspn( $selector, " \r\t\f\n" ) ); + } + + return $parsed; + } + + $type = 'element'; + + switch ( $s[ $at ] ) { + case '+': + // no support for adjacent sibling combinator + return null; + + case '>': + // no support for child combinator + return null; + + case '~': + // no support for general sibling combinator + return null; + + case ' ': + // no support for descendant combinator + return null; + + case '[': + // no support for attribute + return null; + + case ',': + // we shouldn't get here because we're exploding at the start + // of this function; this is a bug if we're here. + return null; + + case ':': + // no support for pseudo-selectors + return null; + + case '#': + $type = 'hash'; + $at++; + break; + + case '.': + $type = 'class'; + $at++; + break; + } + + // @TODO: Hashes don't have to start with `nmstart` so this might reject valid hash names. + $identifier = self::parse_css_identifier( $s, $at ); + if ( null === $identifier ) { + return null; + } + + if ( $at + strlen( $identifier ) < strlen( $s ) ) { + // no support for anything more complicated than a simple selector + return null; + } + + return array( 'type' => $type, 'identifier' => $identifier ); + } + + /** + * Parses CSS identifier; currently limited to ASCII identifiers. + * + * Example: + * ``` + * 'div' === parse_css_identifier( 'div > img' ); + * ``` + * + * Grammar: + * ``` + * ident -?{nmstart}{nmchar}* + * nmstart [_a-z]|{nonascii}|{escape} + * nmchar [_a-z0-9-]|{nonascii}|{escape} + * nonascii [\240-\377] + * escape {unicode}|\\[^\r\n\f0-9a-f] + * unicode \\{h}{1,6}(\r\n|[ \t\r\n\f])? + * h [0-9a-f] + * ``` + * + * @TODO: Add support for the proper syntax + * + * @see https://www.w3.org/TR/CSS21/grammar.html + * + * @param $s + * @return false|string|null + */ + public static function parse_css_identifier( $s, $at = 0 ) { + $budget = 1000; + $started_at = $at; + + $starting_chars = strspn( $s, '_-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at ); + if ( 0 === $starting_chars ) { + return null; + } + $at += $starting_chars; + + while ( $at < strlen( $s ) && $budget-- > 0 ) { + $chars = strspn( $s, '_-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789', $at ); + + if ( 0 === $chars ) { + break; + } + + $at += $chars; + } + + if ( $budget < 0 ) { + return null; + } + + return substr( $s, $started_at, $at - $started_at ); + } + + public static function parse_attribute( $s ) { + $unallowed_characters_match = preg_match( + '~[' . + // Syntax-like characters. + '"\'>&bookmarks[ $start_bookmark ], $this->bookmarks[ $end_bookmark ] ) ) { + return null; + } + + $start = $this->bookmarks[ $start_bookmark ]; + $end = $this->bookmarks[ $end_bookmark ]; + + return substr( $this->get_updated_html(), $start->end + 1, $end->start - $start->end - 2 ); + } +} diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php index affbb6fb27b5c..56787afbe39bb 100644 --- a/lib/experimental/html/class-wp-html-tag-processor.php +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -388,7 +388,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var WP_HTML_Span[] */ - private $bookmarks = array(); + protected $bookmarks = array(); const ADD_CLASS = true; const REMOVE_CLASS = false; diff --git a/phpunit/html/wp-html-attribute-sourcer-test.php b/phpunit/html/wp-html-attribute-sourcer-test.php new file mode 100644 index 0000000000000..95cf675aa43c1 --- /dev/null +++ b/phpunit/html/wp-html-attribute-sourcer-test.php @@ -0,0 +1,177 @@ +assertSame( $expected, ( new WP_HTML_Attribute_Sourcer( $attributes, $html ) )->source_attributes() ); + } + + public function data_sourced_attributes() { + return array( + array( + array( 'attributes' => array( 'src' => 'image.png' ), 'unparsed' => array() ), + '
', + array( + 'src' => array( + 'type' => 'string', + 'source' => 'attribute', + 'selector' => 'img', + 'attribute' => 'src' + ), + ), + ), + + array( + array( + 'attributes' => array( 'content' => 'Just some quirky content' ), + 'unparsed' => array(), + ), + '

Just some quirky content

', + array( + 'content' => array( + 'type' => 'string', + 'source' => 'html', + 'selector' => 'p' + ) + ) + ), + + array( + array( + 'attributes' => array( 'content' => '
one item
another item
' ), + 'unparsed' => array(), + ), + '
one item
another item
', + array( + 'content' => array( + 'type' => 'string', + 'source' => 'html', + 'selector' => '.wp-block-group' + ) + ) + ), + + array( + array( + 'attributes' => array( 'content' => 'An Important Section' ), + 'unparsed' => array(), + ), + '

An Important Section

', + array( + 'content' => array( + 'type' => 'string', + 'source' => 'html', + 'selector' => 'h1,h2,h3,h4,h5,h6' + ) + ) + ), + ); + } + + /** + * @dataProvider data_parsed_block_attribute_definitions + */ + public function test_parse_definition( $expected, $input ) { + $this->assertSame( $expected, WP_HTML_Attribute_Sourcer::parse_definition( $input ) ); + } + + public function data_parsed_block_attribute_definitions() { + return array( + array( + 'not-sourced', + array( 'type' => 'string' ), + ), + array( + 'unsupported', + array( 'type' => 'string', 'source' => 'attribute', 'selector' => 'div + img', 'attribute' => 'src' ), + ), + array( + 'inner-html', + array( 'type' => 'string', 'source' => 'html' ), + ), + array( + array( 'type' => 'html', 'selector' => array( 'type' => 'element', 'identifier' => 'code' ) ), + array( 'type' => 'string', 'source' => 'html', 'selector' => 'code' ), + ), + array( + array( 'type' => 'attribute', 'selector' => array( 'type' => 'element', 'identifier' => 'img' ), 'attribute' => 'src' ), + array( 'type' => 'string', 'source' => 'attribute', 'selector' => 'img', 'attribute' => 'src' ), + ), + ); + } + + /** + * @dataProvider data_parsed_css_selectors + */ + public function test_parses_css_selector( $expected, $input ) { + $this->assertSame($expected, WP_HTML_Attribute_Sourcer::parse_selector( $input ) ); + } + + public function data_parsed_css_selectors() { + return array( + array( array( 'type' => 'element', 'identifier' => 'img' ), 'img' ), + array( array( 'type' => 'class', 'identifier' => 'block-group' ), '.block-group' ), + array( array( 'type' => 'hash', 'identifier' => 'input-form' ), '#input-form' ), + ); + } + + /** + * @dataProvider data_multi_parsed_css_selectors + */ + public function test_parses_multi_css_selectors( $expected, $input ) { + $this->assertSame( $expected, WP_HTML_Attribute_Sourcer::parse_selector( $input ) ); + } + + public function data_multi_parsed_css_selectors() { + return array( + array( + array( + array( 'type' => 'element', 'identifier' => 'img' ), + array( 'type' => 'class', 'identifier' => 'full-width' ), + ), + 'img, .full-width' + ), + array( + array( + array( 'type' => 'element', 'identifier' => 'h1' ), + array( 'type' => 'element', 'identifier' => 'h2' ), + array( 'type' => 'element', 'identifier' => 'h3' ), + array( 'type' => 'element', 'identifier' => 'h4' ), + array( 'type' => 'element', 'identifier' => 'h5' ), + array( 'type' => 'element', 'identifier' => 'h6' ), + ), + 'h1,h2,h3,h4,h5,h6' + ) + ); + } + + /** + * @dataProvider data_identifier_from_selector + * @return void + */ + public function test_parses_css_identifier( $expected, $input ) { + $this->assertEquals( $expected, WP_HTML_Attribute_Sourcer::parse_css_identifier( $input ) ); + } + + public function data_identifier_from_selector() { + return array( + array( 'div', 'div > img' ), + array( '-ident', '-ident.class#id' ) + ); + } +}