Skip to content

Commit

Permalink
HTML API: Return elements pushed and popped rather than tags read.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmsnell committed Apr 3, 2024
1 parent 7d7b92a commit 407a433
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 5 deletions.
20 changes: 20 additions & 0 deletions src/wp-includes/html-api/class-wp-html-open-elements.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,18 @@ class WP_HTML_Open_Elements {
*/
private $has_p_in_button_scope = false;

private $pop_handler = null;

private $push_handler = null;

public function add_pop_handler( Closure $handler ) {
$this->pop_handler = $handler;
}

public function add_push_handler( Closure $handler ) {
$this->push_handler = $handler;
}

/**
* Reports if a specific node is in the stack of open elements.
*
Expand Down Expand Up @@ -429,6 +441,10 @@ public function after_element_push( $item ) {
$this->has_p_in_button_scope = true;
break;
}

if ( null !== $this->push_handler ) {
( $this->push_handler )( $item );
}
}

/**
Expand Down Expand Up @@ -458,5 +474,9 @@ public function after_element_pop( $item ) {
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
break;
}

if ( null !== $this->pop_handler ) {
( $this->pop_handler )( $item );
}
}
}
60 changes: 55 additions & 5 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@
* @since 6.4.0
*/

class WP_HTML_Element_Operation {
public $token;
public $operation;

public function __construct( $token, $operation ) {
$this->token = $token;
$this->operation = $operation;
}
}

/**
* Core class used to safely parse and modify an HTML document.
*
Expand Down Expand Up @@ -201,6 +211,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*/
private $release_internal_bookmark_on_destruct = null;

private $element_queue = array();

/** @var WP_HTML_Element_Operation */
private $current_element = null;

/*
* Public Interface Functions
*/
Expand Down Expand Up @@ -299,6 +314,14 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul

$this->state = new WP_HTML_Processor_State();

$this->state->stack_of_open_elements->add_push_handler( function ( WP_HTML_Token $token ) {

Check failure on line 317 in src/wp-includes/html-api/class-wp-html-processor.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Opening parenthesis of a multi-line function call must be the last content on the line
$this->element_queue[] = new WP_HTML_Element_Operation( $token, 'open' );
} );

Check failure on line 319 in src/wp-includes/html-api/class-wp-html-processor.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Closing parenthesis of a multi-line function call must be on a line by itself

$this->state->stack_of_open_elements->add_pop_handler( function ( WP_HTML_Token $token ) {

Check failure on line 321 in src/wp-includes/html-api/class-wp-html-processor.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Opening parenthesis of a multi-line function call must be the last content on the line
$this->element_queue[] = new WP_HTML_Element_Operation( $token, 'close' );
} );

Check failure on line 323 in src/wp-includes/html-api/class-wp-html-processor.php

View workflow job for this annotation

GitHub Actions / PHP coding standards

Closing parenthesis of a multi-line function call must be on a line by itself

/*
* Create this wrapper so that it's possible to pass
* a private method into WP_HTML_Token classes without
Expand Down Expand Up @@ -365,7 +388,7 @@ public function next_tag( $query = null ) {
continue;
}

if ( ! $this->is_tag_closer() ) {
if ( ! parent::is_tag_closer() ) {
return true;
}
}
Expand All @@ -392,7 +415,7 @@ public function next_tag( $query = null ) {
continue;
}

if ( ! $this->is_tag_closer() ) {
if ( ! parent::is_tag_closer() ) {
return true;
}
}
Expand Down Expand Up @@ -440,7 +463,22 @@ public function next_tag( $query = null ) {
* @return bool
*/
public function next_token() {
return $this->step();
$this->current_element = null;

if ( 0 === count( $this->element_queue ) && ! $this->step() ) {
while ( $this->state->stack_of_open_elements->pop() ) {
continue;
}
}

$this->current_element = array_shift( $this->element_queue );
return null !== $this->current_element;
}

public function is_tag_closer() {
return isset( $this->current_element )
? ( 'close' === $this->current_element->operation )
: parent::is_tag_closer();
}

/**
Expand Down Expand Up @@ -629,7 +667,7 @@ public function get_breadcrumbs() {
private function step_in_body() {
$token_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$token_name}";

switch ( $op ) {
Expand Down Expand Up @@ -1152,7 +1190,7 @@ private function step_in_body() {
throw new WP_HTML_Unsupported_Exception( "Cannot process {$token_name} element." );
}

if ( ! $this->is_tag_closer() ) {
if ( ! parent::is_tag_closer() ) {
/*
* > Any other start tag
*/
Expand Down Expand Up @@ -1248,6 +1286,10 @@ public function get_tag() {
return null;
}

if ( isset( $this->current_element ) ) {
return $this->current_element->token->node_name;
}

$tag_name = parent::get_tag();

switch ( $tag_name ) {
Expand All @@ -1263,6 +1305,14 @@ public function get_tag() {
}
}

public function get_token_name() {
if ( isset( $this->current_element ) ) {
return $this->current_element->token->node_name;
}

return parent::get_token_name();
}

/**
* Removes a bookmark that is no longer needed.
*
Expand Down
155 changes: 155 additions & 0 deletions src/wp-includes/html-api/class-wp-html-to-markdown-converter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
<?php

class WP_HTML_To_Markdown_Converter {
public static function convert( $html ) {
$processor = WP_HTML_Processor::create_fragment( $html );
$md = '';
$list_items = array();
$depth = 0;
echo "\n";

echo "\e[90mFound these nodes…\e[m\n";
$node_count = 8;
while ( $processor->next_token() ) {
$indent = str_pad( '', $depth * 2, ' ' );
$token_name = $processor->get_token_name();
$breadcrumbs = $processor->get_breadcrumbs();

$closer = $processor->is_tag_closer() ? '/' : '';
if ( 0 === --$node_count ) {
$node_count = 8;
echo "\n";
}
echo "\e[36m{$closer}\e[32m{$token_name}\e[m ";

if ( $processor->is_tag_closer() ) {
switch ( $token_name ) {
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
$md .= "\n";
break;

case 'B':
case 'STRONG':
$md .= '*';
break;

case 'I':
case 'EM':
$md .= '_';
break;

case 'OL':
case 'UL':
--$depth;
array_pop( $list_items );
break;
}

// Proceed to the next token.
continue;
}

switch ( $token_name ) {
case '#text':
$md .= $processor->get_modifiable_text();
break;

case 'P':
$md .= "\n";
break;

case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
$hash_count = intval( $token_name[1] );
$hashes = str_pad( '', $hash_count, '#' );
$md .= "\n\n{$hashes} ";
break;

case 'B':
case 'STRONG':
$md .= '*';
break;

case 'I':
case 'EM':
$md .= '_';
break;

case 'LI':
$list_item = end( $list_items );
$md .= "\n{$indent}{$list_item} ";
break;

case 'OL':
++$depth;
$list_items[] = '*';
break;

case 'UL':
++$depth;
$list_items[] = '-';
break;
}

$last_breadcrumbs = $breadcrumbs;
}

if ( null !== $processor->get_last_error() ) {
die( "Encountered unsupported HTML: failed to convert.\n" );
}

$closed_elements = array();
for ( $i = 0; $i < count( $last_breadcrumbs ); $i++ ) {
if (
isset( $last_breadcrumbs[ $i ], $breadcrumbs[ $i ] ) &&
$last_breadcrumbs[ $i ] === $breadcrumbs[ $i ]
) {
continue;
}

$closed_elements = array_slice( $last_breadcrumbs, $i );
break;
}

$closed_elements = array_reverse( $closed_elements );
foreach ( $closed_elements as $element ) {
switch ( $element ) {
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
$md .= "\n";
break;

case 'B':
case 'STRONG':
$md .= '*';
break;

case 'I':
case 'EM':
$md .= '_';
break;

case 'OL':
case 'UL':
--$depth;
array_pop( $list_items );
break;
}
}

return $md;
}
}
1 change: 1 addition & 0 deletions src/wp-settings.php
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@
require ABSPATH . WPINC . '/html-api/class-wp-html-token.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-to-markdown-converter.php';
require ABSPATH . WPINC . '/class-wp-http.php';
require ABSPATH . WPINC . '/class-wp-http-streams.php';
require ABSPATH . WPINC . '/class-wp-http-curl.php';
Expand Down

0 comments on commit 407a433

Please sign in to comment.