|
| 1 | +<?php |
| 2 | + |
| 3 | +/* phpcs:disable WordPress.Security.EscapeOutput.ExceptionNotEscaped */ |
| 4 | + |
| 5 | +/** |
| 6 | + * Generates representation of the semantic HTML tree structure. |
| 7 | + * |
| 8 | + * This is inspired by the representation used by the HTML5lib tests. It's been extended here for |
| 9 | + * blocks to render the semantic structure of blocks and their attributes. |
| 10 | + * The order of attributes and class names is normalized both for HTML tags and blocks, |
| 11 | + * as is the whitespace in HTML tags' style attribute. |
| 12 | + * |
| 13 | + * For example, consider the following block markup: |
| 14 | + * |
| 15 | + * <!-- wp:separator {"className":"is-style-default has-custom-classname","style":{"spacing":{"margin":{"top":"50px","bottom":"50px"}}},"backgroundColor":"accent-1"} --> |
| 16 | + * <hr class="wp-block-separator is-style-default has-custom-classname" style="margin-top: 50px; margin-bottom: 50px" /> |
| 17 | + * <!-- /wp:separator --> |
| 18 | + * |
| 19 | + * This will be represented as: |
| 20 | + * |
| 21 | + * BLOCK["core/separator"] |
| 22 | + * { |
| 23 | + * "backgroundColor": "accent-1", |
| 24 | + * "className": "has-custom-classname is-style-default", |
| 25 | + * "style": { |
| 26 | + * "spacing": { |
| 27 | + * "margin": { |
| 28 | + * "top": "50px", |
| 29 | + * "bottom": "50px" |
| 30 | + * } |
| 31 | + * } |
| 32 | + * } |
| 33 | + * } |
| 34 | + * <hr> |
| 35 | + * class="has-custom-classname is-style-default wp-block-separator" |
| 36 | + * style="margin-top:50px;margin-bottom:50px;" |
| 37 | + * |
| 38 | + * |
| 39 | + * @see https://github.com/WordPress/wordpress-develop/blob/trunk/tests/phpunit/data/html5lib-tests/tree-construction/README.md |
| 40 | + * |
| 41 | + * @since 6.9.0 |
| 42 | + * |
| 43 | + * @throws WP_HTML_Unsupported_Exception|Error If the markup could not be parsed. |
| 44 | + * |
| 45 | + * @param string $html Given test HTML. |
| 46 | + * @param string|null $fragment_context Context element in which to parse HTML, such as BODY or SVG. |
| 47 | + * @return string Tree structure of parsed HTML, if supported. |
| 48 | + */ |
| 49 | +function build_visual_html_tree( string $html, ?string $fragment_context ): string { |
| 50 | + $processor = $fragment_context |
| 51 | + ? WP_HTML_Processor::create_fragment( $html, $fragment_context ) |
| 52 | + : WP_HTML_Processor::create_full_parser( $html ); |
| 53 | + if ( null === $processor ) { |
| 54 | + throw new Error( 'Could not create a parser.' ); |
| 55 | + } |
| 56 | + $tree_indent = ' '; |
| 57 | + |
| 58 | + $output = ''; |
| 59 | + $indent_level = 0; |
| 60 | + $was_text = null; |
| 61 | + $text_node = ''; |
| 62 | + |
| 63 | + $block_context = array(); |
| 64 | + |
| 65 | + while ( $processor->next_token() ) { |
| 66 | + if ( null !== $processor->get_last_error() ) { |
| 67 | + break; |
| 68 | + } |
| 69 | + |
| 70 | + $token_name = $processor->get_token_name(); |
| 71 | + $token_type = $processor->get_token_type(); |
| 72 | + $is_closer = $processor->is_tag_closer(); |
| 73 | + |
| 74 | + if ( $was_text && '#text' !== $token_name ) { |
| 75 | + if ( '' !== $text_node ) { |
| 76 | + $output .= "{$text_node}\"\n"; |
| 77 | + } |
| 78 | + $was_text = false; |
| 79 | + $text_node = ''; |
| 80 | + } |
| 81 | + |
| 82 | + switch ( $token_type ) { |
| 83 | + case '#doctype': |
| 84 | + $doctype = $processor->get_doctype_info(); |
| 85 | + $output .= "<!DOCTYPE {$doctype->name}"; |
| 86 | + if ( null !== $doctype->public_identifier || null !== $doctype->system_identifier ) { |
| 87 | + $output .= " \"{$doctype->public_identifier}\" \"{$doctype->system_identifier}\""; |
| 88 | + } |
| 89 | + $output .= ">\n"; |
| 90 | + break; |
| 91 | + |
| 92 | + case '#tag': |
| 93 | + $namespace = $processor->get_namespace(); |
| 94 | + $tag_name = 'html' === $namespace |
| 95 | + ? strtolower( $processor->get_tag() ) |
| 96 | + : "{$namespace} {$processor->get_qualified_tag_name()}"; |
| 97 | + |
| 98 | + if ( $is_closer ) { |
| 99 | + --$indent_level; |
| 100 | + |
| 101 | + if ( 'html' === $namespace && 'TEMPLATE' === $token_name ) { |
| 102 | + --$indent_level; |
| 103 | + } |
| 104 | + |
| 105 | + break; |
| 106 | + } |
| 107 | + |
| 108 | + $tag_indent = $indent_level; |
| 109 | + |
| 110 | + if ( $processor->expects_closer() ) { |
| 111 | + ++$indent_level; |
| 112 | + } |
| 113 | + |
| 114 | + $output .= str_repeat( $tree_indent, $tag_indent ) . "<{$tag_name}>\n"; |
| 115 | + |
| 116 | + $attribute_names = $processor->get_attribute_names_with_prefix( '' ); |
| 117 | + if ( $attribute_names ) { |
| 118 | + $sorted_attributes = array(); |
| 119 | + foreach ( $attribute_names as $attribute_name ) { |
| 120 | + $sorted_attributes[ $attribute_name ] = $processor->get_qualified_attribute_name( $attribute_name ); |
| 121 | + } |
| 122 | + |
| 123 | + /* |
| 124 | + * Sorts attributes to match html5lib sort order. |
| 125 | + * |
| 126 | + * - First comes normal HTML attributes. |
| 127 | + * - Then come adjusted foreign attributes; these have spaces in their names. |
| 128 | + * - Finally come non-adjusted foreign attributes; these have a colon in their names. |
| 129 | + * |
| 130 | + * Example: |
| 131 | + * |
| 132 | + * From: <math xlink:author definitionurl xlink:title xlink:show> |
| 133 | + * Sorted: 'definitionURL', 'xlink show', 'xlink title', 'xlink:author' |
| 134 | + */ |
| 135 | + uasort( |
| 136 | + $sorted_attributes, |
| 137 | + static function ( $a, $b ) { |
| 138 | + $a_has_ns = str_contains( $a, ':' ); |
| 139 | + $b_has_ns = str_contains( $b, ':' ); |
| 140 | + |
| 141 | + // Attributes with `:` should follow all other attributes. |
| 142 | + if ( $a_has_ns !== $b_has_ns ) { |
| 143 | + return $a_has_ns ? 1 : -1; |
| 144 | + } |
| 145 | + |
| 146 | + $a_has_sp = str_contains( $a, ' ' ); |
| 147 | + $b_has_sp = str_contains( $b, ' ' ); |
| 148 | + |
| 149 | + // Attributes with a namespace ' ' should come after those without. |
| 150 | + if ( $a_has_sp !== $b_has_sp ) { |
| 151 | + return $a_has_sp ? 1 : -1; |
| 152 | + } |
| 153 | + |
| 154 | + return $a <=> $b; |
| 155 | + } |
| 156 | + ); |
| 157 | + |
| 158 | + foreach ( $sorted_attributes as $attribute_name => $display_name ) { |
| 159 | + $val = $processor->get_attribute( $attribute_name ); |
| 160 | + /* |
| 161 | + * Attributes with no value are `true` with the HTML API, |
| 162 | + * we use the empty string value in the tree structure. |
| 163 | + */ |
| 164 | + if ( true === $val ) { |
| 165 | + $val = ''; |
| 166 | + } elseif ( 'class' === $attribute_name ) { |
| 167 | + $class_names = iterator_to_array( $processor->class_list() ); |
| 168 | + sort( $class_names, SORT_STRING ); |
| 169 | + $val = implode( ' ', $class_names ); |
| 170 | + } elseif ( 'style' === $attribute_name ) { |
| 171 | + $normalized_style = ''; |
| 172 | + foreach ( explode( ';', $val ) as $style ) { |
| 173 | + if ( empty( trim( $style ) ) ) { |
| 174 | + continue; |
| 175 | + } |
| 176 | + list( $style_key, $style_val ) = explode( ':', $style ); |
| 177 | + |
| 178 | + $style_key = trim( $style_key ); |
| 179 | + $style_val = trim( $style_val ); |
| 180 | + |
| 181 | + $normalized_style .= "{$style_key}:{$style_val};"; |
| 182 | + } |
| 183 | + $val = $normalized_style; |
| 184 | + } |
| 185 | + $output .= str_repeat( $tree_indent, $tag_indent + 1 ) . "{$display_name}=\"{$val}\"\n"; |
| 186 | + } |
| 187 | + } |
| 188 | + |
| 189 | + // Self-contained tags contain their inner contents as modifiable text. |
| 190 | + $modifiable_text = $processor->get_modifiable_text(); |
| 191 | + if ( '' !== $modifiable_text ) { |
| 192 | + $output .= str_repeat( $tree_indent, $tag_indent + 1 ) . "\"{$modifiable_text}\"\n"; |
| 193 | + } |
| 194 | + |
| 195 | + if ( 'html' === $namespace && 'TEMPLATE' === $token_name ) { |
| 196 | + $output .= str_repeat( $tree_indent, $indent_level ) . "content\n"; |
| 197 | + ++$indent_level; |
| 198 | + } |
| 199 | + |
| 200 | + break; |
| 201 | + |
| 202 | + case '#cdata-section': |
| 203 | + case '#text': |
| 204 | + $text_content = $processor->get_modifiable_text(); |
| 205 | + if ( '' === trim( $text_content, " \f\t\r\n" ) ) { |
| 206 | + break; |
| 207 | + } |
| 208 | + $was_text = true; |
| 209 | + if ( '' === $text_node ) { |
| 210 | + $text_node .= str_repeat( $tree_indent, $indent_level ) . '"'; |
| 211 | + } |
| 212 | + $text_node .= $text_content; |
| 213 | + break; |
| 214 | + |
| 215 | + case '#funky-comment': |
| 216 | + // Comments must be "<" then "!-- " then the data then " -->". |
| 217 | + $output .= str_repeat( $tree_indent, $indent_level ) . "<!-- {$processor->get_modifiable_text()} -->\n"; |
| 218 | + break; |
| 219 | + |
| 220 | + case '#comment': |
| 221 | + // Comments must be "<" then "!--" then the data then "-->". |
| 222 | + $comment = "<!--{$processor->get_full_comment_text()}-->"; |
| 223 | + |
| 224 | + // Maybe the comment is a block delimiter. |
| 225 | + $parser = new WP_Block_Parser(); |
| 226 | + $parser->document = $comment; |
| 227 | + $parser->offset = 0; |
| 228 | + list( $delimiter_type, $block_name, $block_attrs, $start_offset, $token_length ) = $parser->next_token(); |
| 229 | + |
| 230 | + switch ( $delimiter_type ) { |
| 231 | + case 'block-opener': |
| 232 | + case 'void-block': |
| 233 | + $output .= str_repeat( $tree_indent, $indent_level ) . "BLOCK[\"{$block_name}\"]\n"; |
| 234 | + |
| 235 | + if ( 'block-opener' === $delimiter_type ) { |
| 236 | + $block_context[] = $block_name; |
| 237 | + ++$indent_level; |
| 238 | + } |
| 239 | + |
| 240 | + // If they're no attributes, we're done here. |
| 241 | + if ( empty( $block_attrs ) ) { |
| 242 | + break; |
| 243 | + } |
| 244 | + |
| 245 | + // Normalize attribute order. |
| 246 | + ksort( $block_attrs, SORT_STRING ); |
| 247 | + |
| 248 | + if ( isset( $block_attrs['className'] ) ) { |
| 249 | + // Normalize class name order (and de-duplicate), as we need to be tolerant of different orders. |
| 250 | + // (Style attributes don't need this treatment, as they are parsed into a nested array.) |
| 251 | + $block_class_processor = new WP_HTML_Tag_Processor( '<div>' ); |
| 252 | + $block_class_processor->next_token(); |
| 253 | + $block_class_processor->set_attribute( 'class', $block_attrs['className'] ); |
| 254 | + $class_names = iterator_to_array( $block_class_processor->class_list() ); |
| 255 | + sort( $class_names, SORT_STRING ); |
| 256 | + $block_attrs['className'] = implode( ' ', $class_names ); |
| 257 | + } |
| 258 | + |
| 259 | + $block_attrs = json_encode( $block_attrs, JSON_PRETTY_PRINT ); |
| 260 | + // Fix indentation by "halving" it (2 spaces instead of 4). |
| 261 | + // Additionally, we need to indent each line by the current indentation level. |
| 262 | + $block_attrs = preg_replace( '/^( +)\1/m', str_repeat( $tree_indent, $indent_level ) . '$1', $block_attrs ); |
| 263 | + // Finally, indent the first line, and the last line (with the closing curly brace). |
| 264 | + $output .= str_repeat( $tree_indent, $indent_level ) . substr( $block_attrs, 0, -1 ) . str_repeat( $tree_indent, $indent_level ) . "}\n"; |
| 265 | + break; |
| 266 | + case 'block-closer': |
| 267 | + // Is this a closer for the currently open block? |
| 268 | + if ( ! empty( $block_context ) && end( $block_context ) === $block_name ) { |
| 269 | + // If it's a closer, we don't add it to the output. |
| 270 | + // Instead, we decrease indentation and remove the block from block context stack. |
| 271 | + --$indent_level; |
| 272 | + array_pop( $block_context ); |
| 273 | + } |
| 274 | + break; |
| 275 | + default: // Not a block delimiter. |
| 276 | + $output .= str_repeat( $tree_indent, $indent_level ) . $comment . "\n"; |
| 277 | + break; |
| 278 | + } |
| 279 | + break; |
| 280 | + default: |
| 281 | + // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_var_export |
| 282 | + $serialized_token_type = var_export( $processor->get_token_type(), true ); |
| 283 | + throw new Error( "Unhandled token type for tree construction: {$serialized_token_type}" ); |
| 284 | + } |
| 285 | + } |
| 286 | + |
| 287 | + if ( null !== $processor->get_unsupported_exception() ) { |
| 288 | + throw $processor->get_unsupported_exception(); |
| 289 | + } |
| 290 | + |
| 291 | + if ( null !== $processor->get_last_error() ) { |
| 292 | + throw new Error( "Parser error: {$processor->get_last_error()}" ); |
| 293 | + } |
| 294 | + |
| 295 | + if ( $processor->paused_at_incomplete_token() ) { |
| 296 | + throw new Error( 'Paused at incomplete token.' ); |
| 297 | + } |
| 298 | + |
| 299 | + if ( '' !== $text_node ) { |
| 300 | + $output .= "{$text_node}\"\n"; |
| 301 | + } |
| 302 | + |
| 303 | + return $output; |
| 304 | +} |
0 commit comments