From 0f68c8cd4c4613b56c8b4b9c7ef852a26a055095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Sun, 17 May 2026 00:35:16 +0200 Subject: [PATCH] Add source-aware Markdown patching --- components/Markdown/README.md | 41 ++ .../Tests/MarkdownSourceDocumentTest.php | 656 ++++++++++++++++++ .../Markdown/class-markdownsourcedocument.php | 571 +++++++++++++++ .../Markdown/class-markdownsourceunit.php | 203 ++++++ .../Parser/Inline/BacktickParser.php | 31 +- .../commonmark/src/Util/LinkParserHelper.php | 31 +- 6 files changed, 1527 insertions(+), 6 deletions(-) create mode 100644 components/Markdown/Tests/MarkdownSourceDocumentTest.php create mode 100644 components/Markdown/class-markdownsourcedocument.php create mode 100644 components/Markdown/class-markdownsourceunit.php diff --git a/components/Markdown/README.md b/components/Markdown/README.md index 95be57b10..652454879 100644 --- a/components/Markdown/README.md +++ b/components/Markdown/README.md @@ -81,6 +81,47 @@ echo $markdown; - three ``` +## Source-aware editing + +

Use MarkdownSourceDocument when the user edits block markup that originally came from a Markdown file. It keeps the original source slice for each top-level Markdown block and, on save, reuses unchanged slices verbatim. Only inserted or changed blocks are serialized with MarkdownProducer.

+ + +```php +Edit this sentence.

', + '

Edit only this sentence.

', + $document->get_block_markup() +); + +echo $document->patch_markdown( $blocks ); +``` + + +``` +# Title # + +Keep __bold__ syntax. + +Edit only this sentence. +``` + ## Reading YAML frontmatter as post meta

Frontmatter keys come back as arrays so a single key can hold multiple values. Use get_meta_value() when you only want the first scalar.

diff --git a/components/Markdown/Tests/MarkdownSourceDocumentTest.php b/components/Markdown/Tests/MarkdownSourceDocumentTest.php new file mode 100644 index 000000000..30ca0a80b --- /dev/null +++ b/components/Markdown/Tests/MarkdownSourceDocumentTest.php @@ -0,0 +1,656 @@ +assertSame( $markdown, $document->patch_markdown( $document->get_block_markup() ) ); + } + + public function test_changed_paragraph_does_not_reserialize_unchanged_neighbors() { + $markdown = <<Change this sentence.

', + '

Change this sentence, and only this sentence.

', + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "# Heading #\n\n", $patched ); + $this->assertStringContainsString( "Keep __bold__ syntax.\n\n", $patched ); + $this->assertStringContainsString( "Change this sentence, and only this sentence.\n\n", $patched ); + $this->assertStringContainsString( "Final paragraph with _emphasis_.\n", $patched ); + $this->assertStringNotContainsString( '**bold**', $patched ); + $this->assertStringNotContainsString( '*emphasis*', $patched ); + } + + public function test_changed_middle_block_preserves_crlf_separators() { + $markdown = "Before __bold__.\r\n\r\nChange this sentence.\r\n\r\nAfter _emphasis_.\r\n"; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( + '

Change this sentence.

', + '

Change this sentence with CRLF preserved.

', + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "Before __bold__.\r\n\r\n", $patched ); + $this->assertStringContainsString( "Change this sentence with CRLF preserved.\r\n\r\n", $patched ); + $this->assertStringContainsString( "After _emphasis_.\r\n", $patched ); + $this->assertStringNotContainsString( "Change this sentence with CRLF preserved.\n\nAfter", $patched ); + } + + public function test_changed_final_block_preserves_missing_final_newline() { + $markdown = "Before __bold__.\n\nChange this final sentence."; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( + '

Change this final sentence.

', + '

Change this final sentence without adding a newline.

', + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertSame( "Before __bold__.\n\nChange this final sentence without adding a newline.", $patched ); + } + + /** + * @dataProvider provider_tiny_trivia_cases + */ + public function test_generated_tiny_trivia_cases_preserve_changed_block_boundaries( $case_name, $before, $target, $after, $expected_changed ) { + $markdown = $before . $target . $after; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( + 'Tiny target paragraph.', + $expected_changed, + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( $before, $patched, $case_name ); + $this->assertStringContainsString( $expected_changed, $patched, $case_name ); + if ( '' !== $after ) { + $this->assertStringEndsWith( $after, $patched, $case_name ); + } + $this->assertSame( 1, substr_count( $patched, $expected_changed ), $case_name ); + } + + public static function provider_tiny_trivia_cases() { + return array( + 'lf blank separator' => array( + 'lf blank separator', + "Intro __bold__.\n\n", + "Tiny target paragraph.\n\n", + "Tail _emphasis_.\n", + 'Tiny target paragraph changed.', + ), + 'lf no final newline' => array( + 'lf no final newline', + "Intro __bold__.\n\n", + 'Tiny target paragraph.', + '', + 'Tiny target paragraph with no final newline.', + ), + 'crlf blank separator' => array( + 'crlf blank separator', + "Intro __bold__.\r\n\r\n", + "Tiny target paragraph.\r\n\r\n", + "Tail _emphasis_.\r\n", + 'Tiny target paragraph with CRLF preserved.', + ), + 'leading blank lines' => array( + 'leading blank lines', + "Intro __bold__.\n\n\n", + "Tiny target paragraph.\n\n", + "Tail _emphasis_.\n", + 'Tiny target paragraph: changed, checked, done.', + ), + 'leading tabs before target' => array( + 'leading tabs before target', + "Intro __bold__.\n\n\t\n", + "Tiny target paragraph.\n\n", + "Tail _emphasis_.\n", + 'Tiny target paragraph with tab trivia.', + ), + 'trailing space line before target' => array( + 'trailing space line before target', + "Intro __bold__.\n\n \n", + "Tiny target paragraph.\n\n", + "Tail _emphasis_.\n", + 'Tiny target paragraph with space trivia.', + ), + ); + } + + public function test_inserted_block_is_serialized_between_preserved_blocks() { + $markdown = <<' . "\n" . '

Second paragraph with emphasis.

', + '' . "\n" . '

Inserted paragraph.

' . "\n" . '' . "\n\n" . '' . "\n" . '

Second paragraph with emphasis.

', + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "First paragraph with __bold__.\n\n", $patched ); + $this->assertStringContainsString( "Inserted paragraph.\n\n", $patched ); + $this->assertStringContainsString( "Second paragraph with _emphasis_.\n", $patched ); + } + + public function test_deleted_block_is_removed_without_touching_surrounding_source() { + $markdown = <<\n

Delete this paragraph.

\n\n\n", + '', + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "First paragraph with __bold__.\n\n", $patched ); + $this->assertStringNotContainsString( 'Delete this paragraph.', $patched ); + $this->assertStringContainsString( "Second paragraph with _emphasis_.\n", $patched ); + } + + public function test_duplicate_blocks_still_preserve_changed_middle_block_neighbors() { + $markdown = <<Middle bold paragraph.

', + '

Middle bold paragraph changed.

', + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertSame( 2, substr_count( $patched, "Same paragraph.\n" ) ); + $this->assertStringContainsString( "Middle **bold** paragraph changed.\n\n", $patched ); + } + + public function test_frontmatter_and_leading_comments_are_preserved() { + $markdown = << + +Paragraph to edit. + +MD; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( 'Paragraph to edit.', 'Edited paragraph.', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( "---\ntitle: Frontmatter\n---\n\n\n\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_crlf_frontmatter_is_preserved_when_body_changes() { + $markdown = "---\r\ntitle: CRLF Frontmatter\r\n---\r\n\r\nParagraph to edit.\r\n"; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( 'Paragraph to edit.', 'Edited paragraph.', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( "---\r\ntitle: CRLF Frontmatter\r\n---\r\n\r\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\r\n", $patched ); + } + + public function test_setext_heading_is_preserved_when_following_block_changes() { + $markdown = <<get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( "Heading with _style_\n====================\n\n", $patched ); + $this->assertStringNotContainsString( '# Heading with', $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_reference_style_links_and_definitions_are_preserved_when_neighbor_changes() { + $markdown = <<get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "Paragraph with [a reference][docs] and __bold__.\n\n", $patched ); + $this->assertStringContainsString( "[docs]: https://developer.wordpress.org \"Developer docs\"\n\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + $this->assertStringNotContainsString( '[a reference](https://developer.wordpress.org', $patched ); + } + + public function test_unchanged_code_fence_is_preserved_when_later_block_changes() { + $markdown = <<get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "````php\necho `code`;\n```\n````\n\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_indented_code_block_is_preserved_when_later_block_changes() { + $markdown = <<get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( " const keep = \"__syntax__\";\n console.log(keep);\n\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_nested_blockquote_source_is_preserved_when_neighbor_changes() { + $markdown = << Quote with __bold__. +> +> - first +> - second +> +> Final quote line. + +Paragraph to edit. + +MD; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( 'Paragraph to edit.', 'Edited paragraph.', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( "> Quote with __bold__.\n>\n> - first\n> - second\n>\n> Final quote line.\n\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_changed_list_rewrites_only_the_list_unit() { + $markdown = <<Second item', '
  • Second item changed
  • ', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "Before __bold__.\n\n", $patched ); + $this->assertStringContainsString( "- First item\n- Second item changed\n\n", $patched ); + $this->assertStringContainsString( "After _emphasis_.\n", $patched ); + } + + public function test_unchanged_ordered_list_start_and_marker_spacing_are_preserved() { + $markdown = <<get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "7. First item\n8. Second item\n9. Third item\n\n", $patched ); + $this->assertStringNotContainsString( "1. First item\n2. Second item", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_changed_table_rewrites_only_the_table_unit() { + $markdown = <<ok', 'done', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "Before __bold__.\n\n", $patched ); + $this->assertStringContainsString( '| Feature | State |', $patched ); + $this->assertStringContainsString( '| One | done |', $patched ); + $this->assertStringContainsString( "After _emphasis_.\n", $patched ); + } + + public function test_raw_html_block_is_preserved_when_neighbor_changes() { + $markdown = << + Keep raw HTML formatting. + + +Paragraph to edit. + +MD; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( 'Paragraph to edit.', 'Edited paragraph.', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( "
    \n\tKeep raw HTML formatting.\n
    \n\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_thematic_break_marker_is_preserved_when_neighbor_changes() { + $markdown = <<get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "Before __bold__.\n\n___\n\n", $patched ); + $this->assertStringNotContainsString( "\n---\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_table_alignment_and_padding_are_preserved_when_neighbor_changes() { + $markdown = <<get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "| Feature | State |\n| :------ | ----: |\n| One | ok |\n\n", $patched ); + $this->assertStringContainsString( "Edited paragraph.\n", $patched ); + } + + public function test_repeated_blocks_preserve_the_unedited_repetitions_around_a_changed_block() { + $markdown = <<replace_first( + '

    Repeat me.

    ', + '

    Repeat me with a change.

    ', + $document->get_block_markup() + ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( "Alpha __one__.\n\n", $patched ); + $this->assertStringContainsString( "Repeat *me* with a change.\n\nRepeat _me_.\n\n", $patched ); + $this->assertStringContainsString( "Omega __two__.\n", $patched ); + } + + /** + * @dataProvider provider_medium_neighbor_preservation_cases + */ + public function test_generated_medium_neighbor_cases_preserve_surrounding_source( $case_name, $before, $after ) { + $markdown = $before . "Paragraph to edit.\n\n" . $after; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( 'Paragraph to edit.', 'Edited paragraph.', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( $before, $patched, $case_name ); + $this->assertStringContainsString( "Edited paragraph.\n\n", $patched, $case_name ); + $this->assertStringEndsWith( $after, $patched, $case_name ); + $this->assertSame( 1, substr_count( $patched, 'Edited paragraph.' ), $case_name ); + } + + public static function provider_medium_neighbor_preservation_cases() { + $snippets = self::source_snippets(); + $snippet_names = array_keys( $snippets ); + $snippet_count = count( $snippet_names ); + $cases = array(); + + for ( $index = 0; $index < $snippet_count; $index++ ) { + $before_name = $snippet_names[ $index ]; + $after_name = $snippet_names[ ( $index + 7 ) % $snippet_count ]; + $case_name = $before_name . ' before / ' . $after_name . ' after'; + $cases[ $case_name ] = array( $case_name, $snippets[ $before_name ], $snippets[ $after_name ] ); + + $before_name = $snippet_names[ ( $index + 11 ) % $snippet_count ]; + $after_name = $snippet_names[ $index ]; + $case_name = $before_name . ' before / ' . $after_name . ' after'; + $cases[ $case_name ] = array( $case_name, $snippets[ $before_name ], $snippets[ $after_name ] ); + } + + return $cases; + } + + /** + * @dataProvider provider_large_document_cases + */ + public function test_generated_large_document_cases_preserve_every_unedited_slice( $case_name, $before_parts, $after_parts ) { + $before = implode( '', $before_parts ); + $after = implode( '', $after_parts ); + $markdown = $before . "Paragraph to edit.\n\n" . $after; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + $edited_blocks = str_replace( 'Paragraph to edit.', 'Edited paragraph in a large document.', $document->get_block_markup() ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringStartsWith( $before, $patched, $case_name ); + $this->assertStringContainsString( "Edited paragraph in a large document.\n\n", $patched, $case_name ); + $this->assertStringEndsWith( $after, $patched, $case_name ); + $this->assertSame( 1, substr_count( $patched, 'Edited paragraph in a large document.' ), $case_name ); + + foreach ( array_merge( $before_parts, $after_parts ) as $source_part ) { + $this->assertStringContainsString( $source_part, $patched, $case_name ); + } + } + + /** + * @dataProvider provider_unmapped_unchanged_documents + */ + public function test_generated_unmapped_documents_preserve_original_when_unchanged( $case_name, $markdown ) { + $document = MarkdownSourceDocument::from_markdown( $markdown ); + + $this->assertSame( $markdown, $document->patch_markdown( $document->get_block_markup() ), $case_name ); + } + + public static function provider_unmapped_unchanged_documents() { + $unsupported_snippets = array( + 'task list before' => "- [x] Checked item\n- [ ] Open item\n\nParagraph to edit.\n", + 'task list after' => "Paragraph to edit.\n\n- [x] Checked item\n- [ ] Open item\n", + 'duplicate reference definitions' => "A [link][same].\n\n[same]: https://example.com/a\n\nParagraph to edit.\n\nAnother [link][same].\n\n[same]: https://example.com/b\n", + 'html followed by markdown without blank' => "
    \nRaw HTML\n
    \nParagraph to edit.\n", + 'unclosed html block' => "
    \n\nParagraph to edit.\n", + 'mixed task list document' => "Before __bold__.\n\n- [x] Checked item\n- [ ] Open item\n\nParagraph to edit.\n\nAfter _emphasis_.\n", + 'task list nested in quote' => "> - [x] Quoted checked item\n> - [ ] Quoted open item\n\nParagraph to edit.\n", + 'raw markdown inside html' => "
    \n# Not a Markdown heading here\n
    \n\nParagraph to edit.\n", + ); + $cases = array(); + + foreach ( $unsupported_snippets as $case_name => $markdown ) { + $cases[ $case_name ] = array( $case_name, $markdown ); + } + + return $cases; + } + + public static function provider_large_document_cases() { + $snippets = array_values( self::source_snippets() ); + $cases = array(); + $snippet_count = count( $snippets ); + + for ( $case_index = 0; $case_index < 10; $case_index++ ) { + $before_parts = array(); + $after_parts = array(); + for ( $offset = 0; $offset < 8; $offset++ ) { + $before_parts[] = $snippets[ ( $case_index + $offset * 3 ) % $snippet_count ]; + $after_parts[] = $snippets[ ( $case_index * 2 + $offset * 5 + 1 ) % $snippet_count ]; + } + + $case_name = 'large mixed document ' . $case_index; + $cases[ $case_name ] = array( $case_name, $before_parts, $after_parts ); + } + + return $cases; + } + + public function test_reordered_blocks_preserve_any_unchanged_source_units_the_matcher_can_keep() { + $markdown = <<get_block_markup() ), + function ( $block ) { + return isset( $block['blockName'] ) && null !== $block['blockName']; + } + ) + ); + $edited_blocks = serialize_block( $blocks[1] ) . "\n\n" . serialize_block( $blocks[0] ); + $patched = $document->patch_markdown( $edited_blocks ); + + $this->assertStringContainsString( 'Second _emphasis_.', $patched ); + $this->assertStringContainsString( 'First **bold**.', $patched ); + } + + public function test_unmapped_documents_still_preserve_original_source_when_blocks_are_unchanged() { + $markdown = << + +Nested Markdown paragraph. + +
    + +Paragraph after unsupported markup. + +MD; + $document = MarkdownSourceDocument::from_markdown( $markdown ); + + $this->assertSame( $markdown, $document->patch_markdown( $document->get_block_markup() ) ); + } + + private function replace_first( $search, $replace, $subject ) { + $position = strpos( $subject, $search ); + $this->assertNotFalse( $position ); + + return substr( $subject, 0, $position ) . $replace . substr( $subject, $position + strlen( $search ) ); + } + + private static function source_snippets() { + return array( + 'paragraph inline emphasis variants' => "Paragraph with __bold__, _emphasis_, and `inline code`.\n\n", + 'paragraph escaped punctuation' => "Escaped \\*literal asterisks\\* and \\[brackets\\].\n\n", + 'paragraph hard break' => "Line with a hard break \nthen the next line.\n\n", + 'paragraph raw inline html' => "Inline HTML with __markdown__.\n\n", + 'reference link paragraph' => "A [reference link][docs] with __bold__ text.\n\n[docs]: https://developer.wordpress.org \"Docs\"\n\n", + 'reference image paragraph' => "A reference image ![logo][logo] stays indirect.\n\n[logo]: https://example.com/logo.png \"Logo\"\n\n", + 'atx h1 closing marker' => "# Heading one #\n\n", + 'atx h4 no closing marker' => "#### Heading four with _style_\n\n", + 'setext h1 heading' => "Setext primary heading\n======================\n\n", + 'setext h2 heading' => "Setext secondary heading\n------------------------\n\n", + 'fenced code backticks' => "````php\necho `nested`;\n```\n````\n\n", + 'fenced code tildes' => "~~~js\nconst value = \"__keep__\";\n~~~\n\n", + 'indented code block' => " const keep = \"_syntax_\";\n console.log(keep);\n\n", + 'unordered star list' => "* First star item\n* Second star item\n\n", + 'unordered plus list' => "+ First plus item\n+ Second plus item\n\n", + 'ordered list offset' => "7. First ordered item\n8. Second ordered item\n\n", + 'nested unordered list' => "- Parent item\n - Child item A\n - Child item B\n- Sibling item\n\n", + 'blockquote paragraph' => "> Quote with __bold__ and _emphasis_.\n\n", + 'blockquote nested list' => "> Quote intro.\n>\n> - First\n> - Second\n\n", + 'blockquote nested quote' => "> Outer quote\n>\n> > Inner quote with `code`\n\n", + 'table aligned columns' => "| Feature | State |\n| :------ | ----: |\n| One | ok |\n\n", + 'table escaped pipes' => "| Name | Value |\n| ---- | ----- |\n| Pipe | a \\| b |\n\n", + 'thematic break underscores' => "___\n\n", + 'thematic break stars' => "***\n\n", + 'html comment block' => "\n\n", + 'raw html section' => "
    \n\tKeep raw HTML formatting.\n
    \n\n", + 'raw html table' => "\n\n
    Raw table
    \n\n", + ); + } +} diff --git a/components/Markdown/class-markdownsourcedocument.php b/components/Markdown/class-markdownsourcedocument.php new file mode 100644 index 000000000..c7ec9c600 --- /dev/null +++ b/components/Markdown/class-markdownsourcedocument.php @@ -0,0 +1,571 @@ +markdown = $markdown; + $this->block_markup = $block_markup; + $this->metadata = $metadata; + $this->prefix = $prefix; + $this->units = $units; + } + + /** + * Creates a source-aware document from Markdown source. + * + * The source is parsed twice: once by MarkdownConsumer to obtain WordPress + * block markup, and once by CommonMark to obtain top-level block source + * positions. When both views contain the same number of top-level blocks, + * each block becomes a MarkdownSourceUnit. Otherwise the full document is + * kept as one conservative fallback unit. + * + * @param string $markdown The Markdown source to parse. + * @return self Source-aware document containing block markup and source units. + */ + public static function from_markdown( $markdown ) { + $markdown = (string) $markdown; + $consumer = new MarkdownConsumer( $markdown ); + $blocks_with_metadata = $consumer->consume(); + $block_markup = $blocks_with_metadata->get_block_markup(); + $blocks = self::named_blocks( parse_blocks( $block_markup ) ); + $source_blocks = self::source_blocks( $markdown ); + $line_offsets = self::line_offsets( $markdown ); + $source_line_offset = self::frontmatter_line_offset( $markdown ); + $units = array(); + + // Some Markdown constructs do not map one-to-one to named WordPress + // blocks. Preserve the whole source for unchanged saves in those cases. + if ( count( $source_blocks ) !== count( $blocks ) ) { + return new self( + $markdown, + $block_markup, + $blocks_with_metadata->get_all_metadata(), + '', + array( + new MarkdownSourceUnit( + substr( $markdown, 0 ), + 0, + strlen( $markdown ), + $block_markup, + self::semantic_hash_for_markup( $block_markup ) + ), + ) + ); + } + + $prefix_end = count( $source_blocks ) > 0 ? $line_offsets[ $source_line_offset + $source_blocks[0]->getStartLine() - 1 ] : strlen( $markdown ); + $prefix = substr( $markdown, 0, $prefix_end ); + $count = count( $source_blocks ); + + for ( $index = 0; $index < $count; $index++ ) { + $source_block = $source_blocks[ $index ]; + $start = $line_offsets[ $source_line_offset + $source_block->getStartLine() - 1 ]; + $end = $index + 1 < $count + ? $line_offsets[ $source_line_offset + $source_blocks[ $index + 1 ]->getStartLine() - 1 ] + : strlen( $markdown ); + $block_markup_for_unit = serialize_block( $blocks[ $index ] ); + $units[] = new MarkdownSourceUnit( + substr( $markdown, $start, $end - $start ), + $start, + $end, + $block_markup_for_unit, + self::semantic_hash_for_block( $blocks[ $index ] ) + ); + } + + return new self( + $markdown, + $block_markup, + $blocks_with_metadata->get_all_metadata(), + $prefix, + $units + ); + } + + /** + * Returns the WordPress block markup generated from the original Markdown. + * + * @return string Generated block markup. + */ + public function get_block_markup() { + return $this->block_markup; + } + + /** + * Returns metadata extracted from the Markdown document. + * + * @return array Metadata keyed by field name. + */ + public function get_all_metadata() { + return $this->metadata; + } + + /** + * Returns the source units mapped to top-level WordPress blocks. + * + * @return MarkdownSourceUnit[] Source units in document order. + */ + public function get_source_units() { + return $this->units; + } + + /** + * Applies edited block markup to the original Markdown source. + * + * Unchanged blocks are matched by semantic hash and copied from the original + * Markdown source. Changed and inserted blocks are serialized with + * MarkdownProducer. For changed blocks, surrounding line-oriented trivia is + * reused from the replaced source unit so CRLF separators, blank lines, and + * missing final newlines are not normalized. + * + * @param string $edited_block_markup The edited WordPress block markup. + * @return string Patched Markdown source. + */ + public function patch_markdown( $edited_block_markup ) { + if ( 1 === count( $this->units ) && $this->units[0]->get_semantic_hash() === self::semantic_hash_for_markup( $edited_block_markup ) ) { + return $this->markdown; + } + + $edited_blocks = self::named_blocks( parse_blocks( (string) $edited_block_markup ) ); + $original_hashes = array_map( + function ( MarkdownSourceUnit $unit ) { + return $unit->get_semantic_hash(); + }, + $this->units + ); + $edited_hashes = array_map( array( __CLASS__, 'semantic_hash_for_block' ), $edited_blocks ); + $matches = self::longest_common_subsequence( $original_hashes, $edited_hashes ); + $markdown = $this->prefix; + $original_index = 0; + $edited_index = 0; + + foreach ( $matches as $match ) { + $markdown .= $this->markdown_for_changed_blocks( + $edited_blocks, + $edited_index, + $match['edited'], + $original_index, + $match['original'] + ); + $markdown .= $this->units[ $match['original'] ]->get_source(); + $original_index = $match['original'] + 1; + $edited_index = $match['edited'] + 1; + } + + $markdown .= $this->markdown_for_changed_blocks( + $edited_blocks, + $edited_index, + count( $edited_blocks ), + $original_index, + count( $this->units ) + ); + + return $markdown; + } + + /** + * Returns the original Markdown source. + * + * @return string Original Markdown source. + */ + public function get_original_markdown() { + return $this->markdown; + } + + /** + * Serializes edited blocks that appear between two unchanged matches. + * + * The original range may be shorter than the edited range when blocks were + * inserted. Only replacements can borrow source trivia from original units; + * pure insertions use MarkdownProducer's normal block separators. + * + * @param array[] $edited_blocks Edited block objects from parse_blocks(). + * @param int $edited_start First edited block index to serialize. + * @param int $edited_end One past the last edited block index. + * @param int $original_start First original source unit index in the gap. + * @param int $original_end One past the last original source unit index. + * @return string Markdown for changed or inserted blocks. + */ + private function markdown_for_changed_blocks( array $edited_blocks, $edited_start, $edited_end, $original_start, $original_end ) { + $markdown = ''; + $original_available = $original_end - $original_start; + for ( $edited_index = $edited_start; $edited_index < $edited_end; $edited_index++ ) { + $relative_index = $edited_index - $edited_start; + $original_index = $original_start + $relative_index; + if ( $relative_index < $original_available && isset( $this->units[ $original_index ] ) ) { + $markdown .= $this->units[ $original_index ]->get_leading_trivia(); + $markdown .= self::with_trailing_trivia( + self::markdown_for_block( $edited_blocks[ $edited_index ] ), + $this->units[ $original_index ]->get_trailing_trivia() + ); + continue; + } + $markdown .= self::markdown_for_block( $edited_blocks[ $edited_index ] ); + } + + return $markdown; + } + + /** + * Replaces MarkdownProducer's trailing line endings with source trivia. + * + * @param string $markdown Serialized Markdown for a changed block. + * @param string $trailing_trivia Original trailing trivia to preserve. + * @return string Serialized Markdown with original trailing trivia. + */ + private static function with_trailing_trivia( $markdown, $trailing_trivia ) { + return self::trim_trailing_line_endings( $markdown ) . $trailing_trivia; + } + + /** + * Removes trailing CR and LF bytes from a Markdown fragment. + * + * @param string $text Markdown text. + * @return string Markdown text without trailing line endings. + */ + private static function trim_trailing_line_endings( $text ) { + while ( '' !== $text ) { + $last = $text[ strlen( $text ) - 1 ]; + if ( "\n" !== $last && "\r" !== $last ) { + break; + } + $text = substr( $text, 0, -1 ); + } + + return $text; + } + + /** + * Serializes a single WordPress block to Markdown. + * + * @param array $block Parsed block object. + * @return string Markdown representation of the block. + */ + private static function markdown_for_block( array $block ) { + $producer = new MarkdownProducer( + new BlocksWithMetadata( + serialize_block( $block ), + array() + ) + ); + return $producer->produce(); + } + + /** + * Returns the top-level CommonMark source blocks for a Markdown document. + * + * @param string $markdown Markdown source. + * @return AbstractBlock[] Top-level CommonMark blocks. + */ + private static function source_blocks( $markdown ) { + $environment = new Environment( array() ); + $environment->addExtension( new CommonMarkCoreExtension() ); + $environment->addExtension( new GithubFlavoredMarkdownExtension() ); + $environment->addExtension( + new \Webuni\FrontMatter\Markdown\FrontMatterLeagueCommonMarkExtension( + new \Webuni\FrontMatter\FrontMatter() + ) + ); + $parser = new MarkdownParser( $environment ); + $document = $parser->parse( (string) $markdown ); + $blocks = array(); + + foreach ( $document->children() as $child ) { + if ( $child instanceof AbstractBlock ) { + $blocks[] = $child; + } + } + + return $blocks; + } + + /** + * Returns only named WordPress blocks from a parsed block list. + * + * @param array[] $blocks Parsed block objects. + * @return array[] Named WordPress block objects. + */ + private static function named_blocks( array $blocks ) { + $named = array(); + foreach ( $blocks as $block ) { + if ( isset( $block['blockName'] ) && null !== $block['blockName'] ) { + $named[] = $block; + } + } + return $named; + } + + /** + * Returns byte offsets for the start of each source line. + * + * @param string $text Source text. + * @return int[] Byte offsets, starting with 0. + */ + private static function line_offsets( $text ) { + $offsets = array( 0 ); + $length = strlen( $text ); + for ( $i = 0; $i < $length; $i++ ) { + if ( "\n" === $text[ $i ] ) { + $offsets[] = $i + 1; + } + } + return $offsets; + } + + /** + * Returns the number of frontmatter lines before Markdown body content. + * + * CommonMark source positions are relative to the Markdown body when the + * frontmatter extension consumes metadata. This offset maps those line + * numbers back to byte offsets in the original source. + * + * @param string $markdown Markdown source. + * @return int Number of leading frontmatter lines. + */ + private static function frontmatter_line_offset( $markdown ) { + $lines = self::lines_with_endings( $markdown ); + if ( 0 === count( $lines ) ) { + return 0; + } + + $first_line = self::trim_line_ending( $lines[0] ); + if ( '---' !== $first_line && '+++' !== $first_line ) { + return 0; + } + + for ( $index = 1; $index < count( $lines ); $index++ ) { + if ( self::trim_line_ending( $lines[ $index ] ) === $first_line ) { + return $index + 1; + } + } + + return 0; + } + + /** + * Splits text into lines while retaining each line ending. + * + * @param string $text Source text. + * @return string[] Lines, each including its original line ending. + */ + private static function lines_with_endings( $text ) { + $lines = array(); + $line_start = 0; + $length = strlen( $text ); + + for ( $i = 0; $i < $length; $i++ ) { + if ( "\n" !== $text[ $i ] && "\r" !== $text[ $i ] ) { + continue; + } + if ( "\r" === $text[ $i ] && $i + 1 < $length && "\n" === $text[ $i + 1 ] ) { + $i++; + } + $lines[] = substr( $text, $line_start, $i - $line_start + 1 ); + $line_start = $i + 1; + } + + if ( $line_start < $length ) { + $lines[] = substr( $text, $line_start ); + } + + return $lines; + } + + /** + * Removes one line's trailing CR and LF bytes. + * + * @param string $line Source line. + * @return string Line without its trailing line ending. + */ + private static function trim_line_ending( $line ) { + while ( '' !== $line ) { + $last = $line[ strlen( $line ) - 1 ]; + if ( "\n" !== $last && "\r" !== $last ) { + break; + } + $line = substr( $line, 0, -1 ); + } + + return $line; + } + + /** + * Returns a semantic hash for block markup. + * + * @param string $block_markup WordPress block markup. + * @return string Hash of the canonical block structure. + */ + private static function semantic_hash_for_markup( $block_markup ) { + return hash( 'sha256', json_encode( self::canonical_blocks( self::named_blocks( parse_blocks( $block_markup ) ) ) ) ); + } + + /** + * Returns a semantic hash for one block. + * + * @param array $block Parsed block object. + * @return string Hash of the canonical block structure. + */ + private static function semantic_hash_for_block( array $block ) { + return hash( 'sha256', json_encode( self::canonical_block( $block ) ) ); + } + + /** + * Returns canonical representations for a list of blocks. + * + * @param array[] $blocks Parsed block objects. + * @return array[] Canonical block structures. + */ + private static function canonical_blocks( array $blocks ) { + $canonical = array(); + foreach ( $blocks as $block ) { + $canonical[] = self::canonical_block( $block ); + } + return $canonical; + } + + /** + * Returns a canonical representation of a block for semantic comparison. + * + * Attribute order is normalized so equivalent blocks can be matched even + * when serialization order differs. + * + * @param array $block Parsed block object. + * @return array Canonical block structure. + */ + private static function canonical_block( array $block ) { + $attrs = isset( $block['attrs'] ) && is_array( $block['attrs'] ) ? $block['attrs'] : array(); + self::sort_recursive( $attrs ); + $inner_blocks = isset( $block['innerBlocks'] ) && is_array( $block['innerBlocks'] ) + ? self::canonical_blocks( $block['innerBlocks'] ) + : array(); + + return array( + 'blockName' => isset( $block['blockName'] ) ? $block['blockName'] : null, + 'attrs' => $attrs, + 'innerHTML' => isset( $block['innerHTML'] ) ? $block['innerHTML'] : '', + 'innerBlocks' => $inner_blocks, + ); + } + + /** + * Sorts associative arrays recursively while preserving list order. + * + * @param mixed $value Value to normalize. + */ + private static function sort_recursive( &$value ) { + if ( ! is_array( $value ) ) { + return; + } + + foreach ( $value as &$child ) { + self::sort_recursive( $child ); + } + unset( $child ); + + if ( self::is_associative_array( $value ) ) { + ksort( $value ); + } + } + + /** + * Indicates whether an array has non-sequential numeric keys. + * + * @param array $value Array to inspect. + * @return bool True for associative arrays, false for lists. + */ + private static function is_associative_array( array $value ) { + $index = 0; + foreach ( array_keys( $value ) as $key ) { + if ( $key !== $index ) { + return true; + } + $index++; + } + return false; + } + + /** + * Finds matching unchanged blocks between original and edited sequences. + * + * The result is used to splice original source around changed gaps. LCS is + * intentionally used instead of a greedy scan so repeated identical blocks + * still leave the longest possible set of source units untouched. + * + * @param string[] $left Original semantic hashes. + * @param string[] $right Edited semantic hashes. + * @return array[] Matches with original and edited indexes. + */ + private static function longest_common_subsequence( array $left, array $right ) { + $left_count = count( $left ); + $right_count = count( $right ); + $lengths = array_fill( 0, $left_count + 1, array_fill( 0, $right_count + 1, 0 ) ); + + for ( $i = $left_count - 1; $i >= 0; $i-- ) { + for ( $j = $right_count - 1; $j >= 0; $j-- ) { + if ( $left[ $i ] === $right[ $j ] ) { + $lengths[ $i ][ $j ] = $lengths[ $i + 1 ][ $j + 1 ] + 1; + } else { + $lengths[ $i ][ $j ] = max( $lengths[ $i + 1 ][ $j ], $lengths[ $i ][ $j + 1 ] ); + } + } + } + + $matches = array(); + $i = 0; + $j = 0; + while ( $i < $left_count && $j < $right_count ) { + if ( $left[ $i ] === $right[ $j ] ) { + $matches[] = array( + 'original' => $i, + 'edited' => $j, + ); + $i++; + $j++; + } elseif ( $lengths[ $i + 1 ][ $j ] >= $lengths[ $i ][ $j + 1 ] ) { + $i++; + } else { + $j++; + } + } + + return $matches; + } +} diff --git a/components/Markdown/class-markdownsourceunit.php b/components/Markdown/class-markdownsourceunit.php new file mode 100644 index 000000000..10f1daf28 --- /dev/null +++ b/components/Markdown/class-markdownsourceunit.php @@ -0,0 +1,203 @@ +source = (string) $source; + $this->start_offset = (int) $start_offset; + $this->end_offset = (int) $end_offset; + $this->block_markup = (string) $block_markup; + $this->semantic_hash = (string) $semantic_hash; + } + + /** + * Returns the original Markdown source slice. + * + * @return string Original Markdown bytes for this unit. + */ + public function get_source() { + return $this->source; + } + + /** + * Returns whitespace before the first non-whitespace byte in this unit. + * + * This is reused when a source unit is replaced by changed block markup, so + * indentation or blank-line trivia before the edited block is not lost. + * + * @return string Leading whitespace from the source slice. + */ + public function get_leading_trivia() { + $length = strlen( $this->source ); + for ( $i = 0; $i < $length; $i++ ) { + if ( ! ctype_space( $this->source[ $i ] ) ) { + return substr( $this->source, 0, $i ); + } + } + + return $this->source; + } + + /** + * Returns line-oriented trivia after the final content line in this unit. + * + * The returned trivia includes the final content line's line ending plus any + * following blank lines. This allows changed blocks to preserve LF/CRLF + * separators and the absence of a final newline. + * + * @return string Trailing line ending and blank-line trivia. + */ + public function get_trailing_trivia() { + $lines = self::lines_with_endings( $this->source ); + $trivia = ''; + + for ( $index = count( $lines ) - 1; $index >= 0; $index-- ) { + $line = $lines[ $index ]; + $line_without_ending = self::trim_line_ending( $line ); + + if ( self::is_blank( $line_without_ending ) ) { + $trivia = $line . $trivia; + continue; + } + + return self::line_ending( $line ) . $trivia; + } + + return $trivia; + } + + /** + * Returns the start byte offset of this unit in the original document. + * + * @return int Start byte offset. + */ + public function get_start_offset() { + return $this->start_offset; + } + + /** + * Returns the end byte offset of this unit in the original document. + * + * @return int End byte offset. + */ + public function get_end_offset() { + return $this->end_offset; + } + + /** + * Returns the WordPress block markup generated from this unit. + * + * @return string Block markup for this source unit. + */ + public function get_block_markup() { + return $this->block_markup; + } + + /** + * Returns the semantic hash used to match this unit after edits. + * + * @return string Semantic block hash. + */ + public function get_semantic_hash() { + return $this->semantic_hash; + } + + /** + * Splits text into lines while retaining each line ending. + * + * @param string $text Source text. + * @return string[] Lines, each including its original line ending. + */ + private static function lines_with_endings( $text ) { + $lines = array(); + $line_start = 0; + $length = strlen( $text ); + + for ( $i = 0; $i < $length; $i++ ) { + if ( "\n" !== $text[ $i ] && "\r" !== $text[ $i ] ) { + continue; + } + if ( "\r" === $text[ $i ] && $i + 1 < $length && "\n" === $text[ $i + 1 ] ) { + $i++; + } + $lines[] = substr( $text, $line_start, $i - $line_start + 1 ); + $line_start = $i + 1; + } + + if ( $line_start < $length ) { + $lines[] = substr( $text, $line_start ); + } + + return $lines; + } + + /** + * Removes one line's trailing CR and LF bytes. + * + * @param string $line Source line. + * @return string Line without its trailing line ending. + */ + private static function trim_line_ending( $line ) { + while ( '' !== $line ) { + $last = $line[ strlen( $line ) - 1 ]; + if ( "\n" !== $last && "\r" !== $last ) { + break; + } + $line = substr( $line, 0, -1 ); + } + + return $line; + } + + /** + * Returns the CR/LF line ending from a source line. + * + * @param string $line Source line. + * @return string Line ending, or an empty string when none exists. + */ + private static function line_ending( $line ) { + $without_line_ending = self::trim_line_ending( $line ); + return substr( $line, strlen( $without_line_ending ) ); + } + + /** + * Indicates whether text contains only whitespace bytes. + * + * @param string $text Text to inspect. + * @return bool True when the text is blank, false otherwise. + */ + private static function is_blank( $text ) { + $length = strlen( $text ); + for ( $i = 0; $i < $length; $i++ ) { + if ( ! ctype_space( $text[ $i ] ) ) { + return false; + } + } + return true; + } +} diff --git a/components/Markdown/vendor-patched/league/commonmark/src/Extension/CommonMark/Parser/Inline/BacktickParser.php b/components/Markdown/vendor-patched/league/commonmark/src/Extension/CommonMark/Parser/Inline/BacktickParser.php index 5f8040fdd..38aa34573 100644 --- a/components/Markdown/vendor-patched/league/commonmark/src/Extension/CommonMark/Parser/Inline/BacktickParser.php +++ b/components/Markdown/vendor-patched/league/commonmark/src/Extension/CommonMark/Parser/Inline/BacktickParser.php @@ -32,7 +32,7 @@ final class BacktickParser implements InlineParserInterface */ private const MAX_BACKTICKS = 1000; - /** @var \WeakReference|null */ + /** @var \WeakReference|Cursor|null */ private $lastCursor; /** * @var bool @@ -98,9 +98,9 @@ public function parse(InlineParserContext $inlineContext): bool private function findMatchingTicks(int $openTickLength, Cursor $cursor): bool { // Reset the seenBackticks cache if this is a new cursor - if ($this->lastCursor === null || $this->lastCursor->get() !== $cursor) { + if (! $this->isSameCursor($cursor)) { $this->seenBackticks = []; - $this->lastCursor = \WeakReference::create($cursor); + $this->lastCursor = $this->createCursorReference($cursor); $this->lastCursorScanned = false; } @@ -132,4 +132,29 @@ private function findMatchingTicks(int $openTickLength, Cursor $cursor): bool return false; } + + /** + * @return \WeakReference|Cursor + */ + private function createCursorReference(Cursor $cursor) + { + if (\class_exists('WeakReference')) { + return \WeakReference::create($cursor); + } + + return $cursor; + } + + private function isSameCursor(Cursor $cursor): bool + { + if ($this->lastCursor === null) { + return false; + } + + if ($this->lastCursor instanceof Cursor) { + return $this->lastCursor === $cursor; + } + + return $this->lastCursor->get() === $cursor; + } } diff --git a/components/Markdown/vendor-patched/league/commonmark/src/Util/LinkParserHelper.php b/components/Markdown/vendor-patched/league/commonmark/src/Util/LinkParserHelper.php index 656942462..917a7c6b0 100644 --- a/components/Markdown/vendor-patched/league/commonmark/src/Util/LinkParserHelper.php +++ b/components/Markdown/vendor-patched/league/commonmark/src/Util/LinkParserHelper.php @@ -131,7 +131,7 @@ private static function manuallyParseLinkDestination(Cursor $cursor): ?string return $destination; } - /** @var \WeakReference|null */ + /** @var \WeakReference|Cursor|null */ private static $lastCursor; /** * @var bool @@ -144,12 +144,12 @@ private static function parseDestinationBraces(Cursor $cursor): ?string // that no closing brace exists, so we can skip the regex entirely. This helps avoid // certain pathological cases where the regex engine can take a very long time to // determine that no match exists. - if (self::$lastCursor !== null && self::$lastCursor->get() === $cursor) { + if (self::isSameCursor($cursor)) { if (self::$lastCursorLacksClosingBrace) { return null; } } else { - self::$lastCursor = \WeakReference::create($cursor); + self::$lastCursor = self::createCursorReference($cursor); } if ($res = $cursor->match(RegexHelper::REGEX_LINK_DESTINATION_BRACES)) { @@ -165,4 +165,29 @@ private static function parseDestinationBraces(Cursor $cursor): ?string return null; } + + /** + * @return \WeakReference|Cursor + */ + private static function createCursorReference(Cursor $cursor) + { + if (\class_exists('WeakReference')) { + return \WeakReference::create($cursor); + } + + return $cursor; + } + + private static function isSameCursor(Cursor $cursor): bool + { + if (self::$lastCursor === null) { + return false; + } + + if (self::$lastCursor instanceof Cursor) { + return self::$lastCursor === $cursor; + } + + return self::$lastCursor->get() === $cursor; + } }