From db098a6cd519d3592a54b00f824eb7be0eec5313 Mon Sep 17 00:00:00 2001 From: Peter Bagnall Date: Mon, 15 Jul 2013 15:06:23 +0100 Subject: [PATCH 1/4] Update finediff.php Fixed bug where "abc def" and "abc def ghi" when using the wordDelimiters would result in "c4d3i7:def ghi" rather than "c7i4: ghi". The issue was that whitespace was being treated as being part of a word, so the lack of trailing whitespace after "def" meant that it didn't match "def " and therefore wasn't recognised as being a valid match. The solution is to create separate word fragments and whitespace fragments. --- finediff.php | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/finediff.php b/finediff.php index 0fad956..452d224 100644 --- a/finediff.php +++ b/finediff.php @@ -655,6 +655,12 @@ private static function extractFragments($text, $delimiters) { $start = $end = 0; for (;;) { $end += self::mb_strcspn($text, $delimiters, $end); + if ( $end === $start ) { + break; + } + $fragments[$start] = mb_substr($text, $start, $end - $start); + $start = $end; + $end += self::mb_strspn($text, $delimiters, $end); if ( $end === $start ) { break; From 3845e4a88a307aa6fe27e6ac8ada2ce1e89c73b3 Mon Sep 17 00:00:00 2001 From: Peter Bagnall Date: Mon, 15 Jul 2013 15:10:39 +0100 Subject: [PATCH 2/4] Update finediff.php updated edit history --- finediff.php | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/finediff.php b/finediff.php index 452d224..ee63f6a 100644 --- a/finediff.php +++ b/finediff.php @@ -34,6 +34,10 @@ * * 10-Dec-2011 (Christoph Mewes): * - added UTF-8 support, fixed strange usage of htmlentities + * + * 15-Jul-2013 (Peter Bagnall): + * - fixed bug where getting the diff of "abc def" and "abc def ghi" would fail + * to recognise that def was a match, because whitespace was being included in fragments. */ mb_internal_encoding('UTF-8'); From adedee7cb2133247129d14e6478dd9590cb5654f Mon Sep 17 00:00:00 2001 From: Peter Bagnall Date: Wed, 17 Jul 2013 16:40:33 +0100 Subject: [PATCH 3/4] Update finediff.php prevents whitespaces from being counted as matches which would otherwise prevent long runs of text being simply replaced and instead each word being individually replaced and the whitespace between words being being copied. Also removed an unnecessary call to mb_strlen. --- finediff.php | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/finediff.php b/finediff.php index ee63f6a..0f34e62 100644 --- a/finediff.php +++ b/finediff.php @@ -524,12 +524,17 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $fragment_index_offset += $fragment_length; } if ( $fragment_index_offset > $best_copy_length ) { - $best_copy_length = $fragment_index_offset; - $best_from_start = $from_base_fragment_index; - $best_to_start = $to_base_fragment_index; + // if the matching string is just made up of delimiters then don't count it as a match. This prevents an + // excessive number of whitespaces being seen as matches and therefore breaking up a long replace segment + // to no useful purpose. + if ($fragment_index_offset > $from_base_fragment_length || self::mb_strspn($from_base_fragment, $delimiters, 0)===0) { + $best_copy_length = $fragment_index_offset; + $best_from_start = $from_base_fragment_index; + $best_to_start = $to_base_fragment_index; + } } } - $from_base_fragment_index += mb_strlen($from_base_fragment); + $from_base_fragment_index += $from_base_fragment_length; // If match is larger than half segment size, no point trying to find better // TODO: Really? if ( $best_copy_length >= $from_segment_length / 2) { From 052f4f7aa041de7d4586526df064f35f97b1b467 Mon Sep 17 00:00:00 2001 From: Peter Bagnall Date: Wed, 17 Jul 2013 16:47:18 +0100 Subject: [PATCH 4/4] Update README it now works with UTF-8, thanks to Christoph Mewes. --- README | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/README b/README index c0f9f4a..adbcca3 100644 --- a/README +++ b/README @@ -73,21 +73,7 @@ FAQ --- * Does it work with UTF-8? - As of now, the code assume single-byte characters. To use UTF-8 text, you can - always convert the encoding using mb_convert_encoding(): - ... - $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8'); - $to_text = mb_convert_encoding($to_text_utf8, 'HTML-ENTITIES', 'UTF-8'); - $diff_opcodes = FineDiff::getDiffOpcodes($from_text, $to_text); - ... - - If ever you want to re-generate the $to_text_utf8 from the $from_text_utf8: - ... - $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8'); - $to_text = FineDiff::renderToTextFromOpcodes($from_text, $diff_opcodes); - $to_text_utf8 = mb_convert_encoding($to_text, 'UTF-8', 'HTML-ENTITIES'); - .... - + Yes! License -------