diff --git a/README b/README index c0f9f4a..adbcca3 100644 --- a/README +++ b/README @@ -73,21 +73,7 @@ FAQ --- * Does it work with UTF-8? - As of now, the code assume single-byte characters. To use UTF-8 text, you can - always convert the encoding using mb_convert_encoding(): - ... - $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8'); - $to_text = mb_convert_encoding($to_text_utf8, 'HTML-ENTITIES', 'UTF-8'); - $diff_opcodes = FineDiff::getDiffOpcodes($from_text, $to_text); - ... - - If ever you want to re-generate the $to_text_utf8 from the $from_text_utf8: - ... - $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8'); - $to_text = FineDiff::renderToTextFromOpcodes($from_text, $diff_opcodes); - $to_text_utf8 = mb_convert_encoding($to_text, 'UTF-8', 'HTML-ENTITIES'); - .... - + Yes! License ------- diff --git a/finediff.php b/finediff.php index 0fad956..0f34e62 100644 --- a/finediff.php +++ b/finediff.php @@ -34,6 +34,10 @@ * * 10-Dec-2011 (Christoph Mewes): * - added UTF-8 support, fixed strange usage of htmlentities + * + * 15-Jul-2013 (Peter Bagnall): + * - fixed bug where getting the diff of "abc def" and "abc def ghi" would fail + * to recognise that def was a match, because whitespace was being included in fragments. */ mb_internal_encoding('UTF-8'); @@ -520,12 +524,17 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) { $fragment_index_offset += $fragment_length; } if ( $fragment_index_offset > $best_copy_length ) { - $best_copy_length = $fragment_index_offset; - $best_from_start = $from_base_fragment_index; - $best_to_start = $to_base_fragment_index; + // if the matching string is just made up of delimiters then don't count it as a match. This prevents an + // excessive number of whitespaces being seen as matches and therefore breaking up a long replace segment + // to no useful purpose. + if ($fragment_index_offset > $from_base_fragment_length || self::mb_strspn($from_base_fragment, $delimiters, 0)===0) { + $best_copy_length = $fragment_index_offset; + $best_from_start = $from_base_fragment_index; + $best_to_start = $to_base_fragment_index; + } } } - $from_base_fragment_index += mb_strlen($from_base_fragment); + $from_base_fragment_index += $from_base_fragment_length; // If match is larger than half segment size, no point trying to find better // TODO: Really? if ( $best_copy_length >= $from_segment_length / 2) { @@ -655,6 +664,12 @@ private static function extractFragments($text, $delimiters) { $start = $end = 0; for (;;) { $end += self::mb_strcspn($text, $delimiters, $end); + if ( $end === $start ) { + break; + } + $fragments[$start] = mb_substr($text, $start, $end - $start); + $start = $end; + $end += self::mb_strspn($text, $delimiters, $end); if ( $end === $start ) { break;