skrap · pbagnall · Jul 15, 2013 · Jul 15, 2013 · Jul 17, 2013 · Jul 17, 2013
diff --git a/README b/README
@@ -73,21 +73,7 @@ FAQ
 ---
 
 * Does it work with UTF-8?
-  As of now, the code assume single-byte characters. To use UTF-8 text, you can
-  always convert the encoding using mb_convert_encoding():
-    ...
-    $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8');
-    $to_text = mb_convert_encoding($to_text_utf8, 'HTML-ENTITIES', 'UTF-8');
-    $diff_opcodes = FineDiff::getDiffOpcodes($from_text, $to_text);
-    ...
-
-  If ever you want to re-generate the $to_text_utf8 from the $from_text_utf8:
-    ...
-    $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8');
-    $to_text = FineDiff::renderToTextFromOpcodes($from_text, $diff_opcodes);
-    $to_text_utf8 = mb_convert_encoding($to_text, 'UTF-8', 'HTML-ENTITIES');
-    ....	
-
+     Yes!
 
 License
 -------

diff --git a/finediff.php b/finediff.php
@@ -34,6 +34,10 @@
  *
  * 10-Dec-2011 (Christoph Mewes):
  *   - added UTF-8 support, fixed strange usage of htmlentities
+ * 
+ * 15-Jul-2013 (Peter Bagnall):
+ *   - fixed bug where getting the diff of "abc def" and "abc def ghi" would fail
+ *     to recognise that def was a match, because whitespace was being included in fragments.
 */
 
 mb_internal_encoding('UTF-8');
@@ -520,12 +524,17 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
 						$fragment_index_offset += $fragment_length;
 						}
 					if ( $fragment_index_offset > $best_copy_length ) {
-						$best_copy_length = $fragment_index_offset;
-						$best_from_start = $from_base_fragment_index;
-						$best_to_start = $to_base_fragment_index;
+						// if the matching string is just made up of delimiters then don't count it as a match. This prevents an
+						// excessive number of whitespaces being seen as matches and therefore breaking up a long replace segment
+						// to no useful purpose.
+						if ($fragment_index_offset > $from_base_fragment_length || self::mb_strspn($from_base_fragment, $delimiters, 0)===0) {
+							$best_copy_length = $fragment_index_offset;
+							$best_from_start = $from_base_fragment_index;
+							$best_to_start = $to_base_fragment_index;
+							}
 						}
 					}
-				$from_base_fragment_index += mb_strlen($from_base_fragment);
+				$from_base_fragment_index += $from_base_fragment_length;
 				// If match is larger than half segment size, no point trying to find better
 				// TODO: Really?
 				if ( $best_copy_length >= $from_segment_length / 2) {
@@ -655,6 +664,12 @@ private static function extractFragments($text, $delimiters) {
 		$start = $end = 0;
 		for (;;) {
 			$end += self::mb_strcspn($text, $delimiters, $end);
+			if ( $end === $start ) {
+				break;
+				}
+			$fragments[$start] = mb_substr($text, $start, $end - $start);
+			$start = $end;
+
 			$end += self::mb_strspn($text, $delimiters, $end);
 			if ( $end === $start ) {
 				break;