From db098a6cd519d3592a54b00f824eb7be0eec5313 Mon Sep 17 00:00:00 2001
From: Peter Bagnall <pete@surfaceeffect.com>
Date: Mon, 15 Jul 2013 15:06:23 +0100
Subject: [PATCH 1/4] Update finediff.php

Fixed bug where "abc def" and "abc def ghi" when using the wordDelimiters would result in "c4d3i7:def ghi" rather than "c7i4: ghi". The issue was that whitespace was being treated as being part of a word, so the lack of trailing whitespace after "def" meant that it didn't match "def " and therefore wasn't recognised as being a valid match. The solution is to create separate word fragments and whitespace fragments.
---
 finediff.php | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/finediff.php b/finediff.php
index 0fad956..452d224 100644
--- a/finediff.php
+++ b/finediff.php
@@ -655,6 +655,12 @@ private static function extractFragments($text, $delimiters) {
 		$start = $end = 0;
 		for (;;) {
 			$end += self::mb_strcspn($text, $delimiters, $end);
+			if ( $end === $start ) {
+				break;
+				}
+			$fragments[$start] = mb_substr($text, $start, $end - $start);
+			$start = $end;
+
 			$end += self::mb_strspn($text, $delimiters, $end);
 			if ( $end === $start ) {
 				break;

From 3845e4a88a307aa6fe27e6ac8ada2ce1e89c73b3 Mon Sep 17 00:00:00 2001
From: Peter Bagnall <pete@surfaceeffect.com>
Date: Mon, 15 Jul 2013 15:10:39 +0100
Subject: [PATCH 2/4] Update finediff.php

updated edit history
---
 finediff.php | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/finediff.php b/finediff.php
index 452d224..ee63f6a 100644
--- a/finediff.php
+++ b/finediff.php
@@ -34,6 +34,10 @@
  *
  * 10-Dec-2011 (Christoph Mewes):
  *   - added UTF-8 support, fixed strange usage of htmlentities
+ * 
+ * 15-Jul-2013 (Peter Bagnall):
+ *   - fixed bug where getting the diff of "abc def" and "abc def ghi" would fail
+ *     to recognise that def was a match, because whitespace was being included in fragments.
 */
 
 mb_internal_encoding('UTF-8');

From adedee7cb2133247129d14e6478dd9590cb5654f Mon Sep 17 00:00:00 2001
From: Peter Bagnall <pete@surfaceeffect.com>
Date: Wed, 17 Jul 2013 16:40:33 +0100
Subject: [PATCH 3/4] Update finediff.php

prevents whitespaces from being counted as matches which would otherwise prevent long runs of text being simply replaced and instead each word being individually replaced and the whitespace between words being being copied. Also removed an unnecessary call to mb_strlen.
---
 finediff.php | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/finediff.php b/finediff.php
index ee63f6a..0f34e62 100644
--- a/finediff.php
+++ b/finediff.php
@@ -524,12 +524,17 @@ private static function doFragmentDiff($from_text, $to_text, $delimiters) {
 						$fragment_index_offset += $fragment_length;
 						}
 					if ( $fragment_index_offset > $best_copy_length ) {
-						$best_copy_length = $fragment_index_offset;
-						$best_from_start = $from_base_fragment_index;
-						$best_to_start = $to_base_fragment_index;
+						// if the matching string is just made up of delimiters then don't count it as a match. This prevents an
+						// excessive number of whitespaces being seen as matches and therefore breaking up a long replace segment
+						// to no useful purpose.
+						if ($fragment_index_offset > $from_base_fragment_length || self::mb_strspn($from_base_fragment, $delimiters, 0)===0) {
+							$best_copy_length = $fragment_index_offset;
+							$best_from_start = $from_base_fragment_index;
+							$best_to_start = $to_base_fragment_index;
+							}
 						}
 					}
-				$from_base_fragment_index += mb_strlen($from_base_fragment);
+				$from_base_fragment_index += $from_base_fragment_length;
 				// If match is larger than half segment size, no point trying to find better
 				// TODO: Really?
 				if ( $best_copy_length >= $from_segment_length / 2) {

From 052f4f7aa041de7d4586526df064f35f97b1b467 Mon Sep 17 00:00:00 2001
From: Peter Bagnall <pete@surfaceeffect.com>
Date: Wed, 17 Jul 2013 16:47:18 +0100
Subject: [PATCH 4/4] Update README

it now works with UTF-8, thanks to Christoph Mewes.
---
 README | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/README b/README
index c0f9f4a..adbcca3 100644
--- a/README
+++ b/README
@@ -73,21 +73,7 @@ FAQ
 ---
 
 * Does it work with UTF-8?
-  As of now, the code assume single-byte characters. To use UTF-8 text, you can
-  always convert the encoding using mb_convert_encoding():
-    ...
-    $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8');
-    $to_text = mb_convert_encoding($to_text_utf8, 'HTML-ENTITIES', 'UTF-8');
-    $diff_opcodes = FineDiff::getDiffOpcodes($from_text, $to_text);
-    ...
-
-  If ever you want to re-generate the $to_text_utf8 from the $from_text_utf8:
-    ...
-    $from_text = mb_convert_encoding($from_text_utf8, 'HTML-ENTITIES', 'UTF-8');
-    $to_text = FineDiff::renderToTextFromOpcodes($from_text, $diff_opcodes);
-    $to_text_utf8 = mb_convert_encoding($to_text, 'UTF-8', 'HTML-ENTITIES');
-    ....	
-
+     Yes!
 
 License
 -------