Skip to content

Improve speed of hypdiff for large text #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 93 additions & 10 deletions lib/hyp_diff.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ def compare(before, after, options = {})

# @api private
class NodeMap
def self.for(change_node_tuples, &block)
def self.for(change_node_tuples)
new.build(change_node_tuples).map
end

attr_reader :map

def initialize
@map = {}
@map = Hash.new {|h, k| h[k] = [] }
@stashed = []
end

Expand Down Expand Up @@ -79,8 +79,7 @@ def build(change_node_tuples)
end

def append_to_node(node, change)
list = (@map[node] ||= [])
list << change
@map[node] << change
end
end

Expand All @@ -103,15 +102,63 @@ def render(changes)
changes.each do |change|
case change.action
when "!" then
deletions << change.old_element.text
insertions << change.new_element.text
old_fulltext = change.old_element.fulltext
new_fulltext = change.new_element.fulltext
if old_fulltext.include?(new_fulltext)
if old_fulltext.start_with?(new_fulltext)
apply_insertions_and_deletions
new_text << new_fulltext
deletions << old_fulltext[new_fulltext.length..-1]
next
end
if old_fulltext.end_with?(new_fulltext)
deletions << old_fulltext[0, old_fulltext.length - new_fulltext.length]
apply_insertions_and_deletions
new_text << new_fulltext
next
end
end
if new_fulltext.include?(old_fulltext)
if new_fulltext.start_with?(old_fulltext)
apply_insertions_and_deletions
new_text << old_fulltext
insertions << new_fulltext[old_fulltext.length..-1]
next
end
if new_fulltext.end_with?(old_fulltext)
insertions << new_fulltext[0, new_fulltext.length - old_fulltext.length]
apply_insertions_and_deletions
new_text << old_fulltext
next
end
end
if insertions.empty? && deletions.empty? && change.old_element.before_whitespace && change.new_element.before_whitespace
new_text << " "
deletions << change.old_element.text
insertions << change.new_element.text
next
end
deletions << change.old_element.fulltext
insertions << change.new_element.fulltext
when "=" then
if change.old_element.before_whitespace && !change.new_element.before_whitespace
deletions << " "
apply_insertions_and_deletions
new_text << change.new_element.text
next
end
if change.new_element.before_whitespace && !change.old_element.before_whitespace
insertions << " "
apply_insertions_and_deletions
new_text << change.new_element.text
next
end
apply_insertions_and_deletions
new_text << escape_html(change.new_element.text)
new_text << escape_html(change.new_element.fulltext)
when "+" then
insertions << change.new_element.text
insertions << change.new_element.fulltext
when "-" then
deletions << change.old_element.text
deletions << change.old_element.fulltext
else
raise "unexpected change.action #{change.action}"
end
Expand All @@ -131,6 +178,13 @@ def rendered_text
attr_reader :insertions, :deletions, :new_text

def apply_insertions_and_deletions
while !deletions.empty? && !insertions.empty?
break unless deletions.first == insertions.first

deletions.shift
new_text << insertions.shift
end

if deletions.length > 0
new_text << deletion_tag(deletions.join)
end
Expand Down Expand Up @@ -163,7 +217,7 @@ def parse(text)
end

def extract_text(node)
filter_whitespace(text_fragments(node))
merge_whitespace(filter_whitespace(text_fragments(node)))
end

def text_fragments(node)
Expand All @@ -187,5 +241,34 @@ def filter_whitespace(node_list)
result
end

def merge_whitespace(node_list)
result = []

last_whitespace_node = nil
node_list.each do |node|
if node.whitespace?
last_whitespace_node = node
next
end

unless last_whitespace_node
result << node
next
end

if last_whitespace_node.node.equal?(node.node)
node.before_whitespace = last_whitespace_node
else
result << last_whitespace_node
end
last_whitespace_node = nil
result << node
end

result << last_whitespace_node if last_whitespace_node

result
end

end; end

8 changes: 7 additions & 1 deletion lib/hyp_diff/text_from_node.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
module HypDiff

class TextFromNode
attr_accessor :before_whitespace

def initialize(raw_text, node)
@text = raw_text.strip == "" ? " " : raw_text
@node = node
end

def ==(other)
text == other.text
eql?(other)
end

def eql?(other)
Expand All @@ -22,6 +24,10 @@ def whitespace?
@text == " "
end

def fulltext
before_whitespace ? " #{text}" : text
end

def text
@text
end
Expand Down
33 changes: 22 additions & 11 deletions spec/hyp_diff_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def expect_diff(old, new, expected)
end

it "merges consecutive deletions into a single tag" do
expect_diff("hello beautiful world", "hello world", "hello <del>beautiful </del>world")
expect_diff("hello beautiful world", "hello world", "hello<del> beautiful</del> world")
end

it "merge consecutive additions and edits into single tags" do
Expand Down Expand Up @@ -131,14 +131,19 @@ def expect_diff(old, new, expected)
"hello world",
"hello world"
)
expect_diff(
"<span>hello </span>world",
"hello<span> world</span>",
"hello<span> world</span>"
)
end

it "considers trailing and leading whitespace for insertions and deletions" do
it "treats consecutive whitespace as a single whitespace across tags (best effort for special cases)" do
expect(
HypDiff.compare(
"<span>hello </span>world",
"hello<span> world</span>",
)
).to eq("hello<span> world</span>")
.or eq("hello<del> </del><span><ins> </ins>world</span>")
end

it "considers trailing and leading whitespace for insertions and deletions", :aggregate_failures do
expect_diff("hello", "hello world", "hello<ins> world</ins>")
expect_diff("hello world", "hello", "hello<del> world</del>")
expect_diff("world", "hello world", "<ins>hello </ins>world")
Expand All @@ -149,14 +154,14 @@ def expect_diff(old, new, expected)
expect_diff("hello world", "hello ", "hello <del>world</del>")
end

it "considers trailing and leading whitespace changes" do
it "considers trailing and leading whitespace changes", :aggregate_failures do
expect_diff("hello ", "hello", "hello<del> </del>")
expect_diff("hello", "hello ", "hello<ins> </ins>")
expect_diff(" hello", "hello", "<del> </del>hello")
expect_diff("hello", " hello", "<ins> </ins>hello")
end

it "considers changes of text and whitespace" do
it "considers changes of text and whitespace", :aggregate_failures do
expect_diff("hello world ", "hello friend", "hello <del>world </del><ins>friend</ins>")
expect_diff(" bye world", "hello world", "<del> bye</del><ins>hello</ins> world")
expect_diff("hello friend", "hello world ", "hello <del>friend</del><ins>world </ins>")
Expand All @@ -168,7 +173,7 @@ def expect_diff(old, new, expected)
expect_diff("hello world", "hello, world", "hello<ins>,</ins> world")
end

it "diffs changes of punctuation to words" do
it "diffs changes of punctuation to words", :aggregate_failures do
expect_diff(
"hello, world",
"hello beautiful world",
Expand All @@ -181,7 +186,7 @@ def expect_diff(old, new, expected)
)
end

it "diffs changes of punctuation to leading and trailing spaces" do
it "diffs changes of punctuation to leading and trailing spaces", :aggregate_failures do
expect_diff("hello.", "hello ", "hello<del>.</del><ins> </ins>")
expect_diff("hello ", "hello.", "hello<del> </del><ins>.</ins>")
expect_diff(" hello", ".hello", "<del> </del><ins>.</ins>hello")
Expand All @@ -192,4 +197,10 @@ def expect_diff(old, new, expected)
expect_diff("hello world", "hello world.", "hello world<ins>.</ins>")
end

it "converts newlines to spaces" do
content =
"<h3>Office philosophy</h3>\n<h4> No set working places</h4>\n<p> bla bla </p>"
expect_diff(content, content,
"<h3>Office philosophy</h3> <h4>No set working places</h4> <p>bla bla </p>")
end
end
24 changes: 24 additions & 0 deletions spec/large_text_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# encoding: utf-8
require "hyp_diff"
require "securerandom"

describe HypDiff do
context "for large text" do
it "performs reasonably fast" do
words = []
300.times do
8.times do |i|
words << SecureRandom.hex([i + 2, 6].min)
end
words << "replace"
words << "me."
end
text = words.join(" ")
modified_text = text.gsub("replace me", "better text")
start = Time.now

HypDiff.compare(text, modified_text)
expect(Time.now - start).to be < 1
end
end
end