diff --git a/src/htmldiff.coffee b/src/htmldiff.coffee
index dd5c658..46cb5a1 100644
--- a/src/htmldiff.coffee
+++ b/src/htmldiff.coffee
@@ -1,23 +1,121 @@
+###
+ * htmldiff.js is a library that compares HTML content. It creates a diff between two
+ * HTML documents by combining the two documents and wrapping the differences with
+ * and tags. Here is a high-level overview of how the diff works.
+ *
+ * 1. Tokenize the before and after HTML with html_to_tokens.
+ * 2. Generate a list of operations that convert the before list of tokens to the after
+ * list of tokens with calculate_operations, which does the following:
+ * a. Find all the matching blocks of tokens between the before and after lists of
+ * tokens with find_matching_blocks. This is done by finding the single longest
+ * matching block with find_match, then recursively finding the next longest
+ * matching block that precede and follow the longest matching block with
+ * recursively_find_matching_blocks.
+ * b. Determine insertions, deletions, and replacements from the matching blocks.
+ * This is done in calculate_operations.
+ * 3. Render the list of operations by wrapping tokens with and tags where
+ * appropriate with render_operations.
+ *
+ * Example usage:
+ *
+ * htmldiff = require 'htmldiff.js'
+ *
+ * htmldiff '
this is some text
', '
this is some more text
'
+ * == '
this is some more text
'
+ *
+ * htmldiff '
this is some text
', '
this is some more text
', 'diff-class'
+ * == '
this is some more text
'
+###
+
is_end_of_tag = (char)-> char is '>'
is_start_of_tag = (char)-> char is '<'
is_whitespace = (char)-> /^\s+$/.test char
is_tag = (token)-> /^\s*<[^>]+>\s*$/.test token
isnt_tag = (token)-> not is_tag token
+###
+ * Checks if the current word is the beginning of an atomic tag. An atomic tag is one whose
+ * child nodes should not be compared - the entire tag should be treated as one token. This
+ * is useful for tags where it does not make sense to insert and tags.
+ *
+ * @param {string} word The characters of the current token read so far.
+ *
+ * @return {string|null} The name of the atomic tag if the word will be an atomic tag,
+ * null otherwise
+###
+is_start_of_atomic_tag = (word)->
+ result = /^<(iframe|object|math|svg|script)/.exec word
+ result = result[1] if result
+ return result
+
+###
+ * Checks if the current word is the end of an atomic tag (i.e. it has all the characters,
+ * except for the end bracket of the closing tag, such as "
+ (word.substring word.length - tag.length - 2) is "#{tag}"
+
+###
+ * Checks if a tag is a void tag.
+ *
+ * @param {string} token The token to check.
+ *
+ * @return {boolean} True if the token is a void tag, false otherwise.
+###
+is_void_tag = (token) ->
+ /^\s*<[^>]+\/>\s*$/.test token
+
+###
+ * Checks if a token can be wrapped inside a tag.
+ *
+ * @param {string} token The token to check.
+ *
+ * @return {boolean} True if the token can be wrapped inside a tag, false otherwise.
+###
+is_wrappable = (token) ->
+ (isnt_tag token) or (is_start_of_atomic_tag token) or (is_void_tag token)
+
+###
+ * A Match stores the information of a matching block. A matching block is a list of
+ * consecutive tokens that appear in both the before and after lists of tokens.
+ *
+ * @param {number} start_in_before The index of the first token in the list of before tokens.
+ * @param {number} start_in_after The index of the first token in the list of after tokens.
+ * @param {number} length The number of consecutive matching tokens in this block.
+###
class Match
constructor: (@start_in_before, @start_in_after, @length)->
@end_in_before = (@start_in_before + @length) - 1
@end_in_after = (@start_in_after + @length) - 1
+###
+ * Tokenizes a string of HTML.
+ *
+ * @param {string} html The string to tokenize.
+ *
+ * @return {Array.} The list of tokens.
+###
html_to_tokens = (html)->
mode = 'char'
current_word = ''
+ current_atomic_tag = ''
words = []
for char in html
switch mode
when 'tag'
- if is_end_of_tag char
+ atomic_tag = is_start_of_atomic_tag current_word
+ if atomic_tag
+ mode = 'atomic_tag'
+ current_atomic_tag = atomic_tag
+ current_word += char
+ else if is_end_of_tag char
current_word += '>'
words.push current_word
current_word = ''
@@ -27,6 +125,16 @@ html_to_tokens = (html)->
mode = 'char'
else
current_word += char
+ when 'atomic_tag'
+ if (is_end_of_tag char) \
+ and (is_end_of_atomic_tag current_word, current_atomic_tag)
+ current_word += '>'
+ words.push current_word
+ current_word = ''
+ current_atomic_tag = ''
+ mode = 'char'
+ else
+ current_word += char
when 'char'
if is_start_of_tag char
words.push current_word if current_word
@@ -36,11 +144,19 @@ html_to_tokens = (html)->
words.push current_word if current_word
current_word = char
mode = 'whitespace'
- else if /[\w\#@]+/i.test char
+ else if /[\w\d\#@]/.test char
+ # Consider '#' as part of the same word, since it might be part of an HTML escaped
+ # character (e.g. ' ').
current_word += char
- else
+ else if /&/.test char
+ # Consider '&' as the start of a new word, since it might be the start of an HTML
+ # escaped character (e.g. ' ').
words.push current_word if current_word
current_word = char
+ else
+ current_word += char
+ words.push current_word
+ current_word = ''
when 'whitespace'
if is_start_of_tag char
words.push current_word if current_word
@@ -57,6 +173,43 @@ html_to_tokens = (html)->
words.push current_word if current_word
return words
+###
+ * Creates a key that should be used to match tokens. This is useful, for example, if we want
+ * to consider two open tag tokens as equal, even if they don't have the same attributes. We
+ * use a key instead of overwriting the token because we may want to render the original string
+ * without losing the attributes.
+ *
+ * @param {string} token The token to create the key for.
+ *
+ * @return {string} The identifying key that should be used to match before and after tokens.
+###
+get_key_for_token = (token)->
+ # If the token is a tag, return just the tag with no attributes since we do not compare
+ # attributes yet.
+ tag_name = /<([^\s>]+)[\s>]/.exec token
+ return "<#{tag_name[1].toLowerCase()}>" if tag_name
+
+ # If the token is text, collapse adjacent whitespace and replace non-breaking spaces with
+ # regular spaces.
+ return token.replace /(\s+| | )/g, ' ' if token
+
+ return token
+
+###
+ * Finds the matching block with the most consecutive tokens within the given range in the
+ * before and after lists of tokens.
+ *
+ * @param {Array.} before_tokens The before list of tokens.
+ * @param {Array.} after_tokens The after list of tokens.
+ * @param {Object} index_of_before_locations_in_after_tokens The index that is used to search
+ * for tokens in the after list.
+ * @param {number} start_in_before The beginning of the range in the list of before tokens.
+ * @param {number} end_in_before The end of the range in the list of before tokens.
+ * @param {number} start_in_after The beginning of the range in the list of after tokens.
+ * @param {number} end_in_after The end of the range in the list of after tokens.
+ *
+ * @return {Match} A Match that describes the best matching block in the given range.
+###
find_match = (before_tokens, after_tokens,
index_of_before_locations_in_after_tokens,
start_in_before, end_in_before,
@@ -70,7 +223,7 @@ find_match = (before_tokens, after_tokens,
for index_in_before in [start_in_before...end_in_before]
new_match_length_at = {}
- looking_for = before_tokens[index_in_before]
+ looking_for = get_key_for_token before_tokens[index_in_before]
locations_in_after =
index_of_before_locations_in_after_tokens[looking_for]
@@ -96,6 +249,23 @@ find_match = (before_tokens, after_tokens,
return match
+###
+ * Finds all the matching blocks within the given range in the before and after lists of
+ * tokens. This function is called recursively to find the next best matches that precede
+ * and follow the first best match.
+ *
+ * @param {Array.} before_tokens The before list of tokens.
+ * @param {Array.} after_tokens The after list of tokens.
+ * @param {Object} index_of_before_locations_in_after_tokens The index that is used to search
+ * for tokens in the after list.
+ * @param {number} start_in_before The beginning of the range in the list of before tokens.
+ * @param {number} end_in_before The end of the range in the list of before tokens.
+ * @param {number} start_in_after The beginning of the range in the list of after tokens.
+ * @param {number} end_in_after The end of the range in the list of after tokens.
+ * @param {Array.} matching_blocks The list of matching blocks found so far.
+ *
+ * @return {Array.} The list of matching blocks in this range.
+###
recursively_find_matching_blocks = (before_tokens, after_tokens,
index_of_before_locations_in_after_tokens,
start_in_before, end_in_before,
@@ -128,20 +298,44 @@ recursively_find_matching_blocks = (before_tokens, after_tokens,
return matching_blocks
-create_index = (p)->
- throw new Error 'params must have find_these key' unless p.find_these?
- throw new Error 'params must have in_these key' unless p.in_these?
+###
+ * Creates an index (A.K.A. hash table) that will be used to match the list of before
+ * tokens with the list of after tokens.
+ *
+ * @param {Object} options An object with the following:
+ * - {Array.} find_these The list of tokens that will be used to search.
+ * - {Array.} in_these The list of tokens that will be returned.
+ *
+ * @return {Object} An index that can be used to search for tokens.
+###
+create_index = (options)->
+ throw new Error 'params must have find_these key' unless options.find_these?
+ throw new Error 'params must have in_these key' unless options.in_these?
+
+ queries = options.find_these.map (token)->
+ return get_key_for_token token
+ results = options.in_these.map (token)->
+ return get_key_for_token token
index = {}
- for token in p.find_these
- index[token] = []
- idx = p.in_these.indexOf token
+ for query in queries
+ index[query] = []
+ idx = results.indexOf query
while idx isnt -1
- index[token].push idx
- idx = p.in_these.indexOf token, idx+1
+ index[query].push idx
+ idx = results.indexOf query, idx+1
return index
+###
+ * Finds all the matching blocks in the before and after lists of tokens. This function
+ * is a wrapper for the recursive function recursively_find_matching_blocks.
+ *
+ * @param {Array.} before_tokens The before list of tokens.
+ * @param {Array.} after_tokens The after list of tokens.
+ *
+ * @return {Array.} The list of matching blocks.
+###
find_matching_blocks = (before_tokens, after_tokens)->
matching_blocks = []
index_of_before_locations_in_after_tokens =
@@ -155,6 +349,23 @@ find_matching_blocks = (before_tokens, after_tokens)->
0, after_tokens.length,
matching_blocks
+###
+ * Gets a list of operations required to transform the before list of tokens into the
+ * after list of tokens. An operation describes whether a particular list of consecutive
+ * tokens are equal, replaced, inserted, or deleted.
+ *
+ * @param {Array.} before_tokens The before list of tokens.
+ * @param {Array.} after_tokens The after list of tokens.
+ *
+ * @return {Array.