From 2d0faa2b743e2b90e9a0e2f5b5fe5da147c247e6 Mon Sep 17 00:00:00 2001 From: Jamie Peabody Date: Sat, 5 Jan 2013 20:06:33 +0000 Subject: [PATCH] Tidy-up. --- lib/diff.py | 690 ------------------------------------------------ lib/mergely.css | 1 - 2 files changed, 691 deletions(-) delete mode 100755 lib/diff.py diff --git a/lib/diff.py b/lib/diff.py deleted file mode 100755 index 6f5bc4d..0000000 --- a/lib/diff.py +++ /dev/null @@ -1,690 +0,0 @@ -class MyersDiffer: - """ - An implementation of Eugene Myers's O(ND) Diff algorithm based on GNU diff. - """ - SNAKE_LIMIT = 20 - - DISCARD_NONE = 0 - DISCARD_FOUND = 1 - DISCARD_CANCEL = 2 - - # The Myers diff algorithm effectively turns the diff problem into a graph - # search. It works by finding the "shortest middle snake," which - - class DiffData: - def __init__(self, data): - self.data = data - self.length = len(data) - self.modified = {} - self.undiscarded = [] - self.undiscarded_lines = 0 - self.real_indexes = [] - - def is_modified(self, line): - return self.modified.has_key(line) and self.modified[line] - - - def __init__(self, a, b, ignore_space=False): - if type(a) != type(b): - raise TypeError - - self.a = a - self.b = b - self.code_table = {} - self.last_code = 0 - self.a_data = self.b_data = None - self.ignore_space = ignore_space - self.minimal_diff = False - - # SMS State - self.max_lines = 0 - self.fdiag = None - self.bdiag = None - - def ratio(self): - self._gen_diff_data() - a_equals = self.a_data.length - len(self.a_data.modified) - b_equals = self.b_data.length - len(self.b_data.modified) - - return 1.0 * (a_equals + b_equals) / \ - (self.a_data.length + self.b_data.length) - - def get_opcodes(self): - """ - Generator that returns opcodes representing the contents of the - diff. - - The resulting opcodes are in the format of - (tag, i1, i2, j1, j2) - """ - self._gen_diff_data() - - a_line = b_line = 0 - last_group = None - - # Go through the entire set of lines on both the old and new files - while a_line < self.a_data.length or b_line < self.b_data.length: - a_start = a_line - b_start = b_line - - if a_line < self.a_data.length and \ - not self.a_data.is_modified(a_line) and \ - b_line < self.b_data.length and \ - not self.b_data.is_modified(b_line): - # Equal - a_changed = b_changed = 1 - tag = "equal" - a_line += 1 - b_line += 1 - else: - # Deleted, inserted or replaced - - # Count every old line that's been modified, and the - # remainder of old lines if we've reached the end of the new - # file. - while a_line < self.a_data.length and \ - (b_line >= self.b_data.length or \ - self.a_data.is_modified(a_line)): - a_line += 1 - - # Count every new line that's been modified, and the - # remainder of new lines if we've reached the end of the old - # file. - while b_line < self.b_data.length and \ - (a_line >= self.a_data.length or \ - self.b_data.is_modified(b_line)): - b_line += 1 - - a_changed = a_line - a_start - b_changed = b_line - b_start - - assert a_start < a_line or b_start < b_line - assert a_changed != 0 or b_changed != 0 - - if a_changed == 0 and b_changed > 0: - tag = "insert" - elif a_changed > 0 and b_changed == 0: - tag = "delete" - elif a_changed > 0 and b_changed > 0: - tag = "replace" - - if a_changed != b_changed: - if a_changed > b_changed: - a_line -= a_changed - b_changed - elif a_changed < b_changed: - b_line -= b_changed - a_changed - - a_changed = b_changed = min(a_changed, b_changed) - - if last_group and last_group[0] == tag: - last_group = (tag, - last_group[1], last_group[2] + a_changed, - last_group[3], last_group[4] + b_changed) - else: - if last_group: - yield last_group - - last_group = (tag, a_start, a_start + a_changed, - b_start, b_start + b_changed) - - - if not last_group: - last_group = ("equal", 0, self.a_data.length, 0, self.b_data.length) - - yield last_group - - def _gen_diff_data(self): - """ - Generate all the diff data needed to return opcodes or the diff ratio. - This is only called once during the liftime of a MyersDiffer instance. - """ - if self.a_data and self.b_data: - return - - self.a_data = self.DiffData(self._gen_diff_codes(self.a)) - self.b_data = self.DiffData(self._gen_diff_codes(self.b)) - - self._discard_confusing_lines() - - self.max_lines = self.a_data.undiscarded_lines + \ - self.b_data.undiscarded_lines + 3 - - vector_size = self.a_data.undiscarded_lines + \ - self.b_data.undiscarded_lines + 3 - self.fdiag = [0] * vector_size - self.bdiag = [0] * vector_size - self.downoff = self.upoff = self.b_data.undiscarded_lines + 1 - - self._lcs(0, self.a_data.undiscarded_lines, - 0, self.b_data.undiscarded_lines, - self.minimal_diff) - self._shift_chunks(self.a_data, self.b_data) - self._shift_chunks(self.b_data, self.a_data) - - def _gen_diff_codes(self, lines): - """ - Converts all unique lines of text into unique numbers. Comparing - lists of numbers is faster than comparing lists of strings. - """ - codes = [] - - for line in lines: - # TODO: Handle ignoring/triming spaces, ignoring casing, and - # special hooks - - if self.ignore_space: - temp = line.lstrip() - - # We still want to show lines that contain only whitespace. - if temp != "": - line = temp - - if self.code_table.has_key(line): - code = self.code_table[line] - else: - # This is a new, unrecorded line, so mark it and store it. - self.last_code += 1 - code = self.last_code - self.code_table[line] = code - - codes.append(code) - - return codes - - def _find_sms(self, a_lower, a_upper, b_lower, b_upper, find_minimal): - """ - Finds the Shortest Middle Snake. - """ - down_vector = self.fdiag # The vector for the (0, 0) to (x, y) search - up_vector = self.bdiag # The vector for the (u, v) to (N, M) search - - down_k = a_lower - b_lower # The k-line to start the forward search - up_k = a_upper - b_upper # The k-line to start the reverse search - odd_delta = (down_k - up_k) % 2 != 0 - - down_vector[self.downoff + down_k] = a_lower - up_vector[self.upoff + up_k] = a_upper - - dmin = a_lower - b_upper - dmax = a_upper - b_lower - - down_min = down_max = down_k - up_min = up_max = up_k - - cost = 0 - max_cost = max(256, self._very_approx_sqrt(self.max_lines * 4)) - - while True: - cost += 1 - big_snake = False - - if down_min > dmin: - down_min -= 1 - down_vector[self.downoff + down_min - 1] = -1 - else: - down_min += 1 - - if down_max < dmax: - down_max += 1 - down_vector[self.downoff + down_max + 1] = -1 - else: - down_max -= 1 - - # Extend the forward path - for k in xrange(down_max, down_min - 1, -2): - tlo = down_vector[self.downoff + k - 1] - thi = down_vector[self.downoff + k + 1] - - if tlo >= thi: - x = tlo + 1 - else: - x = thi - - y = x - k - old_x = x - - # Find the end of the furthest reaching forward D-path in - # diagonal k - while x < a_upper and y < b_upper and \ - self.a_data.undiscarded[x] == self.b_data.undiscarded[y]: - x += 1 - y += 1 - - if x - old_x > self.SNAKE_LIMIT: - big_snake = True - - down_vector[self.downoff + k] = x - - if odd_delta and up_min <= k <= up_max and \ - up_vector[self.upoff + k] <= x: - return x, y, True, True - - # Extend the reverse path - if up_min > dmin: - up_min -= 1 - up_vector[self.upoff + up_min - 1] = self.max_lines - else: - up_min += 1 - - if up_max < dmax: - up_max += 1 - up_vector[self.upoff + up_max + 1] = self.max_lines - else: - up_max -= 1 - - for k in xrange(up_max, up_min - 1, -2): - tlo = up_vector[self.upoff + k - 1] - thi = up_vector[self.upoff + k + 1] - - if tlo < thi: - x = tlo - else: - x = thi - 1 - - y = x - k - old_x = x - - while x > a_lower and y > b_lower and \ - self.a_data.undiscarded[x - 1] == \ - self.b_data.undiscarded[y - 1]: - x -= 1 - y -= 1 - - if old_x - x > self.SNAKE_LIMIT: - big_snake = True - - up_vector[self.upoff + k] = x - - if not odd_delta and down_min <= k <= down_max and \ - x <= down_vector[self.downoff + k]: - return x, y, True, True - - if find_minimal: - continue - - # Heuristics courtesy of GNU diff. - # - # We check occasionally for a diagonal that made lots of progress - # compared with the edit distance. If we have one, find the one - # that made the most progress and return it. - # - # This gives us better, more dense chunks, instead of lots of - # small ones often starting with replaces. It also makes the output - # closer to that of GNU diff, which more people would expect. - - if cost > 200 and big_snake: - def find_diagonal(minimum, maximum, k, best, diagoff, vector, - vdiff_func, check_x_range, check_y_range, - discard_index, k_offset): - for d in xrange(maximum, minimum - 1, -2): - dd = d - k - x = vector[diagoff + d] - y = x - d - v = vdiff_func(x) * 2 + dd - - if v > 12 * (cost + abs(dd)): - if v > best and \ - check_x_range(x) and check_y_range(y): - # We found a sufficient diagonal. - k = k_offset - x_index = discard_index(x, k) - y_index = discard_index(y, k) - - while self.a_data.undiscarded[x_index] == \ - self.b_data.undiscarded[y_index]: - if k == self.SNAKE_LIMIT - 1 + k_offset: - return x, y, v - - k += 1 - return 0, 0, 0 - - ret_x, ret_y, best = \ - find_diagonal(down_min, down_max, down_k, 0, self.downoff, - down_vector, - lambda x: x - a_lower, - lambda x: a_lower + self.SNAKE_LIMIT <= - x < a_upper, - lambda y: b_lower + self.SNAKE_LIMIT <= - y < b_upper, - lambda i,k: i - k, - 1) - - if best > 0: - return ret_x, ret_y, True, False - - ret_x, ret_y, best = \ - find_diagonal(up_min, up_max, up_k, best, self.upoff, - up_vector, - lambda x: a_upper - x, - lambda x: a_lower < x <= a_upper - - self.SNAKE_LIMIT, - lambda y: b_lower < y <= b_upper - - self.SNAKE_LIMIT, - lambda i,k: i + k, - 0) - - if best > 0: - return ret_x, ret_y, False, True - - continue # XXX - - # If we've reached or gone past the max cost, just give up now - # and report the halfway point between our best results. - if cost >= max_cost: - fx_best = bx_best = 0 - - # Find the forward diagonal that maximized x + y - fxy_best = -1 - for d in xrange(down_max, down_min - 1, -2): - x = min(down_vector[self.downoff + d], a_upper) - y = x - d - - if b_upper < y: - x = b_upper + d - y = b_upper - - if fxy_best < x + y: - fxy_best = x + y - fx_best = x - - # Find the backward diagonal that minimizes x + y - bxy_best = self.max_lines - for d in xrange(up_max, up_min - 1, -2): - x = max(a_lower, up_vector[self.upoff + d]) - y = x - d - - if y < b_lower: - x = b_lower + d - y = b_lower - - if x + y < bxy_best: - bxy_best = x + y - bx_bet = x - - # Use the better of the two diagonals - if a_upper + b_upper - bxy_best < \ - fxy_best - (a_lower + b_lower): - return fx_best, fxy_best - fx_best, True, False - else: - return bx_best, bxy_best - bx_best, False, True - - - raise Exception("The function should not have reached here.") - - def _lcs(self, a_lower, a_upper, b_lower, b_upper, find_minimal): - """ - The divide-and-conquer implementation of the Longest Common - Subsequence (LCS) algorithm. - """ - # Fast walkthrough equal lines at the start - while a_lower < a_upper and b_lower < b_upper and \ - self.a_data.undiscarded[a_lower] == \ - self.b_data.undiscarded[b_lower]: - a_lower += 1 - b_lower += 1 - - while a_upper > a_lower and b_upper > b_lower and \ - self.a_data.undiscarded[a_upper - 1] == \ - self.b_data.undiscarded[b_upper - 1]: - a_upper -= 1 - b_upper -= 1 - - if a_lower == a_upper: - # Inserted lines. - while b_lower < b_upper: - self.b_data.modified[self.b_data.real_indexes[b_lower]] = True - b_lower += 1 - elif b_lower == b_upper: - # Deleted lines - while a_lower < a_upper: - self.a_data.modified[self.a_data.real_indexes[a_lower]] = True - a_lower += 1 - else: - # Find the middle snake and length of an optimal path for A and B - x, y, low_minimal, high_minimal = \ - self._find_sms(a_lower, a_upper, b_lower, b_upper, - find_minimal) - - self._lcs(a_lower, x, b_lower, y, low_minimal) - self._lcs(x, a_upper, y, b_upper, high_minimal) - - def _shift_chunks(self, data, other_data): - """ - Shifts the inserts/deletes of identical lines in order to join - the changes together a bit more. This has the effect of cleaning - up the diff. - - Often times, a generated diff will have two identical lines before - and after a chunk (say, a blank line). The default algorithm will - insert at the front of that range and include two blank lines at the - end, but that does not always produce the best looking diff. Since - the two lines are identical, we can shift the chunk so that the line - appears both before and after the line, rather than only after. - """ - i = j = 0 - i_end = data.length - - while True: - # Scan forward in order to find the start of a run of changes. - while i < i_end and not data.is_modified(i): - i += 1 - - while other_data.is_modified(j): - j += 1 - - if i == i_end: - return; - - start = i - - # Find the end of these changes - i += 1 - while data.is_modified(i): - i += 1 - - while other_data.is_modified(j): - j += 1 - - while True: - run_length = i - start - - # Move the changed chunks back as long as the previous - # unchanged line matches the last changed line. - # This merges with the previous changed chunks. - while start != 0 and data.data[start - 1] == data.data[i - 1]: - start -= 1 - i -= 1 - - data.modified[start] = True - data.modified[i] = False - - while data.is_modified(start - 1): - start -= 1 - - j -= 1 - while other_data.is_modified(j): - j -= 1 - - # The end of the changed run at the last point where it - # corresponds to the changed run in the other data set. - # If it's equal to i_end, then we didn't find a corresponding - # point. - if other_data.is_modified(j - 1): - corresponding = i - else: - corresponding = i_end - - # Move the changed region forward as long as the first - # changed line is the same as the following unchanged line. - while i != i_end and data.data[start] == data.data[i]: - data.modified[start] = False - data.modified[i] = True - - start += 1 - i += 1 - - while data.is_modified(i): - i += 1 - - j += 1 - while other_data.is_modified(j): - j += 1 - corresponding = i - - if run_length == i - start: - break - - # Move the fully-merged run back to a corresponding run in the - # other data set, if we can. - while corresponding < i: - start -= 1 - i -= 1 - - data.modified[start] = True - data.modified[i] = False - - j -= 1 - while other_data.is_modified(j): - j -= 1 - - def _discard_confusing_lines(self): - def build_discard_list(data, discards, counts): - many = 5 * self._very_approx_sqrt(data.length / 64) - - for i, item in enumerate(data.data): - if item != 0: - num_matches = counts[item] - - if num_matches == 0: - discards[i] = self.DISCARD_FOUND - elif num_matches > many: - discards[i] = self.DISCARD_CANCEL - - def scan_run(discards, i, length, index_func): - consec = 0 - - for j in xrange(length): - index = index_func(i, j) - discard = discards[index] - - if j >= 8 and discard == self.DISCARD_FOUND: - break - - if discard == self.DISCARD_FOUND: - consec += 1 - else: - consec = 0 - - if discard == self.DISCARD_CANCEL: - discards[index] = self.DISCARD_NONE - - if consec == 3: - break - - def check_discard_runs(data, discards): - i = 0 - while i < data.length: - # Cancel the provisional discards that are not in the middle - # of a run of discards - if discards[i] == self.DISCARD_CANCEL: - discards[i] = self.DISCARD_NONE - elif discards[i] == self.DISCARD_FOUND: - # We found a provisional discard - provisional = 0 - - # Find the end of this run of discardable lines and count - # how many are provisionally discardable. - #for j in xrange(i, data.length): - j = i - while j < data.length: - if discards[j] == self.DISCARD_NONE: - break - elif discards[j] == self.DISCARD_CANCEL: - provisional += 1 - j += 1 - - # Cancel the provisional discards at the end and shrink - # the run. - while j > i and discards[j - 1] == self.DISCARD_CANCEL: - j -= 1 - discards[j] = 0 - provisional -= 1 - - length = j - i - - # If 1/4 of the lines are provisional, cancel discarding - # all the provisional lines in the run. - if provisional * 4 > length: - while j > i: - j -= 1 - if discards[j] == self.DISCARD_CANCEL: - discards[j] = self.DISCARD_NONE - else: - minimum = 1 + self._very_approx_sqrt(length / 4) - j = 0 - consec = 0 - while j < length: - if discards[i + j] != self.DISCARD_CANCEL: - consec = 0 - else: - consec += 1 - if minimum == consec: - j -= consec - elif minimum < consec: - discards[i + j] = self.DISCARD_NONE - - j += 1 - - scan_run(discards, i, length, lambda x,y: x + y) - i += length - 1 - scan_run(discards, i, length, lambda x,y: x - y) - - i += 1 - - def discard_lines(data, discards): - j = 0 - for i, item in enumerate(data.data): - if self.minimal_diff or discards[i] == self.DISCARD_NONE: - data.undiscarded[j] = item - data.real_indexes[j] = i - j += 1 - else: - data.modified[i] = True - - data.undiscarded_lines = j - - - self.a_data.undiscarded = [0] * self.a_data.length - self.b_data.undiscarded = [0] * self.b_data.length - self.a_data.real_indexes = [0] * self.a_data.length - self.b_data.real_indexes = [0] * self.b_data.length - a_discarded = [0] * self.a_data.length - b_discarded = [0] * self.b_data.length - a_code_counts = [0] * (1 + self.last_code) - b_code_counts = [0] * (1 + self.last_code) - - for item in self.a_data.data: - a_code_counts[item] += 1 - - for item in self.b_data.data: - b_code_counts[item] += 1 - - build_discard_list(self.a_data, a_discarded, b_code_counts) - build_discard_list(self.b_data, b_discarded, a_code_counts) - - check_discard_runs(self.a_data, a_discarded) - check_discard_runs(self.b_data, b_discarded) - - discard_lines(self.a_data, a_discarded) - discard_lines(self.b_data, b_discarded) - - - def _very_approx_sqrt(self, i): - result = 1 - i /= 4 - while i > 0: - i /= 4 - result *= 2 - - return result - \ No newline at end of file diff --git a/lib/mergely.css b/lib/mergely.css index fc713fe..3da21cf 100644 --- a/lib/mergely.css +++ b/lib/mergely.css @@ -11,7 +11,6 @@ .mergely-column { border: 1px solid #ccc; } .mergely-active { border: 1px solid #4ba3fa; } - .mergely.a.rhs.start { border-top: 1px solid #4ba3fa; } .mergely.a.lhs.start.end, .mergely.a.rhs.end { border-bottom: 1px solid #4ba3fa; }