Newer
Older
catch
committed
<?php
namespace Drupal\Component\Diff\Engine;
use Drupal\Component\Utility\Unicode;
/**
* Class used internally by Diff to actually compute the diffs.
*
* The algorithm used here is mostly lifted from the perl module
* Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
* http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
*
* More ideas are taken from:
* http://www.ics.uci.edu/~eppstein/161/960229.html
*
Alex Pott
committed
* Some ideas (and a bit of code) are from analyze.c, from GNU
catch
committed
* diffutils-2.7, which can be found at:
* ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
*
* closingly, some ideas (subdivision by NCHUNKS > 2, and some optimizations)
* are my own.
*
* Line length limits for robustness added by Tim Starling, 2005-08-31
*
* @author Geoffrey T. Dairiki, Tim Starling
* @private
* @subpackage DifferenceEngine
*/
class DiffEngine {
const USE_ASSERTS = FALSE;
const MAX_XREF_LENGTH = 10000;
public function diff($from_lines, $to_lines) {
$n_from = sizeof($from_lines);
$n_to = sizeof($to_lines);
$this->xchanged = $this->ychanged = [];
$this->xv = $this->yv = [];
$this->xind = $this->yind = [];
catch
committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
unset($this->seq);
unset($this->in_seq);
unset($this->lcs);
// Skip leading common lines.
for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) {
if ($from_lines[$skip] !== $to_lines[$skip]) {
break;
}
$this->xchanged[$skip] = $this->ychanged[$skip] = FALSE;
}
// Skip trailing common lines.
$xi = $n_from;
$yi = $n_to;
for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) {
if ($from_lines[$xi] !== $to_lines[$yi]) {
break;
}
$this->xchanged[$xi] = $this->ychanged[$yi] = FALSE;
}
// Ignore lines which do not exist in both files.
for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
$xhash[$this->_line_hash($from_lines[$xi])] = 1;
}
for ($yi = $skip; $yi < $n_to - $endskip; $yi++) {
$line = $to_lines[$yi];
if ($this->ychanged[$yi] = empty($xhash[$this->_line_hash($line)])) {
continue;
}
$yhash[$this->_line_hash($line)] = 1;
$this->yv[] = $line;
$this->yind[] = $yi;
}
for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
$line = $from_lines[$xi];
if ($this->xchanged[$xi] = empty($yhash[$this->_line_hash($line)])) {
continue;
}
$this->xv[] = $line;
$this->xind[] = $xi;
}
// Find the LCS.
$this->_compareseq(0, sizeof($this->xv), 0, sizeof($this->yv));
// Merge edits when possible
$this->_shift_boundaries($from_lines, $this->xchanged, $this->ychanged);
$this->_shift_boundaries($to_lines, $this->ychanged, $this->xchanged);
// Compute the edit operations.
$edits = [];
catch
committed
$xi = $yi = 0;
while ($xi < $n_from || $yi < $n_to) {
$this::USE_ASSERTS && assert($yi < $n_to || $this->xchanged[$xi]);
$this::USE_ASSERTS && assert($xi < $n_from || $this->ychanged[$yi]);
// Skip matching "snake".
$copy = [];
catch
committed
while ( $xi < $n_from && $yi < $n_to && !$this->xchanged[$xi] && !$this->ychanged[$yi]) {
$copy[] = $from_lines[$xi++];
++$yi;
}
if ($copy) {
$edits[] = new DiffOpCopy($copy);
}
// Find deletes & adds.
$delete = [];
catch
committed
while ($xi < $n_from && $this->xchanged[$xi]) {
$delete[] = $from_lines[$xi++];
}
$add = [];
catch
committed
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
while ($yi < $n_to && $this->ychanged[$yi]) {
$add[] = $to_lines[$yi++];
}
if ($delete && $add) {
$edits[] = new DiffOpChange($delete, $add);
}
elseif ($delete) {
$edits[] = new DiffOpDelete($delete);
}
elseif ($add) {
$edits[] = new DiffOpAdd($add);
}
}
return $edits;
}
/**
* Returns the whole line if it's small enough, or the MD5 hash otherwise.
*/
protected function _line_hash($line) {
if (Unicode::strlen($line) > $this::MAX_XREF_LENGTH) {
return md5($line);
}
else {
return $line;
}
}
/**
* Divide the Largest Common Subsequence (LCS) of the sequences
* [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
* sized segments.
*
* Returns (LCS, PTS). LCS is the length of the LCS. PTS is an
* array of NCHUNKS+1 (X, Y) indexes giving the diving points between
* sub sequences. The first sub-sequence is contained in [X0, X1),
* [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on. Note
* that (X0, Y0) == (XOFF, YOFF) and
* (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
*
* This function assumes that the first lines of the specified portions
* of the two files do not match, and likewise that the last lines do not
* match. The caller must trim matching lines from the beginning and end
* of the portions it is going to specify.
*/
protected function _diag($xoff, $xlim, $yoff, $ylim, $nchunks) {
$flip = FALSE;
if ($xlim - $xoff > $ylim - $yoff) {
// Things seems faster (I'm not sure I understand why)
// when the shortest sequence in X.
$flip = TRUE;
list($xoff, $xlim, $yoff, $ylim) = [$yoff, $ylim, $xoff, $xlim];
catch
committed
}
if ($flip) {
for ($i = $ylim - 1; $i >= $yoff; $i--) {
$ymatches[$this->xv[$i]][] = $i;
}
}
else {
for ($i = $ylim - 1; $i >= $yoff; $i--) {
$ymatches[$this->yv[$i]][] = $i;
}
}
$this->lcs = 0;
Alex Pott
committed
$this->seq[0] = $yoff - 1;
$this->in_seq = [];
$ymids[0] = [];
catch
committed
$numer = $xlim - $xoff + $nchunks - 1;
$x = $xoff;
for ($chunk = 0; $chunk < $nchunks; $chunk++) {
if ($chunk > 0) {
for ($i = 0; $i <= $this->lcs; $i++) {
Alex Pott
committed
$ymids[$i][$chunk - 1] = $this->seq[$i];
catch
committed
}
}
Alex Pott
committed
$x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks);
catch
committed
for ( ; $x < $x1; $x++) {
$line = $flip ? $this->yv[$x] : $this->xv[$x];
if (empty($ymatches[$line])) {
continue;
}
$matches = $ymatches[$line];
reset($matches);
while (list ($junk, $y) = each($matches)) {
if (empty($this->in_seq[$y])) {
$k = $this->_lcs_pos($y);
$this::USE_ASSERTS && assert($k > 0);
Alex Pott
committed
$ymids[$k] = $ymids[$k - 1];
catch
committed
break;
}
}
while (list ($junk, $y) = each($matches)) {
Alex Pott
committed
if ($y > $this->seq[$k - 1]) {
catch
committed
$this::USE_ASSERTS && assert($y < $this->seq[$k]);
// Optimization: this is a common case:
// next match is just replacing previous match.
$this->in_seq[$this->seq[$k]] = FALSE;
$this->seq[$k] = $y;
$this->in_seq[$y] = 1;
}
elseif (empty($this->in_seq[$y])) {
$k = $this->_lcs_pos($y);
$this::USE_ASSERTS && assert($k > 0);
Alex Pott
committed
$ymids[$k] = $ymids[$k - 1];
catch
committed
}
}
}
}
$seps[] = $flip ? [$yoff, $xoff] : [$xoff, $yoff];
catch
committed
$ymid = $ymids[$this->lcs];
for ($n = 0; $n < $nchunks - 1; $n++) {
$x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks);
$y1 = $ymid[$n] + 1;
$seps[] = $flip ? [$y1, $x1] : [$x1, $y1];
catch
committed
}
$seps[] = $flip ? [$ylim, $xlim] : [$xlim, $ylim];
catch
committed
return [$this->lcs, $seps];
catch
committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
}
protected function _lcs_pos($ypos) {
$end = $this->lcs;
if ($end == 0 || $ypos > $this->seq[$end]) {
$this->seq[++$this->lcs] = $ypos;
$this->in_seq[$ypos] = 1;
return $this->lcs;
}
$beg = 1;
while ($beg < $end) {
$mid = (int)(($beg + $end) / 2);
if ($ypos > $this->seq[$mid]) {
$beg = $mid + 1;
}
else {
$end = $mid;
}
}
$this::USE_ASSERTS && assert($ypos != $this->seq[$end]);
$this->in_seq[$this->seq[$end]] = FALSE;
$this->seq[$end] = $ypos;
$this->in_seq[$ypos] = 1;
return $end;
}
/**
* Find LCS of two sequences.
*
* The results are recorded in the vectors $this->{x,y}changed[], by
* storing a 1 in the element for each line that is an insertion
* or deletion (ie. is not in the LCS).
*
* The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
*
* Note that XLIM, YLIM are exclusive bounds.
* All line numbers are origin-0 and discarded lines are not counted.
*/
protected function _compareseq($xoff, $xlim, $yoff, $ylim) {
// Slide down the bottom initial diagonal.
while ($xoff < $xlim && $yoff < $ylim && $this->xv[$xoff] == $this->yv[$yoff]) {
++$xoff;
++$yoff;
}
// Slide up the top initial diagonal.
while ($xlim > $xoff && $ylim > $yoff && $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) {
--$xlim;
--$ylim;
}
if ($xoff == $xlim || $yoff == $ylim) {
$lcs = 0;
}
else {
// This is ad hoc but seems to work well.
//$nchunks = sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5);
//$nchunks = max(2, min(8, (int)$nchunks));
$nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1;
list($lcs, $seps)
= $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks);
}
if ($lcs == 0) {
// X and Y sequences have no common subsequence:
// mark all changed.
while ($yoff < $ylim) {
$this->ychanged[$this->yind[$yoff++]] = 1;
}
while ($xoff < $xlim) {
$this->xchanged[$this->xind[$xoff++]] = 1;
}
}
else {
// Use the partitions to split this problem into subproblems.
reset($seps);
$pt1 = $seps[0];
while ($pt2 = next($seps)) {
$this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]);
$pt1 = $pt2;
}
}
}
/**
* Adjust inserts/deletes of identical lines to join changes
* as much as possible.
*
* We do something when a run of changed lines include a
* line at one end and has an excluded, identical line at the other.
* We are free to choose which identical line is included.
* `compareseq' usually chooses the one at the beginning,
* but usually it is cleaner to consider the following identical line
* to be the "change".
*
* This is extracted verbatim from analyze.c (GNU diffutils-2.7).
*/
protected function _shift_boundaries($lines, &$changed, $other_changed) {
$i = 0;
$j = 0;
$this::USE_ASSERTS && assert('sizeof($lines) == sizeof($changed)');
$len = sizeof($lines);
$other_len = sizeof($other_changed);
while (1) {
/*
* Scan forwards to find beginning of another run of changes.
* Also keep track of the corresponding point in the other file.
*
* Throughout this code, $i and $j are adjusted together so that
* the first $i elements of $changed and the first $j elements
* of $other_changed both contain the same number of zeros
* (unchanged lines).
* Furthermore, $j is always kept so that $j == $other_len or
* $other_changed[$j] == FALSE.
*/
while ($j < $other_len && $other_changed[$j]) {
$j++;
}
Alex Pott
committed
while ($i < $len && !$changed[$i]) {
catch
committed
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
$this::USE_ASSERTS && assert('$j < $other_len && ! $other_changed[$j]');
$i++;
$j++;
while ($j < $other_len && $other_changed[$j]) {
$j++;
}
}
if ($i == $len) {
break;
}
$start = $i;
// Find the end of this run of changes.
while (++$i < $len && $changed[$i]) {
continue;
}
do {
/*
* Record the length of this run of changes, so that
* we can later determine whether the run has grown.
*/
$runlength = $i - $start;
/*
* Move the changed region back, so long as the
* previous unchanged line matches the last changed one.
* This merges with previous changed regions.
*/
while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) {
$changed[--$start] = 1;
$changed[--$i] = FALSE;
while ($start > 0 && $changed[$start - 1]) {
$start--;
}
$this::USE_ASSERTS && assert('$j > 0');
while ($other_changed[--$j]) {
continue;
}
$this::USE_ASSERTS && assert('$j >= 0 && !$other_changed[$j]');
}
/*
* Set CORRESPONDING to the end of the changed run, at the last
* point where it corresponds to a changed run in the other file.
* CORRESPONDING == LEN means no such point has been found.
*/
$corresponding = $j < $other_len ? $i : $len;
/*
* Move the changed region forward, so long as the
* first changed line matches the following unchanged one.
* This merges with following changed regions.
* Do this second, so that if there are no merges,
* the changed region is moved forward as far as possible.
*/
while ($i < $len && $lines[$start] == $lines[$i]) {
$changed[$start++] = FALSE;
$changed[$i++] = 1;
while ($i < $len && $changed[$i]) {
$i++;
}
$this::USE_ASSERTS && assert('$j < $other_len && ! $other_changed[$j]');
$j++;
if ($j < $other_len && $other_changed[$j]) {
$corresponding = $i;
while ($j < $other_len && $other_changed[$j]) {
$j++;
}
}
}
} while ($runlength != $i - $start);
/*
* If possible, move the fully-merged run of changes
* back to a corresponding run in the other file.
*/
while ($corresponding < $i) {
$changed[--$start] = 1;
$changed[--$i] = 0;
$this::USE_ASSERTS && assert('$j > 0');
while ($other_changed[--$j]) {
continue;
}
$this::USE_ASSERTS && assert('$j >= 0 && !$other_changed[$j]');
}
}
}