/* * Copyright 1997-2000 Perforce Software. All rights reserved. * * This file is part of Perforce - the FAST SCM System. * * Diff code written by James Strickland, May 1997. */ /* * This file implements the algorithm described in * "An O(ND) Difference Algorithm and Its Variations", Eugene W Myers, * Algorithmica (1986) 1: 251-266 * * Most of the notation used in the paper is used here; sequences are numbered * 1..N, 1..M, Delta=N-M, furthest reaching D-paths are stored in * SymmetricVector types fV and rV, diagonal indices are k. * * The main deviation in notation is that I use the same indices * in calculating the reverse furthest reaching paths as I do for * the forward paths. Thus the k forward diagonal is the k-Delta * reverse diagonal. * * The paper uses the term "snake", which likely originated from the game * Snakes & Ladders. A snake is a diagonal in the edit graph; this corresponds * to a common subsequence of the two sequences (i.e. a bunch of contiguous * matching lines). Note that on p263 of the paper a snake is taken to be * from (x,y) to (u,v) where x<=u, but p261 states x>=u. This is confusing * at best. :-) Here, I take snakes to be from (x,y) to (u,v), x<=u. * * The overall structure of the algorithm is based on divide and conquer; * the longest common subsequence (LCS) of the whole is composed of * the LCS of the first part plus the "snake in the middle" * (found using FindSnake) plus the LCS of the last part. * * The FindSnake algorithm exhibits the following properties: * - the furthest reaching D-path is calculated for incrementing D, * starting at D=0 * - for each value of D information needs to be stored for * "diagonals" k=-D,..,D in steps of 2 * (well, not quite -- from k=mink,..,maxk where mink>=-D and maxk<=D) * - the value for a given D and k is based on the value for D-1 * and k-1, k+1 * * Visually: * k = -3 -2 -1 0 1 2 3 * D=0 * * D=1 * ^ * * D=2 * ^ * ^ * * D=3 * ^ * ^ * ^ * * * and so on. * denotes a value computed at that iteration, ^ a value * computed at the previous iteration which is consulted in the calculation * of values on either side. * * Storing the whole table requires O(D^2) space, which is untenable * for large D (consider two completely different 100 000 line files!). * * The algorithm guarantees optimal output (largest "longest common * subsequence" (LCS) or shortest edit script) EXCEPT when a heuristic is * used to avoid the huge cost of optimal output. (Remember those two * completely different 100 000 line files? Well, what if I told you there * was only one line in common, but I didn't tell you where? You'd have to * make 5 billion comparisons (10^10/2) to find it in the average case.) * * There may be more than one output which is optimal! * Since the three way merge algorithm compares the output of two diffs, * it's important that we produce a canonical form of output. This is * done in ApplyForwardBias(). * * For example, to change AbBbC to AbC you can either delete bB or Bb. * This corresponds to either matching Ab and C or matching A and bC. * If the algorithm chooses A and bC then ApplyForwardBias() will * extend A to Ab and shorten bC to C, thus matching Ab and bC, giving * the answer "delete Bb" rather than "delete bB". In practical terms * "b" is usually a blank line, but it can in fact be any line * or block of lines. * */ #ifndef DEBUGLEVEL #define DEBUGLEVEL 0 #endif #if DEBUGLEVEL > 0 #include <iostream.h> #endif #define NEED_TYPES #include <stdhdrs.h> #include <error.h> #include <strbuf.h> #include <readfile.h> #include <debug.h> #include <tunable.h> #include "diffsp.h" #include "diffan.h" #if DEBUGLEVEL > 2 void SymmetricVector::Dump(LineNo D) { p4debug.printf("index: "); LineNo i; for(i=-D;i<=D;i+=2) p4debug.printf("%3d ",i); p4debug.printf("\nline: "); for(i=-D;i<=D;i+=2) { p4debug.printf("%3d ",this->operator[](i)); } p4debug.printf("\n"); }; #endif static inline LineNo mapxtoy(LineNo x, LineNo diagonal, LineNo originx, LineNo originy) { return x-(diagonal+(originx-originy)); } inline void DiffAnalyze::FollowDiagonal( LineNo &x, LineNo &y, LineNo endx, LineNo endy) { while( x<endx && y<endy && A->ProbablyEqual(x,B,y) ) ++x,++y; } inline void DiffAnalyze::FollowReverseDiagonal( LineNo &x, LineNo &y, LineNo startx, LineNo starty) { while( x>startx && y>starty && A->ProbablyEqual(x-1,B,y-1) ) --x,--y; } // FindSnake normally returns the middle snake (potentially zero length) // of the longest common subsequence, but is free to return *any* snake // (other than a zero length one at endx,endy) if there is a heuristic // which suggests doing so would be a Good Idea. // // The correctness of the algorithm does not depend on the middle snake // being returned, only the optimality of the output is potentially affected. // // Note that this function should not be called with maxD==0. We could add // a test here, but the only time maxD should be zero is when one of the inputs // is zero length, and there's a short-circuit test for that at a higher level. inline void DiffAnalyze::FindSnake( Snake &s, LineNo startx, LineNo starty, LineNo endx, LineNo endy) { const LineNo N = endx-startx; const LineNo M = endy-starty; const LineNo Delta = N - M; // note that ==1 wouldn't work (-1%2==-1) const int DeltaEven = ((Delta%2)==0); #if DEBUGLEVEL > 1 p4debug.printf("FindSnake(%d,%d,%d,%d)\n",startx,starty,endx,endy); if(N<=0||M<=0||maxD<=0) { p4debug.printf("N<=0 or M<=0 or maxD<=0 - that's not good!\n"); fflush(stdout); abort(); } #endif // initialize values for D=0 fV[0]=s.x=s.u=startx; s.y=s.v=starty; // advances s.u,s.v to end of snake FollowDiagonal(s.u,s.v,endx,endy); // Heuristic: return immediately with initial prefix if(s.u>s.x) { #if DEBUGLEVEL > 2 p4debug.printf("FindSnake returning with initial prefix %d to %d\n",s.x,s.u); #endif return; } rV[0]=s.x=s.u=endx; s.y=s.v=endy; // s.x,s.y set to start of snake FollowReverseDiagonal(s.x,s.y,startx,starty); // Heuristic: return immediately with initial suffix if(s.u>s.x) { #if DEBUGLEVEL > 2 p4debug.printf("FindSnake returning with initial suffix %d to %d\n",s.x,s.u); #endif return; } for(LineNo D=1; D<=maxD; D++) { const LineNo minkF = (D<=M) ? -D : -(2*M-D); const LineNo maxkF = (D<=N) ? D : 2*N-D; const LineNo minkR = -maxkF; const LineNo maxkR = -minkF; LineNo k; #if DEBUGLEVEL > 3 p4debug.printf("D=%d Forward min,max (%d,%d) Reverse min,max (%d,%d)\n",D,minkF,maxkF,minkR,maxkR); #endif // search from top left of edit graph ("forward") for(k=minkF;k<=maxkF;k+=2) { if(k==minkF || (k!=maxkF && fV[k+1] > fV[k-1] ) ) s.x=fV[k+1]; // down in edit graph else s.x=fV[k-1]+1; // to the right in edit graph // Follow the (possibly 0 length) diagonal in the // edit graph (aka snake) s.u=s.x; s.v=mapxtoy(s.u,k,startx,starty); // advances s.u,s.v to end of snake FollowDiagonal(s.u,s.v,endx,endy); if( !DeltaEven ) { // Delta odd // check if path overlaps end of furthest reaching // reverse (D-1)-path const LineNo DR=D-1; const LineNo pminkR = (DR<=N) ? -DR : -(2*N-DR); const LineNo pmaxkR = (DR<=M) ? DR : 2*M-DR; #if DEBUGLEVEL > 3 p4debug.printf("DR=%d Reverse min,max (%d,%d)\n",DR,pminkR,pmaxkR); #endif if(k-Delta>=pminkR && k-Delta<=pmaxkR) { #if DEBUGLEVEL > 3 p4debug.printf("s.u %d rv[%d] %d\n",s.u,k-Delta,rV[k-Delta]); #endif if( s.u >= rV[k-Delta] ) { // fill in the fields of s which haven't // already been filled in s.y=mapxtoy(s.x,k,startx,starty); #if DEBUGLEVEL > 2 p4debug.printf("FindSnake returning during forward search (%d,%d) to (%d,%d)\n",s.x,s.y,s.u,s.v); #endif return; // finished! } } } fV[k]=s.u; // ok, now set it to the end of the snake } #if DEBUGLEVEL > 2 p4debug.printf("Forward D=%d\n",D); fV.Dump(D); #endif // search from bottom right of edit graph ("reverse") for(k=minkR;k<=maxkR;k+=2) { if(k==maxkR || (k!=minkR && rV[k-1] < rV[k+1] ) ) s.u=rV[k-1]; // up else s.u=rV[k+1]-1; // to the left s.x=s.u; s.y=mapxtoy(s.x,k,endx,endy); FollowReverseDiagonal(s.x,s.y,startx,starty); // check if path overlaps end of furthest reaching // forward D-path if( DeltaEven ) { if( k+Delta>=minkF && k+Delta<=maxkF ) { #if DEBUGLEVEL > 3 p4debug.printf("s.x %d fV[%d] %d\n",s.x,k+Delta,fV[k+Delta]); #endif if( s.x <= fV[k+Delta]) { // fill in the fields of s which haven't // already been filled in s.v=mapxtoy(s.u,k,endx,endy); #if DEBUGLEVEL > 2 p4debug.printf("FindSnake returning during reverse search (%d,%d) to (%d,%d)\n",s.x,s.y,s.u,s.v); #endif return; // finished! } } } rV[k]=s.x; // ok, now set it to the end of the snake } #if DEBUGLEVEL > 2 p4debug.printf("Reverse D=%d\n",D); rV.Dump(D); #endif } // we only get to this point if we've failed to make ends meet within // maxD iterations. Just return the midpoint so that we're sure we're // doing log N rather than N. If we're fortunate enough to have // stumbled upon a snake in the middle, then extend it to its // maximum length. s.x=s.u=startx+(endx-startx)/2; s.y=s.v=starty+(endy-starty)/2; #if DEBUGLEVEL > 2 p4debug.printf("exceeded maxD midpt (%d,%d)\n",s.x,s.y); #endif FollowReverseDiagonal(s.x,s.y,startx,starty); FollowDiagonal(s.u,s.v,endx,endy); #if DEBUGLEVEL > 2 p4debug.printf("after extending: (%d,%d) to (%d,%d)\n",s.x,s.y,s.u,s.v); #endif } void DiffAnalyze::LCS( LineNo startx, LineNo starty, LineNo endx, LineNo endy) { Snake s; FindSnake(s,startx,starty,endx,endy); // s is a reference parameter for speed if( s.x > startx && s.y > starty) { #if DEBUGLEVEL > 0 p4debug.printf("LCS %d , %d , %d , %d\n",startx,starty,s.x,s.y); if(s.x==endx&&s.y==endy) { p4debug.printf("INFINITE RECURSION!\n"); fflush(stdout); abort(); } #endif LCS(startx,starty,s.x,s.y); } if(s.u>s.x) { // snake has nonzero length #if DEBUGLEVEL > 1 int snakes_added=0; p4debug.printf("SNAKE (%d,%d) to (%d,%d)\n",s.x,s.y,s.u,s.v); #endif // verify actual sequence contents, splitting snake up if there // are any pairs of lines which are ProbablyEqual() but not Equal() LineNo cx,cy; for(cx=s.x, cy=s.y; cx < s.u ; cx++,cy++) { for(s.x=cx,s.y=cy;cx<s.u && A->Equal(cx,B,cy); cx++,cy++) ; if(cx>s.x) { // nonzero length snake to add Snake *toadd = new Snake; toadd->next=0; toadd->x=s.x; toadd->y=s.y; toadd->u=cx; toadd->v=cy; #if DEBUGLEVEL > 1 snakes_added++; p4debug.printf("Adding snake (%d,%d) to (%d,%d)\n",toadd->x,toadd->y,toadd->u,toadd->v); #endif // add snake to end of linked list if(FirstSnake) { // already found first one LastSnake->next=toadd; LastSnake=toadd; } else { FirstSnake=LastSnake=toadd; } } } #if DEBUGLEVEL > 1 if(snakes_added==0) p4debug.printf("SNAKE WAS COMPLETELY BOGUS!!!\n"); if(snakes_added>1) p4debug.printf("Snake was broken into %d parts!\n",snakes_added); #endif } if( endx > s.u && endy > s.v) { #if DEBUGLEVEL > 0 p4debug.printf("LCS %d , %d , %d , %d\n",s.u,s.v,endx,endy); if(s.u==startx&&s.v==starty) { p4debug.printf("INFINITE RECURSION!\n"); abort(); } #endif LCS(s.u,s.v,endx,endy); } } /* * DiffAnalyze::BracketSnake() - put on leading and trailing Snakes */ void DiffAnalyze::BracketSnake() { Snake *s; // If the first snake doesn't include the first lines of // both files, make a "null" snake for the beginning. // Note that if there is no snake (files completely unmatched) // we'll add a start snake anyhow. s = FirstSnake; if( !s || s->x || s->y ) { Snake *toadd = new Snake; toadd->x = toadd->u = toadd->y = toadd->v = 0; toadd->next = s; if( !s ) LastSnake = toadd; FirstSnake = toadd; } // If the last snake doesn't include the last lines of // both files, make a "null" snake for the end. // If we added a start snake for completely unmatched files // above, then we'll only wind up adding a tailing snake // if the files are not both empty. s = LastSnake; if( s->u < A->Lines() || s->v < B->Lines() ) { Snake *toadd = new Snake; toadd->x = toadd->u = A->Lines(); toadd->y = toadd->v = B->Lines(); toadd->next = 0; s->next = toadd; LastSnake = toadd; } } void DiffAnalyze::ApplyForwardBias() { Snake *s,*next; LineNo endx = A->Lines(); LineNo endy = B->Lines(); // Traverse the list of snakes, extending any which can be extended. // If the current snake extends into the next snake, then that snake // is shortened. If the second snake is totally consumed then it is // removed. for( s = FirstSnake, next = s->next; next; s = next, next = next->next ) { while( s->u < endx && s->v < endy && A->Equal(s->u,B,s->v) ) { // start stretching current chunk ++s->u, ++s->v; // if we are in an adjacent chunk then start compacting it if( s->u > next->x || s->v > next->y ) { ++next->x, ++next->y; // if its totally consumed remove it (unless its the last). if( next->x == next->u && next != LastSnake ) { // handle special case of snake completely eating up the // next snake - but don't delete the last snake! s->next = next->next; delete next; next = s->next; } } } } } DiffAnalyze::DiffAnalyze( Sequence *fromFile, Sequence *toFile, int fastMaxD ) { A = fromFile; B = toFile; // Calculate a limit on the amount of searching FindSnake does, so as to have // a reasonable upper bound on compute time (and space, although that's less relevant; // it's only 2*maxD elements of length LineNo, i.e. typically 8*maxD bytes). // FindSnake does a two-sided search, -D to +D, hence the variable name maxD // and its default value of half the sum of the number of lines in the two files. // With that value maxD does not limit the search in FindSnake. // If this value is beyond what results in a "reasonable" compute time // on modern hardware, then maxD is adjusted downwards, trading off potentially // poorer diff output for a reasonable running time. The way to determine // the constants used here is to determine the compute time given the // worst case inputs, which are two files which are completely different. // maxD calculation change // // This new calculation reduces maxD for bigger files. // // Added sLimit: // // "sLimit" is used to control the value of maxD so that as avgLines // becomes large maxD begins to tail off (reducing search time). // For RCS files we need maxD to tail off more rapidly as its important // that submit/checkin happens as quick as possible. // // An upper limit of 50000 lines of code has also been factored in to // cause maxD to significantly reduce the searching that FindSnake // will do. LineNo avgLines = (A->Lines() + B->Lines())/2; const int sThresh = p4tunable.Get( P4TUNE_DIFF_STHRESH ); const int sLimit1 = p4tunable.Get( P4TUNE_DIFF_SLIMIT1 ); const int sLimit2 = p4tunable.Get( P4TUNE_DIFF_SLIMIT2 ); const int sLimit = (!fastMaxD && avgLines < sThresh) ? sLimit2 : sLimit1; maxD = sLimit / (avgLines ? avgLines : 1); if( maxD > avgLines ) maxD = avgLines; if( maxD < 42 ) maxD = 42; // fV and rV are shared by all invocations of FindSnake() // This is required in order for the algorithm to take O(D) space // rather than O(D^2). (See Myers p264.) Thus we just make fV and // rV large enough to handle the worst case, then leave 'em alone. fV.Resize( maxD ); rV.Resize( maxD ); FirstSnake=LastSnake=0; #if DEBUGLEVEL > 0 p4debug.printf("N %d M %d maxD %d\n",A->Lines(),B->Lines(),maxD); #endif // find the longest common subsequence - note that there will // never be a common subsequence if one of the files is empty, // so don't call LCS in this case (this is not just an optimization, // this is necessary for correctness!) if(A->Lines() > 0 && B->Lines() > 0) LCS(0, 0, A->Lines(), B->Lines()); // Free vectors now that we will not need them anymore fV.Resize( 0 ); rV.Resize( 0 ); // Ensure there's a snake at the beginning by adding a zero-length // snake if necessary - this makes life easier for the code // which processes the list of snakes. It also is used by // ApplyForwardBias() BracketSnake(); // Select optimal output which is of a canonical form so that // the 3 way merge algorithm, which compares diff outputs, // does not see any bogus conflicts. // The term "forward bias" is intended to mean that given a choice // between two optimal outputs the result will be the one which // you would find by a forward search, rather than a reverse search. ApplyForwardBias(); } DiffAnalyze::~DiffAnalyze() { while( FirstSnake ) { Snake *next = FirstSnake->next; delete FirstSnake; FirstSnake = next; } }