/* ** Copyright (c) 2007 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the Simplified BSD License (also ** known as the "2-Clause License" or "FreeBSD License".) ** This program is distributed in the hope that it will be useful, ** but without any warranty; without even the implied warranty of ** merchantability or fitness for a particular purpose. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ******************************************************************************* ** ** This file contains code used to compute a "diff" between two ** text files. */ #include "config.h" #include "diff.h" #include <assert.h> /* ** Maximum length of a line in a text file. (8192) */ #define LENGTH_MASK_SZ 13 #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) /* ** Information about each line of a file being diffed. ** ** The lower LENGTH_MASK_SZ bits of the hash (DLine.h) are the length ** of the line. If any line is longer than LENGTH_MASK characters, ** the file is considered binary. */ typedef struct DLine DLine; struct DLine { const char *z; /* The text of the line */ unsigned int h; /* Hash of the line */ unsigned int iNext; /* 1+(Index of next line with same the same hash) */ /* an array of DLine elements services two purposes. The fields ** above are one per line of input text. But each entry is also ** a bucket in a hash table, as follows: */ unsigned int iHash; /* 1+(first entry in the hash chain) */ }; /* ** A context for running a diff. */ typedef struct DContext DContext; struct DContext { int *aEdit; /* Array of copy/delete/insert triples */ int nEdit; /* Number of integers (3x num of triples) in aEdit[] */ int nEditAlloc; /* Space allocated for aEdit[] */ DLine *aFrom; /* File on left side of the diff */ int nFrom; /* Number of lines in aFrom[] */ DLine *aTo; /* File on right side of the diff */ int nTo; /* Number of lines in aTo[] */ }; /* ** Return an array of DLine objects containing a pointer to the ** start of each line and a hash of that line. The lower ** bits of the hash store the length of each line. ** ** Trailing whitespace is removed from each line. ** ** Return 0 if the file is binary or contains a line that is ** too long. */ static DLine *break_into_lines(const char *z, int n, int *pnLine){ int nLine, i, j, k, x; unsigned int h, h2; DLine *a; /* Count the number of lines. Allocate space to hold ** the returned array. */ for(i=j=0, nLine=1; i<n; i++, j++){ int c = z[i]; if( c==0 ){ return 0; } if( c=='\n' && z[i+1]!=0 ){ nLine++; if( j>LENGTH_MASK ){ return 0; } j = 0; } } if( j>LENGTH_MASK ){ return 0; } a = malloc( nLine*sizeof(a[0]) ); if( a==0 ) fossil_panic("out of memory"); memset(a, 0, nLine*sizeof(a[0]) ); /* Fill in the array */ for(i=0; i<nLine; i++){ a[i].z = z; for(j=0; z[j] && z[j]!='\n'; j++){} for(k=j; k>0 && isspace(z[k-1]); k--){} for(h=0, x=0; x<k; x++){ h = h ^ (h<<2) ^ z[x]; } a[i].h = h = (h<<LENGTH_MASK_SZ) | k;; h2 = h % nLine; a[i].iNext = a[h2].iHash; a[h2].iHash = i+1; z += j+1; } /* Return results */ *pnLine = nLine; return a; } /* ** Return true if two DLine elements are identical. */ static int same_dline(DLine *pA, DLine *pB){ return pA->h==pB->h && memcmp(pA->z,pB->z,pA->h & LENGTH_MASK)==0; } /* ** Append a single line of "diff" output to pOut. */ static void appendDiffLine(Blob *pOut, char *zPrefix, DLine *pLine){ blob_append(pOut, zPrefix, 1); blob_append(pOut, pLine->z, pLine->h & LENGTH_MASK); blob_append(pOut, "\n", 1); } /* ** Expand the size of aEdit[] array to hold nEdit elements. */ static void expandEdit(DContext *p, int nEdit){ int *a; a = realloc(p->aEdit, nEdit*sizeof(int)); if( a==0 ){ free( p->aEdit ); p->nEdit = 0; nEdit = 0; } p->aEdit = a; p->nEditAlloc = nEdit; } /* ** Append a new COPY/DELETE/INSERT triple. */ static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){ /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */ if( p->nEdit>=3 ){ if( p->aEdit[p->nEdit-1]==0 ){ if( p->aEdit[p->nEdit-2]==0 ){ p->aEdit[p->nEdit-3] += nCopy; p->aEdit[p->nEdit-2] += nDel; p->aEdit[p->nEdit-1] += nIns; return; } if( nCopy==0 ){ p->aEdit[p->nEdit-2] += nDel; p->aEdit[p->nEdit-1] += nIns; return; } } if( nCopy==0 && nDel==0 ){ p->aEdit[p->nEdit-1] += nIns; return; } } if( p->nEdit+3>p->nEditAlloc ){ expandEdit(p, p->nEdit*2 + 15); if( p->aEdit==0 ) return; } p->aEdit[p->nEdit++] = nCopy; p->aEdit[p->nEdit++] = nDel; p->aEdit[p->nEdit++] = nIns; } /* ** Given a diff context in which the aEdit[] array has been filled ** in, compute a context diff into pOut. */ static void contextDiff(DContext *p, Blob *pOut, int nContext){ DLine *A; /* Left side of the diff */ DLine *B; /* Right side of the diff */ int a = 0; /* Index of next line in A[] */ int b = 0; /* Index of next line in B[] */ int *R; /* Array of COPY/DELETE/INSERT triples */ int r; /* Index into R[] */ int nr; /* Number of COPY/DELETE/INSERT triples to process */ int mxr; /* Maximum value for r */ int na, nb; /* Number of lines shown from A and B */ int i, j; /* Loop counters */ int m; /* Number of lines to output */ int skip; /* Number of lines to skip */ A = p->aFrom; B = p->aTo; R = p->aEdit; mxr = p->nEdit; while( mxr>2 && R[mxr-1]==0 && R[mxr-2]==0 ){ mxr -= 3; } for(r=0; r<mxr; r += 3*nr){ /* Figure out how many triples to show in a single block */ for(nr=1; R[r+nr*3]>0 && R[r+nr*3]<nContext*2; nr++){} /* printf("r=%d nr=%d\n", r, nr); */ /* For the current block comprising nr triples, figure out ** how many lines of A and B are to be displayed */ if( R[r]>nContext ){ na = nb = nContext; skip = R[r] - nContext; }else{ na = nb = R[r]; skip = 0; } for(i=0; i<nr; i++){ na += R[r+i*3+1]; nb += R[r+i*3+2]; } if( R[r+nr*3]>nContext ){ na += nContext; nb += nContext; }else{ na += R[r+nr*3]; nb += R[r+nr*3]; } for(i=1; i<nr; i++){ na += R[r+i*3]; nb += R[r+i*3]; } blob_appendf(pOut,"@@ -%d,%d +%d,%d @@\n", a+skip+1, na, b+skip+1, nb); /* Show the initial common area */ a += skip; b += skip; m = R[r] - skip; for(j=0; j<m; j++){ appendDiffLine(pOut, " ", &A[a+j]); } a += m; b += m; /* Show the differences */ for(i=0; i<nr; i++){ m = R[r+i*3+1]; for(j=0; j<m; j++){ appendDiffLine(pOut, "-", &A[a+j]); } a += m; m = R[r+i*3+2]; for(j=0; j<m; j++){ appendDiffLine(pOut, "+", &B[b+j]); } b += m; if( i<nr-1 ){ m = R[r+i*3+3]; for(j=0; j<m; j++){ appendDiffLine(pOut, " ", &B[b+j]); } b += m; a += m; } } /* Show the final common area */ assert( nr==i ); m = R[r+nr*3]; if( m>nContext ) m = nContext; for(j=0; j<m; j++){ appendDiffLine(pOut, " ", &B[b+j]); } } } /* ** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[] ** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence ** of lines in these two blocks that are exactly the same. Return ** the bounds of the matching sequence. */ static void longestCommonSequence( DContext *p, /* Two files being compared */ int iS1, int iE1, /* Range of lines in p->aFrom[] */ int iS2, int iE2, /* Range of lines in p->aTo[] */ int *piSX, int *piEX, /* Write p->aFrom[] common segment here */ int *piSY, int *piEY /* Write p->aTo[] common segment here */ ){ double bestScore = -1e30; /* Best score seen so far */ int i, j; /* Loop counters */ int iSX, iSY, iEX, iEY; /* Current match */ double score; /* Current score */ int skew; /* How lopsided is the match */ int dist; /* Distance of match from center */ int mid; /* Center of the span */ int iSXb, iSYb, iEXb, iEYb; /* Best match so far */ int iSXp, iSYp, iEXp, iEYp; /* Previous match */ iSXb = iSXp = iS1; iEXb = iEXp = iS1; iSYb = iSYp = iS2; iEYb = iEYp = iS2; mid = (iE1 + iS1)/2; for(i=iS1; i<iE1; i++){ int limit = 0; j = p->aTo[p->aFrom[i].h % p->nTo].iHash; while( j>0 && (j-1<iS2 || j>=iE2 || !same_dline(&p->aFrom[i], &p->aTo[j-1])) ){ if( limit++ > 10 ){ j = 0; break; } j = p->aTo[j-1].iNext; } if( j==0 ) continue; assert( i>=iSXb && i>=iSXp ); if( i<iEXb && j>=iSYb && j<iEYb ) continue; if( i<iEXp && j>=iSYp && j<iEYp ) continue; iSX = i; iSY = j-1; while( iSX>iS1 && iSY>iS2 && same_dline(&p->aFrom[iSX-1],&p->aTo[iSY-1]) ){ iSX--; iSY--; } iEX = i+1; iEY = j; while( iEX<iE1 && iEY<iE2 && same_dline(&p->aFrom[iEX],&p->aTo[iEY]) ){ iEX++; iEY++; } skew = (iSX-iS1) - (iSY-iS2); if( skew<0 ) skew = -skew; dist = (iSX+iEX)/2 - mid; if( dist<0 ) dist = -dist; score = (iEX - iSX) - 0.05*skew - 0.05*dist; if( score>bestScore ){ bestScore = score; iSXb = iSX; iSYb = iSY; iEXb = iEX; iEYb = iEY; }else{ iSXp = iSX; iSYp = iSY; iEXp = iEX; iEYp = iEY; } } *piSX = iSXb; *piSY = iSYb; *piEX = iEXb; *piEY = iEYb; /* printf("LCS(%d..%d/%d..%d) = %d..%d/%d..%d\n", iS1, iE1, iS2, iE2, *piSX, *piEX, *piSY, *piEY); */ } /* ** Do a single step in the difference. Compute a sequence of ** copy/delete/insert steps that will convert lines iS1 through iE1-1 of ** the input into lines iS2 through iE2-1 of the output and write ** that sequence into the difference context. ** ** The algorithm is to find a block of common text near the middle ** of the two segments being diffed. Then recursively compute ** differences on the blocks before and after that common segment. ** Special cases apply if either input segment is empty or if the ** two segments have no text in common. */ static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){ int iSX, iEX, iSY, iEY; if( iE1<=iS1 ){ /* The first segment is empty */ if( iE2>iS2 ){ appendTriple(p, 0, 0, iE2-iS2); } return; } if( iE2<=iS2 ){ /* The second segment is empty */ appendTriple(p, 0, iE1-iS1, 0); return; } /* Find the longest matching segment between the two sequences */ longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY); if( iEX>iSX ){ /* A common segement has been found. ** Recursively diff either side of the matching segment */ diff_step(p, iS1, iSX, iS2, iSY); if( iEX>iSX ){ appendTriple(p, iEX - iSX, 0, 0); } diff_step(p, iEX, iE1, iEY, iE2); }else{ /* The two segments have nothing in common. Delete the first then ** insert the second. */ appendTriple(p, 0, iE1-iS1, iE2-iS2); } } /* ** Compute the differences between two files already loaded into ** the DContext structure. ** ** A divide and conquer technique is used. We look for a large ** block of common text that is in the middle of both files. Then ** compute the difference on those parts of the file before and ** after the common block. This technique is fast, but it does ** not necessarily generate the minimum difference set. On the ** other hand, we do not need a minimum difference set, only one ** that makes sense to human readers, which this algorithm does. ** ** Any common text at the beginning and end of the two files is ** removed before starting the divide-and-conquer algorithm. */ static void diff_all(DContext *p){ int mnE, iS, iE1, iE2; /* Carve off the common header and footer */ iE1 = p->nFrom; iE2 = p->nTo; while( iE1>0 && iE2>0 && same_dline(&p->aFrom[iE1-1], &p->aTo[iE2-1]) ){ iE1--; iE2--; } mnE = iE1<iE2 ? iE1 : iE2; for(iS=0; iS<mnE && same_dline(&p->aFrom[iS],&p->aTo[iS]); iS++){} /* do the difference */ if( iS>0 ){ appendTriple(p, iS, 0, 0); } diff_step(p, iS, iE1, iS, iE2); if( iE1<p->nFrom ){ appendTriple(p, p->nFrom - iE1, 0, 0); } /* Terminate the COPY/DELETE/INSERT triples with three zeros */ expandEdit(p, p->nEdit+3); if( p->aEdit ){ p->aEdit[p->nEdit++] = 0; p->aEdit[p->nEdit++] = 0; p->aEdit[p->nEdit++] = 0; } } /* ** Generate a report of the differences between files pA and pB. ** If pOut is not NULL then a unified diff is appended there. It ** is assumed that pOut has already been initialized. If pOut is ** NULL, then a pointer to an array of integers is returned. ** The integers come in triples. For each triple, ** the elements are the number of lines copied, the number of ** lines deleted, and the number of lines inserted. The vector ** is terminated by a triple of all zeros. ** ** This diff utility does not work on binary files. If a binary ** file is encountered, 0 is returned and pOut is written with ** text "cannot compute difference between binary files". */ int *text_diff( Blob *pA_Blob, /* FROM file */ Blob *pB_Blob, /* TO file */ Blob *pOut, /* Write unified diff here if not NULL */ int nContext /* Amount of context to unified diff */ ){ DContext c; /* Prepare the input files */ memset(&c, 0, sizeof(c)); c.aFrom = break_into_lines(blob_str(pA_Blob), blob_size(pA_Blob), &c.nFrom); c.aTo = break_into_lines(blob_str(pB_Blob), blob_size(pB_Blob), &c.nTo); if( c.aFrom==0 || c.aTo==0 ){ free(c.aFrom); free(c.aTo); if( pOut ){ blob_appendf(pOut, "cannot compute difference between binary files\n"); } return 0; } /* Compute the difference */ diff_all(&c); if( pOut ){ /* Compute a context diff if requested */ contextDiff(&c, pOut, nContext); free(c.aFrom); free(c.aTo); free(c.aEdit); return 0; }else{ /* If a context diff is not requested, then return the ** array of COPY/DELETE/INSERT triples. */ free(c.aFrom); free(c.aTo); return c.aEdit; } } /* ** COMMAND: test-rawdiff */ void test_rawdiff_cmd(void){ Blob a, b; int r; int i; int *R; if( g.argc<4 ) usage("FILE1 FILE2 ..."); blob_read_from_file(&a, g.argv[2]); for(i=3; i<g.argc; i++){ if( i>3 ) printf("-------------------------------\n"); blob_read_from_file(&b, g.argv[i]); R = text_diff(&a, &b, 0, 0); for(r=0; R[r] || R[r+1] || R[r+2]; r += 3){ printf(" copy %4d delete %4d insert %4d\n", R[r], R[r+1], R[r+2]); } /* free(R); */ blob_reset(&b); } } /* ** COMMAND: test-udiff */ void test_udiff_cmd(void){ Blob a, b, out; if( g.argc!=4 ) usage("FILE1 FILE2"); blob_read_from_file(&a, g.argv[2]); blob_read_from_file(&b, g.argv[3]); blob_zero(&out); text_diff(&a, &b, &out, 3); blob_write_to_file(&out, "-"); } /************************************************************************** ** The basic difference engine is above. What follows is the annotation ** engine. Both are in the same file since they share many components. */ /* ** The status of an annotation operation is recorded by an instance ** of the following structure. */ typedef struct Annotator Annotator; struct Annotator { DContext c; /* The diff-engine context */ struct { /* Lines of the original files... */ const char *z; /* The text of the line */ int n; /* Number of bytes (omitting trailing space and \n) */ const char *zSrc; /* Tag showing origin of this line */ } *aOrig; int nOrig; /* Number of elements in aOrig[] */ int nNoSrc; /* Number of entries where aOrig[].zSrc==NULL */ }; /* ** Initialize the annotation process by specifying the file that is ** to be annotated. The annotator takes control of the input Blob and ** will release it when it is finished with it. */ static int annotation_start(Annotator *p, Blob *pInput){ int i; memset(p, 0, sizeof(*p)); p->c.aTo = break_into_lines(blob_str(pInput), blob_size(pInput), &p->c.nTo); if( p->c.aTo==0 ){ return 1; } p->aOrig = malloc( sizeof(p->aOrig[0])*p->c.nTo ); if( p->aOrig==0 ) fossil_panic("out of memory"); for(i=0; i<p->c.nTo; i++){ p->aOrig[i].z = p->c.aTo[i].z; p->aOrig[i].n = p->c.aTo[i].h & LENGTH_MASK; p->aOrig[i].zSrc = 0; } p->nOrig = p->c.nTo; return 0; } /* ** The input pParent is the next most recent ancestor of the file ** being annotated. Do another step of the annotation. Return true ** if additional annotation is required. zPName is the tag to insert ** on each line of the file being annotated that was contributed by ** pParent. Memory to hold zPName is leaked. */ static int annotation_step(Annotator *p, Blob *pParent, char *zPName){ int i, j; int lnTo; /* Prepare the parent file to be diffed */ p->c.aFrom = break_into_lines(blob_str(pParent), blob_size(pParent), &p->c.nFrom); if( p->c.aFrom==0 ){ return 1; } /* Compute the differences going from pParent to the file being ** annotated. */ diff_all(&p->c); /* Where new lines are inserted on this difference, record the ** zPName as the source of the new line. */ for(i=lnTo=0; i<p->c.nEdit; i+=3){ for(j=0; j<p->c.aEdit[i]; j++, lnTo++){ p->aOrig[lnTo].zSrc = zPName; } lnTo += p->c.aEdit[i+2]; } /* Clear out the diff results */ free(p->c.aEdit); p->c.aEdit = 0; p->c.nEdit = 0; p->c.nEditAlloc = 0; /* Clear out the from file */ free(p->c.aFrom); blob_zero(pParent); /* Return no errors */ return 0; } /* ** COMMAND: test-annotate-step */ void test_annotate_step_cmd(void){ Blob orig, b; Annotator x; int i; if( g.argc<4 ) usage("RID1 RID2 ..."); db_must_be_within_tree(); blob_zero(&b); content_get(name_to_rid(g.argv[2]), &orig); if( annotation_start(&x, &orig) ){ fossil_fatal("binary file"); } for(i=3; i<g.argc; i++){ blob_zero(&b); content_get(name_to_rid(g.argv[i]), &b); if( annotation_step(&x, &b, g.argv[i-1]) ){ fossil_fatal("binary file"); } } for(i=0; i<x.nOrig; i++){ const char *zSrc = x.aOrig[i].zSrc; if( zSrc==0 ) zSrc = g.argv[g.argc-1]; printf("%10s: %.*s\n", zSrc, x.aOrig[i].n, x.aOrig[i].z); } } /* ** Compute a complete annotation on a file. The file is identified ** by its filename number (filename.fnid) and the baseline in which ** it was checked in (mlink.mid). */ static void annotate_file(Annotator *p, int fnid, int mid, int webLabel){ Blob toAnnotate; /* Text of the final version of the file */ Blob step; /* Text of previous revision */ int rid; /* Artifact ID of the file being annotated */ char *zLabel; /* Label to apply to a line */ Stmt q; /* Query returning all ancestor versions */ /* Initialize the annotation */ rid = db_int(0, "SELECT fid FROM mlink WHERE mid=%d AND fnid=%d",mid,fnid); if( rid==0 ){ fossil_panic("file #%d is unchanged in manifest #%d", fnid, mid); } if( !content_get(rid, &toAnnotate) ){ fossil_panic("unable to retrieve content of artifact #%d", rid); } db_multi_exec("CREATE TEMP TABLE ok(rid INTEGER PRIMARY KEY)"); compute_ancestors(mid, 1000000000); annotation_start(p, &toAnnotate); db_prepare(&q, "SELECT mlink.fid, blob.uuid, date(event.mtime), " " coalesce(event.euser,event.user) " " FROM mlink, blob, event" " WHERE mlink.fnid=%d" " AND mlink.mid IN ok" " AND blob.rid=mlink.mid" " AND event.objid=mlink.mid" " ORDER BY event.mtime DESC", fnid ); while( db_step(&q)==SQLITE_ROW ){ int pid = db_column_int(&q, 0); const char *zUuid = db_column_text(&q, 1); const char *zDate = db_column_text(&q, 2); const char *zUser = db_column_text(&q, 3); if( webLabel ){ zLabel = mprintf("<a href='%s/info/%s'>%.10s</a> %s %9.9s", g.zBaseURL, zUuid, zUuid, zDate, zUser); }else{ zLabel = mprintf("%.10s %s %9.9s", zUuid, zDate, zUser); } content_get(pid, &step); annotation_step(p, &step, zLabel); blob_reset(&step); } db_finalize(&q); } /* ** WEBPAGE: annotate ** ** Query parameters: ** ** checkin=ID The manifest ID at which to start the annotation ** filename=FILENAME The filename. */ void annotation_page(void){ int mid; int fnid; int i; Annotator ann; login_check_credentials(); if( !g.okRead ){ login_needed(); return; } mid = name_to_rid(PD("checkin","0")); fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q", P("filename")); if( mid==0 || fnid==0 ){ fossil_redirect_home(); } if( !db_exists("SELECT 1 FROM mlink WHERE mid=%d AND fnid=%d",mid,fnid) ){ fossil_redirect_home(); } style_header("File Annotation"); annotate_file(&ann, fnid, mid, g.okHistory); @ <pre> for(i=0; i<ann.nOrig; i++){ ((char*)ann.aOrig[i].z)[ann.aOrig[i].n] = 0; @ %s(ann.aOrig[i].zSrc): %h(ann.aOrig[i].z) } @ </pre> style_footer(); } /* ** COMMAND: annotate ** ** %fossil annotate FILENAME ** ** Output the text of a file with markings to show when each line of ** the file was introduced. */ void annotate_cmd(void){ int fnid; /* Filename ID */ int fid; /* File instance ID */ int mid; /* Manifest where file was checked in */ Blob treename; /* FILENAME translated to canonical form */ char *zFilename; /* Cannonical filename */ Annotator ann; /* The annotation of the file */ int i; /* Loop counter */ db_must_be_within_tree(); if (g.argc<3) { usage("FILENAME"); } file_tree_name(g.argv[2], &treename, 1); zFilename = blob_str(&treename); fnid = db_int(0, "SELECT fnid FROM filename WHERE name=%Q", zFilename); if( fnid==0 ){ fossil_fatal("no such file: %s", zFilename); } fid = db_int(0, "SELECT rid FROM vfile WHERE pathname=%Q", zFilename); if( fid==0 ){ fossil_fatal("not part of current checkout: %s", zFilename); } mid = db_int(0, "SELECT mid FROM mlink WHERE fid=%d AND fnid=%d", fid, fnid); if( mid==0 ){ fossil_panic("unable to find manifest"); } annotate_file(&ann, fnid, mid, 0); for(i=0; i<ann.nOrig; i++){ printf("%s: %.*s\n", ann.aOrig[i].zSrc, ann.aOrig[i].n, ann.aOrig[i].z); } }