/* ** Copyright (c) 2007 D. Richard Hipp ** ** This program is free software; you can redistribute it and/or ** modify it under the terms of the GNU General Public ** License version 2 as published by the Free Software Foundation. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ** General Public License for more details. ** ** You should have received a copy of the GNU General Public ** License along with this library; if not, write to the ** Free Software Foundation, Inc., 59 Temple Place - Suite 330, ** Boston, MA 02111-1307, USA. ** ** Author contact information: ** drh@hwaci.com ** http://www.hwaci.com/drh/ ** ******************************************************************************* ** ** This module implements a 3-way merge */ #include "config.h" #include "merge3.h" #if 0 # define DEBUG1(X) X #else # define DEBUG1(X) #endif #if 0 #define DEBUG2(X) X /* ** For debugging: ** Print 16 characters of text from zBuf */ static const char *print16(const char *z){ int i; static char zBuf[20]; for(i=0; i<16; i++){ if( z[i]>=0x20 && z[i]<=0x7e ){ zBuf[i] = z[i]; }else{ zBuf[i] = '.'; } } zBuf[i] = 0; return zBuf; } #else # define DEBUG2(X) #endif /* ** Must be a 32-bit integer */ typedef unsigned int u32; /* ** Must be a 16-bit value */ typedef unsigned short int u16; /* ** The width of a hash window in bytes. The algorithm only works if this ** is a power of 2. */ #define NHASH 16 /* ** The current state of the rolling hash. ** ** z[] holds the values that have been hashed. z[] is a circular buffer. ** z[i] is the first entry and z[(i+NHASH-1)%NHASH] is the last entry of ** the window. ** ** Hash.a is the sum of all elements of hash.z[]. Hash.b is a weighted ** sum. Hash.b is z[i]*NHASH + z[i+1]*(NHASH-1) + ... + z[i+NHASH-1]*1. ** (Each index for z[] should be module NHASH, of course. The %NHASH operator ** is omitted in the prior expression for brevity.) */ typedef struct hash hash; struct hash { u16 a, b; /* Hash values */ u16 i; /* Start of the hash window */ char z[NHASH]; /* The values that have been hashed */ }; /* ** Initialize the rolling hash using the first NHASH characters of z[] */ static void hash_init(hash *pHash, const char *z){ u16 a, b, i; a = b = 0; for(i=0; iz[i] = z[i]; } pHash->a = a & 0xffff; pHash->b = b & 0xffff; pHash->i = 0; } /* ** Advance the rolling hash by a single character "c" */ static void hash_next(hash *pHash, int c){ u16 old = pHash->z[pHash->i]; pHash->z[pHash->i] = c; pHash->i = (pHash->i+1)&(NHASH-1); pHash->a = pHash->a - old + c; pHash->b = pHash->b - NHASH*old + pHash->a; } /* ** Return a 32-bit hash value */ static u32 hash_32bit(hash *pHash){ return (pHash->a & 0xffff) | (((u32)(pHash->b & 0xffff))<<16); } /* ** Maximum number of landmarks to set in the source file. */ #define MX_LANDMARK (1024*128) /* ** A mapping structure is used to record which parts of two ** files contain the same text. There are zero or more mapping ** entries in a mapping. Each entry maps a segment of text in ** the source file into a segment of the output file. ** ** fromFirst...fromLast -> toFirst...toLast ** ** Extra text might be inserted in the output file after a ** mapping. The nExtra parameter records the number of bytes ** of extra text to insert. */ typedef struct Mapping Mapping; struct Mapping { int nMap; int nUsed; struct Mapping_entry { int fromFirst, fromLast; int toFirst, toLast; int nExtra; } *aMap; }; /* ** Free malloced memory associated with a mapping. */ static void MappingClear(Mapping *p){ free(p->aMap); memset(p, 0, sizeof(*p)); } /* ** Add an entry to a mapping structure. The mapping is: ** ** a...b -> c...d ** ** The nExtra parameter is initially zero. It will be changed ** later if necessary. */ static void MappingInsert(Mapping *p, int a, int b, int c, int d){ struct Mapping_entry *pEntry; int i; for(i=0, pEntry=p->aMap; inUsed; i++, pEntry++){ if( pEntry->fromFirst==a && pEntry->fromLast==b && pEntry->toFirst==c ){ DEBUG2( printf("DUPLICATE: %6d..%-6d %6d..%d\n", a, b, c, d); ) return; } } if( p->nUsed>=p->nMap ){ p->nMap = p->nMap * 2 + 10; p->aMap = realloc(p->aMap, p->nMap*sizeof(p->aMap[0]) ); if( p->aMap==0 ) exit(1); } pEntry = &p->aMap[p->nUsed]; pEntry->fromFirst = a; pEntry->fromLast = b; pEntry->toFirst = c; pEntry->toLast = d; pEntry->nExtra = 0; p->nUsed++; } DEBUG1( /* ** For debugging purposes: ** Print the content of a mapping. */ static void MappingPrint(Mapping *pMap){ int i; struct Mapping_entry *p; for(i=0, p=pMap->aMap; inUsed; i++, p++){ printf("%6d..%-6d %6d..%-6d %d\n", p->fromFirst, p->fromLast, p->toFirst, p->toLast, p->nExtra); } } ) /* ** Remove deleted entries from a mapping. Deleted enties have ** an fromFirst of less than 0. */ static void MappingPurgeDeletedEntries(Mapping *p){ int i, j; for(i=j=0; inUsed; i++){ if( p->aMap[i].fromFirst<0 ) continue; if( jaMap[j] = p->aMap[i]; } j++; } p->nUsed = j; } /* ** Comparisons functions used for sorting elements of a Mapping */ static int intAbs(int x){ return x<0 ? -x : x; } static int compareSize(const void *a, const void *b){ const struct Mapping_entry *A = (const struct Mapping_entry*)a; const struct Mapping_entry *B = (const struct Mapping_entry*)b; int rc; rc = (B->fromLast - B->fromFirst) - (A->fromLast - A->fromFirst); if( rc==0 ){ rc = intAbs(A->toFirst - A->fromFirst) - intAbs(B->toFirst - B->fromFirst); } return rc; } static int compareFrom(const void *a, const void *b){ const struct Mapping_entry *A = (const struct Mapping_entry*)a; const struct Mapping_entry *B = (const struct Mapping_entry*)b; return A->fromFirst - B->fromFirst; } static int compareTo(const void *a, const void *b){ const struct Mapping_entry *A = (const struct Mapping_entry*)a; const struct Mapping_entry *B = (const struct Mapping_entry*)b; return A->toFirst - B->toFirst; } /* ** Routines for sorting the entries of a mapping. SortSize sorts ** the entries in order of decreasing size (largest first.) ** SortFrom and SortTo sort the entries in order of increasing ** fromFirst and toFirst. */ static void MappingSortSize(Mapping *p){ qsort(p->aMap, p->nUsed, sizeof(p->aMap[0]), compareSize); } static void MappingSortFrom(Mapping *p){ qsort(p->aMap, p->nUsed, sizeof(p->aMap[0]), compareFrom); } static void MappingSortTo(Mapping *p){ qsort(p->aMap, p->nUsed, sizeof(p->aMap[0]), compareTo); } /* ** Initialize pMap to contain a set of similarities between two files. */ static void MappingInit( const char *zSrc, /* The source or pattern file */ int lenSrc, /* Length of the source file */ const char *zOut, /* The target file */ int lenOut, /* Length of the target file */ Mapping *pMap /* Write the map of similaries here */ ){ int i, j, base, prefix; hash h; int *collide; int origLenOut = lenOut; struct Mapping_entry *aMap; int landmark[MX_LANDMARK]; /* ** Initialize the map */ memset(pMap, 0, sizeof(*pMap)); /* ** Find common prefix and suffix */ if( lenSrc<=NHASH || lenOut<=NHASH ){ MappingInsert(pMap, 0, 0, 0, 0); goto add_nextra; } for(i=0; i=NHASH ){ MappingInsert(pMap, 0, i-1, 0, i-1); lenSrc -= i; zSrc += i; lenOut -= i; zOut += i; if( lenSrc<=0 || lenOut<=0 ) goto add_nextra; prefix = i; }else{ prefix = 0; } for(i=1; i<=lenSrc && i<=lenOut && zSrc[lenSrc-i]==zOut[lenOut-i]; i++){} if( i>NHASH ){ MappingInsert(pMap, prefix+lenSrc-i+1, prefix+lenSrc-1, prefix+lenOut-i+1, prefix+lenOut-1); lenSrc -= i; lenOut -= i; } /* If the source file is very small, it means that we have no ** chance of ever finding any matches. We can leave early. */ if( lenSrc<=NHASH ) goto add_nextra; /* Compute the hash table used to locate matching sections in the ** source file. */ collide = malloc( lenSrc*sizeof(int)/NHASH ); if( collide==0 ) return; memset(landmark, -1, sizeof(landmark)); memset(collide, -1, lenSrc*sizeof(int)/NHASH ); for(i=0; i=0 ){ /* ** The hash window has identified a potential match against ** landmark block iBlock. But we need to investigate further. ** ** Look for a region in zOut that matches zSrc. Anchor the search ** at zSrc[iSrc] and zOut[base+i]. ** ** Set cnt equal to the length of the match and set ofst so that ** zSrc[ofst] is the first element of the match. */ int cnt, ofstSrc; int j, k, x, y; /* Beginning at iSrc, match forwards as far as we can. j counts ** the number of characters that match */ iSrc = iBlock*NHASH; for(j=0, x=iSrc, y=base+i; xNHASH ){ int ofstOut = base+i-k; DEBUG2( printf("COPY %6d..%-6d %6d..%d\n", prefix+ofstSrc, prefix+ofstSrc+cnt-1, prefix+ofstOut, prefix+ofstOut+cnt-1); ) MappingInsert(pMap, prefix+ofstSrc, prefix+ofstSrc+cnt-1, prefix+ofstOut, prefix+ofstOut+cnt-1); if( nextBase < ofstOut+cnt-1 ){ nextBase = ofstOut+cnt-1; nextBaseI = i+NHASH; } } /* Check the next matching block */ iBlock = collide[iBlock]; } /* If we found a match, then jump out to the outer loop and begin ** a new cycle. */ if( nextBase>base && i>=nextBaseI ){ base = nextBase; break; } /* Advance the hash by one character. Keep looking for a match */ if( base+i+NHASH>=lenOut ){ base = lenOut; break; } hash_next(&h, zOut[base+i+NHASH]); i++; } } free(collide); DEBUG1( printf("after creation:\n"); MappingPrint(pMap); ) /* In this step, we will remove overlapping entries from the mapping. ** ** We use a greedy algorithm. Select the largest mapping first and ** remove all overlapping mappings. Then take the next largest ** mapping and remove others that overlap with it. Keep going until ** all mappings have been processed. */ MappingSortSize(pMap); for(i=0; inUsed; i++){ int sortNeeded = 0; int purgeNeeded = 0; struct Mapping_entry *pA; pA = &pMap->aMap[i]; for(j=i+1; jnUsed; j++){ int diff; struct Mapping_entry *pB; pB = &pMap->aMap[j]; if( pB->fromLastfromFirst || pB->fromFirst>pA->fromLast ){ /* No overlap. Do nothing */ }else if( pB->fromFirst>=pA->fromFirst && pB->fromLast<=pA->fromLast ){ /* B is contained entirely within A. Drop B */ pB->fromFirst = -1; purgeNeeded = 1; continue; }else if( pB->fromFirstfromFirst ){ /* The tail B overlaps the head of A */ assert( pB->fromLast>=pA->fromFirst && pB->fromLast<=pA->fromLast ); diff = pB->fromLast + 1 - pA->fromFirst; pB->fromLast -= diff; pB->toLast -= diff; sortNeeded = 1; }else{ /* The head of B overlaps the tail of A */ assert( pB->fromFirst<=pA->fromLast && pB->fromLast>pA->fromLast ); diff = pA->fromLast + 1 - pB->fromFirst; pB->fromFirst += diff; pB->toFirst += diff; sortNeeded = 1; } if( pB->toLasttoFirst || pB->toFirst>pA->toLast ){ /* No overlap. Do nothing */ }else if( pB->toFirst>=pA->toFirst && pB->toLast<=pA->toLast ){ /* B is contained entirely within A. Drop B */ pB->fromFirst = -1; purgeNeeded = 1; }else if( pB->toFirsttoFirst ){ /* The tail of B overlaps the head of A */ assert( pB->toLast>=pA->toFirst && pB->toLast<=pA->toLast ); diff = pB->toLast + 1 - pA->toFirst; pB->fromLast -= diff; pB->toLast -= diff; sortNeeded = 1; }else{ /* The head of B overlaps the tail of A */ assert( pB->toFirst<=pA->toLast && pB->toLast>pA->toLast ); diff = pA->toLast + 1 - pB->toFirst; pB->fromFirst += diff; pB->toFirst += diff; sortNeeded = 1; } } if( purgeNeeded ){ MappingPurgeDeletedEntries(pMap); } if( sortNeeded && inUsed-2 ){ MappingSortSize(pMap); } } /* Final step: Arrange the mapping entires so that they are in the ** order of the output file. Then fill in the nExtra values. */ add_nextra: MappingSortTo(pMap); aMap = pMap->aMap; for(i=0; inUsed-1; i++){ aMap[i].nExtra = aMap[i+1].toFirst - aMap[i].toLast - 1; } if( pMap->nUsed>0 && origLenOut > aMap[i].toLast+1 ){ aMap[i].nExtra = origLenOut - aMap[i].toLast - 1; } } /* ** Translate an index into a file using a mapping. ** ** The mapping "p" shows how blocks in the input file map into blocks ** of the output file. The index iFrom is an index into the input file. ** This routine returns the index into the output file of the corresponding ** character. ** ** If pInserted!=0 and iFrom points to the last character before a ** insert in the output file, then the return value is adjusted forward ** so that it points to the end of the insertion and the number of ** bytes inserted is written into *pInserted. If pInserted==0 then ** iFrom always maps directly in the corresponding output file ** index regardless of whether or not it points to the last character ** before an insertion. */ static int MappingTranslate(Mapping *p, int iFrom, int *pInserted){ int i; for(i=0; inUsed; i++){ if( iFrom>p->aMap[i].fromLast ) continue; if( iFrom<=p->aMap[i].fromFirst ){ return p->aMap[i].toFirst; } if( pInserted && iFrom==p->aMap[i].fromLast ){ int n = p->aMap[i].nExtra; *pInserted = n; return p->aMap[i].toLast + n; }else{ return p->aMap[i].toFirst + iFrom - p->aMap[i].fromFirst; } } i--; return p->aMap[i].toLast + p->aMap[i].nExtra; } /* ** Do a three-way merge. Initialize pOut to contain the result. */ void blob_merge(Blob *pPivot, Blob *pV1, Blob *pV2, Blob *pOut){ Mapping map1, map2; int i; const char *zV1, *zV2; blob_zero(pOut); DEBUG1( printf("map1:\n"); ) MappingInit( blob_buffer(pPivot), blob_size(pPivot), blob_buffer(pV1), blob_size(pV1), &map1); MappingSortFrom(&map1); DEBUG1( printf("map1-final:\n"); MappingPrint(&map1); printf("map2:\n"); ) MappingInit( blob_buffer(pPivot), blob_size(pPivot), blob_buffer(pV2), blob_size(pV2), &map2); DEBUG1( printf("map2-final:\n"); MappingPrint(&map2); ) zV1 = blob_buffer(pV1); zV2 = blob_buffer(pV2); if( map2.nUsed==0 ) return; if( map1.aMap[0].toFirst>0 ){ blob_append(pOut, zV1, map1.aMap[0].toFirst); DEBUG1( printf("INSERT %d bytes from V1[0..%d]\n", map1.aMap[0].toFirst, map1.aMap[0].toFirst-1); ) } if( map2.aMap[0].toFirst>0 ){ blob_append(pOut, zV2, map2.aMap[0].toFirst); DEBUG1( printf("INSERT %d bytes from V2[0..%d]\n", map2.aMap[0].toFirst, map2.aMap[0].toFirst-1); ) } for(i=0; ifromFirst, 0); iLast = MappingTranslate(&map1, p->fromLast, &nInsert); blob_append(pOut, &zV1[iFirst], iLast - iFirst + 1); DEBUG1( printf("COPY %d bytes from V1[%d..%d]\n", iLast-iFirst+1, iFirst, iLast); ) if( p->nExtra>0 ){ if( p->nExtra==nInsert && memcmp(&zV2[p->toLast+1], &zV1[iLast-nInsert+1], nInsert)==0 ){ /* Omit a duplicate insert */ DEBUG1( printf("OMIT duplicate insert\n"); ) }else{ blob_append(pOut, &zV2[p->toLast+1], p->nExtra); DEBUG1( printf("INSERT %d bytes from V2[%d..%d]\n", p->nExtra, p->toLast+1, p->toLast+p->nExtra); ) } } } MappingClear(&map1); MappingClear(&map2); } /* ** COMMAND: test-3-way-merge ** ** Combine change in going from PIVOT->VERSION1 with the change going ** from PIVOT->VERSION2 and write the combined changes into MERGED. */ void delta_3waymerge_cmd(void){ Blob pivot, v1, v2, merged; if( g.argc!=6 ){ fprintf(stderr,"Usage: %s %s PIVOT V1 V2 MERGED\n", g.argv[0], g.argv[1]); exit(1); } if( blob_read_from_file(&pivot, g.argv[2])<0 ){ fprintf(stderr,"cannot read %s\n", g.argv[2]); exit(1); } if( blob_read_from_file(&v1, g.argv[3])<0 ){ fprintf(stderr,"cannot read %s\n", g.argv[3]); exit(1); } if( blob_read_from_file(&v2, g.argv[4])<0 ){ fprintf(stderr,"cannot read %s\n", g.argv[4]); exit(1); } blob_merge(&pivot, &v1, &v2, &merged); if( blob_write_to_file(&merged, g.argv[5])