Fossil

Diff
Login

Diff

Differences From Artifact [b7665e7c83]:

To Artifact [7c4d3451fa]:


55
56
57
58
59
60
61
62

63
64
65
66
67
68
69
55
56
57
58
59
60
61

62
63
64
65
66
67
68
69







-
+








#define DIFF_TOO_MANY_CHANGES_TXT \
    "more than 10,000 changes\n"

#define DIFF_TOO_MANY_CHANGES_HTML \
    "<p class='generalError'>More than 10,000 changes</p>\n"

#define looks_like_binary(blob) ((looks_like_utf8((blob))&3) == 0)
#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0)
#endif /* INTERFACE */

/*
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
*/
#define LENGTH_MASK_SZ  13
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

225
226
227
228

229
230
231
232
233
234
235

236
237


238
239
240

241
242
243
244
245
246


247
248

249
250
251
252
253
254
255
197
198
199
200
201
202
203




204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

220
221
222
223

224
225
226
227
228
229
230
231
232


233
234
235
236

237
238
239
240
241


242
243
244

245
246
247
248
249
250
251
252







-
-
-
-
















-
+



-
+







+
-
-
+
+


-
+




-
-
+
+

-
+







**         does not understand UTF-16, it may falsely consider UTF-16 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-8.
**
** (-4) -- The same as 0, but the determination is based on the fact that
**         the blob might be text (any encoding) but it has a line length
**         bigger than the diff logic in fossil can handle.
**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-8.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between UTF-8 and other encodings occur.
**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf8(const Blob *pContent){
  const char *z = blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int flags = 0;  /* bit 0 = long lines found, 1 = CR/NL found. */
  int result = 1;  /* Assume UTF-8 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return 1;  /* Empty file -> text */
  if( n==0 ) return result;  /* Empty file -> text */
  c = *z;
  if( c==0 ) return 0;  /* Zero byte in a file -> binary */
  j = (c!='\n');
  while( --n>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* Zero byte in a file -> binary */
    if( c=='\n' ){
      int c2 = z[-1];
      if( z[-1]=='\r' ){
        flags |= 2;  /* Contains CR/NL, continue */
      if( c2=='\r' ){
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>LENGTH_MASK ){
        flags |= 1;  /* Very long line, continue */
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( (flags&1) || (j>LENGTH_MASK) ){
    return -4;  /* Very long line -> binary */
  if( j>LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return 1-flags;  /* No problems seen -> not binary */
  return result;  /* No problems seen -> not binary */
}

/*
** Define the type needed to represent a Unicode (UTF-16) character.
*/
#ifndef WCHAR_T
#  ifdef _WIN32
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317

318
319
320
321

322
323
324
325
326
327
328
329
330
331
332

333
334
335

336
337
338
339
340
341


342
343

344
345
346
347
348
349
350
287
288
289
290
291
292
293




294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309

310
311
312
313

314
315
316
317
318
319
320
321
322
323
324

325
326
327

328
329
330
331
332


333
334
335

336
337
338
339
340
341
342
343







-
-
-
-
















-
+



-
+










-
+


-
+




-
-
+
+

-
+







**         does not understand UTF-8, it may falsely consider UTF-8 text
**         to be binary.
**
** (-1) -- The content appears to consist entirely of text, with lines
**         delimited by carriage-return, line-feed pairs; however, the
**         encoding may not be UTF-16.
**
** (-4) -- The same as 0, but the determination is based on the fact that
**         the blob might be text (any encoding) but it has a line length
**         bigger than the diff logic in fossil can handle.
**
************************************ WARNING **********************************
**
** This function does not validate that the blob content is properly formed
** UTF-16.  It assumes that all code points are the same size.  It does not
** validate any code points.  It makes no attempt to detect if any [invalid]
** switches between the UTF-16be and UTF-16le encodings occur.
**
** The only code points that this function cares about are the NUL character,
** carriage-return, and line-feed.
**
************************************ WARNING **********************************
*/
int looks_like_utf16(const Blob *pContent){
  const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
  unsigned int n = blob_size(pContent);
  int j, c;
  int flags = 0;  /* bit 0 = long lines found, 1 = CR/NL found. */
  int result = 1;  /* Assume UTF-16 text with no CR/NL */

  /* Check individual lines.
  */
  if( n==0 ) return 1;  /* Empty file -> text */
  if( n==0 ) return result;  /* Empty file -> text */
  if( n%2 ) return 0;  /* Odd number of bytes -> binary (or UTF-8) */
  c = *z;
  if( c==0 ) return 0;  /* NUL character in a file -> binary */
  j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
  while( (n-=2)>0 ){
    c = *++z; ++j;
    if( c==0 ) return 0;  /* NUL character in a file -> binary */
    if( c==UTF16BE_LF || c==UTF16LE_LF ){
      int c2 = z[-1];
      if( c2==UTF16BE_CR || c2==UTF16LE_CR ){
        flags |= 2;  /* Contains CR/NL, continue */
        result = -1;  /* Contains CR/NL, continue */
      }
      if( j>UTF16_LENGTH_MASK ){
        flags |= 1;  /* Very long line, continue */
        return 0;  /* Very long line -> binary */
      }
      j = 0;
    }
  }
  if( (flags&1) || (j>UTF16_LENGTH_MASK) ){
    return -4;  /* Very long line -> binary */
  if( j>UTF16_LENGTH_MASK ){
    return 0;  /* Very long line -> binary */
  }
  return 1-flags;  /* No problems seen -> not binary */
  return result;  /* No problems seen -> not binary */
}

/*
** This function returns an array of bytes representing the byte-order-mark
** for UTF-8.
*/
const unsigned char *get_utf8_bom(int *pnByte){