commit e51ebd83fa731d197ee4074ee2e94dbc0581078c from: Stefan Sperling date: Sat Nov 21 10:47:30 2020 UTC handle binary files like diff(1) does; new -a option forces text commit - 29916bb6c0c248ca6fa5486cb9e081d92112e86c commit + e51ebd83fa731d197ee4074ee2e94dbc0581078c blob - e14ccd2a03dbf0250534d698c445e42262fa3e2f blob + eded4163df8dc21ec5cf7ed44b09b3c7a994e331 --- diff/diff.c +++ diff/diff.c @@ -41,7 +41,7 @@ enum diffreg_algo { }; __dead void usage(void); -int diffreg(char *, char *, enum diffreg_algo, bool, bool, +int diffreg(char *, char *, enum diffreg_algo, bool, bool, bool, int, bool); FILE * openfile(const char *, char **, struct stat *); @@ -49,8 +49,9 @@ __dead void usage(void) { fprintf(stderr, - "usage: %s [-pPQTwe] [-U n] file1 file2\n" + "usage: %s [-apPQTwe] [-U n] file1 file2\n" "\n" + " -a Treat input as ASCII even if binary data is detected\n" " -p Show function prototypes in hunk headers\n" " -P Use Patience Diff (slower but often nicer)\n" " -Q Use forward-Myers for small files, otherwise Patience\n" @@ -66,14 +67,18 @@ int main(int argc, char *argv[]) { int ch, rc; + bool force_text = false; bool ignore_whitespace = false; bool show_function_prototypes = false; bool edscript = false; int context_lines = 3; enum diffreg_algo algo = DIFFREG_ALGO_MYERS_THEN_MYERS_DIVIDE; - while ((ch = getopt(argc, argv, "pPQTwU:e")) != -1) { + while ((ch = getopt(argc, argv, "apPQTwU:e")) != -1) { switch (ch) { + case 'a': + force_text = true; + break; case 'p': show_function_prototypes = true; break; @@ -106,7 +111,7 @@ main(int argc, char *argv[]) if (argc != 2) usage(); - rc = diffreg(argv[0], argv[1], algo, ignore_whitespace, + rc = diffreg(argv[0], argv[1], algo, force_text, ignore_whitespace, show_function_prototypes, context_lines, edscript); if (rc != DIFF_RC_OK) { fprintf(stderr, "diff: %s\n", strerror(rc)); @@ -178,8 +183,9 @@ const struct diff_config diff_config_no_algo = { }; int -diffreg(char *file1, char *file2, enum diffreg_algo algo, bool ignore_whitespace, - bool show_function_prototypes, int context_lines, bool edscript) +diffreg(char *file1, char *file2, enum diffreg_algo algo, bool force_text, + bool ignore_whitespace, bool show_function_prototypes, int context_lines, + bool edscript) { char *str1, *str2; FILE *f1, *f2; @@ -213,6 +219,8 @@ diffreg(char *file1, char *file2, enum diffreg_algo al f1 = openfile(file1, &str1, &st1); f2 = openfile(file2, &str2, &st2); + if (force_text) + diff_flags |= DIFF_FLAG_FORCE_TEXT_DATA; if (ignore_whitespace) diff_flags |= DIFF_FLAG_IGNORE_WHITESPACE; if (show_function_prototypes) blob - bd94a9109c7f29432f6af3217b4a5775f3f0493c blob + 5e816ae10b5948c0aeacd27365a91f3147b96baf --- include/diff_main.h +++ include/diff_main.h @@ -105,6 +105,7 @@ struct diff_data { const uint8_t *data; /* if memory-mapped */ off_t len; + int atomizer_flags; ARRAYLIST(struct diff_atom) atoms; struct diff_data *root; struct diff_data *current; @@ -115,8 +116,13 @@ struct diff_data { int err; }; +/* Flags set by file atomizer. */ +#define DIFF_ATOMIZER_FOUND_BINARY_DATA 0x00000001 + +/* Flags set by caller of diff_main(). */ #define DIFF_FLAG_IGNORE_WHITESPACE 0x00000001 #define DIFF_FLAG_SHOW_PROTOTYPES 0x00000002 +#define DIFF_FLAG_FORCE_TEXT_DATA 0x00000004 void diff_data_free(struct diff_data *diff_data); @@ -143,7 +149,7 @@ struct diff_state; * * func_data: context pointer (free to be used by implementation). * d: struct diff_data with d->data and d->len already set up, and - * d->atoms to be created. + * d->atoms to be created and d->atomizer_flags to be set up. */ typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d); blob - 1da34c64d7271fbd198a3a0cfc8d0414641d22d8 blob + 0531fabe30530664069034f1c3166fe9d18a6e54 --- lib/diff_atomize_text.c +++ lib/diff_atomize_text.c @@ -43,6 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); + bool embedded_nul = false; while (array_size_estimate >>= 1) pow2++; @@ -71,6 +72,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) || !isspace(buf[i])) hash = diff_atom_hash_update( hash, buf[i]); + if (buf[i] == '\0') + embedded_nul = true; line_end++; } else eol = buf[i]; @@ -112,6 +115,10 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) return errno; } + /* File are considered binary if they contain embedded '\0' bytes. */ + if (embedded_nul) + d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; + return DIFF_RC_OK; } @@ -121,7 +128,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) const uint8_t *pos = d->data; const uint8_t *end = pos + d->len; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); - + bool embedded_nul = false; unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; while (array_size_estimate >>= 1) @@ -137,6 +144,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) if (!ignore_whitespace || !isspace(*line_end)) hash = hash * 23 + *line_end; + if (*line_end == '\0') + embedded_nul = true; line_end++; } @@ -167,6 +176,10 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) pos = line_end; } + /* File are considered binary if they contain embedded '\0' bytes. */ + if (embedded_nul) + d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; + return DIFF_RC_OK; } blob - 116c0d8c3b0d2a06a61a9f516b6b89dfe3f82f8d blob + 677f5baf44d4fa746fc192cb95326307b7f2bf0c --- lib/diff_output_edscript.c +++ lib/diff_output_edscript.c @@ -110,6 +110,12 @@ diff_output_edscript(struct diff_output_info **output_ { struct diff_output_info *outinfo = NULL; struct diff_chunk_context cc = {}; + int atomizer_flags = (result->left->atomizer_flags| + result->right->atomizer_flags); + int flags = (result->left->root->diff_flags | + result->right->root->diff_flags); + bool force_text = (flags & DIFF_FLAG_FORCE_TEXT_DATA); + bool have_binary = (atomizer_flags & DIFF_ATOMIZER_FOUND_BINARY_DATA); int i, rc; if (!result) @@ -124,6 +130,23 @@ diff_output_edscript(struct diff_output_info **output_ outinfo = *output_info; } + if (have_binary && !force_text) { + for (i = 0; i < result->chunks.len; i++) { + struct diff_chunk *c = &result->chunks.head[i]; + enum diff_chunk_type t = diff_chunk_type(c); + + if (t != CHUNK_MINUS && t != CHUNK_PLUS) + continue; + + fprintf(dest, "Binary files %s and %s differ\n", + info->left_path ? : "a", + info->right_path ? : "b"); + break; + } + + return DIFF_RC_OK; + } + for (i = 0; i < result->chunks.len; i++) { struct diff_chunk *chunk = &result->chunks.head[i]; enum diff_chunk_type t = diff_chunk_type(chunk); blob - 2f178e703ce61ba77413863c0b71d0f96e256909 blob + 520dc91d99494e5f4a3c9be8c238a6defd1ff48a --- lib/diff_output_unidiff.c +++ lib/diff_output_unidiff.c @@ -412,9 +412,13 @@ diff_output_unidiff(struct diff_output_info **output_i struct diff_output_unidiff_state *state; struct diff_chunk_context cc = {}; struct diff_output_info *outinfo = NULL; + int atomizer_flags = (result->left->atomizer_flags| + result->right->atomizer_flags); int flags = (result->left->root->diff_flags | result->right->root->diff_flags); bool show_function_prototypes = (flags & DIFF_FLAG_SHOW_PROTOTYPES); + bool force_text = (flags & DIFF_FLAG_FORCE_TEXT_DATA); + bool have_binary = (atomizer_flags & DIFF_ATOMIZER_FOUND_BINARY_DATA); int i; if (!result) @@ -428,7 +432,24 @@ diff_output_unidiff(struct diff_output_info **output_i return ENOMEM; outinfo = *output_info; } + + if (have_binary && !force_text) { + for (i = 0; i < result->chunks.len; i++) { + struct diff_chunk *c = &result->chunks.head[i]; + enum diff_chunk_type t = diff_chunk_type(c); + if (t != CHUNK_MINUS && t != CHUNK_PLUS) + continue; + + fprintf(dest, "Binary files %s and %s differ\n", + info->left_path ? : "a", + info->right_path ? : "b"); + break; + } + + return DIFF_RC_OK; + } + state = diff_output_unidiff_state_alloc(); if (state == NULL) { if (output_info) {