commit e51ebd83fa731d197ee4074ee2e94dbc0581078c
from: Stefan Sperling <stsp@stsp.name>
date: Sat Nov 21 10:47:30 2020 UTC

handle binary files like diff(1) does; new -a option forces text

commit - 29916bb6c0c248ca6fa5486cb9e081d92112e86c
commit + e51ebd83fa731d197ee4074ee2e94dbc0581078c
blob - e14ccd2a03dbf0250534d698c445e42262fa3e2f
blob + eded4163df8dc21ec5cf7ed44b09b3c7a994e331
--- diff/diff.c
+++ diff/diff.c
@@ -41,7 +41,7 @@ enum diffreg_algo {
 };
 
 __dead void	 usage(void);
-int		 diffreg(char *, char *, enum diffreg_algo, bool, bool,
+int		 diffreg(char *, char *, enum diffreg_algo, bool, bool, bool,
 			 int, bool);
 FILE *		 openfile(const char *, char **, struct stat *);
 
@@ -49,8 +49,9 @@ __dead void
 usage(void)
 {
 	fprintf(stderr,
-		"usage: %s [-pPQTwe] [-U n] file1 file2\n"
+		"usage: %s [-apPQTwe] [-U n] file1 file2\n"
 		"\n"
+		"  -a   Treat input as ASCII even if binary data is detected\n"
 		"  -p   Show function prototypes in hunk headers\n"
 		"  -P   Use Patience Diff (slower but often nicer)\n"
 		"  -Q   Use forward-Myers for small files, otherwise Patience\n"
@@ -66,14 +67,18 @@ int
 main(int argc, char *argv[])
 {
 	int ch, rc;
+	bool force_text = false;
 	bool ignore_whitespace = false;
 	bool show_function_prototypes = false;
 	bool edscript = false;
 	int context_lines = 3;
 	enum diffreg_algo algo = DIFFREG_ALGO_MYERS_THEN_MYERS_DIVIDE;
 
-	while ((ch = getopt(argc, argv, "pPQTwU:e")) != -1) {
+	while ((ch = getopt(argc, argv, "apPQTwU:e")) != -1) {
 		switch (ch) {
+		case 'a':
+			force_text = true;
+			break;
 		case 'p':
 			show_function_prototypes = true;
 			break;
@@ -106,7 +111,7 @@ main(int argc, char *argv[])
 	if (argc != 2)
 		usage();
 
-	rc = diffreg(argv[0], argv[1], algo, ignore_whitespace,
+	rc = diffreg(argv[0], argv[1], algo, force_text, ignore_whitespace,
 	    show_function_prototypes, context_lines, edscript);
 	if (rc != DIFF_RC_OK) {
 		fprintf(stderr, "diff: %s\n", strerror(rc));
@@ -178,8 +183,9 @@ const struct diff_config diff_config_no_algo = {
 };
 
 int
-diffreg(char *file1, char *file2, enum diffreg_algo algo, bool ignore_whitespace,
-    bool show_function_prototypes, int context_lines, bool edscript)
+diffreg(char *file1, char *file2, enum diffreg_algo algo, bool force_text,
+    bool ignore_whitespace, bool show_function_prototypes, int context_lines,
+    bool edscript)
 {
 	char *str1, *str2;
 	FILE *f1, *f2;
@@ -213,6 +219,8 @@ diffreg(char *file1, char *file2, enum diffreg_algo al
 	f1 = openfile(file1, &str1, &st1);
 	f2 = openfile(file2, &str2, &st2);
 
+	if (force_text)
+		diff_flags |= DIFF_FLAG_FORCE_TEXT_DATA;
 	if (ignore_whitespace)
 		diff_flags |= DIFF_FLAG_IGNORE_WHITESPACE;
 	if (show_function_prototypes)
blob - bd94a9109c7f29432f6af3217b4a5775f3f0493c
blob + 5e816ae10b5948c0aeacd27365a91f3147b96baf
--- include/diff_main.h
+++ include/diff_main.h
@@ -105,6 +105,7 @@ struct diff_data {
 	const uint8_t *data;	/* if memory-mapped */
 	off_t len;
 
+	int atomizer_flags;
 	ARRAYLIST(struct diff_atom) atoms;
 	struct diff_data *root;
 	struct diff_data *current;
@@ -115,8 +116,13 @@ struct diff_data {
 	int err;
 };
 
+/* Flags set by file atomizer. */
+#define DIFF_ATOMIZER_FOUND_BINARY_DATA	0x00000001
+
+/* Flags set by caller of diff_main(). */
 #define DIFF_FLAG_IGNORE_WHITESPACE	0x00000001
 #define DIFF_FLAG_SHOW_PROTOTYPES	0x00000002
+#define DIFF_FLAG_FORCE_TEXT_DATA	0x00000004
 
 void diff_data_free(struct diff_data *diff_data);
 
@@ -143,7 +149,7 @@ struct diff_state;
  *
  * func_data: context pointer (free to be used by implementation).
  * d: struct diff_data with d->data and d->len already set up, and
- * d->atoms to be created.
+ * d->atoms to be created and d->atomizer_flags to be set up.
  */
 typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d);
 
blob - 1da34c64d7271fbd198a3a0cfc8d0414641d22d8
blob + 0531fabe30530664069034f1c3166fe9d18a6e54
--- lib/diff_atomize_text.c
+++ lib/diff_atomize_text.c
@@ -43,6 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 	unsigned int array_size_estimate = d->len / 50;
 	unsigned int pow2 = 1;
 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
+	bool embedded_nul = false;
 
 	while (array_size_estimate >>= 1)
 		pow2++;
@@ -71,6 +72,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 					    || !isspace(buf[i]))
 						hash = diff_atom_hash_update(
 						    hash, buf[i]);
+					if (buf[i] == '\0')
+						embedded_nul = true;
 					line_end++;
 				} else
 					eol = buf[i];
@@ -112,6 +115,10 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 			return errno;
 	}
 
+	/* File are considered binary if they contain embedded '\0' bytes. */
+	if (embedded_nul)
+		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
+
 	return DIFF_RC_OK;
 }
 
@@ -121,7 +128,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 	const uint8_t *pos = d->data;
 	const uint8_t *end = pos + d->len;
 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
-
+	bool embedded_nul = false;
 	unsigned int array_size_estimate = d->len / 50;
 	unsigned int pow2 = 1;
 	while (array_size_estimate >>= 1)
@@ -137,6 +144,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 			if (!ignore_whitespace
 			    || !isspace(*line_end))
 				hash = hash * 23 + *line_end;
+			if (*line_end == '\0')
+				embedded_nul = true;
 			line_end++;
 		}
 
@@ -167,6 +176,10 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 		pos = line_end;
 	}
 
+	/* File are considered binary if they contain embedded '\0' bytes. */
+	if (embedded_nul)
+		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
+
 	return DIFF_RC_OK;
 }
 
blob - 116c0d8c3b0d2a06a61a9f516b6b89dfe3f82f8d
blob + 677f5baf44d4fa746fc192cb95326307b7f2bf0c
--- lib/diff_output_edscript.c
+++ lib/diff_output_edscript.c
@@ -110,6 +110,12 @@ diff_output_edscript(struct diff_output_info **output_
 {
 	struct diff_output_info *outinfo = NULL;
 	struct diff_chunk_context cc = {};
+	int atomizer_flags = (result->left->atomizer_flags|
+	    result->right->atomizer_flags);
+	int flags = (result->left->root->diff_flags |
+	    result->right->root->diff_flags);
+	bool force_text = (flags & DIFF_FLAG_FORCE_TEXT_DATA);
+	bool have_binary = (atomizer_flags & DIFF_ATOMIZER_FOUND_BINARY_DATA);
 	int i, rc;
 
 	if (!result)
@@ -124,6 +130,23 @@ diff_output_edscript(struct diff_output_info **output_
 		outinfo = *output_info;
 	}
 
+	if (have_binary && !force_text) {
+		for (i = 0; i < result->chunks.len; i++) {
+			struct diff_chunk *c = &result->chunks.head[i];
+			enum diff_chunk_type t = diff_chunk_type(c);
+
+			if (t != CHUNK_MINUS && t != CHUNK_PLUS)
+				continue;
+
+			fprintf(dest, "Binary files %s and %s differ\n",
+			    info->left_path ? : "a",
+			    info->right_path ? : "b");
+			break;
+		}
+
+		return DIFF_RC_OK;
+	}
+
 	for (i = 0; i < result->chunks.len; i++) {
 		struct diff_chunk *chunk = &result->chunks.head[i];
 		enum diff_chunk_type t = diff_chunk_type(chunk);
blob - 2f178e703ce61ba77413863c0b71d0f96e256909
blob + 520dc91d99494e5f4a3c9be8c238a6defd1ff48a
--- lib/diff_output_unidiff.c
+++ lib/diff_output_unidiff.c
@@ -412,9 +412,13 @@ diff_output_unidiff(struct diff_output_info **output_i
 	struct diff_output_unidiff_state *state;
 	struct diff_chunk_context cc = {};
 	struct diff_output_info *outinfo = NULL;
+	int atomizer_flags = (result->left->atomizer_flags|
+	    result->right->atomizer_flags);
 	int flags = (result->left->root->diff_flags |
 	    result->right->root->diff_flags);
 	bool show_function_prototypes = (flags & DIFF_FLAG_SHOW_PROTOTYPES);
+	bool force_text = (flags & DIFF_FLAG_FORCE_TEXT_DATA);
+	bool have_binary = (atomizer_flags & DIFF_ATOMIZER_FOUND_BINARY_DATA);
 	int i;
 
 	if (!result)
@@ -428,7 +432,24 @@ diff_output_unidiff(struct diff_output_info **output_i
 			return ENOMEM;
 		outinfo = *output_info;
 	}
+
+	if (have_binary && !force_text) {
+		for (i = 0; i < result->chunks.len; i++) {
+			struct diff_chunk *c = &result->chunks.head[i];
+			enum diff_chunk_type t = diff_chunk_type(c);
 
+			if (t != CHUNK_MINUS && t != CHUNK_PLUS)
+				continue;
+
+			fprintf(dest, "Binary files %s and %s differ\n",
+			    info->left_path ? : "a",
+			    info->right_path ? : "b");
+			break;
+		}
+
+		return DIFF_RC_OK;
+	}
+
 	state = diff_output_unidiff_state_alloc();
 	if (state == NULL) {
 		if (output_info) {