commit 60b6e08bf7113461bbe0ce511caf96a31121b143
from: Neels Hofmeyr <neels@hofmeyr.de>
date: Mon Oct 12 04:01:33 2020 UTC

patience: use qsort to optimize finding unique lines

commit - ad5b3f855591bc548f15e09ae4b7fdf674f16245
commit + 60b6e08bf7113461bbe0ce511caf96a31121b143
blob - 375e1b98cb5964fd4d7c3d4c3cd4e1497d56fd89
blob + 9dc641cf345f7bda61a2e56012332d68e357bddb
--- include/diff_main.h
+++ include/diff_main.h
@@ -44,6 +44,8 @@ struct diff_data {
 	struct diff_data *root;
 
 	int diff_flags;
+
+	int err;
 };
 
 #define DIFF_FLAG_IGNORE_WHITESPACE	0x00000001
blob - bb2d2b2222005a0962cc5edc2a2573fbaa3ab2a8
blob + e0580aa5e5f76cdaee48726754ed1af7595dfbe3
--- lib/diff_debug.h
+++ lib/diff_debug.h
@@ -52,8 +52,7 @@ dump_atom(const struct diff_data *left, const struct d
 		      diff_atom_root_idx(right, atom->patience.pos_in_other));
 
 	print(" %s%s '",
-	      atom->patience.unique_here ? "u" : " ",
-	      atom->patience.unique_in_both ? "c" : " ");
+	      atom->patience.unique_in_both ? "u" : " ");
 	if (atom->at == NULL) {
 		off_t remain = atom->len;
 		if (fseek(atom->d->root->f, atom->pos, SEEK_SET) == -1)
blob - d8950f01de74482510174dd49af87a8d3f8b13f6
blob + e9088e91a37fcda61b2cf9cd6487f3abe30adee4
--- lib/diff_internal.h
+++ lib/diff_internal.h
@@ -73,7 +73,6 @@ struct diff_atom {
 	/* State for the Patience Diff algorithm */
 	/* TODO: keep a separate array for the patience state */
 	struct {
-		bool unique_here;
 		bool unique_in_both;
 		struct diff_atom *pos_in_other;
 		struct diff_atom *prev_stack;
blob - cccfafd597fa145d26f94990a65acb0a25287205
blob + e7f728f1729206b8262ad23227da9f0d7d0dff66
--- lib/diff_patience.c
+++ lib/diff_patience.c
@@ -29,127 +29,121 @@
 #include "diff_internal.h"
 #include "diff_debug.h"
 
-/* Set unique_here = true for all atoms that exist exactly once in this list. */
-static int
-diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
+int diff_atoms_qsort_compar(const void *_a, const void *_b)
 {
-	struct diff_atom *i;
-	unsigned int count = 0;
-	diff_data_foreach_atom(i, d) {
-		i->patience.unique_here = true;
-		i->patience.unique_in_both = true;
-		count++;
+	const struct diff_atom *a = *(struct diff_atom**)_a;
+	const struct diff_atom *b = *(struct diff_atom**)_b;
+	int cmp;
+	int rc;
+
+	if (a->root->err || b->root->err) {
+		/* If atoms are from more than one diff_data, make sure the
+		 * error, if any, spreads to all of them. */
+		a->root->err = rc;
+		b->root->err = rc;
+		return 0;
 	}
-	diff_data_foreach_atom(i, d) {
-		struct diff_atom *j;
 
-		if (!i->patience.unique_here)
-			continue;
+	/* Sort by the simplistic hash */
+	if (a->hash < b->hash)
+		return -1;
+	if (a->hash > b->hash)
+		return 1;
 
-		diff_data_foreach_atom_from(i + 1, j, d) {
-			bool same;
-			int r = diff_atom_same(&same, i, j);
-			if (r)
-				return r;
-			if (!same)
-				continue;
-			if (i->patience.unique_here) {
-				i->patience.unique_here = false;
-				i->patience.unique_in_both = false;
-				count--;
-			}
-			j->patience.unique_here = false;
-			j->patience.unique_in_both = false;
-			count--;
-		}
+	/* If hashes are the same, the lines may still differ. Do a full cmp. */
+	rc = diff_atom_cmp(&cmp, a, b);
+
+	if (rc) {
+		/* Mark the I/O error so that the caller can find out about it.
+		 * For the case atoms are from more than one diff_data, mark in
+		 * both. */
+		a->root->err = rc;
+		if (a->root != b->root)
+			b->root->err = rc;
+		return 0;
 	}
-	if (unique_count)
-		*unique_count = count;
-	return 0;
+
+	return cmp;
 }
 
-/* Mark those lines as atom->patience.unique_in_both = true that appear exactly
- * once in each side. */
-static int
-diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
-			       unsigned int *unique_in_both_count)
+/* Sort an array of struct diff_atom* in-place. */
+static int diff_atoms_qsort(struct diff_atom *atoms[],
+			    size_t atoms_count)
 {
-	/* Derive the final unique_in_both count without needing an explicit
-	 * iteration. So this is just some optimiziation to save one iteration
-	 * in the end. */
-	unsigned int unique_in_both;
-	int r;
+	qsort(atoms, atoms_count, sizeof(struct diff_atom*),
+	      diff_atoms_qsort_compar);
+	return atoms[0]->root->err;
+}
 
-	r = diff_atoms_mark_unique(left, &unique_in_both);
-	if (r)
-		return r;
-	r = diff_atoms_mark_unique(right, NULL);
-	if (r)
-		return r;
-
-	debug("unique_in_both %u\n", unique_in_both);
-
-	struct diff_atom *i;
-	diff_data_foreach_atom(i, left) {
-		if (!i->patience.unique_here)
-			continue;
-		struct diff_atom *j;
-		int found_in_b = 0;
-		diff_data_foreach_atom(j, right) {
-			bool same;
-			int r = diff_atom_same(&same, i, j);
-			if (r)
-				return r;
-			if (!same)
-				continue;
-			if (!j->patience.unique_here) {
-				found_in_b = 2; /* or more */
+static int
+diff_atoms_mark_unique_in_both_by_qsort(struct diff_data *left,
+					struct diff_data *right,
+					unsigned int *unique_in_both_count_p)
+{
+	struct diff_atom *a;
+	struct diff_atom *b;
+	struct diff_atom **all_atoms =
+		malloc((left->atoms.len + right->atoms.len)
+		       * sizeof(struct diff_atom*));
+	unsigned int len = 0;
+	unsigned int i;
+	unsigned int unique_in_both_count = 0;
+	int rc;
+	left->root->err = 0;
+	right->root->err = 0;
+	diff_data_foreach_atom(a, left) {
+		all_atoms[len++] = a;
+	}
+	diff_data_foreach_atom(b, right) {
+		all_atoms[len++] = b;
+	}
+
+	rc = diff_atoms_qsort(all_atoms, len);
+	if (rc)
+		goto free_and_exit;
+
+	/* Now we have a sorted array of atom pointers. All similar lines are
+	 * adjacent. Walk through the array and mark those that are unique on
+	 * each side, but exist once in both sources. */
+	for (i = 0; i < len; i++) {
+		bool same;
+		unsigned int j;
+		unsigned int count_first_side = 1;
+		unsigned int count_other_side = 0;
+		a = all_atoms[i];
+
+		for (j = i+1; j < len; j++) {
+			b = all_atoms[j];
+			rc = diff_atom_same(&same, a, b);
+			if (rc)
+				goto free_and_exit;
+			if (!same)
 				break;
-			} else {
-				found_in_b = 1;
-				j->patience.pos_in_other = i;
-				i->patience.pos_in_other = j;
-			}
+			/* A following atom is the same. See on which side the
+			 * repetition counts. */
+			if (a->root == b->root)
+				count_first_side ++;
+			else
+				count_other_side ++;
 		}
 
-		if (found_in_b == 0 || found_in_b > 1) {
-			i->patience.unique_in_both = false;
-			unique_in_both--;
-			debug("unique_in_both %u  (%d) ", unique_in_both,
-			      found_in_b);
-			debug_dump_atom(left, NULL, i);
+		/* Counted a section of similar atoms, put the results back to
+		 * the atoms. */
+		if ((count_first_side == 1)
+		    && (count_other_side == 1)) {
+			b = all_atoms[i+1];
+			a->patience.unique_in_both = true;
+			a->patience.pos_in_other = b;
+			b->patience.unique_in_both = true;
+			b->patience.pos_in_other = a;
+			unique_in_both_count++;
 		}
 	}
-
-	/* Still need to unmark right[*]->patience.unique_in_both for atoms that
-	 * don't exist in left */
-	diff_data_foreach_atom(i, right) {
-		if (!i->patience.unique_here
-		    || !i->patience.unique_in_both)
-			continue;
-		struct diff_atom *j;
-		bool found_in_a = false;
-		diff_data_foreach_atom(j, left) {
-			bool same;
-			int r;
-			if (!j->patience.unique_in_both)
-				continue;
-			r = diff_atom_same(&same, i, j);
-			if (r)
-				return r;
-			if (!same)
-				continue;
-			found_in_a = true;
-			break;
-		}
-
-		if (!found_in_a)
-			i->patience.unique_in_both = false;
-	}
-
-	if (unique_in_both_count)
-		*unique_in_both_count = unique_in_both;
-	return 0;
+	*unique_in_both_count_p = unique_in_both_count;
+	rc = 0;
+free_and_exit:
+	free(all_atoms);
+	return rc;
 }
 
 static int
@@ -288,7 +282,8 @@ diff_algo_patience(const struct diff_algo_config *algo
 
 	/* Find those lines that appear exactly once in 'left' and exactly once
 	 * in 'right'. */
-	rc = diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
+	rc = diff_atoms_mark_unique_in_both_by_qsort(left, right,
+						     &unique_in_both_count);
 	if (rc)
 		return rc;