commit 60b6e08bf7113461bbe0ce511caf96a31121b143 from: Neels Hofmeyr date: Mon Oct 12 04:01:33 2020 UTC patience: use qsort to optimize finding unique lines commit - ad5b3f855591bc548f15e09ae4b7fdf674f16245 commit + 60b6e08bf7113461bbe0ce511caf96a31121b143 blob - 375e1b98cb5964fd4d7c3d4c3cd4e1497d56fd89 blob + 9dc641cf345f7bda61a2e56012332d68e357bddb --- include/diff_main.h +++ include/diff_main.h @@ -44,6 +44,8 @@ struct diff_data { struct diff_data *root; int diff_flags; + + int err; }; #define DIFF_FLAG_IGNORE_WHITESPACE 0x00000001 blob - bb2d2b2222005a0962cc5edc2a2573fbaa3ab2a8 blob + e0580aa5e5f76cdaee48726754ed1af7595dfbe3 --- lib/diff_debug.h +++ lib/diff_debug.h @@ -52,8 +52,7 @@ dump_atom(const struct diff_data *left, const struct d diff_atom_root_idx(right, atom->patience.pos_in_other)); print(" %s%s '", - atom->patience.unique_here ? "u" : " ", - atom->patience.unique_in_both ? "c" : " "); + atom->patience.unique_in_both ? "u" : " "); if (atom->at == NULL) { off_t remain = atom->len; if (fseek(atom->d->root->f, atom->pos, SEEK_SET) == -1) blob - d8950f01de74482510174dd49af87a8d3f8b13f6 blob + e9088e91a37fcda61b2cf9cd6487f3abe30adee4 --- lib/diff_internal.h +++ lib/diff_internal.h @@ -73,7 +73,6 @@ struct diff_atom { /* State for the Patience Diff algorithm */ /* TODO: keep a separate array for the patience state */ struct { - bool unique_here; bool unique_in_both; struct diff_atom *pos_in_other; struct diff_atom *prev_stack; blob - cccfafd597fa145d26f94990a65acb0a25287205 blob + e7f728f1729206b8262ad23227da9f0d7d0dff66 --- lib/diff_patience.c +++ lib/diff_patience.c @@ -29,127 +29,121 @@ #include "diff_internal.h" #include "diff_debug.h" -/* Set unique_here = true for all atoms that exist exactly once in this list. */ -static int -diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count) +int diff_atoms_qsort_compar(const void *_a, const void *_b) { - struct diff_atom *i; - unsigned int count = 0; - diff_data_foreach_atom(i, d) { - i->patience.unique_here = true; - i->patience.unique_in_both = true; - count++; + const struct diff_atom *a = *(struct diff_atom**)_a; + const struct diff_atom *b = *(struct diff_atom**)_b; + int cmp; + int rc; + + if (a->root->err || b->root->err) { + /* If atoms are from more than one diff_data, make sure the + * error, if any, spreads to all of them. */ + a->root->err = rc; + b->root->err = rc; + return 0; } - diff_data_foreach_atom(i, d) { - struct diff_atom *j; - if (!i->patience.unique_here) - continue; + /* Sort by the simplistic hash */ + if (a->hash < b->hash) + return -1; + if (a->hash > b->hash) + return 1; - diff_data_foreach_atom_from(i + 1, j, d) { - bool same; - int r = diff_atom_same(&same, i, j); - if (r) - return r; - if (!same) - continue; - if (i->patience.unique_here) { - i->patience.unique_here = false; - i->patience.unique_in_both = false; - count--; - } - j->patience.unique_here = false; - j->patience.unique_in_both = false; - count--; - } + /* If hashes are the same, the lines may still differ. Do a full cmp. */ + rc = diff_atom_cmp(&cmp, a, b); + + if (rc) { + /* Mark the I/O error so that the caller can find out about it. + * For the case atoms are from more than one diff_data, mark in + * both. */ + a->root->err = rc; + if (a->root != b->root) + b->root->err = rc; + return 0; } - if (unique_count) - *unique_count = count; - return 0; + + return cmp; } -/* Mark those lines as atom->patience.unique_in_both = true that appear exactly - * once in each side. */ -static int -diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right, - unsigned int *unique_in_both_count) +/* Sort an array of struct diff_atom* in-place. */ +static int diff_atoms_qsort(struct diff_atom *atoms[], + size_t atoms_count) { - /* Derive the final unique_in_both count without needing an explicit - * iteration. So this is just some optimiziation to save one iteration - * in the end. */ - unsigned int unique_in_both; - int r; + qsort(atoms, atoms_count, sizeof(struct diff_atom*), + diff_atoms_qsort_compar); + return atoms[0]->root->err; +} - r = diff_atoms_mark_unique(left, &unique_in_both); - if (r) - return r; - r = diff_atoms_mark_unique(right, NULL); - if (r) - return r; - - debug("unique_in_both %u\n", unique_in_both); - - struct diff_atom *i; - diff_data_foreach_atom(i, left) { - if (!i->patience.unique_here) - continue; - struct diff_atom *j; - int found_in_b = 0; - diff_data_foreach_atom(j, right) { - bool same; - int r = diff_atom_same(&same, i, j); - if (r) - return r; - if (!same) - continue; - if (!j->patience.unique_here) { - found_in_b = 2; /* or more */ +static int +diff_atoms_mark_unique_in_both_by_qsort(struct diff_data *left, + struct diff_data *right, + unsigned int *unique_in_both_count_p) +{ + struct diff_atom *a; + struct diff_atom *b; + struct diff_atom **all_atoms = + malloc((left->atoms.len + right->atoms.len) + * sizeof(struct diff_atom*)); + unsigned int len = 0; + unsigned int i; + unsigned int unique_in_both_count = 0; + int rc; + left->root->err = 0; + right->root->err = 0; + diff_data_foreach_atom(a, left) { + all_atoms[len++] = a; + } + diff_data_foreach_atom(b, right) { + all_atoms[len++] = b; + } + + rc = diff_atoms_qsort(all_atoms, len); + if (rc) + goto free_and_exit; + + /* Now we have a sorted array of atom pointers. All similar lines are + * adjacent. Walk through the array and mark those that are unique on + * each side, but exist once in both sources. */ + for (i = 0; i < len; i++) { + bool same; + unsigned int j; + unsigned int count_first_side = 1; + unsigned int count_other_side = 0; + a = all_atoms[i]; + + for (j = i+1; j < len; j++) { + b = all_atoms[j]; + rc = diff_atom_same(&same, a, b); + if (rc) + goto free_and_exit; + if (!same) break; - } else { - found_in_b = 1; - j->patience.pos_in_other = i; - i->patience.pos_in_other = j; - } + /* A following atom is the same. See on which side the + * repetition counts. */ + if (a->root == b->root) + count_first_side ++; + else + count_other_side ++; } - if (found_in_b == 0 || found_in_b > 1) { - i->patience.unique_in_both = false; - unique_in_both--; - debug("unique_in_both %u (%d) ", unique_in_both, - found_in_b); - debug_dump_atom(left, NULL, i); + /* Counted a section of similar atoms, put the results back to + * the atoms. */ + if ((count_first_side == 1) + && (count_other_side == 1)) { + b = all_atoms[i+1]; + a->patience.unique_in_both = true; + a->patience.pos_in_other = b; + b->patience.unique_in_both = true; + b->patience.pos_in_other = a; + unique_in_both_count++; } } - - /* Still need to unmark right[*]->patience.unique_in_both for atoms that - * don't exist in left */ - diff_data_foreach_atom(i, right) { - if (!i->patience.unique_here - || !i->patience.unique_in_both) - continue; - struct diff_atom *j; - bool found_in_a = false; - diff_data_foreach_atom(j, left) { - bool same; - int r; - if (!j->patience.unique_in_both) - continue; - r = diff_atom_same(&same, i, j); - if (r) - return r; - if (!same) - continue; - found_in_a = true; - break; - } - - if (!found_in_a) - i->patience.unique_in_both = false; - } - - if (unique_in_both_count) - *unique_in_both_count = unique_in_both; - return 0; + *unique_in_both_count_p = unique_in_both_count; + rc = 0; +free_and_exit: + free(all_atoms); + return rc; } static int @@ -288,7 +282,8 @@ diff_algo_patience(const struct diff_algo_config *algo /* Find those lines that appear exactly once in 'left' and exactly once * in 'right'. */ - rc = diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count); + rc = diff_atoms_mark_unique_in_both_by_qsort(left, right, + &unique_in_both_count); if (rc) return rc;