commit 3b0f3d6191103b52a0619ed00752f7f5e6fa754c from: Neels Hofmeyr date: Wed Jan 22 03:02:58 2020 UTC initial commit commit - /dev/null commit + 3b0f3d6191103b52a0619ed00752f7f5e6fa754c blob - /dev/null blob + 4ce1b320e06779fbf641c890bdb8a0d6dcea3ed9 (mode 644) --- /dev/null +++ .gitignore @@ -0,0 +1,8 @@ +.*.sw? +diff/diff +*.o +*.d +*.a +tags +test/got*.diff +test/verify.* blob - /dev/null blob + d908088b90fc58865c6f9352047c7d52f37c3a48 (mode 644) --- /dev/null +++ LICENCE @@ -0,0 +1,13 @@ + Copyright (c) 2020 Neels Hofmeyr + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. blob - /dev/null blob + 2ff357ceee0eed252b191258de3fea36fa4ae92b (mode 644) --- /dev/null +++ README @@ -0,0 +1,29 @@ +This is a collection of diff algorithms, to test various combinations. + +The initial aim was to provide a faster diff implementation for got +(gameoftrees.org) with a BSD license, at the u2k20 OpenBSD hackathon. +A side effect could be improving OpenBSD's /usr/bin/diff utility. + +At the time of writing, this is little more than a playground / benchmark basis +/ diff algorithm analysis platform. What could be done: +- add profiling and test series to rate diff algorithm combinations. +- interface with / merge into got. + +The Myers and Patience Diff algorithm implementations found here are based on +the explanations found in these blog post series: + https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff. +and + https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ ff. +-- possibly the single most comprehensive explanations of these algorithms. +Many thanks for this valuable door opener! +The source code itself is not based on the code found in those blogs, but +written from scratch with the knowledge gained. + +Compile: + OpenBSD: + make -C diff + Linux: + make -f linux_Makefile -C diff + +Test: + make -C test/ blob - /dev/null blob + b93701332fb1013c6f09d620e138075dc268925c (mode 644) --- /dev/null +++ diff/Makefile @@ -0,0 +1,38 @@ +.PATH:${.CURDIR}/../lib + +.include "../diff-version.mk" + +PROG= diff +SRCS= \ + diff.c \ + diff_atomize_text.c \ + diff_main.c \ + diff_myers.c \ + diff_patience.c \ + diff_output.c \ + diff_output_plain.c \ + diff_output_unidiff.c \ + ${END} +MAN = ${PROG}.1 + +CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib + +.if defined(PROFILE) +LDADD = -lutil_p -lz_p -lc_p +.else +LDADD = -lutil -lz +.endif + +.if ${DIFF_RELEASE} != "Yes" +NOMAN = Yes +.endif + +realinstall: + ${INSTALL} ${INSTALL_COPY} -o ${BINOWN} -g ${BINGRP} \ + -m ${BINMODE} ${PROG} ${BINDIR}/${PROG} + +dist: + mkdir ../diff-${DIFF_VERSION}/diff + cp ${SRCS} ${MAN} ../diff-${DIFF_VERSION}/diff + +.include blob - /dev/null blob + be68fea20a6bbf1f0e109ccd5ea2042919055f31 (mode 644) --- /dev/null +++ diff/diff.c @@ -0,0 +1,143 @@ +/* Commandline diff utility to test diff implementations. */ +/* + * Copyright (c) 2018 Martin Pieuchot + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +/* stupid shims to compile and test on linux */ +#define __dead + +static const char *getprogname() +{ + return "diff"; +} +#endif + +__dead void usage(void); +int diffreg(char *, char *, int); +char *mmapfile(const char *, struct stat *); + +__dead void +usage(void) +{ + fprintf(stderr, "usage: %s file1 file2\n", getprogname()); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + int ch; + + while ((ch = getopt(argc, argv, "")) != -1) { + switch (ch) { + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + if (argc != 2) + usage(); + + return diffreg(argv[0], argv[1], 0); +} + +#include +#include + +const struct diff_algo_config myers, patience, myers_divide; + +const struct diff_algo_config myers = (struct diff_algo_config){ + .impl = diff_algo_myers, + .permitted_state_size = 104 * sizeof(int), + .fallback_algo = &patience, +}; + +const struct diff_algo_config patience = (struct diff_algo_config){ + .impl = diff_algo_patience, + .inner_algo = &patience, // After subdivision, do Patience again. + .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera. +}; + +const struct diff_algo_config myers_divide = (struct diff_algo_config){ + .impl = diff_algo_myers_divide, + .inner_algo = &myers, // When division succeeded, start from the top. + // (fallback_algo = NULL implies diff_algo_none). +}; + +const struct diff_config diff_config = { + .atomize_func = diff_atomize_text_by_line, + .algo = &myers, +}; + +int +diffreg(char *file1, char *file2, int flags) +{ + char *str1, *str2; + struct stat st1, st2; + struct diff_input_info info = { + .left_path = file1, + .right_path = file2, + }; + + str1 = mmapfile(file1, &st1); + str2 = mmapfile(file2, &st2); + + diff_plain(stdout, &diff_config, &info, str1, st1.st_size, str2, st2.st_size); + + munmap(str1, st1.st_size); + munmap(str2, st2.st_size); + + return 0; +} + +char * +mmapfile(const char *path, struct stat *st) +{ + int fd; + char *p; + + fd = open(path, O_RDONLY); + if (fd == -1) + err(2, "%s", path); + + if (fstat(fd, st) == -1) + err(2, "%s", path); + + if ((uintmax_t)st->st_size > SIZE_MAX) + errx(2, "%s: file too big to fit memory", path); + + p = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + err(2, "mmap"); + + close(fd); + + return p; +} blob - /dev/null blob + 62c7103d70ac32f292cc05f62c782305dd8798f6 (mode 644) --- /dev/null +++ diff/linux_Makefile @@ -0,0 +1,13 @@ +CFLAGS = -fsanitize=address -fsanitize=undefined -g -O3 + +diff: diff.c ../lib/libdiff.a + gcc $(CFLAGS) -I../include -o $@ $^ + + +../lib/libdiff.a: ../lib/*.[hc] ../include/diff/*.h + $(MAKE) -f linux_Makefile -C ../lib + +.PHONY: clean +clean: + rm diff + $(MAKE) -f linux_Makefile -C ../lib clean blob - /dev/null blob + 1d14cd7ecfcc0a39e1935164b1480f4dffbf0b28 (mode 644) --- /dev/null +++ diff-version.mk @@ -0,0 +1,8 @@ +DIFF_RELEASE=No +DIFF_VERSION_NUMBER=0.1 + +.if ${DIFF_RELEASE} == Yes +DIFF_VERSION=${DIFF_VERSION_NUMBER} +.else +DIFF_VERSION=${DIFF_VERSION_NUMBER}-current +.endif blob - /dev/null blob + 395468a8c29ce7c6d5ca5283a1a1aa247acb3d87 (mode 644) --- /dev/null +++ include/diff/arraylist.h @@ -0,0 +1,96 @@ +/* Auto-reallocating array for arbitrary member types. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#ifdef __linux__ +/* stupid shims to compile and test on linux */ +#include +static inline void *reallocarray(void *buf, size_t n, size_t member_size) +{ + return realloc(buf, n * member_size); +} +static inline void *recallocarray(void *buf, size_t oldn, size_t n, size_t member_size) +{ + buf = reallocarray(buf, n, member_size); + bzero(((char*)buf) + (oldn * member_size), (n - oldn) * member_size); + return buf; +} +#endif + + +/* Usage: + * + * ARRAYLIST(any_type_t) list; + * // OR + * typedef ARRAYLIST(any_type_t) any_type_list_t; + * any_type_list_t list; + * + * ARRAYLIST_INIT(list, 128); // < number of (at first unused) members to add on each realloc + * any_type_t *x; + * while (bar) { + * ARRAYLIST_ADD(x, list); // < enlarges the allocated array as needed; list.head may change due to realloc + * if (!x) + * abort(); + * *x = random_foo_value; + * } + * for (i = 0; i < list.len; i++) + * printf("%s", foo_to_str(list.head[i])); + * ARRAYLIST_FREE(list); + */ +#define ARRAYLIST(MEMBER_TYPE) \ + struct { \ + MEMBER_TYPE *head; \ + unsigned int len; \ + unsigned int allocated; \ + unsigned int alloc_blocksize; \ + } + +#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \ + (ARRAY_LIST).head = NULL; \ + (ARRAY_LIST).len = 0; \ + (ARRAY_LIST).allocated = 0; \ + (ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \ + } while(0) + +#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \ + if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \ + NEW_ITEM_P = NULL; \ + break; \ + } \ + if ((ARRAY_LIST).head == NULL || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \ + (ARRAY_LIST).allocated += (ARRAY_LIST).alloc_blocksize ? : 8; \ + (ARRAY_LIST).head = recallocarray((ARRAY_LIST).head, (ARRAY_LIST).len, \ + (ARRAY_LIST).allocated, sizeof(*(ARRAY_LIST).head)); \ + }; \ + if (!(ARRAY_LIST).head || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \ + NEW_ITEM_P = NULL; \ + break; \ + } \ + (NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \ + (ARRAY_LIST).len++; \ + } while (0) + +#define ARRAYLIST_CLEAR(ARRAY_LIST) \ + (ARRAY_LIST).len = 0 + +#define ARRAYLIST_FREE(ARRAY_LIST) \ + do { \ + if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \ + free((ARRAY_LIST).head); \ + ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \ + } while(0) blob - /dev/null blob + 073e0a21c0a312a7c09f13092e23da0decdec43a (mode 644) --- /dev/null +++ include/diff/diff_main.h @@ -0,0 +1,249 @@ +/* Generic infrastructure to implement various diff algorithms. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +/* List of all possible return codes of a diff invocation. */ +enum diff_rc { + DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1, + DIFF_RC_OK = 0, + DIFF_RC_ENOTSUP = ENOTSUP, + DIFF_RC_ENOMEM = ENOMEM, + DIFF_RC_EINVAL = EINVAL, +}; + +struct diff_atom { + const uint8_t *at; + size_t len; + + /* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to + * compare entire atoms to find out whether they are indeed identical or not. */ + unsigned int hash; + + /* State for the Patience Diff algorithm */ + /* TODO: keep a separate array for the patience state */ + struct { + bool unique_here; + bool unique_in_both; + struct diff_atom *pos_in_other; + struct diff_atom *prev_stack; + } patience; +}; + +static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right) +{ + return left->hash == right->hash + && left->len == right->len + && memcmp(left->at, right->at, left->len) == 0; +} + +/* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In + * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file, + * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms + * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a + * subset of atoms. */ +struct diff_data { + const uint8_t *data; + size_t len; + ARRAYLIST(struct diff_atom) atoms; + struct diff_data *root; +}; + +void diff_data_free(struct diff_data *diff_data); + +/* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with + * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in + * the file (and not the relative position within the subsection). */ +#define diff_atom_root_idx(DIFF_DATA, ATOM) \ + ((ATOM) ? (ATOM) - ((DIFF_DATA)->root->atoms.head) : (DIFF_DATA)->root->atoms.len) + +/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with + * 0). */ +#define diff_atom_idx(DIFF_DATA, ATOM) \ + ((ATOM) ? (ATOM) - ((DIFF_DATA)->atoms.head) : (DIFF_DATA)->atoms.len) + +#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \ + for ((ATOM) = (FIRST_ATOM); \ + (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \ + (ATOM)++) + +#define diff_data_foreach_atom(ATOM, DIFF_DATA) \ + foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len) + +#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \ + for ((ATOM) = (FROM); \ + (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \ + (ATOM)++) + +/* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right. + * + * If solved == false: + * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next. + * The lines on the left should be diffed against the lines on the right. + * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.) + * + * If solved == true: + * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk"). + * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk"). + * If there are both left and right lines, it is a chunk of equal content on both sides, + * and left_count == right_count: + * + * - foo } + * - bar }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3, + * - baz } right_start = NULL, right_count = 0 } + * moo } + * goo }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3, + * zoo } right_start = &right.atoms.head[0], right_count = 3 } + * +loo } + * +roo }-- diff_chunk{ left_start = NULL, left_count = 0, + * +too } right_start = &right.atoms.head[3], right_count = 3 } + * + */ +struct diff_chunk { + bool solved; + struct diff_atom *left_start; + unsigned int left_count; + struct diff_atom *right_start; + unsigned int right_count; +}; + +typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t; +#define DIFF_RESULT_ALLOC_BLOCKSIZE 128 + +struct diff_result { + enum diff_rc rc; + struct diff_data left; + struct diff_data right; + diff_chunk_arraylist_t chunks; +}; + +struct diff_state { + /* The final result passed to the original diff caller. */ + struct diff_result *result; + + /* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */ + struct diff_data left; + struct diff_data right; + + unsigned int recursion_depth_left; + + /* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */ + diff_chunk_arraylist_t temp_result; +}; + +struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved, + struct diff_atom *left_start, unsigned int left_count, + struct diff_atom *right_start, unsigned int right_count); + +/* Signature of a utility function to divide both source files into diff atoms. + * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets + * both left and right to atomize at the same time. + * An example is diff_atomize_text_by_line() in diff_atomize_text.c. + * + * func_data: context pointer (free to be used by implementation). + * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created. + * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created. + */ +typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right); + +extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right); + +struct diff_algo_config; +typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */ +enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can + * fail if the required space surpasses algo_config->permitted_state_size. */ +extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides + * the problem into smaller chunks. Requires only linear amounts of memory. */ +extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to + * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */ +extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state); + +/* Diff algorithms to use, possibly nested. For example: + * + * struct diff_algo_config myers, patience, myers_divide; + * + * myers = (struct diff_algo_config){ + * .impl = diff_algo_myers, + * .permitted_state_size = 32 * 1024 * 1024, + * .fallback_algo = &patience, // when too large, do diff_algo_patience + * }; + * + * patience = (struct diff_algo_config){ + * .impl = diff_algo_patience, + * .inner_algo = &patience, // After subdivision, do Patience again. + * .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera. + * }; + * + * myers_divide = (struct diff_algo_config){ + * .impl = diff_algo_myers_divide, + * .inner_algo = &myers, // When division succeeded, start from the top. + * // (fallback_algo = NULL implies diff_algo_none). + * }; + * + * struct diff_config config = { + * .algo = &myers, + * ... + * }; + * diff_main(&config, ...); + */ +struct diff_algo_config { + diff_algo_impl_t impl; + + /* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo + * (diff_algo_myers). permitted_state_size == 0 means no limitation. */ + size_t permitted_state_size; + + /* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */ + const struct diff_algo_config *inner_algo; + + /* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find + * any common-unique atoms), then use this algorithm instead. */ + const struct diff_algo_config *fallback_algo; +}; + +struct diff_config { + diff_atomize_func_t atomize_func; + void *atomize_func_data; + + const struct diff_algo_config *algo; + + /* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite + * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all + * left atoms and add all right atoms). */ + unsigned int max_recursion_depth; +}; + +struct diff_result *diff_main(const struct diff_config *config, + const uint8_t *left_data, size_t left_len, + const uint8_t *right_data, size_t right_len); +void diff_result_free(struct diff_result *result); blob - /dev/null blob + 5d07e74d868098aab6819c41c9499f0f1b30040c (mode 644) --- /dev/null +++ include/diff/diff_output.h @@ -0,0 +1,43 @@ +/* Diff output generators and invocation shims. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#include +#include + +struct diff_input_info { + const char *arbitrary_info; + const char *left_path; + const char *right_path; +}; + +enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result); +enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len); + +enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result, unsigned int context_lines); +enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len, + unsigned int context_lines); + +enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info); +void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count); blob - /dev/null blob + 72a597bf05c06f25fd94c1421712e178888a0ca3 (mode 644) --- /dev/null +++ lib/debug.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#pragma once + +#define DEBUG 0 + +#if DEBUG +#include +#define print(args...) fprintf(stderr, ##args) +#define debug print +#define debug_dump dump +#define debug_dump_atom dump_atom +#define debug_dump_atoms dump_atoms +#define debug_dump_myers_graph dump_myers_graph + +static inline void dump_atom(const struct diff_data *left, const struct diff_data *right, const struct diff_atom *atom) +{ + if (!atom) { + print("NULL atom\n"); + return; + } + if (left) + print(" %3ld", diff_atom_root_idx(left, atom)); + if (right && atom->patience.pos_in_other) + print(" %3ld", diff_atom_root_idx(right, atom->patience.pos_in_other)); + + print(" %s%s '", atom->patience.unique_here ? "u" : " ", atom->patience.unique_in_both ? "c" : " "); + const char *s; + for (s = atom->at; s < (const char*)(atom->at + atom->len); s++) { + if (*s == '\r') + print("\\r"); + else if (*s == '\n') + print("\\n"); + else if (*s < 32 || *s >= 127) + print("\\x%02x", *s); + else + print("%c", *s); + } + print("'\n"); +} + +static inline void dump_atoms(const struct diff_data *d, struct diff_atom *atom, unsigned int count) +{ + struct diff_atom *i; + foreach_diff_atom(i, atom, count) { + dump_atom(d, NULL, i); + } +} + +static inline void dump(struct diff_data *d) +{ + dump_atoms(d, d->atoms.head, d->atoms.len); +} + +static inline void dump_myers_graph(const struct diff_data *l, const struct diff_data *r, + int *kd) +{ + int x; + int y; + print(" "); + for (x = 0; x <= l->atoms.len; x++) + print("%2d", x); + print("\n"); + + for (y = 0; y <= r->atoms.len; y++) { + print("%2d ", y); + for (x = 0; x <= l->atoms.len; x++) { + + /* print d advancements from kd, if any. */ + int d = -1; + if (kd) { + int max = l->atoms.len + r->atoms.len; + size_t kd_len = max + 1 + max; + int *kd_pos = kd; + int di; +#define xk_to_y(X, K) ((X) - (K)) + for (di = 0; di < max; di++) { + int ki; + for (ki = di; ki >= -di; ki -= 2) { + if (x == kd_pos[ki] + && y == xk_to_y(x, ki)) { + d = di; + break; + } + } + if (d >= 0) + break; + kd_pos += kd_len; + } + } + if (d >= 0) + print("%d", d); + else + print("o"); + if (x < l->atoms.len && d < 10) + print("-"); + } + print("\n"); + if (y == r->atoms.len) + break; + + print(" "); + for (x = 0; x < l->atoms.len; x++) { + if (diff_atom_same(&l->atoms.head[x], &r->atoms.head[y])) + print("|\\"); + else + print("| "); + } + print("|\n"); + } +} + +#else +#define debug(args...) +#define debug_dump(args...) +#define debug_dump_atom(args...) +#define debug_dump_atoms(args...) +#define debug_dump_myers_graph(args...) +#endif blob - /dev/null blob + 964c1353da1b055ae761ae6961c45ea19e16d032 (mode 644) --- /dev/null +++ lib/diff_atomize_text.c @@ -0,0 +1,75 @@ +/* Split source by line breaks, and calculate a simplistic checksum. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +static int diff_data_atomize_text_lines(struct diff_data *d) +{ + const uint8_t *pos = d->data; + const uint8_t *end = pos + d->len; + enum diff_rc rc; + + unsigned int array_size_estimate = d->len / 50; + unsigned int pow2 = 1; + while (array_size_estimate >>= 1) + pow2++; + + ARRAYLIST_INIT(d->atoms, 1 << pow2); + + while (pos < end) { + const uint8_t *line_end = pos; + unsigned int hash = 0; + + while (line_end < end && *line_end != '\r' && *line_end != '\n') { + hash = hash * 23 + *line_end; + line_end++; + } + + /* When not at the end of data, the line ending char ('\r' or '\n') must follow */ + if (line_end < end) + line_end++; + /* If that was an '\r', also pull in any following '\n' */ + if (line_end[0] == '\r' && line_end < end && line_end[1] == '\n') + line_end++; + + /* Record the found line as diff atom */ + struct diff_atom *atom; + ARRAYLIST_ADD(atom, d->atoms); + if (!atom) + return DIFF_RC_ENOMEM; + + *atom = (struct diff_atom){ + .at = pos, + .len = line_end - pos, + .hash = hash, + }; + + /* Starting point for next line: */ + pos = line_end; + } + + return DIFF_RC_OK; +} + +enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right) +{ + enum diff_rc rc; + rc = diff_data_atomize_text_lines(left); + if (rc != DIFF_RC_OK) + return rc; + return diff_data_atomize_text_lines(right); +} blob - /dev/null blob + 285bd40d5941fe84c5d9a9bd437a4a9f981ead13 (mode 644) --- /dev/null +++ lib/diff_main.c @@ -0,0 +1,246 @@ +/* Generic infrastructure to implement various diff algorithms (implementation). */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "debug.h" + +/* Even if a left or right side is empty, diff output may need to know the position in that file. + * So left_start or right_start must never be NULL -- pass left_count or right_count as zero to indicate staying at that + * position without consuming any lines. */ +struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved, + struct diff_atom *left_start, unsigned int left_count, + struct diff_atom *right_start, unsigned int right_count) +{ + struct diff_chunk *chunk; + diff_chunk_arraylist_t *result; + + if (solved && !state->temp_result.len) + result = &state->result->chunks; + else + result = &state->temp_result; + + ARRAYLIST_ADD(chunk, *result); + if (!chunk) + return NULL; + *chunk = (struct diff_chunk){ + .solved = solved, + .left_start = left_start, + .left_count = left_count, + .right_start = right_start, + .right_count = right_count, + }; + + debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED"); + debug("L\n"); + debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count); + debug("R\n"); + debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count); + return chunk; +} + +void diff_data_init_root(struct diff_data *d, const uint8_t *data, size_t len) +{ + *d = (struct diff_data){ + .data = data, + .len = len, + .root = d, + }; +} + +void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, + struct diff_atom *from_atom, unsigned int atoms_count) +{ + struct diff_atom *last_atom = from_atom + atoms_count - 1; + *d = (struct diff_data){ + .data = from_atom->at, + .len = (last_atom->at + last_atom->len) - from_atom->at, + .root = parent->root, + .atoms.head = from_atom, + .atoms.len = atoms_count, + }; + + debug("subsection:\n"); + debug_dump(d); +} + +void diff_data_free(struct diff_data *diff_data) +{ + if (!diff_data) + return; + if (diff_data->atoms.allocated) + ARRAYLIST_FREE(diff_data->atoms); +} + +enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + debug("\n** %s\n", __func__); + debug("left:\n"); + debug_dump(&state->left); + debug("right:\n"); + debug_dump(&state->right); + debug_dump_myers_graph(&state->left, &state->right, NULL); + + /* Add a chunk of equal lines, if any */ + unsigned int equal_atoms = 0; + while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len + && diff_atom_same(&state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms])) + equal_atoms++; + if (equal_atoms) { + if (!diff_state_add_chunk(state, true, + &state->left.atoms.head[0], equal_atoms, + &state->right.atoms.head[0], equal_atoms)) + return DIFF_RC_ENOMEM; + } + + /* Add a "minus" chunk with all lines from the left. */ + if (equal_atoms < state->left.atoms.len) { + if (!diff_state_add_chunk(state, true, + &state->left.atoms.head[equal_atoms], + state->left.atoms.len - equal_atoms, + NULL, 0)) + return DIFF_RC_ENOMEM; + } + + /* Add a "plus" chunk with all lines from the right. */ + if (equal_atoms < state->right.atoms.len) { + if (!diff_state_add_chunk(state, true, + NULL, 0, + &state->right.atoms.head[equal_atoms], + state->right.atoms.len - equal_atoms)) + return DIFF_RC_ENOMEM; + } + return DIFF_RC_OK; +} + +enum diff_rc diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + enum diff_rc rc; + ARRAYLIST_FREE(state->temp_result); + + if (!algo_config || !algo_config->impl || !state->recursion_depth_left) { + debug("MAX RECURSION REACHED, just dumping diff chunks\n"); + return diff_algo_none(algo_config, state); + } + + ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); + rc = algo_config->impl(algo_config, state); + switch (rc) { + case DIFF_RC_USE_DIFF_ALGO_FALLBACK: + debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo); + rc = diff_run_algo(algo_config->fallback_algo, state); + goto return_rc; + + case DIFF_RC_OK: + /* continue below */ + break; + + default: + /* some error happened */ + goto return_rc; + } + + /* Pick up any diff chunks that are still unsolved and feed to inner_algo. + * inner_algo will solve unsolved chunks and append to result, + * and subsequent solved chunks on this level are then appended to result afterwards. */ + int i; + for (i = 0; i < state->temp_result.len; i++) { + struct diff_chunk *c = &state->temp_result.head[i]; + if (c->solved) { + struct diff_chunk *final_c; + ARRAYLIST_ADD(final_c, state->result->chunks); + if (!final_c) { + rc = DIFF_RC_ENOMEM; + goto return_rc; + } + *final_c = *c; + continue; + } + + /* c is an unsolved chunk, feed to inner_algo */ + struct diff_state inner_state = { + .result = state->result, + .recursion_depth_left = state->recursion_depth_left - 1, + }; + diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count); + diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count); + + rc = diff_run_algo(algo_config->inner_algo, &inner_state); + + if (rc != DIFF_RC_OK) + goto return_rc; + } + + rc = DIFF_RC_OK; +return_rc: + ARRAYLIST_FREE(state->temp_result); + return rc; +} + +struct diff_result *diff_main(const struct diff_config *config, + const uint8_t *left_data, size_t left_len, + const uint8_t *right_data, size_t right_len) +{ + struct diff_result *result = malloc(sizeof(struct diff_result)); + if (!result) + return NULL; + + *result = (struct diff_result){}; + diff_data_init_root(&result->left, left_data, left_len); + diff_data_init_root(&result->right, right_data, right_len); + + if (!config->atomize_func) { + result->rc = DIFF_RC_EINVAL; + return result; + } + + result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right); + if (result->rc != DIFF_RC_OK) + return result; + + struct diff_state state = { + .result = result, + .recursion_depth_left = config->max_recursion_depth ? : 1024, + }; + diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len); + diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len); + + result->rc = diff_run_algo(config->algo, &state); + + return result; +} + +void diff_result_free(struct diff_result *result) +{ + if (!result) + return; + diff_data_free(&result->left); + diff_data_free(&result->right); + ARRAYLIST_FREE(result->chunks); + free(result); +} blob - /dev/null blob + cba927a4a5813ed3a1afa1247227a9637271e176 (mode 644) --- /dev/null +++ lib/diff_myers.c @@ -0,0 +1,1043 @@ +/* Myers diff algorithm implementation, invented by Eugene W. Myers [1]. + * Implementations of both the Myers Divide Et Impera (using linear space) + * and the canonical Myers algorithm (using quadratic space). */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include "debug.h" + +/* Myers' diff algorithm [1] is nicely explained in [2]. + * [1] http://www.xmailserver.org/diff2.pdf + * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff. + * + * Myers approaches finding the smallest diff as a graph problem. + * The crux is that the original algorithm requires quadratic amount of memory: + * both sides' lengths added, and that squared. So if we're diffing lines of text, two files with 1000 lines each would + * blow up to a matrix of about 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text. + * The solution is using Myers' "divide and conquer" extension algorithm, which does the original traversal from both + * ends of the files to reach a middle where these "snakes" touch, hence does not need to backtrace the traversal, and + * so gets away with only keeping a single column of that huge state matrix in memory. + * + * Todo: the divide and conquer requires linear *space*, not necessarily linear *time*. It recurses, apparently doing + * multiple Myers passes, and also it apparently favors fragmented diffs in cases where chunks of text were moved to a + * different place. Up to a given count of diff atoms (text lines), it might be desirable to accept the quadratic memory + * usage, get nicer diffs and less re-iteration of the same data? + */ + +struct diff_box { + unsigned int left_start; + unsigned int left_end; + unsigned int right_start; + unsigned int right_end; +}; + +#define diff_box_empty(DIFF_SNAKE) ((DIFF_SNAKE)->left_end == 0) + + +/* If the two contents of a file are A B C D E and X B C Y, + * the Myers diff graph looks like: + * + * k0 k1 + * \ \ + * k-1 0 1 2 3 4 5 + * \ A B C D E + * 0 o-o-o-o-o-o + * X | | | | | | + * 1 o-o-o-o-o-o + * B | |\| | | | + * 2 o-o-o-o-o-o + * C | | |\| | | + * 3 o-o-o-o-o-o + * Y | | | | | |\ + * 4 o-o-o-o-o-o c1 + * \ \ + * c-1 c0 + * + * Moving right means delete an atom from the left-hand-side, + * Moving down means add an atom from the right-hand-side. + * Diagonals indicate identical atoms on both sides, the challenge is to use as many diagonals as possible. + * + * The original Myers algorithm walks all the way from the top left to the bottom right, remembers all steps, and then + * backtraces to find the shortest path. However, that requires keeping the entire graph in memory, which needs + * quadratic space. + * + * Myers adds a variant that uses linear space -- note, not linear time, only linear space: walk forward and backward, + * find a meeting point in the middle, and recurse on the two separate sections. This is called "divide and conquer". + * + * d: the step number, starting with 0, a.k.a. the distance from the starting point. + * k: relative index in the state array for the forward scan, indicating on which diagonal through the diff graph we + * currently are. + * c: relative index in the state array for the backward scan, indicating the diagonal number from the bottom up. + * + * The "divide and conquer" traversal through the Myers graph looks like this: + * + * | d= 0 1 2 3 2 1 0 + * ----+-------------------------------------------- + * k= | c= + * 4 | 3 + * | + * 3 | 3,0 5,2 2 + * | / \ + * 2 | 2,0 5,3 1 + * | / \ + * 1 | 1,0 4,3 >= 4,3 5,4<-- 0 + * | / / \ / + * 0 | -->0,0 3,3 4,4 -1 + * | \ / / + * -1 | 0,1 1,2 3,4 -2 + * | \ / + * -2 | 0,2 -3 + * | \ + * | 0,3 + * | forward-> <-backward + * + * x,y pairs here are the coordinates in the Myers graph: + * x = atom index in left-side source, y = atom index in the right-side source. + * + * Only one forward column and one backward column are kept in mem, each need at most left.len + 1 + right.len items. + * Note that each d step occupies either the even or the odd items of a column: if e.g. the previous column is in the + * odd items, the next column is formed in the even items, without overwriting the previous column's results. + * + * Also note that from the diagonal index k and the x coordinate, the y coordinate can be derived: + * y = x - k + * Hence the state array only needs to keep the x coordinate, i.e. the position in the left-hand file, and the y + * coordinate, i.e. position in the right-hand file, is derived from the index in the state array. + * + * The two traces meet at 4,3, the first step (here found in the forward traversal) where a forward position is on or + * past a backward traced position on the same diagonal. + * + * This divides the problem space into: + * + * 0 1 2 3 4 5 + * A B C D E + * 0 o-o-o-o-o + * X | | | | | + * 1 o-o-o-o-o + * B | |\| | | + * 2 o-o-o-o-o + * C | | |\| | + * 3 o-o-o-o-*-o *: forward and backward meet here + * Y | | + * 4 o-o + * + * Doing the same on each section lead to: + * + * 0 1 2 3 4 5 + * A B C D E + * 0 o-o + * X | | + * 1 o-b b: backward d=1 first reaches here (sliding up the snake) + * B \ f: then forward d=2 reaches here (sliding down the snake) + * 2 o As result, the box from b to f is found to be identical; + * C \ leaving a top box from 0,0 to 1,1 and a bottom trivial tail 3,3 to 4,3. + * 3 f-o + * + * 3 o-* + * Y | + * 4 o *: forward and backward meet here + * + * and solving the last top left box gives: + * + * 0 1 2 3 4 5 + * A B C D E -A + * 0 o-o +X + * X | B + * 1 o C + * B \ -D + * 2 o -E + * C \ +Y + * 3 o-o-o + * Y | + * 4 o + * + */ + +#define xk_to_y(X, K) ((X) - (K)) +#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA)) +#define k_to_c(K, DELTA) ((K) + (DELTA)) +#define c_to_k(C, DELTA) ((C) - (DELTA)) + +/* Do one forwards step in the "divide and conquer" graph traversal. + * left: the left side to diff. + * right: the right side to diff against. + * kd_forward: the traversal state for forwards traversal, modified by this function. + * This is carried over between invocations with increasing d. + * kd_forward points at the center of the state array, allowing negative indexes. + * kd_backward: the traversal state for backwards traversal, to find a meeting point. + * Since forwards is done first, kd_backward will be valid for d - 1, not d. + * kd_backward points at the center of the state array, allowing negative indexes. + * d: Step or distance counter, indicating for what value of d the kd_forward should be populated. + * For d == 0, kd_forward[0] is initialized, i.e. the first invocation should be for d == 0. + * meeting_snake: resulting meeting point, if any. + */ +static void diff_divide_myers_forward(struct diff_data *left, struct diff_data *right, + int *kd_forward, int *kd_backward, int d, + struct diff_box *meeting_snake) +{ + int delta = (int)right->atoms.len - (int)left->atoms.len; + int prev_x; + int prev_y; + int k; + int x; + + debug("-- %s d=%d\n", __func__, d); + debug_dump_myers_graph(left, right, NULL); + + for (k = d; k >= -d; k -= 2) { + if (k < -(int)right->atoms.len || k > (int)left->atoms.len) { + /* This diagonal is completely outside of the Myers graph, don't calculate it. */ + if (k < -(int)right->atoms.len) + debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len); + else + debug(" %d k > left->atoms.len %d\n", k, left->atoms.len); + if (k < 0) { + /* We are traversing negatively, and already below the entire graph, nothing will come + * of this. */ + debug(" break"); + break; + } + debug(" continue"); + continue; + } + debug("- k = %d\n", k); + if (d == 0) { + /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of + * the Myers graph. */ + x = 0; + } + /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. + * For this, all k should derive from k - 1, only the bottom most k derive from k + 1: + * + * | d= 0 1 2 + * ----+---------------- + * k= | + * 2 | 2,0 <-- from prev_k = 2 - 1 = 1 + * | / + * 1 | 1,0 + * | / + * 0 | -->0,0 3,3 + * | \\ / + * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0 + * | \\ + * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1 + * + * Except when a k + 1 from a previous run already means a further advancement in the graph. + * If k == d, there is no k + 1 and k - 1 is the only option. + * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph. + */ + else if (k > -d && (k == d + || (k - 1 >= -(int)right->atoms.len + && kd_forward[k - 1] >= kd_forward[k + 1]))) { + /* Advance from k - 1. + * From position prev_k, step to the right in the Myers graph: x += 1. + */ + int prev_k = k - 1; + prev_x = kd_forward[prev_k]; + prev_y = xk_to_y(prev_x, prev_k); + x = prev_x + 1; + } else { + /* The bottom most one. + * From position prev_k, step to the bottom in the Myers graph: y += 1. + * Incrementing y is achieved by decrementing k while keeping the same x. + * (since we're deriving y from y = x - k). + */ + int prev_k = k + 1; + prev_x = kd_forward[prev_k]; + prev_y = xk_to_y(prev_x, prev_k); + x = prev_x; + } + + /* Slide down any snake that we might find here. */ + while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len + && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)])) + x++; + kd_forward[k] = x; + + if (DEBUG) { + int fi; + for (fi = d; fi >= k; fi--) { + debug("kd_forward[%d] = (%d, %d)\n", fi, kd_forward[fi], kd_forward[fi] - fi); + /* + if (kd_forward[fi] >= 0 && kd_forward[fi] < left->atoms.len) + debug_dump_atom(left, right, &left->atoms.head[kd_forward[fi]]); + else + debug("\n"); + if (kd_forward[fi]-fi >= 0 && kd_forward[fi]-fi < right->atoms.len) + debug_dump_atom(right, left, &right->atoms.head[kd_forward[fi]-fi]); + else + debug("\n"); + */ + } + } + + if (x < 0 || x > left->atoms.len + || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len) + continue; + + /* Figured out a new forwards traversal, see if this has gone onto or even past a preceding backwards + * traversal. + * + * If the delta in length is odd, then d and backwards_d hit the same state indexes: + * | d= 0 1 2 1 0 + * ----+---------------- ---------------- + * k= | c= + * 4 | 3 + * | + * 3 | 2 + * | same + * 2 | 2,0====5,3 1 + * | / \ + * 1 | 1,0 5,4<-- 0 + * | / / + * 0 | -->0,0 3,3====4,4 -1 + * | \ / + * -1 | 0,1 -2 + * | \ + * -2 | 0,2 -3 + * | + * + * If the delta is even, they end up off-by-one, i.e. on different diagonals: + * + * | d= 0 1 2 1 0 + * ----+---------------- ---------------- + * | c= + * 3 | 3 + * | + * 2 | 2,0 off 2 + * | / \\ + * 1 | 1,0 4,3 1 + * | / // \ + * 0 | -->0,0 3,3 4,4<-- 0 + * | \ / / + * -1 | 0,1 3,4 -1 + * | \ // + * -2 | 0,2 -2 + * | + * + * So in the forward path, we can only match up diagonals when the delta is odd. + */ + /* Forwards is done first, so the backwards one was still at d - 1. Can't do this for d == 0. */ + int backwards_d = d - 1; + if ((delta & 1) && (backwards_d >= 0)) { + debug("backwards_d = %d\n", backwards_d); + + /* If both sides have the same length, forward and backward start on the same diagonal, meaning the + * backwards state index c == k. + * As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and + * c = k shifted by the difference in length. + */ + int c = k_to_c(k, delta); + + /* When the file sizes are very different, the traversal trees start on far distant diagonals. + * They don't necessarily meet straight on. See whether this forward value is on a diagonal that + * is also valid in kd_backward[], and match them if so. */ + if (c >= -backwards_d && c <= backwards_d) { + /* Current k is on a diagonal that exists in kd_backward[]. If the two x positions have + * met or passed (forward walked onto or past backward), then we've found a midpoint / a + * mid-box. + * + * But we need to avoid matching a situation like this: + * 0 1 + * x y + * 0 o-o-o + * x |\| | + * 1 o-o-o + * y | |\| + * 2 (B)o-o <--(B) backwards traversal reached here + * a | | | + * 3 o-o-o<-- prev_x, prev_y + * b | | | + * 4 o-o(F) <--(F) forwards traversal reached here + * x |\| | Now both are on the same diagonal and look like they passed, + * 5 o-o-o but actually they have sneaked past each other and have not met. + * y | |\| + * 6 o-o-o + * + * The solution is to notice that prev_x,prev_y were also already past (B). + */ + int backward_x = kd_backward[c]; + int backward_y = xc_to_y(backward_x, c, delta); + debug(" prev_x,y = (%d,%d) c%d:backward_x,y = (%d,%d) k%d:x,y = (%d,%d)\n", + prev_x, prev_y, c, backward_x, backward_y, k, x, xk_to_y(x, k)); + if (prev_x <= backward_x && prev_y <= backward_y + && x >= backward_x) { + *meeting_snake = (struct diff_box){ + .left_start = backward_x, + .left_end = x, + .right_start = xc_to_y(backward_x, c, delta), + .right_end = xk_to_y(x, k), + }; + debug("HIT x=(%u,%u) - y=(%u,%u)\n", + meeting_snake->left_start, + meeting_snake->right_start, + meeting_snake->left_end, + meeting_snake->right_end); + return; + } + } + } + } +} + +/* Do one backwards step in the "divide and conquer" graph traversal. + * left: the left side to diff. + * right: the right side to diff against. + * kd_forward: the traversal state for forwards traversal, to find a meeting point. + * Since forwards is done first, after this, both kd_forward and kd_backward will be valid for d. + * kd_forward points at the center of the state array, allowing negative indexes. + * kd_backward: the traversal state for backwards traversal, to find a meeting point. + * This is carried over between invocations with increasing d. + * kd_backward points at the center of the state array, allowing negative indexes. + * d: Step or distance counter, indicating for what value of d the kd_backward should be populated. + * Before the first invocation, kd_backward[0] shall point at the bottom right of the Myers graph + * (left.len, right.len). + * The first invocation will be for d == 1. + * meeting_snake: resulting meeting point, if any. + */ +static void diff_divide_myers_backward(struct diff_data *left, struct diff_data *right, + int *kd_forward, int *kd_backward, int d, + struct diff_box *meeting_snake) +{ + int delta = (int)right->atoms.len - (int)left->atoms.len; + int prev_x; + int prev_y; + int c; + int x; + + debug("-- %s d=%d\n", __func__, d); + debug_dump_myers_graph(left, right, NULL); + + for (c = d; c >= -d; c -= 2) { + if (c < -(int)left->atoms.len || c > (int)right->atoms.len) { + /* This diagonal is completely outside of the Myers graph, don't calculate it. */ + if (c < -(int)left->atoms.len) + debug(" %d c < -(int)left->atoms.len %d\n", c, -(int)left->atoms.len); + else + debug(" %d c > right->atoms.len %d\n", c, right->atoms.len); + if (c < 0) { + /* We are traversing negatively, and already below the entire graph, nothing will come + * of this. */ + debug(" break"); + break; + } + debug(" continue"); + continue; + } + debug("- c = %d\n", c); + if (d == 0) { + /* This is the initializing step. There is no prev_c yet, get the initial x from the bottom + * right of the Myers graph. */ + x = left->atoms.len; + } + /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. + * For this, all c should derive from c - 1, only the bottom most c derive from c + 1: + * + * 2 1 0 + * --------------------------------------------------- + * c= + * 3 + * + * from prev_c = c - 1 --> 5,2 2 + * \ + * 5,3 1 + * \ + * 4,3 5,4<-- 0 + * \ / + * bottom most for d=1 from c + 1 --> 4,4 -1 + * / + * bottom most for d=2 --> 3,4 -2 + * + * Except when a c + 1 from a previous run already means a further advancement in the graph. + * If c == d, there is no c + 1 and c - 1 is the only option. + * If c < d, use c + 1 in case that yields a larger x. Also use c + 1 if c - 1 is outside the graph. + */ + else if (c > -d && (c == d + || (c - 1 >= -(int)right->atoms.len + && kd_backward[c - 1] <= kd_backward[c + 1]))) { + /* A top one. + * From position prev_c, step upwards in the Myers graph: y -= 1. + * Decrementing y is achieved by incrementing c while keeping the same x. + * (since we're deriving y from y = x - c + delta). + */ + int prev_c = c - 1; + prev_x = kd_backward[prev_c]; + prev_y = xc_to_y(prev_x, prev_c, delta); + x = prev_x; + } else { + /* The bottom most one. + * From position prev_c, step to the left in the Myers graph: x -= 1. + */ + int prev_c = c + 1; + prev_x = kd_backward[prev_c]; + prev_y = xc_to_y(prev_x, prev_c, delta); + x = prev_x - 1; + } + + /* Slide up any snake that we might find here. */ + debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c, delta), xc_to_y(x, c, delta)-1); + if (x > 0) { + debug(" l="); debug_dump_atom(left, right, &left->atoms.head[x-1]); + } + if (xc_to_y(x, c, delta) > 0) { + debug(" r="); debug_dump_atom(right, left, &right->atoms.head[xc_to_y(x, c, delta)-1]); + } + while (x > 0 && xc_to_y(x, c, delta) > 0 + && diff_atom_same(&left->atoms.head[x-1], &right->atoms.head[xc_to_y(x, c, delta)-1])) + x--; + kd_backward[c] = x; + + if (DEBUG) { + int fi; + for (fi = d; fi >= c; fi--) { + debug("kd_backward[%d] = (%d, %d)\n", fi, kd_backward[fi], + kd_backward[fi] - fi + delta); + /* + if (kd_backward[fi] >= 0 && kd_backward[fi] < left->atoms.len) + debug_dump_atom(left, right, &left->atoms.head[kd_backward[fi]]); + else + debug("\n"); + if (kd_backward[fi]-fi+delta >= 0 && kd_backward[fi]-fi+delta < right->atoms.len) + debug_dump_atom(right, left, &right->atoms.head[kd_backward[fi]-fi+delta]); + else + debug("\n"); + */ + } + } + + if (x < 0 || x > left->atoms.len + || xc_to_y(x, c, delta) < 0 || xc_to_y(x, c, delta) > right->atoms.len) + continue; + + /* Figured out a new backwards traversal, see if this has gone onto or even past a preceding forwards + * traversal. + * + * If the delta in length is even, then d and backwards_d hit the same state indexes -- note how this is + * different from in the forwards traversal, because now both d are the same: + * + * | d= 0 1 2 2 1 0 + * ----+---------------- -------------------- + * k= | c= + * 4 | + * | + * 3 | 3 + * | same + * 2 | 2,0====5,2 2 + * | / \ + * 1 | 1,0 5,3 1 + * | / / \ + * 0 | -->0,0 3,3====4,3 5,4<-- 0 + * | \ / / + * -1 | 0,1 4,4 -1 + * | \ + * -2 | 0,2 -2 + * | + * -3 + * If the delta is odd, they end up off-by-one, i.e. on different diagonals. + * So in the backward path, we can only match up diagonals when the delta is even. + */ + if ((delta & 1) == 0) { + /* Forwards was done first, now both d are the same. */ + int forwards_d = d; + + /* As soon as the lengths are not the same, the backwards traversal starts on a different diagonal, and + * c = k shifted by the difference in length. + */ + int k = c_to_k(c, delta); + + /* When the file sizes are very different, the traversal trees start on far distant diagonals. + * They don't necessarily meet straight on. See whether this backward value is also on a valid + * diagonal in kd_forward[], and match them if so. */ + if (k >= -forwards_d && k <= forwards_d) { + /* Current c is on a diagonal that exists in kd_forward[]. If the two x positions have + * met or passed (backward walked onto or past forward), then we've found a midpoint / a + * mid-box. */ + int forward_x = kd_forward[k]; + int forward_y = xk_to_y(forward_x, k); + debug("Compare %d to %d k=%d (x=%d,y=%d) to (x=%d,y=%d)\n", + forward_x, x, k, + forward_x, xk_to_y(forward_x, k), x, xc_to_y(x, c, delta)); + if (forward_x <= prev_x && forward_y <= prev_y + && forward_x >= x) { + *meeting_snake = (struct diff_box){ + .left_start = x, + .left_end = forward_x, + .right_start = xc_to_y(x, c, delta), + .right_end = xk_to_y(forward_x, k), + }; + debug("HIT x=%u,%u - y=%u,%u\n", + meeting_snake->left_start, + meeting_snake->right_start, + meeting_snake->left_end, + meeting_snake->right_end); + return; + } + } + } + } +} + +/* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides + * the problem into smaller chunks. Requires only linear amounts of memory. */ +enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + enum diff_rc rc = DIFF_RC_ENOMEM; + struct diff_data *left = &state->left; + struct diff_data *right = &state->right; + + debug("\n** %s\n", __func__); + debug("left:\n"); + debug_dump(left); + debug("right:\n"); + debug_dump(right); + debug_dump_myers_graph(left, right, NULL); + + /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */ + unsigned int max = left->atoms.len + right->atoms.len; + size_t kd_len = max + 1; + size_t kd_buf_size = kd_len << 1; + int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int)); + if (!kd_buf) + return DIFF_RC_ENOMEM; + int i; + for (i = 0; i < kd_buf_size; i++) + kd_buf[i] = -1; + int *kd_forward = kd_buf; + int *kd_backward = kd_buf + kd_len; + + /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle. + * It is then possible to index from -max/2 .. max/2. */ + kd_forward += max/2; + kd_backward += max/2; + + int d; + struct diff_box mid_snake = {}; + for (d = 0; d <= (max/2); d++) { + debug("-- d=%d\n", d); + diff_divide_myers_forward(left, right, kd_forward, kd_backward, d, &mid_snake); + if (!diff_box_empty(&mid_snake)) + break; + diff_divide_myers_backward(left, right, kd_forward, kd_backward, d, &mid_snake); + if (!diff_box_empty(&mid_snake)) + break; + } + + if (diff_box_empty(&mid_snake)) { + /* Divide and conquer failed to find a meeting point. Use the fallback_algo defined in the algo_config + * (leave this to the caller). This is just paranoia/sanity, we normally should always find a midpoint. + */ + debug(" no midpoint \n"); + rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK; + goto return_rc; + } else { + debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n", + mid_snake.left_start, mid_snake.left_end, left->atoms.len, + mid_snake.right_start, mid_snake.right_end, right->atoms.len); + + /* Section before the mid-snake. */ + debug("Section before the mid-snake\n"); + + struct diff_atom *left_atom = &left->atoms.head[0]; + unsigned int left_section_len = mid_snake.left_start; + struct diff_atom *right_atom = &right->atoms.head[0]; + unsigned int right_section_len = mid_snake.right_start; + + if (left_section_len && right_section_len) { + /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ + if (!diff_state_add_chunk(state, false, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing before the mid-snake. */ + + /* the mid-snake, identical data on both sides: */ + debug("the mid-snake\n"); + if (!diff_state_add_chunk(state, true, + &left->atoms.head[mid_snake.left_start], + mid_snake.left_end - mid_snake.left_start, + &right->atoms.head[mid_snake.right_start], + mid_snake.right_end - mid_snake.right_start)) + goto return_rc; + + /* Section after the mid-snake. */ + debug("Section after the mid-snake\n"); + debug(" left_end %u right_end %u\n", mid_snake.left_end, mid_snake.right_end); + debug(" left_count %u right_count %u\n", left->atoms.len, right->atoms.len); + left_atom = &left->atoms.head[mid_snake.left_end]; + left_section_len = left->atoms.len - mid_snake.left_end; + right_atom = &right->atoms.head[mid_snake.right_end]; + right_section_len = right->atoms.len - mid_snake.right_end; + + if (left_section_len && right_section_len) { + /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ + if (!diff_state_add_chunk(state, false, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing after the mid-snake. */ + } + + rc = DIFF_RC_OK; + +return_rc: + free(kd_buf); + debug("** END %s\n", __func__); + return rc; +} + +/* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can + * fail if the required space surpasses algo_config->permitted_state_size. */ +enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + /* do a diff_divide_myers_forward() without a _backward(), so that it walks forward across the entire + * files to reach the end. Keep each run's state, and do a final backtrace. */ + enum diff_rc rc = DIFF_RC_ENOMEM; + struct diff_data *left = &state->left; + struct diff_data *right = &state->right; + + debug("\n** %s\n", __func__); + debug("left:\n"); + debug_dump(left); + debug("right:\n"); + debug_dump(right); + debug_dump_myers_graph(left, right, NULL); + + /* Allocate two columns of a Myers graph, one for the forward and one for the backward traversal. */ + unsigned int max = left->atoms.len + right->atoms.len; + size_t kd_len = max + 1 + max; + size_t kd_buf_size = kd_len * kd_len; + debug("state size: %zu\n", kd_buf_size); + if (kd_buf_size < kd_len /* overflow? */ + || kd_buf_size * sizeof(int) > algo_config->permitted_state_size) { + debug("state size %zu > permitted_state_size %zu, use fallback_algo\n", + kd_buf_size, algo_config->permitted_state_size); + return DIFF_RC_USE_DIFF_ALGO_FALLBACK; + } + + int *kd_buf = reallocarray(NULL, kd_buf_size, sizeof(int)); + if (!kd_buf) + return DIFF_RC_ENOMEM; + int i; + for (i = 0; i < kd_buf_size; i++) + kd_buf[i] = -1; + + /* The 'k' axis in Myers spans positive and negative indexes, so point the kd to the middle. + * It is then possible to index from -max .. max. */ + int *kd_origin = kd_buf + max; + int *kd_column = kd_origin; + + int d; + int backtrack_d = -1; + int backtrack_k = 0; + int k; + int x, y; + for (d = 0; d <= max; d++, kd_column += kd_len) { + debug("-- d=%d\n", d); + + debug("-- %s d=%d\n", __func__, d); + + for (k = d; k >= -d; k -= 2) { + if (k < -(int)right->atoms.len || k > (int)left->atoms.len) { + /* This diagonal is completely outside of the Myers graph, don't calculate it. */ + if (k < -(int)right->atoms.len) + debug(" %d k < -(int)right->atoms.len %d\n", k, -(int)right->atoms.len); + else + debug(" %d k > left->atoms.len %d\n", k, left->atoms.len); + if (k < 0) { + /* We are traversing negatively, and already below the entire graph, nothing will come + * of this. */ + debug(" break"); + break; + } + debug(" continue"); + continue; + } + + debug("- k = %d\n", k); + if (d == 0) { + /* This is the initializing step. There is no prev_k yet, get the initial x from the top left of + * the Myers graph. */ + x = 0; + } else { + int *kd_prev_column = kd_column - kd_len; + + /* Favoring "-" lines first means favoring moving rightwards in the Myers graph. + * For this, all k should derive from k - 1, only the bottom most k derive from k + 1: + * + * | d= 0 1 2 + * ----+---------------- + * k= | + * 2 | 2,0 <-- from prev_k = 2 - 1 = 1 + * | / + * 1 | 1,0 + * | / + * 0 | -->0,0 3,3 + * | \\ / + * -1 | 0,1 <-- bottom most for d=1 from prev_k = -1 + 1 = 0 + * | \\ + * -2 | 0,2 <-- bottom most for d=2 from prev_k = -2 + 1 = -1 + * + * Except when a k + 1 from a previous run already means a further advancement in the graph. + * If k == d, there is no k + 1 and k - 1 is the only option. + * If k < d, use k + 1 in case that yields a larger x. Also use k + 1 if k - 1 is outside the graph. + */ + if (k > -d && (k == d + || (k - 1 >= -(int)right->atoms.len + && kd_prev_column[k - 1] >= kd_prev_column[k + 1]))) { + /* Advance from k - 1. + * From position prev_k, step to the right in the Myers graph: x += 1. + */ + int prev_k = k - 1; + int prev_x = kd_prev_column[prev_k]; + x = prev_x + 1; + } else { + /* The bottom most one. + * From position prev_k, step to the bottom in the Myers graph: y += 1. + * Incrementing y is achieved by decrementing k while keeping the same x. + * (since we're deriving y from y = x - k). + */ + int prev_k = k + 1; + int prev_x = kd_prev_column[prev_k]; + x = prev_x; + } + } + + /* Slide down any snake that we might find here. */ + while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len + && diff_atom_same(&left->atoms.head[x], &right->atoms.head[xk_to_y(x, k)])) + x++; + kd_column[k] = x; + + if (DEBUG) { + int fi; + for (fi = d; fi >= k; fi-=2) { + debug("kd_column[%d] = (%d, %d)\n", fi, kd_column[fi], kd_column[fi] - fi); +#if 0 + if (kd_column[fi] >= 0 && kd_column[fi] < left->atoms.len) + debug_dump_atom(left, right, &left->atoms.head[kd_column[fi]]); + else + debug("\n"); + if (kd_column[fi]-fi >= 0 && kd_column[fi]-fi < right->atoms.len) + debug_dump_atom(right, left, &right->atoms.head[kd_column[fi]-fi]); + else + debug("\n"); +#endif + } + } + + if (x == left->atoms.len && xk_to_y(x, k) == right->atoms.len) { + /* Found a path */ + backtrack_d = d; + backtrack_k = k; + debug("Reached the end at d = %d, k = %d\n", + backtrack_d, backtrack_k); + break; + } + } + + if (backtrack_d >= 0) + break; + } + + debug_dump_myers_graph(left, right, kd_origin); + + /* backtrack. A matrix spanning from start to end of the file is ready: + * + * | d= 0 1 2 3 4 + * ----+--------------------------------- + * k= | + * 3 | + * | + * 2 | 2,0 + * | / + * 1 | 1,0 4,3 + * | / / \ + * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0 + * | \ / \ + * -1 | 0,1 3,4 + * | \ + * -2 | 0,2 + * | + * + * From (4,4) backwards, find the previous position that is the largest, and remember it. + * + */ + for (d = backtrack_d, k = backtrack_k; d >= 0; d--) { + x = kd_column[k]; + y = xk_to_y(x, k); + + /* When the best position is identified, remember it for that kd_column. + * That kd_column is no longer needed otherwise, so just re-purpose kd_column[0] = x and kd_column[1] = y, + * so that there is no need to allocate more memory. + */ + kd_column[0] = x; + kd_column[1] = y; + debug("Backtrack d=%d: xy=(%d, %d)\n", + d, kd_column[0], kd_column[1]); + + /* Don't access memory before kd_buf */ + if (d == 0) + break; + int *kd_prev_column = kd_column - kd_len; + + /* When y == 0, backtracking downwards (k-1) is the only way. + * When x == 0, backtracking upwards (k+1) is the only way. + * + * | d= 0 1 2 3 4 + * ----+--------------------------------- + * k= | + * 3 | + * | ..y == 0 + * 2 | 2,0 + * | / + * 1 | 1,0 4,3 + * | / / \ + * 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 0 + * | \ / \ + * -1 | 0,1 3,4 + * | \ + * -2 | 0,2__ + * | x == 0 + */ + debug("prev[k-1] = %d,%d prev[k+1] = %d,%d\n", + kd_prev_column[k-1], xk_to_y(kd_prev_column[k-1],k-1), + kd_prev_column[k+1], xk_to_y(kd_prev_column[k+1],k+1)); + if (y == 0 + || (x > 0 && kd_prev_column[k - 1] >= kd_prev_column[k + 1])) { + k = k - 1; + debug("prev k=k-1=%d x=%d y=%d\n", + k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k)); + } else { + k = k + 1; + debug("prev k=k+1=%d x=%d y=%d\n", + k, kd_prev_column[k], xk_to_y(kd_prev_column[k], k)); + } + kd_column = kd_prev_column; + } + + /* Forwards again, this time recording the diff chunks. + * Definitely start from 0,0. kd_column[0] may actually point to the bottom of a snake starting at 0,0 */ + x = 0; + y = 0; + + kd_column = kd_origin; + for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) { + int next_x = kd_column[0]; + int next_y = kd_column[1]; + debug("Forward track from xy(%d,%d) to xy(%d,%d)\n", + x, y, next_x, next_y); + + struct diff_atom *left_atom = &left->atoms.head[x]; + int left_section_len = next_x - x; + struct diff_atom *right_atom = &right->atoms.head[y]; + int right_section_len = next_y - y; + + rc = DIFF_RC_ENOMEM; + if (left_section_len && right_section_len) { + /* This must be a snake slide. + * Snake slides have a straight line leading into them (except when starting at (0,0)). Find + * out whether the lead-in is horizontal or vertical: + * + * left + * ----------> + * | + * r| o-o o + * i| \ | + * g| o o + * h| \ \ + * t| o o + * v + * + * If left_section_len > right_section_len, the lead-in is horizontal, meaning first + * remove one atom from the left before sliding down the snake. + * If right_section_len > left_section_len, the lead-in is vetical, so add one atom from + * the right before sliding down the snake. */ + if (left_section_len == right_section_len + 1) { + if (!diff_state_add_chunk(state, true, + left_atom, 1, + right_atom, 0)) + goto return_rc; + left_atom++; + left_section_len--; + } else if (right_section_len == left_section_len + 1) { + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, 1)) + goto return_rc; + right_atom++; + right_section_len--; + } else if (left_section_len != right_section_len) { + /* The numbers are making no sense. Should never happen. */ + rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK; + goto return_rc; + } + + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + + x = next_x; + y = next_y; + } + + rc = DIFF_RC_OK; + +return_rc: + free(kd_buf); + debug("** END %s rc=%d\n", __func__, rc); + return rc; +} blob - /dev/null blob + 20d1f9a79ec84281efede279f51bd2292a99466a (mode 644) --- /dev/null +++ lib/diff_output.c @@ -0,0 +1,52 @@ +/* Common parts for printing diff output */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +void diff_output_lines(FILE *dest, const char *prefix, struct diff_atom *start_atom, unsigned int count) +{ + struct diff_atom *atom; + foreach_diff_atom(atom, start_atom, count) { + fprintf(dest, "%s", prefix); + int i; + unsigned int len = atom->len; + if (len && atom->at[len - 1] == '\n') { + len--; + if (len && atom->at[len - 1] == '\r') + len--; + } + + for (i = 0; i < len; i++) { + char c = atom->at[i]; + if (c < 0x20 || c >= 0x7f) + fprintf(dest, "\\x%02x", (unsigned char)c); + else + fprintf(dest, "%c", c); + } + fprintf(dest, "\n"); + } +} + +enum diff_rc diff_output_info(FILE *dest, const struct diff_input_info *info) +{ + if (info->arbitrary_info && *info->arbitrary_info) + fprintf(dest, "%s", info->arbitrary_info); + fprintf(dest, "--- %s\n+++ %s\n", + info->left_path ? : "a", + info->right_path ? : "b"); + return DIFF_RC_OK; +} blob - /dev/null blob + 796c3ee69f83bf338c67c8b310b27c2dea7501f1 (mode 644) --- /dev/null +++ lib/diff_output_plain.c @@ -0,0 +1,54 @@ +/* Output all lines of a diff_result. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +enum diff_rc diff_output_plain(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result) +{ + if (!result) + return DIFF_RC_EINVAL; + if (result->rc != DIFF_RC_OK) + return result->rc; + + diff_output_info(dest, info); + + int i; + for (i = 0; i < result->chunks.len; i++) { + struct diff_chunk *c = &result->chunks.head[i]; + if (c->left_count && c->right_count) + diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count); + else if (c->left_count && !c->right_count) + diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count); + else if (c->right_count && !c->left_count) + diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count); + } + return DIFF_RC_OK; +} + +enum diff_rc diff_plain(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len) +{ + enum diff_rc rc; + left_len = left_len < 0 ? strlen(left) : left_len; + right_len = right_len < 0 ? strlen(right) : right_len; + struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len); + rc = diff_output_plain(dest, info, result); + diff_result_free(result); + return rc; +} blob - /dev/null blob + d12e4c0b5d67538e6f0dbd89b84c22447ec44d25 (mode 644) --- /dev/null +++ lib/diff_output_unidiff.c @@ -0,0 +1,233 @@ +/* Produce a unidiff output from a diff_result. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include "debug.h" + +enum chunk_type { + CHUNK_EMPTY, + CHUNK_PLUS, + CHUNK_MINUS, + CHUNK_SAME, + CHUNK_WEIRD, +}; + +static inline enum chunk_type chunk_type(const struct diff_chunk *chunk) +{ + if (!chunk->left_count && !chunk->right_count) + return CHUNK_EMPTY; + if (!chunk->solved) + return CHUNK_WEIRD; + if (!chunk->right_count) + return CHUNK_MINUS; + if (!chunk->left_count) + return CHUNK_PLUS; + if (chunk->left_count != chunk->right_count) + return CHUNK_WEIRD; + return CHUNK_SAME; +} + +#define MAX(A,B) ((A)>(B)?(A):(B)) +#define MIN(A,B) ((A)<(B)?(A):(B)) + +struct range { + int start; + int end; +}; + +static bool range_empty(const struct range *r) +{ + return r->start == r->end; +} + +static bool ranges_touch(const struct range *a, const struct range *b) +{ + return (a->end >= b->start) && (a->start <= b->end); +} + +static void ranges_merge(struct range *a, const struct range *b) +{ + *a = (struct range){ + .start = MIN(a->start, b->start), + .end = MAX(a->end, b->end), + }; +} + +struct chunk_context { + struct range chunk; + struct range left, right; +}; + +static bool chunk_context_empty(const struct chunk_context *cc) +{ + return range_empty(&cc->chunk); +} + +static void chunk_context_get(struct chunk_context *cc, const struct diff_result *r, int chunk_idx, + int context_lines) +{ + const struct diff_chunk *c = &r->chunks.head[chunk_idx]; + int left_start = diff_atom_root_idx(&r->left, c->left_start); + int right_start = diff_atom_root_idx(&r->right, c->right_start); + + *cc = (struct chunk_context){ + .chunk = { + .start = chunk_idx, + .end = chunk_idx + 1, + }, + .left = { + .start = MAX(0, left_start - context_lines), + .end = MIN(r->left.atoms.len, left_start + c->left_count + context_lines), + }, + .right = { + .start = MAX(0, right_start - context_lines), + .end = MIN(r->right.atoms.len, right_start + c->right_count + context_lines), + }, + }; +} + +static bool chunk_contexts_touch(const struct chunk_context *cc, const struct chunk_context *other) +{ + return ranges_touch(&cc->chunk, &other->chunk) + || ranges_touch(&cc->left, &other->left) + || ranges_touch(&cc->right, &other->right); +} + +static void chunk_contexts_merge(struct chunk_context *cc, const struct chunk_context *other) +{ + ranges_merge(&cc->chunk, &other->chunk); + ranges_merge(&cc->left, &other->left); + ranges_merge(&cc->right, &other->right); +} + +static void diff_output_unidiff_chunk(FILE *dest, bool *info_printed, const struct diff_input_info *info, + const struct diff_result *result, const struct chunk_context *cc) +{ + if (range_empty(&cc->left) && range_empty(&cc->right)) + return; + + if (!(*info_printed)) { + diff_output_info(dest, info); + *info_printed = true; + } + + fprintf(dest, "@@ -%d,%d +%d,%d @@\n", + cc->left.start + 1, cc->left.end - cc->left.start, + cc->right.start + 1, cc->right.end - cc->right.start); + + /* Got the absolute line numbers where to start printing, and the index of the interesting (non-context) chunk. + * To print context lines above the interesting chunk, nipping on the previous chunk index may be necessary. + * It is guaranteed to be only context lines where left == right, so it suffices to look on the left. */ + const struct diff_chunk *first_chunk = &result->chunks.head[cc->chunk.start]; + int chunk_start_line = diff_atom_root_idx(&result->left, first_chunk->left_start); + if (cc->left.start < chunk_start_line) + diff_output_lines(dest, " ", &result->left.atoms.head[cc->left.start], + chunk_start_line - cc->left.start); + + /* Now write out all the joined chunks and contexts between them */ + int c_idx; + for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) { + const struct diff_chunk *c = &result->chunks.head[c_idx]; + + if (c->left_count && c->right_count) + diff_output_lines(dest, c->solved ? " " : "?", c->left_start, c->left_count); + else if (c->left_count && !c->right_count) + diff_output_lines(dest, c->solved ? "-" : "?", c->left_start, c->left_count); + else if (c->right_count && !c->left_count) + diff_output_lines(dest, c->solved ? "+" : "?", c->right_start, c->right_count); + } + + /* Trailing context? */ + const struct diff_chunk *last_chunk = &result->chunks.head[cc->chunk.end - 1]; + int chunk_end_line = diff_atom_root_idx(&result->left, last_chunk->left_start + last_chunk->left_count); + if (cc->left.end > chunk_end_line) + diff_output_lines(dest, " ", &result->left.atoms.head[chunk_end_line], + cc->left.end - chunk_end_line); +} + +enum diff_rc diff_output_unidiff(FILE *dest, const struct diff_input_info *info, + const struct diff_result *result, unsigned int context_lines) +{ + if (!result) + return DIFF_RC_EINVAL; + if (result->rc != DIFF_RC_OK) + return result->rc; + + struct chunk_context cc = {}; + bool info_printed = false; + + int i; + for (i = 0; i < result->chunks.len; i++) { + struct diff_chunk *c = &result->chunks.head[i]; + enum chunk_type t = chunk_type(c); + + if (t == CHUNK_MINUS || t == CHUNK_PLUS) { + if (chunk_context_empty(&cc)) { + /* These are the first lines being printed. + * Note down the start point, any number of subsequent chunks may be joined up to this + * unidiff chunk by context lines or by being directly adjacent. */ + chunk_context_get(&cc, result, i, context_lines); + debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n", + cc.chunk.start, cc.chunk.end, + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + } else { + /* There already is a previous chunk noted down for being printed. + * Does it join up with this one? */ + struct chunk_context next; + chunk_context_get(&next, result, i, context_lines); + debug("new chunk to be printed: chunk %d-%d left %d-%d right %d-%d\n", + next.chunk.start, next.chunk.end, + next.left.start, next.left.end, next.right.start, next.right.end); + if (chunk_contexts_touch(&cc, &next)) { + /* This next context touches or overlaps the previous one, join. */ + chunk_contexts_merge(&cc, &next); + debug("new chunk to be printed touches previous chunk, now: left %d-%d right %d-%d\n", + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + } else { + /* No touching, so the previous context is complete with a gap between it and + * this next one. Print the previous one and start fresh here. */ + debug("new chunk to be printed does not touch previous chunk; print left %d-%d right %d-%d\n", + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc); + cc = next; + debug("new unprinted chunk is left %d-%d right %d-%d\n", + cc.left.start, cc.left.end, cc.right.start, cc.right.end); + } + } + } + + } + + if (!chunk_context_empty(&cc)) + diff_output_unidiff_chunk(dest, &info_printed, info, result, &cc); + return DIFF_RC_OK; +} + +enum diff_rc diff_unidiff(FILE *dest, const struct diff_config *diff_config, + const struct diff_input_info *info, + const char *left, int left_len, const char *right, int right_len, + unsigned int context_lines) +{ + enum diff_rc rc; + left_len = left_len < 0 ? strlen(left) : left_len; + right_len = right_len < 0 ? strlen(right) : right_len; + struct diff_result *result = diff_main(diff_config, left, left_len, right, right_len); + rc = diff_output_unidiff(dest, info, result, context_lines); + diff_result_free(result); + return rc; +} blob - /dev/null blob + 9fcf8196417c1df4fda8effb7cdbb57995974764 (mode 644) --- /dev/null +++ lib/diff_patience.c @@ -0,0 +1,331 @@ +/* Implementation of the Patience Diff algorithm invented by Bram Cohen: + * Divide a diff problem into smaller chunks by an LCS of common-unique lines. */ +/* + * Copyright (c) 2020 Neels Hofmeyr + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "debug.h" + +/* Set unique_here = true for all atoms that exist exactly once in this list. */ +static void diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count) +{ + struct diff_atom *i; + unsigned int count = 0; + diff_data_foreach_atom(i, d) { + i->patience.unique_here = true; + i->patience.unique_in_both = true; + count++; + } + diff_data_foreach_atom(i, d) { + struct diff_atom *j; + + if (!i->patience.unique_here) + continue; + + diff_data_foreach_atom_from(i + 1, j, d) { + if (diff_atom_same(i, j)) { + if (i->patience.unique_here) { + i->patience.unique_here = false; + i->patience.unique_in_both = false; + count--; + } + j->patience.unique_here = false; + j->patience.unique_in_both = false; + count--; + } + } + } + if (unique_count) + *unique_count = count; +} + +/* Mark those lines as atom->patience.unique_in_both = true that appear exactly once in each side. */ +static void diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right, + unsigned int *unique_in_both_count) +{ + /* Derive the final unique_in_both count without needing an explicit iteration. So this is just some + * optimiziation to save one iteration in the end. */ + unsigned int unique_in_both; + + diff_atoms_mark_unique(left, &unique_in_both); + diff_atoms_mark_unique(right, NULL); + + debug("unique_in_both %u\n", unique_in_both); + + struct diff_atom *i; + diff_data_foreach_atom(i, left) { + if (!i->patience.unique_here) + continue; + struct diff_atom *j; + int found_in_b = 0; + diff_data_foreach_atom(j, right) { + if (!diff_atom_same(i, j)) + continue; + if (!j->patience.unique_here) { + found_in_b = 2; /* or more */ + break; + } else { + found_in_b = 1; + j->patience.pos_in_other = i; + i->patience.pos_in_other = j; + } + } + + if (found_in_b == 0 || found_in_b > 1) { + i->patience.unique_in_both = false; + unique_in_both--; + debug("unique_in_both %u (%d) ", unique_in_both, found_in_b); + debug_dump_atom(left, NULL, i); + } + } + + /* Still need to unmark right[*]->patience.unique_in_both for atoms that don't exist in left */ + diff_data_foreach_atom(i, right) { + if (!i->patience.unique_here + || !i->patience.unique_in_both) + continue; + struct diff_atom *j; + bool found_in_a = false; + diff_data_foreach_atom(j, left) { + if (!j->patience.unique_in_both) + continue; + if (!diff_atom_same(i, j)) + continue; + found_in_a = true; + break; + } + + if (!found_in_a) + i->patience.unique_in_both = false; + } + + if (unique_in_both_count) + *unique_in_both_count = unique_in_both; +} + + +/* Among the lines that appear exactly once in each side, find the longest streak that appear in both files in the same + * order (with other stuff allowed to interleave). Use patience sort for that, as in the Patience Diff algorithm. + * See https://bramcohen.livejournal.com/73318.html and, for a much more detailed explanation, + * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */ +enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state) +{ + enum diff_rc rc = DIFF_RC_ENOMEM; + + struct diff_data *left = &state->left; + struct diff_data *right = &state->right; + + unsigned int unique_in_both_count; + + debug("\n** %s\n", __func__); + + /* Find those lines that appear exactly once in 'left' and exactly once in 'right'. */ + diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count); + + debug("unique_in_both_count %u\n", unique_in_both_count); + debug("left:\n"); + debug_dump(left); + debug("right:\n"); + debug_dump(right); + + if (!unique_in_both_count) { + /* Cannot apply Patience, tell the caller to use fallback_algo instead. */ + return DIFF_RC_USE_DIFF_ALGO_FALLBACK; + } + + /* An array of Longest Common Sequence is the result of the below subscope: */ + unsigned int lcs_count = 0; + struct diff_atom **lcs = NULL; + struct diff_atom *lcs_tail = NULL; + + { + /* This subscope marks the lifetime of the atom_pointers allocation */ + + /* One chunk of storage for atom pointers */ + struct diff_atom **atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2, sizeof(struct diff_atom*)); + + /* Half for the list of atoms that still need to be put on stacks */ + struct diff_atom **uniques = atom_pointers; + + /* Half for the patience sort state's "card stacks" -- we remember only each stack's topmost "card" */ + struct diff_atom **patience_stacks = atom_pointers + unique_in_both_count; + unsigned int patience_stacks_count = 0; + + /* Take all common, unique items from 'left' ... */ + + struct diff_atom *atom; + struct diff_atom **uniques_end = uniques; + diff_data_foreach_atom(atom, left) { + if (!atom->patience.unique_in_both) + continue; + *uniques_end = atom; + uniques_end++; + } + + /* ...and sort them to the order found in 'right'. + * The idea is to find the leftmost stack that has a higher line number and add it to the stack's top. + * If there is no such stack, open a new one on the right. The line number is derived from the atom*, + * which are array items and hence reflect the relative position in the source file. So we got the + * common-uniques from 'left' and sort them according to atom->patience.pos_in_other. */ + unsigned int i; + for (i = 0; i < unique_in_both_count; i++) { + atom = uniques[i]; + unsigned int target_stack; + + if (!patience_stacks_count) + target_stack = 0; + else { + /* binary search to find the stack to put this atom "card" on. */ + unsigned int lo = 0; + unsigned int hi = patience_stacks_count; + while (lo < hi) { + unsigned int mid = (lo + hi) >> 1; + if (patience_stacks[mid]->patience.pos_in_other < atom->patience.pos_in_other) + lo = mid + 1; + else + hi = mid; + } + + target_stack = lo; + } + + assert(target_stack <= patience_stacks_count); + patience_stacks[target_stack] = atom; + if (target_stack == patience_stacks_count) + patience_stacks_count++; + + /* Record a back reference to the next stack on the left, which will form the final longest sequence + * later. */ + atom->patience.prev_stack = target_stack ? patience_stacks[target_stack - 1] : NULL; + + } + + /* backtrace through prev_stack references to form the final longest common sequence */ + lcs_tail = patience_stacks[patience_stacks_count - 1]; + lcs_count = patience_stacks_count; + + /* uniques and patience_stacks are no longer needed. Backpointers are in atom->patience.prev_stack */ + free(atom_pointers); + } + + lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*)); + struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1]; + struct diff_atom *atom; + for (atom = lcs_tail; atom; atom = atom->patience.prev_stack, lcs_backtrace_pos--) { + assert(lcs_backtrace_pos >= lcs); + *lcs_backtrace_pos = atom; + } + + unsigned int i; + if (DEBUG) { + debug("\npatience LCS:\n"); + for (i = 0; i < lcs_count; i++) { + debug_dump_atom(left, right, lcs[i]); + } + } + + + /* TODO: For each common-unique line found (now listed in lcs), swallow lines upwards and downwards that are + * identical on each side. Requires a way to represent atoms being glued to adjacent atoms. */ + + debug("\ntraverse LCS, possibly recursing:\n"); + + /* Now we have pinned positions in both files at which it makes sense to divide the diff problem into smaller + * chunks. Go into the next round: look at each section in turn, trying to again find common-unique lines in + * those smaller sections. As soon as no more are found, the remaining smaller sections are solved by Myers. */ + unsigned int left_pos = 0; + unsigned int right_pos = 0; + for (i = 0; i <= lcs_count; i++) { + struct diff_atom *atom; + unsigned int left_idx; + unsigned int right_idx; + + if (i < lcs_count) { + atom = lcs[i]; + left_idx = diff_atom_idx(left, atom); + right_idx = diff_atom_idx(right, atom->patience.pos_in_other); + } else { + atom = NULL; + left_idx = left->atoms.len; + right_idx = right->atoms.len; + } + + /* 'atom' now marks an atom that matches on both sides according to patience-diff + * (a common-unique identical atom in both files). + * Handle the section before and the atom itself; the section after will be handled by the next loop + * iteration -- note that i loops to last element + 1 ("i <= lcs_count"), so that there will be another + * final iteration to pick up the last remaining items after the last LCS atom. + * The sections before might also be empty on left and/or right. + * left_pos and right_pos mark the indexes of the first atoms that have not yet been handled in the + * previous loop iteration. + * left_idx and right_idx mark the indexes of the matching atom on left and right, respectively. */ + + debug("iteration %u left_pos %u left_idx %u right_pos %u right_idx %u\n", + i, left_pos, left_idx, right_pos, right_idx); + + struct diff_chunk *chunk; + + /* Section before the matching atom */ + struct diff_atom *left_atom = &left->atoms.head[left_pos]; + unsigned int left_section_len = left_idx - left_pos; + + struct diff_atom *right_atom = &(right->atoms.head[right_pos]); + unsigned int right_section_len = right_idx - right_pos; + + if (left_section_len && right_section_len) { + /* Record an unsolved chunk, the caller will apply inner_algo() on this chunk. */ + if (!diff_state_add_chunk(state, false, + left_atom, left_section_len, + right_atom, right_section_len)) + goto return_rc; + } else if (left_section_len && !right_section_len) { + /* Only left atoms and none on the right, they form a "minus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, left_section_len, + right_atom, 0)) + goto return_rc; + } else if (!left_section_len && right_section_len) { + /* No left atoms, only atoms on the right, they form a "plus" chunk, then. */ + if (!diff_state_add_chunk(state, true, + left_atom, 0, + right_atom, right_section_len)) + goto return_rc; + } + /* else: left_section_len == 0 and right_section_len == 0, i.e. nothing here. */ + + /* The atom found to match on both sides forms a chunk of equals on each side. In the very last + * iteration of this loop, there is no matching atom, we were just cleaning out the remaining lines. */ + if (atom) { + if (!diff_state_add_chunk(state, true, + atom, 1, + atom->patience.pos_in_other, 1)) + goto return_rc; + } + + left_pos = left_idx + 1; + right_pos = right_idx + 1; + } + debug("** END %s\n", __func__); + + rc = DIFF_RC_OK; + +return_rc: + free(lcs); + return rc; +} + blob - /dev/null blob + e8e14be5621f0f4536f2e0e42b7e9fb59d758cf4 (mode 644) --- /dev/null +++ lib/linux_Makefile @@ -0,0 +1,24 @@ +srcs = \ + diff_atomize_text.c \ + diff_main.c \ + diff_myers.c \ + diff_patience.c \ + diff_output.c \ + diff_output_plain.c \ + diff_output_unidiff.c \ + $(END) + +objs = $(srcs:.c=.o) + +libdiff.a: $(objs) + ar rcs $@ $^ + +CFLAGS = -fsanitize=address -fsanitize=undefined -g + +%.o: %.c ./*.h ../include/diff/*.h + gcc $(CFLAGS) -I../include -o $@ -c $< + +.PHONY: clean +clean: + -rm $(objs) + -rm libdiff.a blob - /dev/null blob + b4a9acb0cdc71e2d714c76d3f510873470cb8236 (mode 644) --- /dev/null +++ man/diff.1 @@ -0,0 +1,47 @@ +.\" $OpenBSD$ +.\" +.\" Copyright (c) 2018 Martin Pieuchot +.\" Copyright (c) 2020 Neels Hofmeyr +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: August 28 2017 $ +.Dt DIFF 1 +.Os +.Sh NAME +.Nm diff +.Nd compare files +.Sh SYNOPSIS +.Nm diff +.Ar file1 file2 +.Sh DESCRIPTION +The +.Nm +utility compares the contents of +.Ar file1 +and +.Ar file2 +line by line. +.Sh EXIT STATUS +The +.Nm +utility exits with one of the following values: +.Pp +.Bl -tag -width Ds -offset indent -compact +.It 0 +No differences were found. +.It 1 +Differences were found. +.It >1 +An error occurred. +.El blob - /dev/null blob + ad6ffbd9d12fc4f81e7e59ea3b4b7801c0b53887 (mode 644) --- /dev/null +++ test/Makefile @@ -0,0 +1,7 @@ +.PHONY: test verify clean +test: verify clean + +verify: + ./verify_all.sh +clean: + -rm verify.* blob - /dev/null blob + 60764f10614644471b9cae1fdf7787246ef1ed92 (mode 644) --- /dev/null +++ test/README @@ -0,0 +1,7 @@ +The test produces a diff, which is successful if it is able to reconstruct the +original source files from it. It is not tested whether diff output is optimal +or beautiful. + +Since different diff algorithms can produce different diff outputs, the +expect*.diff files are merely provided for reference and are not part of the +tests. blob - /dev/null blob + 48768c491c8dc456f94d9bd13e9a2f27590bbb0d (mode 644) --- /dev/null +++ test/expect001.diff @@ -0,0 +1,11 @@ +--- test001.left.txt ++++ test001.right.txt +-A +-B + C +-A + B ++A + B + A ++C blob - /dev/null blob + 5a409a01b7a7269424e11fd6a07a054e997dd3c5 (mode 644) --- /dev/null +++ test/expect002.diff @@ -0,0 +1,15 @@ +--- test002.left.txt ++++ test002.right.txt +-A +-B + C +-A + B ++A + B + A ++C + X +-Y + Z ++Q blob - /dev/null blob + ff1c8b5c8d3c587b15fe9d9ec83ba6ceb516436e (mode 644) --- /dev/null +++ test/expect003.diff @@ -0,0 +1,9 @@ +--- test003.left.txt ++++ test003.right.txt +-a ++x + b + c +-d +-e ++y blob - /dev/null blob + 5989a134be2ab91b93f757600becacc8065b8f94 (mode 644) --- /dev/null +++ test/expect004.diff @@ -0,0 +1,23 @@ +--- test004.left.txt ++++ test004.right.txt ++int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n) ++{ ++ if (chunk == NULL) return 0; ++ ++ return start <= chunk->length && n <= chunk->length - start; ++} ++ + void Chunk_copy(Chunk *src, size_t src_start, Chunk *dst, size_t dst_start, size_t n) + { + if (!Chunk_bounds_check(src, src_start, n)) return; + if (!Chunk_bounds_check(dst, dst_start, n)) return; + + memcpy(dst->data + dst_start, src->data + src_start, n); + } +- +-int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n) +-{ +- if (chunk == NULL) return 0; +- +- return start <= chunk->length && n <= chunk->length - start; +-} blob - /dev/null blob + 25e2e0bc35f05d8383e52fad77872be96de5dca0 (mode 644) --- /dev/null +++ test/expect005.diff @@ -0,0 +1,11 @@ +--- test005.left.txt ++++ test005.right.txt ++The Slits ++Gil Scott Heron + David Axelrod + Electric Prunes +-Gil Scott Heron +-The Slits + Faust + The Sonics + The Sonics blob - /dev/null blob + ea0a5d1c20b159e7dad0631318ef5690875da3ba (mode 644) --- /dev/null +++ test/expect006.diff @@ -0,0 +1,29 @@ +--- test006.left.txt ++++ test006.right.txt + Below is an example license to be used for new code in OpenBSD, + modeled after the ISC license. + + It is important to specify the year of the copyright. Additional years + should be separated by a comma, e.g. +- Copyright (c) 2003, 2004 ++ Copyright (c) 2003, 2004, 2005 + + If you add extra text to the body of the license, be careful not to + add further restrictions. + + /* + * Copyright (c) CCYY YOUR NAME HERE + * +- * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ ++An extra line blob - /dev/null blob + 996246e7974d7b25bc76ee14c63114a241be4704 (mode 644) --- /dev/null +++ test/expect007.diff @@ -0,0 +1,4 @@ +--- test007.left.txt ++++ test007.right.txt +-x ++abcdx blob - /dev/null blob + c4f9c82c25819151690ffa3bcd44eea1b6e43159 (mode 644) --- /dev/null +++ test/expect008.diff @@ -0,0 +1,8 @@ +--- test008.left.txt ++++ test008.right.txt + x ++a ++b ++c ++d ++x blob - /dev/null blob + 8f75e2ee8130368c1ef12eb5b35b99b786f49ec1 (mode 644) --- /dev/null +++ test/expect009.diff @@ -0,0 +1,12 @@ +--- test009.left.txt ++++ test009.right.txt + x + a + b ++c ++d ++e ++f ++x ++a ++b blob - /dev/null blob + fd113b0f71509c767b9a4e7d4cc2f70f9d41c459 (mode 644) --- /dev/null +++ test/test001.left.txt @@ -0,0 +1,7 @@ +A +B +C +A +B +B +A blob - /dev/null blob + 0075e6d23de0d19b7f870a3e82b9b310001b3db9 (mode 644) --- /dev/null +++ test/test001.right.txt @@ -0,0 +1,6 @@ +C +B +A +B +A +C blob - /dev/null blob + de77f56f2d47f324d97bd0e8968589acd31a54d6 (mode 644) --- /dev/null +++ test/test002.left.txt @@ -0,0 +1,10 @@ +A +B +C +A +B +B +A +X +Y +Z blob - /dev/null blob + a9b4b8b851ddd7eba0955da5c7cd1bf316b7da83 (mode 644) --- /dev/null +++ test/test002.right.txt @@ -0,0 +1,9 @@ +C +B +A +B +A +C +X +Z +Q blob - /dev/null blob + 940532533944dd159bfd11136fac2ee35872de38 (mode 644) --- /dev/null +++ test/test003.left.txt @@ -0,0 +1,5 @@ +a +b +c +d +e blob - /dev/null blob + d9f266e145742427cb8635edb7343b2d7cd33b92 (mode 644) --- /dev/null +++ test/test003.right.txt @@ -0,0 +1,4 @@ +x +b +c +y blob - /dev/null blob + 72d95cf6331a2a7f35baa19863e6dd0250112e23 (mode 644) --- /dev/null +++ test/test004.left.txt @@ -0,0 +1,14 @@ +void Chunk_copy(Chunk *src, size_t src_start, Chunk *dst, size_t dst_start, size_t n) +{ + if (!Chunk_bounds_check(src, src_start, n)) return; + if (!Chunk_bounds_check(dst, dst_start, n)) return; + + memcpy(dst->data + dst_start, src->data + src_start, n); +} + +int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n) +{ + if (chunk == NULL) return 0; + + return start <= chunk->length && n <= chunk->length - start; +} blob - /dev/null blob + cfe8f2ed6be03213c89877245b9ce3da3a4802d8 (mode 644) --- /dev/null +++ test/test004.right.txt @@ -0,0 +1,14 @@ +int Chunk_bounds_check(Chunk *chunk, size_t start, size_t n) +{ + if (chunk == NULL) return 0; + + return start <= chunk->length && n <= chunk->length - start; +} + +void Chunk_copy(Chunk *src, size_t src_start, Chunk *dst, size_t dst_start, size_t n) +{ + if (!Chunk_bounds_check(src, src_start, n)) return; + if (!Chunk_bounds_check(dst, dst_start, n)) return; + + memcpy(dst->data + dst_start, src->data + src_start, n); +} blob - /dev/null blob + b77fc41c4c871564f41f74ebc9f4a07710681b3e (mode 644) --- /dev/null +++ test/test005.left.txt @@ -0,0 +1,7 @@ +David Axelrod +Electric Prunes +Gil Scott Heron +The Slits +Faust +The Sonics +The Sonics blob - /dev/null blob + 2480cecfba48d082c6c9c52ba386499542d3e2aa (mode 644) --- /dev/null +++ test/test005.right.txt @@ -0,0 +1,7 @@ +The Slits +Gil Scott Heron +David Axelrod +Electric Prunes +Faust +The Sonics +The Sonics blob - /dev/null blob + e01fce04d5ef4776a326a510558cfe85af41a22a (mode 644) --- /dev/null +++ test/test006.left.txt @@ -0,0 +1,25 @@ +Below is an example license to be used for new code in OpenBSD, +modeled after the ISC license. + +It is important to specify the year of the copyright. Additional years +should be separated by a comma, e.g. + Copyright (c) 2003, 2004 + +If you add extra text to the body of the license, be careful not to +add further restrictions. + +/* + * Copyright (c) CCYY YOUR NAME HERE + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ blob - /dev/null blob + 40a0f253dd2e3503d295711fcfe09427f52219b5 (mode 644) --- /dev/null +++ test/test006.right.txt @@ -0,0 +1,25 @@ +Below is an example license to be used for new code in OpenBSD, +modeled after the ISC license. + +It is important to specify the year of the copyright. Additional years +should be separated by a comma, e.g. + Copyright (c) 2003, 2004, 2005 + +If you add extra text to the body of the license, be careful not to +add further restrictions. + +/* + * Copyright (c) CCYY YOUR NAME HERE + * + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +An extra line blob - /dev/null blob + 587be6b4c3f93f93c489c0111bba5596147a26cb (mode 644) --- /dev/null +++ test/test007.left.txt @@ -0,0 +1 @@ +x blob - /dev/null blob + 47b0ccb04734aa02fca7a524d6fbc171756a6a5b (mode 644) --- /dev/null +++ test/test007.right.txt @@ -0,0 +1 @@ +abcdx blob - /dev/null blob + 587be6b4c3f93f93c489c0111bba5596147a26cb (mode 644) --- /dev/null +++ test/test008.left.txt @@ -0,0 +1 @@ +x blob - /dev/null blob + 70b67eef4500e1d108c59c18653e56159e5c60ad (mode 644) --- /dev/null +++ test/test008.right.txt @@ -0,0 +1,6 @@ +x +a +b +c +d +x blob - /dev/null blob + e4e5238f8e5d3a5deb0169f1795d18a9a1eebe68 (mode 644) --- /dev/null +++ test/test009.left.txt @@ -0,0 +1,3 @@ +x +a +b blob - /dev/null blob + 3a4386800dba971814434a508e6de9867ebc7abc (mode 644) --- /dev/null +++ test/test009.right.txt @@ -0,0 +1,10 @@ +x +a +b +c +d +e +f +x +a +b blob - /dev/null blob + 4a0ec1a64add5fccad7d01d0856e38be06629717 (mode 755) --- /dev/null +++ test/verify_all.sh @@ -0,0 +1,56 @@ +#!/bin/sh + +diff_prog="../diff/diff" + +diff_type=plain + +verify_diff_script() { + orig_left="$1" + orig_right="$2" + the_diff="$3" + + verify_left="verify.$orig_left" + verify_right="verify.$orig_right" + + if [ "x$diff_type" = "xunidiff" ]; then + cp "$orig_left" "$verify_right" + patch --quiet -u "$verify_right" "$the_diff" + if ! cmp "$orig_right" "$verify_right" ; then + echo "FAIL: $orig_right != $verify_right" + return 1 + fi + + cp "$orig_right" "$verify_left" + patch --quiet -u -R "$verify_left" "$the_diff" + if ! cmp "$orig_left" "$verify_left" ; then + echo "FAIL: $orig_left != $verify_left" + return 1 + fi + else + tail -n +3 "$the_diff" | grep -v "^+" | sed 's/^.//' > "$verify_left" + tail -n +3 "$the_diff" | grep -v "^-" | sed 's/^.//' > "$verify_right" + + if ! cmp "$orig_left" "$verify_left" ; then + echo "FAIL: $orig_left != $verify_left" + return 1 + fi + if ! cmp "$orig_right" "$verify_right" ; then + echo "FAIL: $orig_right != $verify_right" + return 1 + fi + fi + echo "OK: $diff_prog $orig_left $orig_right" + return 0 +} + +for left in test*.left.* ; do + right="$(echo "$left" | sed 's/\.left\./.right./')" + expected_diff="$(echo "$left" | sed 's/test\([0-9]*\)\..*/expect\1.diff/')" + got_diff="verify.$expected_diff" + + "$diff_prog" "$left" "$right" > "$got_diff" + + set -e + verify_diff_script "$left" "$right" "$got_diff" + set +e +done