/* Generic infrastructure to implement various diff algorithms (implementation). */ /* * Copyright (c) 2020 Neels Hofmeyr * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "diff_internal.h" #include "diff_debug.h" static int read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len) { int r; if (fseeko(f, at_pos, SEEK_SET) == -1) return errno; r = fread(buf, sizeof(char), len, f); if ((r == 0 || r < len) && ferror(f)) return errno; if (r != len) return EIO; return 0; } static int buf_cmp(const unsigned char *left, size_t left_len, const unsigned char *right, size_t right_len, bool ignore_whitespace) { int cmp; if (ignore_whitespace) { int il = 0, ir = 0; while (il < left_len && ir < right_len) { unsigned char cl = left[il]; unsigned char cr = right[ir]; if (isspace(cl) && il < left_len) { il++; continue; } if (isspace(cr) && ir < right_len) { ir++; continue; } if (cl > cr) return 1; if (cr > cl) return -1; il++; ir++; } while (il < left_len) { unsigned char cl = left[il++]; if (!isspace(cl)) return 1; } while (ir < right_len) { unsigned char cr = right[ir++]; if (!isspace(cr)) return -1; } return 0; } cmp = memcmp(left, right, MIN(left_len, right_len)); if (cmp) return cmp; if (left_len == right_len) return 0; return (left_len > right_len) ? 1 : -1; } int diff_atom_cmp(int *cmp, const struct diff_atom *left, const struct diff_atom *right) { off_t remain_left, remain_right; int flags = (left->d->root->diff_flags | right->d->root->diff_flags); bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE); if (!ignore_whitespace) { if (!left->len && !right->len) { *cmp = 0; return 0; } if (!right->len) { *cmp = 1; return 0; } if (!left->len) { *cmp = -1; return 0; } } if (left->at != NULL && right->at != NULL) { *cmp = buf_cmp(left->at, left->len, right->at, right->len, ignore_whitespace); return 0; } remain_left = left->len; remain_right = right->len; while (remain_left > 0 || remain_right > 0) { const size_t chunksz = 8192; unsigned char buf_left[chunksz], buf_right[chunksz]; const uint8_t *p_left, *p_right; off_t n_left, n_right; ssize_t r; if (!remain_right) { *cmp = 1; return 0; } if (!remain_left) { *cmp = -1; return 0; } n_left = MIN(chunksz, remain_left); n_right = MIN(chunksz, remain_right); if (left->at == NULL) { r = read_at(left->d->root->f, left->pos + (left->len - remain_left), buf_left, n_left); if (r) { *cmp = 0; return r; } p_left = buf_left; } else { p_left = left->at + (left->len - remain_left); } if (right->at == NULL) { r = read_at(right->d->root->f, right->pos + (right->len - remain_right), buf_right, n_right); if (r) { *cmp = 0; return r; } p_right = buf_right; } else { p_right = right->at + (right->len - remain_right); } r = buf_cmp(p_left, n_left, p_right, n_right, ignore_whitespace); if (r) { *cmp = r; return 0; } remain_left -= n_left; remain_right -= n_right; } *cmp = 0; return 0; } int diff_atom_same(bool *same, const struct diff_atom *left, const struct diff_atom *right) { int cmp; int r = diff_atom_cmp(&cmp, left, right); if (r) { *same = true; return r; } *same = (cmp == 0); return 0; } /* Even if a left or right side is empty, diff output may need to know the * position in that file. * So left_start or right_start must never be NULL -- pass left_count or * right_count as zero to indicate staying at that position without consuming * any lines. */ struct diff_chunk * diff_state_add_chunk(struct diff_state *state, bool solved, struct diff_atom *left_start, unsigned int left_count, struct diff_atom *right_start, unsigned int right_count) { diff_chunk_arraylist_t *result = NULL; struct diff_chunk *new_chunk; struct diff_chunk chunk = { .solved = solved, .left_start = left_start, .left_count = left_count, .right_start = right_start, .right_count = right_count, }; enum diff_chunk_type last_t; enum diff_chunk_type prev_last_t; enum diff_chunk_type new_t; if (!solved || state->temp_result.len) { /* Append to temp_result */ result = &state->temp_result; } else if (!state->result->chunks.len) { /* Append to final result */ result = &state->result->chunks; } if (result) { ARRAYLIST_ADD(new_chunk, *result); if (!new_chunk) return NULL; *new_chunk = chunk; goto chunk_added; } /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk * never directly follows a plus chunk. */ result = &state->result->chunks; prev_last_t = result->len > 1 ? diff_chunk_type(&result->head[result->len - 2]) : CHUNK_EMPTY; last_t = diff_chunk_type(&result->head[result->len - 1]); new_t = diff_chunk_type(&chunk); if (new_t == last_t) { new_chunk = &result->head[result->len - 1]; new_chunk->left_count += chunk.left_count; new_chunk->right_count += chunk.right_count; } else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) { /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk. * Is the one before that also a minus? combine. */ if (prev_last_t == CHUNK_MINUS) { new_chunk = &result->head[result->len - 2]; new_chunk->left_count += chunk.left_count; new_chunk->right_count += chunk.right_count; } else { ARRAYLIST_INSERT(new_chunk, *result, result->len - 2); if (!new_chunk) return NULL; *new_chunk = chunk; } } else { ARRAYLIST_ADD(new_chunk, *result); if (!new_chunk) return NULL; *new_chunk = chunk; goto chunk_added; } chunk_added: debug("Add %s chunk:\n", new_chunk->solved ? "solved" : "UNSOLVED"); debug("L\n"); debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count); debug("R\n"); debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count); return new_chunk; } void diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data, unsigned long long len, int diff_flags) { *d = (struct diff_data){ .f = f, .pos = 0, .data = data, .len = len, .root = d, .diff_flags = diff_flags, }; } void diff_data_init_subsection(struct diff_data *d, struct diff_data *parent, struct diff_atom *from_atom, unsigned int atoms_count) { struct diff_atom *last_atom; if (atoms_count == 0) { *d = (struct diff_data){ .f = NULL, .pos = 0, .data = "", .len = 0, .root = parent->root, .atoms.head = NULL, .atoms.len = atoms_count, }; return; } last_atom = from_atom + atoms_count - 1; *d = (struct diff_data){ .f = NULL, .pos = from_atom->pos, .data = from_atom->at, .len = (last_atom->pos + last_atom->len) - from_atom->pos, .root = parent->root, .atoms.head = from_atom, .atoms.len = atoms_count, }; debug("subsection:\n"); debug_dump(d); } void diff_data_free(struct diff_data *diff_data) { if (!diff_data) return; if (diff_data->atoms.allocated) ARRAYLIST_FREE(diff_data->atoms); } int diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state) { debug("\n** %s\n", __func__); debug("left:\n"); debug_dump(&state->left); debug("right:\n"); debug_dump(&state->right); debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL, 0); /* Add a chunk of equal lines, if any */ unsigned int equal_atoms = 0; while (equal_atoms < state->left.atoms.len && equal_atoms < state->right.atoms.len) { int r; bool same; r = diff_atom_same(&same, &state->left.atoms.head[equal_atoms], &state->right.atoms.head[equal_atoms]); if (r) return r; if (!same) break; equal_atoms++; } if (equal_atoms) { if (!diff_state_add_chunk(state, true, &state->left.atoms.head[0], equal_atoms, &state->right.atoms.head[0], equal_atoms)) return ENOMEM; } /* Add a "minus" chunk with all lines from the left. */ if (equal_atoms < state->left.atoms.len) { if (!diff_state_add_chunk(state, true, &state->left.atoms.head[equal_atoms], state->left.atoms.len - equal_atoms, NULL, 0)) return ENOMEM; } /* Add a "plus" chunk with all lines from the right. */ if (equal_atoms < state->right.atoms.len) { if (!diff_state_add_chunk(state, true, NULL, 0, &state->right.atoms.head[equal_atoms], state->right.atoms.len - equal_atoms)) return ENOMEM; } return DIFF_RC_OK; } int diff_run_algo(const struct diff_algo_config *algo_config, struct diff_state *state) { int rc; ARRAYLIST_FREE(state->temp_result); if (!algo_config || !algo_config->impl || !state->recursion_depth_left) { debug("MAX RECURSION REACHED, just dumping diff chunks\n"); return diff_algo_none(algo_config, state); } ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE); rc = algo_config->impl(algo_config, state); switch (rc) { case DIFF_RC_USE_DIFF_ALGO_FALLBACK: debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n", algo_config->fallback_algo); rc = diff_run_algo(algo_config->fallback_algo, state); goto return_rc; case DIFF_RC_OK: /* continue below */ break; default: /* some error happened */ goto return_rc; } /* Pick up any diff chunks that are still unsolved and feed to * inner_algo. inner_algo will solve unsolved chunks and append to * result, and subsequent solved chunks on this level are then appended * to result afterwards. */ int i; for (i = 0; i < state->temp_result.len; i++) { struct diff_chunk *c = &state->temp_result.head[i]; if (c->solved) { struct diff_chunk *final_c; ARRAYLIST_ADD(final_c, state->result->chunks); if (!final_c) { rc = ENOMEM; goto return_rc; } *final_c = *c; continue; } /* c is an unsolved chunk, feed to inner_algo */ struct diff_state inner_state = { .result = state->result, .recursion_depth_left = state->recursion_depth_left - 1, }; diff_data_init_subsection(&inner_state.left, &state->left, c->left_start, c->left_count); diff_data_init_subsection(&inner_state.right, &state->right, c->right_start, c->right_count); rc = diff_run_algo(algo_config->inner_algo, &inner_state); if (rc != DIFF_RC_OK) goto return_rc; } rc = DIFF_RC_OK; return_rc: ARRAYLIST_FREE(state->temp_result); return rc; } struct diff_result * diff_main(const struct diff_config *config, FILE *left_f, const uint8_t *left_data, off_t left_len, FILE *right_f, const uint8_t *right_data, off_t right_len, int diff_flags) { struct diff_result *result = malloc(sizeof(struct diff_result)); if (!result) return NULL; *result = (struct diff_result){}; diff_data_init_root(&result->left, left_f, left_data, left_len, diff_flags); diff_data_init_root(&result->right, right_f, right_data, right_len, diff_flags); if (!config->atomize_func) { result->rc = EINVAL; return result; } result->rc = config->atomize_func(config->atomize_func_data, &result->left, &result->right); if (result->rc != DIFF_RC_OK) return result; struct diff_state state = { .result = result, .recursion_depth_left = config->max_recursion_depth ? : 32, }; diff_data_init_subsection(&state.left, &result->left, result->left.atoms.head, result->left.atoms.len); diff_data_init_subsection(&state.right, &result->right, result->right.atoms.head, result->right.atoms.len); result->rc = diff_run_algo(config->algo, &state); return result; } void diff_result_free(struct diff_result *result) { if (!result) return; diff_data_free(&result->left); diff_data_free(&result->right); ARRAYLIST_FREE(result->chunks); free(result); }