Got Mirror

Blob

Date:: Tue May 5 05:00:46 2020 UTC
Message:: Fix macro diff_atom_*idx return value types The macros sometimes returned unsigned int (atoms.len) or long int ((ATOM) - head). Make that always unsigned int, with some range checks against negative values.
Actions:: History | Blame | Raw File
1 /* Generic infrastructure to implement various diff algorithms. */
2 /*
3  * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4  *
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose with or without fee is hereby granted, provided that the above
7  * copyright notice and this permission notice appear in all copies.
8  *
9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16  */
17 
18 #pragma once
19 
20 #include <stdint.h>
21 #include <stdlib.h>
22 #include <stdbool.h>
23 #include <string.h>
24 #include <errno.h>
25 
26 #include <diff/arraylist.h>
27 
28 #ifndef MAX
29 #define MAX(A,B) ((A)>(B)?(A):(B))
30 #endif
31 #ifndef MIN
32 #define MIN(A,B) ((A)<(B)?(A):(B))
33 #endif
34 
35 struct range {
36 	int start;
37 	int end;
38 };
39 
40 static inline bool range_empty(const struct range *r)
41 {
42 	return r->start == r->end;
43 }
44 
45 static inline bool ranges_touch(const struct range *a, const struct range *b)
46 {
47 	return (a->end >= b->start) && (a->start <= b->end);
48 }
49 
50 static inline void ranges_merge(struct range *a, const struct range *b)
51 {
52 	*a = (struct range){
53 		.start = MIN(a->start, b->start),
54 		.end = MAX(a->end, b->end),
55 	};
56 }
57 
58 static inline int range_len(const struct range *r)
59 {
60 	if (!r)
61 		return 0;
62 	return r->end - r->start;
63 }
64 
65 /* List of all possible return codes of a diff invocation. */
66 enum diff_rc {
67 	DIFF_RC_USE_DIFF_ALGO_FALLBACK = -1,
68 	DIFF_RC_OK = 0,
69 	DIFF_RC_ENOTSUP = ENOTSUP,
70 	DIFF_RC_ENOMEM = ENOMEM,
71 	DIFF_RC_EINVAL = EINVAL,
72 };
73 
74 struct diff_atom {
75 	const uint8_t *at;
76 	size_t len;
77 
78 	/* This hash is just a very cheap speed up for finding *mismatching* atoms. When hashes match, we still need to
79 	 * compare entire atoms to find out whether they are indeed identical or not. */
80 	unsigned int hash;
81 
82 	/* State for the Patience Diff algorithm */
83 	/* TODO: keep a separate array for the patience state */
84 	struct {
85 		bool unique_here;
86 		bool unique_in_both;
87 		struct diff_atom *pos_in_other;
88 		struct diff_atom *prev_stack;
89 		struct range identical_lines;
90 	} patience;
91 };
92 
93 static inline bool diff_atom_same(const struct diff_atom *left, const struct diff_atom *right)
94 {
95 	return left->hash == right->hash
96 		&& left->len == right->len
97 		&& memcmp(left->at, right->at, left->len) == 0;
98 }
99 
100 /* For each file, there is a "root" struct diff_data referencing the entire file, which the atoms are parsed from. In
101  * recursion of diff algorithm, there may be "child" struct diff_data only referencing a subsection of the file,
102  * re-using the atoms parsing. For "root" structs, atoms_allocated will be nonzero, indicating that the array of atoms
103  * is owned by that struct. For "child" structs, atoms_allocated == 0, to indicate that the struct is referencing a
104  * subset of atoms. */
105 struct diff_data {
106 	const uint8_t *data;
107 	size_t len;
108 	ARRAYLIST(struct diff_atom) atoms;
109 	struct diff_data *root;
110 };
111 
112 void diff_data_free(struct diff_data *diff_data);
113 
114 /* The atom's index in the entire file. For atoms divided by lines of text, this yields the line number (starting with
115  * 0). Also works for diff_data that reference only a subsection of a file, always reflecting the global position in
116  * the file (and not the relative position within the subsection). */
117 #define diff_atom_root_idx(DIFF_DATA, ATOM) \
118 	((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \
119 	 ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \
120 	 : (DIFF_DATA)->root->atoms.len)
121 
122 /* The atom's index within DIFF_DATA. For atoms divided by lines of text, this yields the line number (starting with
123  * 0). */
124 #define diff_atom_idx(DIFF_DATA, ATOM) \
125 	((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \
126 	 ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \
127 	 : (DIFF_DATA)->atoms.len)
128 
129 #define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
130 	for ((ATOM) = (FIRST_ATOM); \
131 	     (ATOM) && ((ATOM) >= (FIRST_ATOM)) && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
132 	     (ATOM)++)
133 
134 #define diff_data_foreach_atom(ATOM, DIFF_DATA) \
135 	foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
136 
137 #define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
138 	for ((ATOM) = (FROM); \
139 	     (ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
140 	     (ATOM)++)
141 
142 /* A diff chunk represents a set of atoms on the left and/or a set of atoms on the right.
143  *
144  * If solved == false:
145  * The diff algorithm has divided the source file, and this is a chunk that the inner_algo should run on next.
146  * The lines on the left should be diffed against the lines on the right.
147  * (If there are no left lines or no right lines, it implies solved == true, because there is nothing to diff.)
148  *
149  * If solved == true:
150  * If there are only left atoms, it is a chunk removing atoms from the left ("a minus chunk").
151  * If there are only right atoms, it is a chunk adding atoms from the right ("a plus chunk").
152  * If there are both left and right lines, it is a chunk of equal content on both sides,
153  * and left_count == right_count:
154  *
155  * - foo  }
156  * - bar  }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
157  * - baz  }               right_start = NULL, right_count = 0 }
158  *   moo    }
159  *   goo    }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
160  *   zoo    }               right_start = &right.atoms.head[0], right_count = 3 }
161  *  +loo      }
162  *  +roo      }-- diff_chunk{ left_start = NULL, left_count = 0,
163  *  +too      }               right_start = &right.atoms.head[3], right_count = 3 }
164  *
165  */
166 struct diff_chunk {
167 	bool solved;
168 	struct diff_atom *left_start;
169 	unsigned int left_count;
170 	struct diff_atom *right_start;
171 	unsigned int right_count;
172 };
173 
174 typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
175 #define DIFF_RESULT_ALLOC_BLOCKSIZE 128
176 
177 struct diff_result {
178 	enum diff_rc rc;
179 	struct diff_data left;
180 	struct diff_data right;
181 	diff_chunk_arraylist_t chunks;
182 };
183 
184 struct diff_state {
185 	/* The final result passed to the original diff caller. */
186 	struct diff_result *result;
187 
188 	/* The root diff_data is in result->left,right, these are (possibly) subsections of the root data. */
189 	struct diff_data left;
190 	struct diff_data right;
191 
192 	unsigned int recursion_depth_left;
193 
194 	/* Remaining chunks from one diff algorithm pass, if any solved == false chunks came up. */
195 	diff_chunk_arraylist_t temp_result;
196 };
197 
198 struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
199 					struct diff_atom *left_start, unsigned int left_count,
200 					struct diff_atom *right_start, unsigned int right_count);
201 
202 /* Signature of a utility function to divide both source files into diff atoms.
203  * It is possible that a (future) algorithm requires both source files to decide on atom split points, hence this gets
204  * both left and right to atomize at the same time.
205  * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
206  *
207  * func_data: context pointer (free to be used by implementation).
208  * left: struct diff_data with left->data and left->len already set up, and left->atoms to be created.
209  * right: struct diff_data with right->data and right->len already set up, and right->atoms to be created.
210  */
211 typedef enum diff_rc (*diff_atomize_func_t)(void *func_data, struct diff_data *left, struct diff_data *right);
212 
213 extern enum diff_rc diff_atomize_text_by_line(void *func_data, struct diff_data *left, struct diff_data *right);
214 
215 struct diff_algo_config;
216 typedef enum diff_rc (*diff_algo_impl_t)(const struct diff_algo_config *algo_config, struct diff_state *state);
217 
218 /* Form a result with all left-side removed and all right-side added, i.e. no actual diff algorithm involved. */
219 enum diff_rc diff_algo_none(const struct diff_algo_config *algo_config, struct diff_state *state);
220 
221 /* Myers Diff tracing from the start all the way through to the end, requiring quadratic amounts of memory. This can
222  * fail if the required space surpasses algo_config->permitted_state_size. */
223 extern enum diff_rc diff_algo_myers(const struct diff_algo_config *algo_config, struct diff_state *state);
224 
225 /* Myers "Divide et Impera": tracing forwards from the start and backwards from the end to find a midpoint that divides
226  * the problem into smaller chunks. Requires only linear amounts of memory. */
227 extern enum diff_rc diff_algo_myers_divide(const struct diff_algo_config *algo_config, struct diff_state *state);
228 
229 /* Patience Diff algorithm, which divides a larger diff into smaller chunks. For very specific scenarios, it may lead to
230  * a complete diff result by itself, but needs a fallback algo to solve chunks that don't have common-unique atoms. */
231 extern enum diff_rc diff_algo_patience(const struct diff_algo_config *algo_config, struct diff_state *state);
232 
233 /* Diff algorithms to use, possibly nested. For example:
234  *
235  * struct diff_algo_config myers, patience, myers_divide;
236  *
237  * myers = (struct diff_algo_config){
238  *         .impl = diff_algo_myers,
239  *         .permitted_state_size = 32 * 1024 * 1024,
240  *         .fallback_algo = &patience, // when too large, do diff_algo_patience
241  * };
242  *
243  * patience = (struct diff_algo_config){
244  *         .impl = diff_algo_patience,
245  *         .inner_algo = &patience,        // After subdivision, do Patience again.
246  *         .fallback_algo = &myers_divide, // If subdivision failed, do Myers Divide et Impera.
247  * };
248  *
249  * myers_divide = (struct diff_algo_config){
250  *         .impl = diff_algo_myers_divide,
251  *         .inner_algo = &myers,           // When division succeeded, start from the top.
252  *                                         // (fallback_algo = NULL implies diff_algo_none).
253  * };
254  *
255  * struct diff_config config = {
256  *         .algo = &myers,
257  *         ...
258  * };
259  * diff_main(&config, ...);
260  */
261 struct diff_algo_config {
262 	diff_algo_impl_t impl;
263 
264 	/* Fail this algo if it would use more than this amount of memory, and instead use fallback_algo
265 	 * (diff_algo_myers). permitted_state_size == 0 means no limitation. */
266 	size_t permitted_state_size;
267 
268 	/* For algorithms that divide into smaller chunks, use this algorithm to solve the divided chunks. */
269 	const struct diff_algo_config *inner_algo;
270 
271 	/* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large state, or diff_algo_patience can't find
272 	 * any common-unique atoms), then use this algorithm instead. */
273 	const struct diff_algo_config *fallback_algo;
274 };
275 
276 struct diff_config {
277 	diff_atomize_func_t atomize_func;
278 	void *atomize_func_data;
279 
280 	const struct diff_algo_config *algo;
281 
282 	/* How deep to step into subdivisions of a source file, a paranoia / safety measure to guard against infinite
283 	 * loops through diff algorithms. When the maximum recursion is reached, employ diff_algo_none (i.e. remove all
284 	 * left atoms and add all right atoms). */
285 	unsigned int max_recursion_depth;
286 };
287 
288 struct diff_result *diff_main(const struct diff_config *config,
289 			      const uint8_t *left_data, size_t left_len,
290 			      const uint8_t *right_data, size_t right_len);
291 void diff_result_free(struct diff_result *result);