Blame


1 3b0f3d61 2020-01-22 neels /* Split source by line breaks, and calculate a simplistic checksum. */
2 3b0f3d61 2020-01-22 neels /*
3 3b0f3d61 2020-01-22 neels * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 3b0f3d61 2020-01-22 neels *
5 3b0f3d61 2020-01-22 neels * Permission to use, copy, modify, and distribute this software for any
6 3b0f3d61 2020-01-22 neels * purpose with or without fee is hereby granted, provided that the above
7 3b0f3d61 2020-01-22 neels * copyright notice and this permission notice appear in all copies.
8 3b0f3d61 2020-01-22 neels *
9 3b0f3d61 2020-01-22 neels * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 3b0f3d61 2020-01-22 neels * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 3b0f3d61 2020-01-22 neels * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 3b0f3d61 2020-01-22 neels * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 3b0f3d61 2020-01-22 neels * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 3b0f3d61 2020-01-22 neels * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 3b0f3d61 2020-01-22 neels * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 3b0f3d61 2020-01-22 neels */
17 3b0f3d61 2020-01-22 neels
18 e10a628a 2020-09-16 stsp #include <errno.h>
19 e10a628a 2020-09-16 stsp #include <stdbool.h>
20 fe6d58fb 2020-11-14 naddy #include <stdint.h>
21 c6eecea3 2020-07-26 stsp #include <stdio.h>
22 e10a628a 2020-09-16 stsp #include <stdlib.h>
23 e10a628a 2020-09-16 stsp #include <unistd.h>
24 845f3575 2020-10-22 neels #include <ctype.h>
25 c6eecea3 2020-07-26 stsp
26 1dfba055 2020-10-07 stsp #include <arraylist.h>
27 1dfba055 2020-10-07 stsp #include <diff_main.h>
28 1dfba055 2020-10-07 stsp
29 85ab4559 2020-09-22 stsp #include "diff_internal.h"
30 2a1b94d0 2020-09-26 stsp #include "diff_debug.h"
31 3b0f3d61 2020-01-22 neels
32 f8b2e31e 2022-08-03 stsp unsigned int
33 29916bb6 2020-11-18 stsp diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
34 29916bb6 2020-11-18 stsp {
35 29916bb6 2020-11-18 stsp return hash * 23 + atom_byte;
36 29916bb6 2020-11-18 stsp }
37 29916bb6 2020-11-18 stsp
38 61a7b578 2020-05-06 neels static int
39 c6eecea3 2020-07-26 stsp diff_data_atomize_text_lines_fd(struct diff_data *d)
40 3b0f3d61 2020-01-22 neels {
41 7a54ad3a 2020-09-20 stsp off_t pos = 0;
42 c6eecea3 2020-07-26 stsp const off_t end = pos + d->len;
43 c6eecea3 2020-07-26 stsp unsigned int array_size_estimate = d->len / 50;
44 c6eecea3 2020-07-26 stsp unsigned int pow2 = 1;
45 845f3575 2020-10-22 neels bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
46 e51ebd83 2020-11-21 stsp bool embedded_nul = false;
47 7a54ad3a 2020-09-20 stsp
48 c6eecea3 2020-07-26 stsp while (array_size_estimate >>= 1)
49 c6eecea3 2020-07-26 stsp pow2++;
50 c6eecea3 2020-07-26 stsp
51 c6eecea3 2020-07-26 stsp ARRAYLIST_INIT(d->atoms, 1 << pow2);
52 c6eecea3 2020-07-26 stsp
53 7a54ad3a 2020-09-20 stsp if (fseek(d->root->f, 0L, SEEK_SET) == -1)
54 7a54ad3a 2020-09-20 stsp return errno;
55 7a54ad3a 2020-09-20 stsp
56 c6eecea3 2020-07-26 stsp while (pos < end) {
57 c6eecea3 2020-07-26 stsp off_t line_end = pos;
58 c6eecea3 2020-07-26 stsp unsigned int hash = 0;
59 c6eecea3 2020-07-26 stsp unsigned char buf[512];
60 7a54ad3a 2020-09-20 stsp size_t r, i;
61 c6eecea3 2020-07-26 stsp struct diff_atom *atom;
62 c6eecea3 2020-07-26 stsp int eol = 0;
63 c6eecea3 2020-07-26 stsp
64 c6eecea3 2020-07-26 stsp while (eol == 0 && line_end < end) {
65 7a54ad3a 2020-09-20 stsp r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
66 7a54ad3a 2020-09-20 stsp if (r == 0 && ferror(d->root->f))
67 674563ab 2022-10-11 tj return EIO;
68 c6eecea3 2020-07-26 stsp i = 0;
69 c6eecea3 2020-07-26 stsp while (eol == 0 && i < r) {
70 c6eecea3 2020-07-26 stsp if (buf[i] != '\r' && buf[i] != '\n') {
71 845f3575 2020-10-22 neels if (!ignore_whitespace
72 1dce05e8 2022-11-17 op || !isspace((unsigned char)buf[i]))
73 29916bb6 2020-11-18 stsp hash = diff_atom_hash_update(
74 29916bb6 2020-11-18 stsp hash, buf[i]);
75 e51ebd83 2020-11-21 stsp if (buf[i] == '\0')
76 e51ebd83 2020-11-21 stsp embedded_nul = true;
77 c6eecea3 2020-07-26 stsp line_end++;
78 c6eecea3 2020-07-26 stsp } else
79 c6eecea3 2020-07-26 stsp eol = buf[i];
80 c6eecea3 2020-07-26 stsp i++;
81 c6eecea3 2020-07-26 stsp }
82 c6eecea3 2020-07-26 stsp }
83 c6eecea3 2020-07-26 stsp
84 c6eecea3 2020-07-26 stsp /* When not at the end of data, the line ending char ('\r' or
85 c6eecea3 2020-07-26 stsp * '\n') must follow */
86 c6eecea3 2020-07-26 stsp if (line_end < end)
87 c6eecea3 2020-07-26 stsp line_end++;
88 c6eecea3 2020-07-26 stsp /* If that was an '\r', also pull in any following '\n' */
89 c6eecea3 2020-07-26 stsp if (line_end < end && eol == '\r') {
90 7a54ad3a 2020-09-20 stsp if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
91 c6eecea3 2020-07-26 stsp return errno;
92 7a54ad3a 2020-09-20 stsp r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
93 7a54ad3a 2020-09-20 stsp if (r == 0 && ferror(d->root->f))
94 674563ab 2022-10-11 tj return EIO;
95 ed9312f0 2022-07-26 op if (r > 0 && buf[0] == '\n')
96 c6eecea3 2020-07-26 stsp line_end++;
97 c6eecea3 2020-07-26 stsp }
98 c6eecea3 2020-07-26 stsp
99 c6eecea3 2020-07-26 stsp /* Record the found line as diff atom */
100 c6eecea3 2020-07-26 stsp ARRAYLIST_ADD(atom, d->atoms);
101 c6eecea3 2020-07-26 stsp if (!atom)
102 3e6cba3a 2020-08-13 stsp return ENOMEM;
103 c6eecea3 2020-07-26 stsp
104 c6eecea3 2020-07-26 stsp *atom = (struct diff_atom){
105 ad5b3f85 2020-10-12 neels .root = d,
106 c6eecea3 2020-07-26 stsp .pos = pos,
107 c6eecea3 2020-07-26 stsp .at = NULL, /* atom data is not memory-mapped */
108 c6eecea3 2020-07-26 stsp .len = line_end - pos,
109 c6eecea3 2020-07-26 stsp .hash = hash,
110 c6eecea3 2020-07-26 stsp };
111 c6eecea3 2020-07-26 stsp
112 c6eecea3 2020-07-26 stsp /* Starting point for next line: */
113 c6eecea3 2020-07-26 stsp pos = line_end;
114 7a54ad3a 2020-09-20 stsp if (fseeko(d->root->f, pos, SEEK_SET) == -1)
115 03f49727 2020-09-20 stsp return errno;
116 c6eecea3 2020-07-26 stsp }
117 c6eecea3 2020-07-26 stsp
118 e51ebd83 2020-11-21 stsp /* File are considered binary if they contain embedded '\0' bytes. */
119 e51ebd83 2020-11-21 stsp if (embedded_nul)
120 e51ebd83 2020-11-21 stsp d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
121 e51ebd83 2020-11-21 stsp
122 c6eecea3 2020-07-26 stsp return DIFF_RC_OK;
123 c6eecea3 2020-07-26 stsp }
124 c6eecea3 2020-07-26 stsp
125 c6eecea3 2020-07-26 stsp static int
126 c6eecea3 2020-07-26 stsp diff_data_atomize_text_lines_mmap(struct diff_data *d)
127 c6eecea3 2020-07-26 stsp {
128 3b0f3d61 2020-01-22 neels const uint8_t *pos = d->data;
129 3b0f3d61 2020-01-22 neels const uint8_t *end = pos + d->len;
130 845f3575 2020-10-22 neels bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
131 e51ebd83 2020-11-21 stsp bool embedded_nul = false;
132 3b0f3d61 2020-01-22 neels unsigned int array_size_estimate = d->len / 50;
133 3b0f3d61 2020-01-22 neels unsigned int pow2 = 1;
134 3b0f3d61 2020-01-22 neels while (array_size_estimate >>= 1)
135 3b0f3d61 2020-01-22 neels pow2++;
136 3b0f3d61 2020-01-22 neels
137 3b0f3d61 2020-01-22 neels ARRAYLIST_INIT(d->atoms, 1 << pow2);
138 3b0f3d61 2020-01-22 neels
139 3b0f3d61 2020-01-22 neels while (pos < end) {
140 3b0f3d61 2020-01-22 neels const uint8_t *line_end = pos;
141 3b0f3d61 2020-01-22 neels unsigned int hash = 0;
142 3b0f3d61 2020-01-22 neels
143 3b0f3d61 2020-01-22 neels while (line_end < end && *line_end != '\r' && *line_end != '\n') {
144 845f3575 2020-10-22 neels if (!ignore_whitespace
145 1dce05e8 2022-11-17 op || !isspace((unsigned char)*line_end))
146 cd9ef01a 2022-08-03 op hash = diff_atom_hash_update(hash, *line_end);
147 e51ebd83 2020-11-21 stsp if (*line_end == '\0')
148 e51ebd83 2020-11-21 stsp embedded_nul = true;
149 3b0f3d61 2020-01-22 neels line_end++;
150 3b0f3d61 2020-01-22 neels }
151 3b0f3d61 2020-01-22 neels
152 0d27172a 2020-05-06 neels /* When not at the end of data, the line ending char ('\r' or
153 0d27172a 2020-05-06 neels * '\n') must follow */
154 ed9312f0 2022-07-26 op if (line_end < end && *line_end == '\r')
155 3b0f3d61 2020-01-22 neels line_end++;
156 ed9312f0 2022-07-26 op if (line_end < end && *line_end == '\n')
157 3b0f3d61 2020-01-22 neels line_end++;
158 3b0f3d61 2020-01-22 neels
159 3b0f3d61 2020-01-22 neels /* Record the found line as diff atom */
160 3b0f3d61 2020-01-22 neels struct diff_atom *atom;
161 3b0f3d61 2020-01-22 neels ARRAYLIST_ADD(atom, d->atoms);
162 3b0f3d61 2020-01-22 neels if (!atom)
163 3e6cba3a 2020-08-13 stsp return ENOMEM;
164 3b0f3d61 2020-01-22 neels
165 3b0f3d61 2020-01-22 neels *atom = (struct diff_atom){
166 ad5b3f85 2020-10-12 neels .root = d,
167 c6eecea3 2020-07-26 stsp .pos = (off_t)(pos - d->data),
168 3b0f3d61 2020-01-22 neels .at = pos,
169 3b0f3d61 2020-01-22 neels .len = line_end - pos,
170 3b0f3d61 2020-01-22 neels .hash = hash,
171 3b0f3d61 2020-01-22 neels };
172 3b0f3d61 2020-01-22 neels
173 3b0f3d61 2020-01-22 neels /* Starting point for next line: */
174 3b0f3d61 2020-01-22 neels pos = line_end;
175 3b0f3d61 2020-01-22 neels }
176 3b0f3d61 2020-01-22 neels
177 e51ebd83 2020-11-21 stsp /* File are considered binary if they contain embedded '\0' bytes. */
178 e51ebd83 2020-11-21 stsp if (embedded_nul)
179 e51ebd83 2020-11-21 stsp d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
180 e51ebd83 2020-11-21 stsp
181 3b0f3d61 2020-01-22 neels return DIFF_RC_OK;
182 3b0f3d61 2020-01-22 neels }
183 3b0f3d61 2020-01-22 neels
184 c6eecea3 2020-07-26 stsp static int
185 c6eecea3 2020-07-26 stsp diff_data_atomize_text_lines(struct diff_data *d)
186 c6eecea3 2020-07-26 stsp {
187 40dba3d8 2020-10-16 stsp if (d->data == NULL)
188 1ea18522 2020-10-16 stsp return diff_data_atomize_text_lines_fd(d);
189 40dba3d8 2020-10-16 stsp else
190 40dba3d8 2020-10-16 stsp return diff_data_atomize_text_lines_mmap(d);
191 c6eecea3 2020-07-26 stsp }
192 c6eecea3 2020-07-26 stsp
193 3e6cba3a 2020-08-13 stsp int
194 c16dde50 2020-10-22 stsp diff_atomize_text_by_line(void *func_data, struct diff_data *d)
195 3b0f3d61 2020-01-22 neels {
196 c16dde50 2020-10-22 stsp return diff_data_atomize_text_lines(d);
197 3b0f3d61 2020-01-22 neels }