Blame


1 3b0f3d61 2020-01-22 neels /* Split source by line breaks, and calculate a simplistic checksum. */
2 3b0f3d61 2020-01-22 neels /*
3 3b0f3d61 2020-01-22 neels * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
4 3b0f3d61 2020-01-22 neels *
5 3b0f3d61 2020-01-22 neels * Permission to use, copy, modify, and distribute this software for any
6 3b0f3d61 2020-01-22 neels * purpose with or without fee is hereby granted, provided that the above
7 3b0f3d61 2020-01-22 neels * copyright notice and this permission notice appear in all copies.
8 3b0f3d61 2020-01-22 neels *
9 3b0f3d61 2020-01-22 neels * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 3b0f3d61 2020-01-22 neels * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 3b0f3d61 2020-01-22 neels * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 3b0f3d61 2020-01-22 neels * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 3b0f3d61 2020-01-22 neels * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 3b0f3d61 2020-01-22 neels * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 3b0f3d61 2020-01-22 neels * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 3b0f3d61 2020-01-22 neels */
17 3b0f3d61 2020-01-22 neels
18 e10a628a 2020-09-16 stsp #include <errno.h>
19 e10a628a 2020-09-16 stsp #include <inttypes.h>
20 e10a628a 2020-09-16 stsp #include <stdbool.h>
21 c6eecea3 2020-07-26 stsp #include <stdio.h>
22 e10a628a 2020-09-16 stsp #include <stdlib.h>
23 e10a628a 2020-09-16 stsp #include <unistd.h>
24 c6eecea3 2020-07-26 stsp
25 1dfba055 2020-10-07 stsp #include <arraylist.h>
26 1dfba055 2020-10-07 stsp #include <diff_main.h>
27 1dfba055 2020-10-07 stsp
28 85ab4559 2020-09-22 stsp #include "diff_internal.h"
29 2a1b94d0 2020-09-26 stsp #include "diff_debug.h"
30 3b0f3d61 2020-01-22 neels
31 61a7b578 2020-05-06 neels static int
32 c6eecea3 2020-07-26 stsp diff_data_atomize_text_lines_fd(struct diff_data *d)
33 3b0f3d61 2020-01-22 neels {
34 7a54ad3a 2020-09-20 stsp off_t pos = 0;
35 c6eecea3 2020-07-26 stsp const off_t end = pos + d->len;
36 c6eecea3 2020-07-26 stsp unsigned int array_size_estimate = d->len / 50;
37 c6eecea3 2020-07-26 stsp unsigned int pow2 = 1;
38 7a54ad3a 2020-09-20 stsp
39 c6eecea3 2020-07-26 stsp while (array_size_estimate >>= 1)
40 c6eecea3 2020-07-26 stsp pow2++;
41 c6eecea3 2020-07-26 stsp
42 c6eecea3 2020-07-26 stsp ARRAYLIST_INIT(d->atoms, 1 << pow2);
43 c6eecea3 2020-07-26 stsp
44 7a54ad3a 2020-09-20 stsp if (fseek(d->root->f, 0L, SEEK_SET) == -1)
45 7a54ad3a 2020-09-20 stsp return errno;
46 7a54ad3a 2020-09-20 stsp
47 c6eecea3 2020-07-26 stsp while (pos < end) {
48 c6eecea3 2020-07-26 stsp off_t line_end = pos;
49 c6eecea3 2020-07-26 stsp unsigned int hash = 0;
50 c6eecea3 2020-07-26 stsp unsigned char buf[512];
51 7a54ad3a 2020-09-20 stsp size_t r, i;
52 c6eecea3 2020-07-26 stsp struct diff_atom *atom;
53 c6eecea3 2020-07-26 stsp int eol = 0;
54 c6eecea3 2020-07-26 stsp
55 c6eecea3 2020-07-26 stsp while (eol == 0 && line_end < end) {
56 7a54ad3a 2020-09-20 stsp r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
57 7a54ad3a 2020-09-20 stsp if (r == 0 && ferror(d->root->f))
58 c6eecea3 2020-07-26 stsp return errno;
59 c6eecea3 2020-07-26 stsp i = 0;
60 c6eecea3 2020-07-26 stsp while (eol == 0 && i < r) {
61 c6eecea3 2020-07-26 stsp if (buf[i] != '\r' && buf[i] != '\n') {
62 c6eecea3 2020-07-26 stsp hash = hash * 23 + buf[i];
63 c6eecea3 2020-07-26 stsp line_end++;
64 c6eecea3 2020-07-26 stsp } else
65 c6eecea3 2020-07-26 stsp eol = buf[i];
66 c6eecea3 2020-07-26 stsp i++;
67 c6eecea3 2020-07-26 stsp }
68 c6eecea3 2020-07-26 stsp }
69 c6eecea3 2020-07-26 stsp
70 c6eecea3 2020-07-26 stsp /* When not at the end of data, the line ending char ('\r' or
71 c6eecea3 2020-07-26 stsp * '\n') must follow */
72 c6eecea3 2020-07-26 stsp if (line_end < end)
73 c6eecea3 2020-07-26 stsp line_end++;
74 c6eecea3 2020-07-26 stsp /* If that was an '\r', also pull in any following '\n' */
75 c6eecea3 2020-07-26 stsp if (line_end < end && eol == '\r') {
76 7a54ad3a 2020-09-20 stsp if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
77 c6eecea3 2020-07-26 stsp return errno;
78 7a54ad3a 2020-09-20 stsp r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
79 7a54ad3a 2020-09-20 stsp if (r == 0 && ferror(d->root->f))
80 c6eecea3 2020-07-26 stsp return errno;
81 c6eecea3 2020-07-26 stsp if (r == 1 && buf[0] == '\n' )
82 c6eecea3 2020-07-26 stsp line_end++;
83 c6eecea3 2020-07-26 stsp }
84 c6eecea3 2020-07-26 stsp
85 c6eecea3 2020-07-26 stsp /* Record the found line as diff atom */
86 c6eecea3 2020-07-26 stsp ARRAYLIST_ADD(atom, d->atoms);
87 c6eecea3 2020-07-26 stsp if (!atom)
88 3e6cba3a 2020-08-13 stsp return ENOMEM;
89 c6eecea3 2020-07-26 stsp
90 c6eecea3 2020-07-26 stsp *atom = (struct diff_atom){
91 c6eecea3 2020-07-26 stsp .d = d,
92 c6eecea3 2020-07-26 stsp .pos = pos,
93 c6eecea3 2020-07-26 stsp .at = NULL, /* atom data is not memory-mapped */
94 c6eecea3 2020-07-26 stsp .len = line_end - pos,
95 c6eecea3 2020-07-26 stsp .hash = hash,
96 c6eecea3 2020-07-26 stsp };
97 c6eecea3 2020-07-26 stsp
98 c6eecea3 2020-07-26 stsp /* Starting point for next line: */
99 c6eecea3 2020-07-26 stsp pos = line_end;
100 7a54ad3a 2020-09-20 stsp if (fseeko(d->root->f, pos, SEEK_SET) == -1)
101 03f49727 2020-09-20 stsp return errno;
102 c6eecea3 2020-07-26 stsp }
103 c6eecea3 2020-07-26 stsp
104 c6eecea3 2020-07-26 stsp return DIFF_RC_OK;
105 c6eecea3 2020-07-26 stsp }
106 c6eecea3 2020-07-26 stsp
107 c6eecea3 2020-07-26 stsp static int
108 c6eecea3 2020-07-26 stsp diff_data_atomize_text_lines_mmap(struct diff_data *d)
109 c6eecea3 2020-07-26 stsp {
110 3b0f3d61 2020-01-22 neels const uint8_t *pos = d->data;
111 3b0f3d61 2020-01-22 neels const uint8_t *end = pos + d->len;
112 3b0f3d61 2020-01-22 neels
113 3b0f3d61 2020-01-22 neels unsigned int array_size_estimate = d->len / 50;
114 3b0f3d61 2020-01-22 neels unsigned int pow2 = 1;
115 3b0f3d61 2020-01-22 neels while (array_size_estimate >>= 1)
116 3b0f3d61 2020-01-22 neels pow2++;
117 3b0f3d61 2020-01-22 neels
118 3b0f3d61 2020-01-22 neels ARRAYLIST_INIT(d->atoms, 1 << pow2);
119 3b0f3d61 2020-01-22 neels
120 3b0f3d61 2020-01-22 neels while (pos < end) {
121 3b0f3d61 2020-01-22 neels const uint8_t *line_end = pos;
122 3b0f3d61 2020-01-22 neels unsigned int hash = 0;
123 3b0f3d61 2020-01-22 neels
124 3b0f3d61 2020-01-22 neels while (line_end < end && *line_end != '\r' && *line_end != '\n') {
125 3b0f3d61 2020-01-22 neels hash = hash * 23 + *line_end;
126 3b0f3d61 2020-01-22 neels line_end++;
127 3b0f3d61 2020-01-22 neels }
128 3b0f3d61 2020-01-22 neels
129 0d27172a 2020-05-06 neels /* When not at the end of data, the line ending char ('\r' or
130 0d27172a 2020-05-06 neels * '\n') must follow */
131 3b0f3d61 2020-01-22 neels if (line_end < end)
132 3b0f3d61 2020-01-22 neels line_end++;
133 3b0f3d61 2020-01-22 neels /* If that was an '\r', also pull in any following '\n' */
134 0d27172a 2020-05-06 neels if (line_end[0] == '\r'
135 0d27172a 2020-05-06 neels && line_end < end && line_end[1] == '\n')
136 3b0f3d61 2020-01-22 neels line_end++;
137 3b0f3d61 2020-01-22 neels
138 3b0f3d61 2020-01-22 neels /* Record the found line as diff atom */
139 3b0f3d61 2020-01-22 neels struct diff_atom *atom;
140 3b0f3d61 2020-01-22 neels ARRAYLIST_ADD(atom, d->atoms);
141 3b0f3d61 2020-01-22 neels if (!atom)
142 3e6cba3a 2020-08-13 stsp return ENOMEM;
143 3b0f3d61 2020-01-22 neels
144 3b0f3d61 2020-01-22 neels *atom = (struct diff_atom){
145 c6eecea3 2020-07-26 stsp .d = d,
146 c6eecea3 2020-07-26 stsp .pos = (off_t)(pos - d->data),
147 3b0f3d61 2020-01-22 neels .at = pos,
148 3b0f3d61 2020-01-22 neels .len = line_end - pos,
149 3b0f3d61 2020-01-22 neels .hash = hash,
150 3b0f3d61 2020-01-22 neels };
151 3b0f3d61 2020-01-22 neels
152 3b0f3d61 2020-01-22 neels /* Starting point for next line: */
153 3b0f3d61 2020-01-22 neels pos = line_end;
154 3b0f3d61 2020-01-22 neels }
155 3b0f3d61 2020-01-22 neels
156 3b0f3d61 2020-01-22 neels return DIFF_RC_OK;
157 3b0f3d61 2020-01-22 neels }
158 3b0f3d61 2020-01-22 neels
159 c6eecea3 2020-07-26 stsp static int
160 c6eecea3 2020-07-26 stsp diff_data_atomize_text_lines(struct diff_data *d)
161 c6eecea3 2020-07-26 stsp {
162 c6eecea3 2020-07-26 stsp if (d->data == NULL)
163 c6eecea3 2020-07-26 stsp return diff_data_atomize_text_lines_fd(d);
164 c6eecea3 2020-07-26 stsp else
165 c6eecea3 2020-07-26 stsp return diff_data_atomize_text_lines_mmap(d);
166 c6eecea3 2020-07-26 stsp }
167 c6eecea3 2020-07-26 stsp
168 3e6cba3a 2020-08-13 stsp int
169 0d27172a 2020-05-06 neels diff_atomize_text_by_line(void *func_data, struct diff_data *left,
170 0d27172a 2020-05-06 neels struct diff_data *right)
171 3b0f3d61 2020-01-22 neels {
172 3e6cba3a 2020-08-13 stsp int rc;
173 3b0f3d61 2020-01-22 neels rc = diff_data_atomize_text_lines(left);
174 3b0f3d61 2020-01-22 neels if (rc != DIFF_RC_OK)
175 3b0f3d61 2020-01-22 neels return rc;
176 3b0f3d61 2020-01-22 neels return diff_data_atomize_text_lines(right);
177 3b0f3d61 2020-01-22 neels }