##// END OF EJS Templates
pathencode: improve error messages slightly...
Augie Fackler -
r38067:9aaa74f9 default
parent child Browse files
Show More
@@ -1,764 +1,765 b''
1 /*
1 /*
2 pathencode.c - efficient path name encoding
2 pathencode.c - efficient path name encoding
3
3
4 Copyright 2012 Facebook
4 Copyright 2012 Facebook
5
5
6 This software may be used and distributed according to the terms of
6 This software may be used and distributed according to the terms of
7 the GNU General Public License, incorporated herein by reference.
7 the GNU General Public License, incorporated herein by reference.
8 */
8 */
9
9
10 /*
10 /*
11 * An implementation of the name encoding scheme used by the fncache
11 * An implementation of the name encoding scheme used by the fncache
12 * store. The common case is of a path < 120 bytes long, which is
12 * store. The common case is of a path < 120 bytes long, which is
13 * handled either in a single pass with no allocations or two passes
13 * handled either in a single pass with no allocations or two passes
14 * with a single allocation. For longer paths, multiple passes are
14 * with a single allocation. For longer paths, multiple passes are
15 * required.
15 * required.
16 */
16 */
17
17
18 #define PY_SSIZE_T_CLEAN
18 #define PY_SSIZE_T_CLEAN
19 #include <Python.h>
19 #include <Python.h>
20 #include <assert.h>
20 #include <assert.h>
21 #include <ctype.h>
21 #include <ctype.h>
22 #include <stdlib.h>
22 #include <stdlib.h>
23 #include <string.h>
23 #include <string.h>
24
24
25 #include "util.h"
25 #include "util.h"
26
26
27 /* state machine for the fast path */
27 /* state machine for the fast path */
28 enum path_state {
28 enum path_state {
29 START, /* first byte of a path component */
29 START, /* first byte of a path component */
30 A, /* "AUX" */
30 A, /* "AUX" */
31 AU,
31 AU,
32 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
32 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
33 C, /* "CON" or "COMn" */
33 C, /* "CON" or "COMn" */
34 CO,
34 CO,
35 COMLPT, /* "COM" or "LPT" */
35 COMLPT, /* "COM" or "LPT" */
36 COMLPTn,
36 COMLPTn,
37 L,
37 L,
38 LP,
38 LP,
39 N,
39 N,
40 NU,
40 NU,
41 P, /* "PRN" */
41 P, /* "PRN" */
42 PR,
42 PR,
43 LDOT, /* leading '.' */
43 LDOT, /* leading '.' */
44 DOT, /* '.' in a non-leading position */
44 DOT, /* '.' in a non-leading position */
45 H, /* ".h" */
45 H, /* ".h" */
46 HGDI, /* ".hg", ".d", or ".i" */
46 HGDI, /* ".hg", ".d", or ".i" */
47 SPACE,
47 SPACE,
48 DEFAULT, /* byte of a path component after the first */
48 DEFAULT, /* byte of a path component after the first */
49 };
49 };
50
50
51 /* state machine for dir-encoding */
51 /* state machine for dir-encoding */
52 enum dir_state {
52 enum dir_state {
53 DDOT,
53 DDOT,
54 DH,
54 DH,
55 DHGDI,
55 DHGDI,
56 DDEFAULT,
56 DDEFAULT,
57 };
57 };
58
58
59 static inline int inset(const uint32_t bitset[], char c)
59 static inline int inset(const uint32_t bitset[], char c)
60 {
60 {
61 return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
61 return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
62 }
62 }
63
63
64 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
64 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
65 char c)
65 char c)
66 {
66 {
67 if (dest) {
67 if (dest) {
68 assert(*destlen < destsize);
68 assert(*destlen < destsize);
69 dest[*destlen] = c;
69 dest[*destlen] = c;
70 }
70 }
71 (*destlen)++;
71 (*destlen)++;
72 }
72 }
73
73
74 static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
74 static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
75 const void *src, Py_ssize_t len)
75 const void *src, Py_ssize_t len)
76 {
76 {
77 if (dest) {
77 if (dest) {
78 assert(*destlen + len < destsize);
78 assert(*destlen + len < destsize);
79 memcpy((void *)&dest[*destlen], src, len);
79 memcpy((void *)&dest[*destlen], src, len);
80 }
80 }
81 *destlen += len;
81 *destlen += len;
82 }
82 }
83
83
84 static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
84 static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
85 uint8_t c)
85 uint8_t c)
86 {
86 {
87 static const char hexdigit[] = "0123456789abcdef";
87 static const char hexdigit[] = "0123456789abcdef";
88
88
89 charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
89 charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
90 charcopy(dest, destlen, destsize, hexdigit[c & 15]);
90 charcopy(dest, destlen, destsize, hexdigit[c & 15]);
91 }
91 }
92
92
93 /* 3-byte escape: tilde followed by two hex digits */
93 /* 3-byte escape: tilde followed by two hex digits */
94 static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
94 static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
95 char c)
95 char c)
96 {
96 {
97 charcopy(dest, destlen, destsize, '~');
97 charcopy(dest, destlen, destsize, '~');
98 hexencode(dest, destlen, destsize, c);
98 hexencode(dest, destlen, destsize, c);
99 }
99 }
100
100
101 static Py_ssize_t _encodedir(char *dest, size_t destsize, const char *src,
101 static Py_ssize_t _encodedir(char *dest, size_t destsize, const char *src,
102 Py_ssize_t len)
102 Py_ssize_t len)
103 {
103 {
104 enum dir_state state = DDEFAULT;
104 enum dir_state state = DDEFAULT;
105 Py_ssize_t i = 0, destlen = 0;
105 Py_ssize_t i = 0, destlen = 0;
106
106
107 while (i < len) {
107 while (i < len) {
108 switch (state) {
108 switch (state) {
109 case DDOT:
109 case DDOT:
110 switch (src[i]) {
110 switch (src[i]) {
111 case 'd':
111 case 'd':
112 case 'i':
112 case 'i':
113 state = DHGDI;
113 state = DHGDI;
114 charcopy(dest, &destlen, destsize, src[i++]);
114 charcopy(dest, &destlen, destsize, src[i++]);
115 break;
115 break;
116 case 'h':
116 case 'h':
117 state = DH;
117 state = DH;
118 charcopy(dest, &destlen, destsize, src[i++]);
118 charcopy(dest, &destlen, destsize, src[i++]);
119 break;
119 break;
120 default:
120 default:
121 state = DDEFAULT;
121 state = DDEFAULT;
122 break;
122 break;
123 }
123 }
124 break;
124 break;
125 case DH:
125 case DH:
126 if (src[i] == 'g') {
126 if (src[i] == 'g') {
127 state = DHGDI;
127 state = DHGDI;
128 charcopy(dest, &destlen, destsize, src[i++]);
128 charcopy(dest, &destlen, destsize, src[i++]);
129 } else
129 } else
130 state = DDEFAULT;
130 state = DDEFAULT;
131 break;
131 break;
132 case DHGDI:
132 case DHGDI:
133 if (src[i] == '/') {
133 if (src[i] == '/') {
134 memcopy(dest, &destlen, destsize, ".hg", 3);
134 memcopy(dest, &destlen, destsize, ".hg", 3);
135 charcopy(dest, &destlen, destsize, src[i++]);
135 charcopy(dest, &destlen, destsize, src[i++]);
136 }
136 }
137 state = DDEFAULT;
137 state = DDEFAULT;
138 break;
138 break;
139 case DDEFAULT:
139 case DDEFAULT:
140 if (src[i] == '.')
140 if (src[i] == '.')
141 state = DDOT;
141 state = DDOT;
142 charcopy(dest, &destlen, destsize, src[i++]);
142 charcopy(dest, &destlen, destsize, src[i++]);
143 break;
143 break;
144 }
144 }
145 }
145 }
146
146
147 return destlen;
147 return destlen;
148 }
148 }
149
149
150 PyObject *encodedir(PyObject *self, PyObject *args)
150 PyObject *encodedir(PyObject *self, PyObject *args)
151 {
151 {
152 Py_ssize_t len, newlen;
152 Py_ssize_t len, newlen;
153 PyObject *pathobj, *newobj;
153 PyObject *pathobj, *newobj;
154 char *path;
154 char *path;
155
155
156 if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj))
156 if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj))
157 return NULL;
157 return NULL;
158
158
159 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
159 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
160 PyErr_SetString(PyExc_TypeError, "expected a string");
160 PyErr_SetString(PyExc_TypeError, "expected a string");
161 return NULL;
161 return NULL;
162 }
162 }
163
163
164 newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;
164 newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;
165
165
166 if (newlen == len + 1) {
166 if (newlen == len + 1) {
167 Py_INCREF(pathobj);
167 Py_INCREF(pathobj);
168 return pathobj;
168 return pathobj;
169 }
169 }
170
170
171 newobj = PyBytes_FromStringAndSize(NULL, newlen);
171 newobj = PyBytes_FromStringAndSize(NULL, newlen);
172
172
173 if (newobj) {
173 if (newobj) {
174 assert(PyBytes_Check(newobj));
174 assert(PyBytes_Check(newobj));
175 Py_SIZE(newobj)--;
175 Py_SIZE(newobj)--;
176 _encodedir(PyBytes_AS_STRING(newobj), newlen, path, len + 1);
176 _encodedir(PyBytes_AS_STRING(newobj), newlen, path, len + 1);
177 }
177 }
178
178
179 return newobj;
179 return newobj;
180 }
180 }
181
181
182 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
182 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
183 char *dest, Py_ssize_t destlen, size_t destsize,
183 char *dest, Py_ssize_t destlen, size_t destsize,
184 const char *src, Py_ssize_t len, int encodedir)
184 const char *src, Py_ssize_t len, int encodedir)
185 {
185 {
186 enum path_state state = START;
186 enum path_state state = START;
187 Py_ssize_t i = 0;
187 Py_ssize_t i = 0;
188
188
189 /*
189 /*
190 * Python strings end with a zero byte, which we use as a
190 * Python strings end with a zero byte, which we use as a
191 * terminal token as they are not valid inside path names.
191 * terminal token as they are not valid inside path names.
192 */
192 */
193
193
194 while (i < len) {
194 while (i < len) {
195 switch (state) {
195 switch (state) {
196 case START:
196 case START:
197 switch (src[i]) {
197 switch (src[i]) {
198 case '/':
198 case '/':
199 charcopy(dest, &destlen, destsize, src[i++]);
199 charcopy(dest, &destlen, destsize, src[i++]);
200 break;
200 break;
201 case '.':
201 case '.':
202 state = LDOT;
202 state = LDOT;
203 escape3(dest, &destlen, destsize, src[i++]);
203 escape3(dest, &destlen, destsize, src[i++]);
204 break;
204 break;
205 case ' ':
205 case ' ':
206 state = DEFAULT;
206 state = DEFAULT;
207 escape3(dest, &destlen, destsize, src[i++]);
207 escape3(dest, &destlen, destsize, src[i++]);
208 break;
208 break;
209 case 'a':
209 case 'a':
210 state = A;
210 state = A;
211 charcopy(dest, &destlen, destsize, src[i++]);
211 charcopy(dest, &destlen, destsize, src[i++]);
212 break;
212 break;
213 case 'c':
213 case 'c':
214 state = C;
214 state = C;
215 charcopy(dest, &destlen, destsize, src[i++]);
215 charcopy(dest, &destlen, destsize, src[i++]);
216 break;
216 break;
217 case 'l':
217 case 'l':
218 state = L;
218 state = L;
219 charcopy(dest, &destlen, destsize, src[i++]);
219 charcopy(dest, &destlen, destsize, src[i++]);
220 break;
220 break;
221 case 'n':
221 case 'n':
222 state = N;
222 state = N;
223 charcopy(dest, &destlen, destsize, src[i++]);
223 charcopy(dest, &destlen, destsize, src[i++]);
224 break;
224 break;
225 case 'p':
225 case 'p':
226 state = P;
226 state = P;
227 charcopy(dest, &destlen, destsize, src[i++]);
227 charcopy(dest, &destlen, destsize, src[i++]);
228 break;
228 break;
229 default:
229 default:
230 state = DEFAULT;
230 state = DEFAULT;
231 break;
231 break;
232 }
232 }
233 break;
233 break;
234 case A:
234 case A:
235 if (src[i] == 'u') {
235 if (src[i] == 'u') {
236 state = AU;
236 state = AU;
237 charcopy(dest, &destlen, destsize, src[i++]);
237 charcopy(dest, &destlen, destsize, src[i++]);
238 } else
238 } else
239 state = DEFAULT;
239 state = DEFAULT;
240 break;
240 break;
241 case AU:
241 case AU:
242 if (src[i] == 'x') {
242 if (src[i] == 'x') {
243 state = THIRD;
243 state = THIRD;
244 i++;
244 i++;
245 } else
245 } else
246 state = DEFAULT;
246 state = DEFAULT;
247 break;
247 break;
248 case THIRD:
248 case THIRD:
249 state = DEFAULT;
249 state = DEFAULT;
250 switch (src[i]) {
250 switch (src[i]) {
251 case '.':
251 case '.':
252 case '/':
252 case '/':
253 case '\0':
253 case '\0':
254 escape3(dest, &destlen, destsize, src[i - 1]);
254 escape3(dest, &destlen, destsize, src[i - 1]);
255 break;
255 break;
256 default:
256 default:
257 i--;
257 i--;
258 break;
258 break;
259 }
259 }
260 break;
260 break;
261 case C:
261 case C:
262 if (src[i] == 'o') {
262 if (src[i] == 'o') {
263 state = CO;
263 state = CO;
264 charcopy(dest, &destlen, destsize, src[i++]);
264 charcopy(dest, &destlen, destsize, src[i++]);
265 } else
265 } else
266 state = DEFAULT;
266 state = DEFAULT;
267 break;
267 break;
268 case CO:
268 case CO:
269 if (src[i] == 'm') {
269 if (src[i] == 'm') {
270 state = COMLPT;
270 state = COMLPT;
271 i++;
271 i++;
272 } else if (src[i] == 'n') {
272 } else if (src[i] == 'n') {
273 state = THIRD;
273 state = THIRD;
274 i++;
274 i++;
275 } else
275 } else
276 state = DEFAULT;
276 state = DEFAULT;
277 break;
277 break;
278 case COMLPT:
278 case COMLPT:
279 switch (src[i]) {
279 switch (src[i]) {
280 case '1':
280 case '1':
281 case '2':
281 case '2':
282 case '3':
282 case '3':
283 case '4':
283 case '4':
284 case '5':
284 case '5':
285 case '6':
285 case '6':
286 case '7':
286 case '7':
287 case '8':
287 case '8':
288 case '9':
288 case '9':
289 state = COMLPTn;
289 state = COMLPTn;
290 i++;
290 i++;
291 break;
291 break;
292 default:
292 default:
293 state = DEFAULT;
293 state = DEFAULT;
294 charcopy(dest, &destlen, destsize, src[i - 1]);
294 charcopy(dest, &destlen, destsize, src[i - 1]);
295 break;
295 break;
296 }
296 }
297 break;
297 break;
298 case COMLPTn:
298 case COMLPTn:
299 state = DEFAULT;
299 state = DEFAULT;
300 switch (src[i]) {
300 switch (src[i]) {
301 case '.':
301 case '.':
302 case '/':
302 case '/':
303 case '\0':
303 case '\0':
304 escape3(dest, &destlen, destsize, src[i - 2]);
304 escape3(dest, &destlen, destsize, src[i - 2]);
305 charcopy(dest, &destlen, destsize, src[i - 1]);
305 charcopy(dest, &destlen, destsize, src[i - 1]);
306 break;
306 break;
307 default:
307 default:
308 memcopy(dest, &destlen, destsize, &src[i - 2],
308 memcopy(dest, &destlen, destsize, &src[i - 2],
309 2);
309 2);
310 break;
310 break;
311 }
311 }
312 break;
312 break;
313 case L:
313 case L:
314 if (src[i] == 'p') {
314 if (src[i] == 'p') {
315 state = LP;
315 state = LP;
316 charcopy(dest, &destlen, destsize, src[i++]);
316 charcopy(dest, &destlen, destsize, src[i++]);
317 } else
317 } else
318 state = DEFAULT;
318 state = DEFAULT;
319 break;
319 break;
320 case LP:
320 case LP:
321 if (src[i] == 't') {
321 if (src[i] == 't') {
322 state = COMLPT;
322 state = COMLPT;
323 i++;
323 i++;
324 } else
324 } else
325 state = DEFAULT;
325 state = DEFAULT;
326 break;
326 break;
327 case N:
327 case N:
328 if (src[i] == 'u') {
328 if (src[i] == 'u') {
329 state = NU;
329 state = NU;
330 charcopy(dest, &destlen, destsize, src[i++]);
330 charcopy(dest, &destlen, destsize, src[i++]);
331 } else
331 } else
332 state = DEFAULT;
332 state = DEFAULT;
333 break;
333 break;
334 case NU:
334 case NU:
335 if (src[i] == 'l') {
335 if (src[i] == 'l') {
336 state = THIRD;
336 state = THIRD;
337 i++;
337 i++;
338 } else
338 } else
339 state = DEFAULT;
339 state = DEFAULT;
340 break;
340 break;
341 case P:
341 case P:
342 if (src[i] == 'r') {
342 if (src[i] == 'r') {
343 state = PR;
343 state = PR;
344 charcopy(dest, &destlen, destsize, src[i++]);
344 charcopy(dest, &destlen, destsize, src[i++]);
345 } else
345 } else
346 state = DEFAULT;
346 state = DEFAULT;
347 break;
347 break;
348 case PR:
348 case PR:
349 if (src[i] == 'n') {
349 if (src[i] == 'n') {
350 state = THIRD;
350 state = THIRD;
351 i++;
351 i++;
352 } else
352 } else
353 state = DEFAULT;
353 state = DEFAULT;
354 break;
354 break;
355 case LDOT:
355 case LDOT:
356 switch (src[i]) {
356 switch (src[i]) {
357 case 'd':
357 case 'd':
358 case 'i':
358 case 'i':
359 state = HGDI;
359 state = HGDI;
360 charcopy(dest, &destlen, destsize, src[i++]);
360 charcopy(dest, &destlen, destsize, src[i++]);
361 break;
361 break;
362 case 'h':
362 case 'h':
363 state = H;
363 state = H;
364 charcopy(dest, &destlen, destsize, src[i++]);
364 charcopy(dest, &destlen, destsize, src[i++]);
365 break;
365 break;
366 default:
366 default:
367 state = DEFAULT;
367 state = DEFAULT;
368 break;
368 break;
369 }
369 }
370 break;
370 break;
371 case DOT:
371 case DOT:
372 switch (src[i]) {
372 switch (src[i]) {
373 case '/':
373 case '/':
374 case '\0':
374 case '\0':
375 state = START;
375 state = START;
376 memcopy(dest, &destlen, destsize, "~2e", 3);
376 memcopy(dest, &destlen, destsize, "~2e", 3);
377 charcopy(dest, &destlen, destsize, src[i++]);
377 charcopy(dest, &destlen, destsize, src[i++]);
378 break;
378 break;
379 case 'd':
379 case 'd':
380 case 'i':
380 case 'i':
381 state = HGDI;
381 state = HGDI;
382 charcopy(dest, &destlen, destsize, '.');
382 charcopy(dest, &destlen, destsize, '.');
383 charcopy(dest, &destlen, destsize, src[i++]);
383 charcopy(dest, &destlen, destsize, src[i++]);
384 break;
384 break;
385 case 'h':
385 case 'h':
386 state = H;
386 state = H;
387 memcopy(dest, &destlen, destsize, ".h", 2);
387 memcopy(dest, &destlen, destsize, ".h", 2);
388 i++;
388 i++;
389 break;
389 break;
390 default:
390 default:
391 state = DEFAULT;
391 state = DEFAULT;
392 charcopy(dest, &destlen, destsize, '.');
392 charcopy(dest, &destlen, destsize, '.');
393 break;
393 break;
394 }
394 }
395 break;
395 break;
396 case H:
396 case H:
397 if (src[i] == 'g') {
397 if (src[i] == 'g') {
398 state = HGDI;
398 state = HGDI;
399 charcopy(dest, &destlen, destsize, src[i++]);
399 charcopy(dest, &destlen, destsize, src[i++]);
400 } else
400 } else
401 state = DEFAULT;
401 state = DEFAULT;
402 break;
402 break;
403 case HGDI:
403 case HGDI:
404 if (src[i] == '/') {
404 if (src[i] == '/') {
405 state = START;
405 state = START;
406 if (encodedir)
406 if (encodedir)
407 memcopy(dest, &destlen, destsize, ".hg",
407 memcopy(dest, &destlen, destsize, ".hg",
408 3);
408 3);
409 charcopy(dest, &destlen, destsize, src[i++]);
409 charcopy(dest, &destlen, destsize, src[i++]);
410 } else
410 } else
411 state = DEFAULT;
411 state = DEFAULT;
412 break;
412 break;
413 case SPACE:
413 case SPACE:
414 switch (src[i]) {
414 switch (src[i]) {
415 case '/':
415 case '/':
416 case '\0':
416 case '\0':
417 state = START;
417 state = START;
418 memcopy(dest, &destlen, destsize, "~20", 3);
418 memcopy(dest, &destlen, destsize, "~20", 3);
419 charcopy(dest, &destlen, destsize, src[i++]);
419 charcopy(dest, &destlen, destsize, src[i++]);
420 break;
420 break;
421 default:
421 default:
422 state = DEFAULT;
422 state = DEFAULT;
423 charcopy(dest, &destlen, destsize, ' ');
423 charcopy(dest, &destlen, destsize, ' ');
424 break;
424 break;
425 }
425 }
426 break;
426 break;
427 case DEFAULT:
427 case DEFAULT:
428 while (inset(onebyte, src[i])) {
428 while (inset(onebyte, src[i])) {
429 charcopy(dest, &destlen, destsize, src[i++]);
429 charcopy(dest, &destlen, destsize, src[i++]);
430 if (i == len)
430 if (i == len)
431 goto done;
431 goto done;
432 }
432 }
433 switch (src[i]) {
433 switch (src[i]) {
434 case '.':
434 case '.':
435 state = DOT;
435 state = DOT;
436 i++;
436 i++;
437 break;
437 break;
438 case ' ':
438 case ' ':
439 state = SPACE;
439 state = SPACE;
440 i++;
440 i++;
441 break;
441 break;
442 case '/':
442 case '/':
443 state = START;
443 state = START;
444 charcopy(dest, &destlen, destsize, '/');
444 charcopy(dest, &destlen, destsize, '/');
445 i++;
445 i++;
446 break;
446 break;
447 default:
447 default:
448 if (inset(onebyte, src[i])) {
448 if (inset(onebyte, src[i])) {
449 do {
449 do {
450 charcopy(dest, &destlen,
450 charcopy(dest, &destlen,
451 destsize, src[i++]);
451 destsize, src[i++]);
452 } while (i < len &&
452 } while (i < len &&
453 inset(onebyte, src[i]));
453 inset(onebyte, src[i]));
454 } else if (inset(twobytes, src[i])) {
454 } else if (inset(twobytes, src[i])) {
455 char c = src[i++];
455 char c = src[i++];
456 charcopy(dest, &destlen, destsize, '_');
456 charcopy(dest, &destlen, destsize, '_');
457 charcopy(dest, &destlen, destsize,
457 charcopy(dest, &destlen, destsize,
458 c == '_' ? '_' : c + 32);
458 c == '_' ? '_' : c + 32);
459 } else
459 } else
460 escape3(dest, &destlen, destsize,
460 escape3(dest, &destlen, destsize,
461 src[i++]);
461 src[i++]);
462 break;
462 break;
463 }
463 }
464 break;
464 break;
465 }
465 }
466 }
466 }
467 done:
467 done:
468 return destlen;
468 return destlen;
469 }
469 }
470
470
471 static Py_ssize_t basicencode(char *dest, size_t destsize, const char *src,
471 static Py_ssize_t basicencode(char *dest, size_t destsize, const char *src,
472 Py_ssize_t len)
472 Py_ssize_t len)
473 {
473 {
474 static const uint32_t twobytes[8] = {0, 0, 0x87fffffe};
474 static const uint32_t twobytes[8] = {0, 0, 0x87fffffe};
475
475
476 static const uint32_t onebyte[8] = {
476 static const uint32_t onebyte[8] = {
477 1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
477 1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
478 };
478 };
479
479
480 Py_ssize_t destlen = 0;
480 Py_ssize_t destlen = 0;
481
481
482 return _encode(twobytes, onebyte, dest, destlen, destsize, src, len, 1);
482 return _encode(twobytes, onebyte, dest, destlen, destsize, src, len, 1);
483 }
483 }
484
484
485 static const Py_ssize_t maxstorepathlen = 120;
485 static const Py_ssize_t maxstorepathlen = 120;
486
486
487 static Py_ssize_t _lowerencode(char *dest, size_t destsize, const char *src,
487 static Py_ssize_t _lowerencode(char *dest, size_t destsize, const char *src,
488 Py_ssize_t len)
488 Py_ssize_t len)
489 {
489 {
490 static const uint32_t onebyte[8] = {1, 0x2bfffbfb, 0xe8000001,
490 static const uint32_t onebyte[8] = {1, 0x2bfffbfb, 0xe8000001,
491 0x2fffffff};
491 0x2fffffff};
492
492
493 static const uint32_t lower[8] = {0, 0, 0x7fffffe};
493 static const uint32_t lower[8] = {0, 0, 0x7fffffe};
494
494
495 Py_ssize_t i, destlen = 0;
495 Py_ssize_t i, destlen = 0;
496
496
497 for (i = 0; i < len; i++) {
497 for (i = 0; i < len; i++) {
498 if (inset(onebyte, src[i]))
498 if (inset(onebyte, src[i]))
499 charcopy(dest, &destlen, destsize, src[i]);
499 charcopy(dest, &destlen, destsize, src[i]);
500 else if (inset(lower, src[i]))
500 else if (inset(lower, src[i]))
501 charcopy(dest, &destlen, destsize, src[i] + 32);
501 charcopy(dest, &destlen, destsize, src[i] + 32);
502 else
502 else
503 escape3(dest, &destlen, destsize, src[i]);
503 escape3(dest, &destlen, destsize, src[i]);
504 }
504 }
505
505
506 return destlen;
506 return destlen;
507 }
507 }
508
508
509 PyObject *lowerencode(PyObject *self, PyObject *args)
509 PyObject *lowerencode(PyObject *self, PyObject *args)
510 {
510 {
511 char *path;
511 char *path;
512 Py_ssize_t len, newlen;
512 Py_ssize_t len, newlen;
513 PyObject *ret;
513 PyObject *ret;
514
514
515 if (!PyArg_ParseTuple(args, PY23("s#:lowerencode", "y#:lowerencode"),
515 if (!PyArg_ParseTuple(args, PY23("s#:lowerencode", "y#:lowerencode"),
516 &path, &len))
516 &path, &len))
517 return NULL;
517 return NULL;
518
518
519 newlen = _lowerencode(NULL, 0, path, len);
519 newlen = _lowerencode(NULL, 0, path, len);
520 ret = PyBytes_FromStringAndSize(NULL, newlen);
520 ret = PyBytes_FromStringAndSize(NULL, newlen);
521 if (ret)
521 if (ret)
522 _lowerencode(PyBytes_AS_STRING(ret), newlen, path, len);
522 _lowerencode(PyBytes_AS_STRING(ret), newlen, path, len);
523
523
524 return ret;
524 return ret;
525 }
525 }
526
526
527 /* See store.py:_auxencode for a description. */
527 /* See store.py:_auxencode for a description. */
528 static Py_ssize_t auxencode(char *dest, size_t destsize, const char *src,
528 static Py_ssize_t auxencode(char *dest, size_t destsize, const char *src,
529 Py_ssize_t len)
529 Py_ssize_t len)
530 {
530 {
531 static const uint32_t twobytes[8];
531 static const uint32_t twobytes[8];
532
532
533 static const uint32_t onebyte[8] = {
533 static const uint32_t onebyte[8] = {
534 ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U,
534 ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U,
535 };
535 };
536
536
537 return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);
537 return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);
538 }
538 }
539
539
540 static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20])
540 static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20])
541 {
541 {
542 static const Py_ssize_t dirprefixlen = 8;
542 static const Py_ssize_t dirprefixlen = 8;
543 static const Py_ssize_t maxshortdirslen = 68;
543 static const Py_ssize_t maxshortdirslen = 68;
544 char *dest;
544 char *dest;
545 PyObject *ret;
545 PyObject *ret;
546
546
547 Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
547 Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
548 Py_ssize_t destsize, destlen = 0, slop, used;
548 Py_ssize_t destsize, destlen = 0, slop, used;
549
549
550 while (lastslash >= 0 && src[lastslash] != '/') {
550 while (lastslash >= 0 && src[lastslash] != '/') {
551 if (src[lastslash] == '.' && lastdot == -1)
551 if (src[lastslash] == '.' && lastdot == -1)
552 lastdot = lastslash;
552 lastdot = lastslash;
553 lastslash--;
553 lastslash--;
554 }
554 }
555
555
556 #if 0
556 #if 0
557 /* All paths should end in a suffix of ".i" or ".d".
557 /* All paths should end in a suffix of ".i" or ".d".
558 Unfortunately, the file names in test-hybridencode.py
558 Unfortunately, the file names in test-hybridencode.py
559 violate this rule. */
559 violate this rule. */
560 if (lastdot != len - 3) {
560 if (lastdot != len - 3) {
561 PyErr_SetString(PyExc_ValueError,
561 PyErr_SetString(PyExc_ValueError,
562 "suffix missing or wrong length");
562 "suffix missing or wrong length");
563 return NULL;
563 return NULL;
564 }
564 }
565 #endif
565 #endif
566
566
567 /* If src contains a suffix, we will append it to the end of
567 /* If src contains a suffix, we will append it to the end of
568 the new string, so make room. */
568 the new string, so make room. */
569 destsize = 120;
569 destsize = 120;
570 if (lastdot >= 0)
570 if (lastdot >= 0)
571 destsize += len - lastdot - 1;
571 destsize += len - lastdot - 1;
572
572
573 ret = PyBytes_FromStringAndSize(NULL, destsize);
573 ret = PyBytes_FromStringAndSize(NULL, destsize);
574 if (ret == NULL)
574 if (ret == NULL)
575 return NULL;
575 return NULL;
576
576
577 dest = PyBytes_AS_STRING(ret);
577 dest = PyBytes_AS_STRING(ret);
578 memcopy(dest, &destlen, destsize, "dh/", 3);
578 memcopy(dest, &destlen, destsize, "dh/", 3);
579
579
580 /* Copy up to dirprefixlen bytes of each path component, up to
580 /* Copy up to dirprefixlen bytes of each path component, up to
581 a limit of maxshortdirslen bytes. */
581 a limit of maxshortdirslen bytes. */
582 for (i = d = p = 0; i < lastslash; i++, p++) {
582 for (i = d = p = 0; i < lastslash; i++, p++) {
583 if (src[i] == '/') {
583 if (src[i] == '/') {
584 char d = dest[destlen - 1];
584 char d = dest[destlen - 1];
585 /* After truncation, a directory name may end
585 /* After truncation, a directory name may end
586 in a space or dot, which are unportable. */
586 in a space or dot, which are unportable. */
587 if (d == '.' || d == ' ')
587 if (d == '.' || d == ' ')
588 dest[destlen - 1] = '_';
588 dest[destlen - 1] = '_';
589 /* The + 3 is to account for "dh/" in the beginning */
589 /* The + 3 is to account for "dh/" in the beginning */
590 if (destlen > maxshortdirslen + 3)
590 if (destlen > maxshortdirslen + 3)
591 break;
591 break;
592 charcopy(dest, &destlen, destsize, src[i]);
592 charcopy(dest, &destlen, destsize, src[i]);
593 p = -1;
593 p = -1;
594 } else if (p < dirprefixlen)
594 } else if (p < dirprefixlen)
595 charcopy(dest, &destlen, destsize, src[i]);
595 charcopy(dest, &destlen, destsize, src[i]);
596 }
596 }
597
597
598 /* Rewind to just before the last slash copied. */
598 /* Rewind to just before the last slash copied. */
599 if (destlen > maxshortdirslen + 3)
599 if (destlen > maxshortdirslen + 3)
600 do {
600 do {
601 destlen--;
601 destlen--;
602 } while (destlen > 0 && dest[destlen] != '/');
602 } while (destlen > 0 && dest[destlen] != '/');
603
603
604 if (destlen > 3) {
604 if (destlen > 3) {
605 if (lastslash > 0) {
605 if (lastslash > 0) {
606 char d = dest[destlen - 1];
606 char d = dest[destlen - 1];
607 /* The last directory component may be
607 /* The last directory component may be
608 truncated, so make it safe. */
608 truncated, so make it safe. */
609 if (d == '.' || d == ' ')
609 if (d == '.' || d == ' ')
610 dest[destlen - 1] = '_';
610 dest[destlen - 1] = '_';
611 }
611 }
612
612
613 charcopy(dest, &destlen, destsize, '/');
613 charcopy(dest, &destlen, destsize, '/');
614 }
614 }
615
615
616 /* Add a prefix of the original file's name. Its length
616 /* Add a prefix of the original file's name. Its length
617 depends on the number of bytes left after accounting for
617 depends on the number of bytes left after accounting for
618 hash and suffix. */
618 hash and suffix. */
619 used = destlen + 40;
619 used = destlen + 40;
620 if (lastdot >= 0)
620 if (lastdot >= 0)
621 used += len - lastdot - 1;
621 used += len - lastdot - 1;
622 slop = maxstorepathlen - used;
622 slop = maxstorepathlen - used;
623 if (slop > 0) {
623 if (slop > 0) {
624 Py_ssize_t basenamelen =
624 Py_ssize_t basenamelen =
625 lastslash >= 0 ? len - lastslash - 2 : len - 1;
625 lastslash >= 0 ? len - lastslash - 2 : len - 1;
626
626
627 if (basenamelen > slop)
627 if (basenamelen > slop)
628 basenamelen = slop;
628 basenamelen = slop;
629 if (basenamelen > 0)
629 if (basenamelen > 0)
630 memcopy(dest, &destlen, destsize, &src[lastslash + 1],
630 memcopy(dest, &destlen, destsize, &src[lastslash + 1],
631 basenamelen);
631 basenamelen);
632 }
632 }
633
633
634 /* Add hash and suffix. */
634 /* Add hash and suffix. */
635 for (i = 0; i < 20; i++)
635 for (i = 0; i < 20; i++)
636 hexencode(dest, &destlen, destsize, sha[i]);
636 hexencode(dest, &destlen, destsize, sha[i]);
637
637
638 if (lastdot >= 0)
638 if (lastdot >= 0)
639 memcopy(dest, &destlen, destsize, &src[lastdot],
639 memcopy(dest, &destlen, destsize, &src[lastdot],
640 len - lastdot - 1);
640 len - lastdot - 1);
641
641
642 assert(PyBytes_Check(ret));
642 assert(PyBytes_Check(ret));
643 Py_SIZE(ret) = destlen;
643 Py_SIZE(ret) = destlen;
644
644
645 return ret;
645 return ret;
646 }
646 }
647
647
648 /*
648 /*
649 * Avoiding a trip through Python would improve performance by 50%,
649 * Avoiding a trip through Python would improve performance by 50%,
650 * but we don't encounter enough long names to be worth the code.
650 * but we don't encounter enough long names to be worth the code.
651 */
651 */
652 static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
652 static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
653 {
653 {
654 static PyObject *shafunc;
654 static PyObject *shafunc;
655 PyObject *shaobj, *hashobj;
655 PyObject *shaobj, *hashobj;
656
656
657 if (shafunc == NULL) {
657 if (shafunc == NULL) {
658 PyObject *hashlib, *name = PyBytes_FromString("hashlib");
658 PyObject *hashlib, *name = PyBytes_FromString("hashlib");
659
659
660 if (name == NULL)
660 if (name == NULL)
661 return -1;
661 return -1;
662
662
663 hashlib = PyImport_ImportModule("hashlib");
663 hashlib = PyImport_ImportModule("hashlib");
664 Py_DECREF(name);
664 Py_DECREF(name);
665
665
666 if (hashlib == NULL) {
666 if (hashlib == NULL) {
667 PyErr_SetString(PyExc_ImportError, "hashlib");
667 PyErr_SetString(PyExc_ImportError,
668 "pathencode failed to find hashlib");
668 return -1;
669 return -1;
669 }
670 }
670 shafunc = PyObject_GetAttrString(hashlib, "sha1");
671 shafunc = PyObject_GetAttrString(hashlib, "sha1");
671 Py_DECREF(hashlib);
672 Py_DECREF(hashlib);
672
673
673 if (shafunc == NULL) {
674 if (shafunc == NULL) {
674 PyErr_SetString(PyExc_AttributeError,
675 PyErr_SetString(PyExc_AttributeError,
675 "module 'hashlib' has no "
676 "module 'hashlib' has no "
676 "attribute 'sha1'");
677 "attribute 'sha1' in pathencode");
677 return -1;
678 return -1;
678 }
679 }
679 }
680 }
680
681
681 shaobj = PyObject_CallFunction(shafunc, PY23("s#", "y#"), str, len);
682 shaobj = PyObject_CallFunction(shafunc, PY23("s#", "y#"), str, len);
682
683
683 if (shaobj == NULL)
684 if (shaobj == NULL)
684 return -1;
685 return -1;
685
686
686 hashobj = PyObject_CallMethod(shaobj, "digest", "");
687 hashobj = PyObject_CallMethod(shaobj, "digest", "");
687 Py_DECREF(shaobj);
688 Py_DECREF(shaobj);
688 if (hashobj == NULL)
689 if (hashobj == NULL)
689 return -1;
690 return -1;
690
691
691 if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) {
692 if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) {
692 PyErr_SetString(PyExc_TypeError,
693 PyErr_SetString(PyExc_TypeError,
693 "result of digest is not a 20-byte hash");
694 "result of digest is not a 20-byte hash");
694 Py_DECREF(hashobj);
695 Py_DECREF(hashobj);
695 return -1;
696 return -1;
696 }
697 }
697
698
698 memcpy(hash, PyBytes_AS_STRING(hashobj), 20);
699 memcpy(hash, PyBytes_AS_STRING(hashobj), 20);
699 Py_DECREF(hashobj);
700 Py_DECREF(hashobj);
700 return 0;
701 return 0;
701 }
702 }
702
703
703 #define MAXENCODE 4096 * 4
704 #define MAXENCODE 4096 * 4
704
705
705 static PyObject *hashencode(const char *src, Py_ssize_t len)
706 static PyObject *hashencode(const char *src, Py_ssize_t len)
706 {
707 {
707 char dired[MAXENCODE];
708 char dired[MAXENCODE];
708 char lowered[MAXENCODE];
709 char lowered[MAXENCODE];
709 char auxed[MAXENCODE];
710 char auxed[MAXENCODE];
710 Py_ssize_t dirlen, lowerlen, auxlen, baselen;
711 Py_ssize_t dirlen, lowerlen, auxlen, baselen;
711 char sha[20];
712 char sha[20];
712
713
713 baselen = (len - 5) * 3;
714 baselen = (len - 5) * 3;
714 if (baselen >= MAXENCODE) {
715 if (baselen >= MAXENCODE) {
715 PyErr_SetString(PyExc_ValueError, "string too long");
716 PyErr_SetString(PyExc_ValueError, "string too long");
716 return NULL;
717 return NULL;
717 }
718 }
718
719
719 dirlen = _encodedir(dired, baselen, src, len);
720 dirlen = _encodedir(dired, baselen, src, len);
720 if (sha1hash(sha, dired, dirlen - 1) == -1)
721 if (sha1hash(sha, dired, dirlen - 1) == -1)
721 return NULL;
722 return NULL;
722 lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5);
723 lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5);
723 auxlen = auxencode(auxed, baselen, lowered, lowerlen);
724 auxlen = auxencode(auxed, baselen, lowered, lowerlen);
724 return hashmangle(auxed, auxlen, sha);
725 return hashmangle(auxed, auxlen, sha);
725 }
726 }
726
727
727 PyObject *pathencode(PyObject *self, PyObject *args)
728 PyObject *pathencode(PyObject *self, PyObject *args)
728 {
729 {
729 Py_ssize_t len, newlen;
730 Py_ssize_t len, newlen;
730 PyObject *pathobj, *newobj;
731 PyObject *pathobj, *newobj;
731 char *path;
732 char *path;
732
733
733 if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
734 if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
734 return NULL;
735 return NULL;
735
736
736 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
737 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
737 PyErr_SetString(PyExc_TypeError, "expected a string");
738 PyErr_SetString(PyExc_TypeError, "expected a string");
738 return NULL;
739 return NULL;
739 }
740 }
740
741
741 if (len > maxstorepathlen)
742 if (len > maxstorepathlen)
742 newlen = maxstorepathlen + 2;
743 newlen = maxstorepathlen + 2;
743 else
744 else
744 newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
745 newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
745
746
746 if (newlen <= maxstorepathlen + 1) {
747 if (newlen <= maxstorepathlen + 1) {
747 if (newlen == len + 1) {
748 if (newlen == len + 1) {
748 Py_INCREF(pathobj);
749 Py_INCREF(pathobj);
749 return pathobj;
750 return pathobj;
750 }
751 }
751
752
752 newobj = PyBytes_FromStringAndSize(NULL, newlen);
753 newobj = PyBytes_FromStringAndSize(NULL, newlen);
753
754
754 if (newobj) {
755 if (newobj) {
755 assert(PyBytes_Check(newobj));
756 assert(PyBytes_Check(newobj));
756 Py_SIZE(newobj)--;
757 Py_SIZE(newobj)--;
757 basicencode(PyBytes_AS_STRING(newobj), newlen, path,
758 basicencode(PyBytes_AS_STRING(newobj), newlen, path,
758 len + 1);
759 len + 1);
759 }
760 }
760 } else
761 } else
761 newobj = hashencode(path, len + 1);
762 newobj = hashencode(path, len + 1);
762
763
763 return newobj;
764 return newobj;
764 }
765 }
General Comments 0
You need to be logged in to leave comments. Login now