##// END OF EJS Templates
pathencode: use Py_SIZE directly...
Gregory Szorc -
r30102:a8c948ee default
parent child Browse files
Show More
@@ -1,762 +1,765
1 /*
1 /*
2 pathencode.c - efficient path name encoding
2 pathencode.c - efficient path name encoding
3
3
4 Copyright 2012 Facebook
4 Copyright 2012 Facebook
5
5
6 This software may be used and distributed according to the terms of
6 This software may be used and distributed according to the terms of
7 the GNU General Public License, incorporated herein by reference.
7 the GNU General Public License, incorporated herein by reference.
8 */
8 */
9
9
10 /*
10 /*
11 * An implementation of the name encoding scheme used by the fncache
11 * An implementation of the name encoding scheme used by the fncache
12 * store. The common case is of a path < 120 bytes long, which is
12 * store. The common case is of a path < 120 bytes long, which is
13 * handled either in a single pass with no allocations or two passes
13 * handled either in a single pass with no allocations or two passes
14 * with a single allocation. For longer paths, multiple passes are
14 * with a single allocation. For longer paths, multiple passes are
15 * required.
15 * required.
16 */
16 */
17
17
18 #define PY_SSIZE_T_CLEAN
18 #define PY_SSIZE_T_CLEAN
19 #include <Python.h>
19 #include <Python.h>
20 #include <assert.h>
20 #include <assert.h>
21 #include <ctype.h>
21 #include <ctype.h>
22 #include <stdlib.h>
22 #include <stdlib.h>
23 #include <string.h>
23 #include <string.h>
24
24
25 #include "util.h"
25 #include "util.h"
26
26
27 /* state machine for the fast path */
27 /* state machine for the fast path */
28 enum path_state {
28 enum path_state {
29 START, /* first byte of a path component */
29 START, /* first byte of a path component */
30 A, /* "AUX" */
30 A, /* "AUX" */
31 AU,
31 AU,
32 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
32 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
33 C, /* "CON" or "COMn" */
33 C, /* "CON" or "COMn" */
34 CO,
34 CO,
35 COMLPT, /* "COM" or "LPT" */
35 COMLPT, /* "COM" or "LPT" */
36 COMLPTn,
36 COMLPTn,
37 L,
37 L,
38 LP,
38 LP,
39 N,
39 N,
40 NU,
40 NU,
41 P, /* "PRN" */
41 P, /* "PRN" */
42 PR,
42 PR,
43 LDOT, /* leading '.' */
43 LDOT, /* leading '.' */
44 DOT, /* '.' in a non-leading position */
44 DOT, /* '.' in a non-leading position */
45 H, /* ".h" */
45 H, /* ".h" */
46 HGDI, /* ".hg", ".d", or ".i" */
46 HGDI, /* ".hg", ".d", or ".i" */
47 SPACE,
47 SPACE,
48 DEFAULT /* byte of a path component after the first */
48 DEFAULT /* byte of a path component after the first */
49 };
49 };
50
50
51 /* state machine for dir-encoding */
51 /* state machine for dir-encoding */
52 enum dir_state {
52 enum dir_state {
53 DDOT,
53 DDOT,
54 DH,
54 DH,
55 DHGDI,
55 DHGDI,
56 DDEFAULT
56 DDEFAULT
57 };
57 };
58
58
59 static inline int inset(const uint32_t bitset[], char c)
59 static inline int inset(const uint32_t bitset[], char c)
60 {
60 {
61 return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
61 return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
62 }
62 }
63
63
64 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
64 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
65 char c)
65 char c)
66 {
66 {
67 if (dest) {
67 if (dest) {
68 assert(*destlen < destsize);
68 assert(*destlen < destsize);
69 dest[*destlen] = c;
69 dest[*destlen] = c;
70 }
70 }
71 (*destlen)++;
71 (*destlen)++;
72 }
72 }
73
73
74 static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
74 static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
75 const void *src, Py_ssize_t len)
75 const void *src, Py_ssize_t len)
76 {
76 {
77 if (dest) {
77 if (dest) {
78 assert(*destlen + len < destsize);
78 assert(*destlen + len < destsize);
79 memcpy((void *)&dest[*destlen], src, len);
79 memcpy((void *)&dest[*destlen], src, len);
80 }
80 }
81 *destlen += len;
81 *destlen += len;
82 }
82 }
83
83
84 static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
84 static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
85 uint8_t c)
85 uint8_t c)
86 {
86 {
87 static const char hexdigit[] = "0123456789abcdef";
87 static const char hexdigit[] = "0123456789abcdef";
88
88
89 charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
89 charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
90 charcopy(dest, destlen, destsize, hexdigit[c & 15]);
90 charcopy(dest, destlen, destsize, hexdigit[c & 15]);
91 }
91 }
92
92
93 /* 3-byte escape: tilde followed by two hex digits */
93 /* 3-byte escape: tilde followed by two hex digits */
94 static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
94 static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
95 char c)
95 char c)
96 {
96 {
97 charcopy(dest, destlen, destsize, '~');
97 charcopy(dest, destlen, destsize, '~');
98 hexencode(dest, destlen, destsize, c);
98 hexencode(dest, destlen, destsize, c);
99 }
99 }
100
100
101 static Py_ssize_t _encodedir(char *dest, size_t destsize,
101 static Py_ssize_t _encodedir(char *dest, size_t destsize,
102 const char *src, Py_ssize_t len)
102 const char *src, Py_ssize_t len)
103 {
103 {
104 enum dir_state state = DDEFAULT;
104 enum dir_state state = DDEFAULT;
105 Py_ssize_t i = 0, destlen = 0;
105 Py_ssize_t i = 0, destlen = 0;
106
106
107 while (i < len) {
107 while (i < len) {
108 switch (state) {
108 switch (state) {
109 case DDOT:
109 case DDOT:
110 switch (src[i]) {
110 switch (src[i]) {
111 case 'd':
111 case 'd':
112 case 'i':
112 case 'i':
113 state = DHGDI;
113 state = DHGDI;
114 charcopy(dest, &destlen, destsize, src[i++]);
114 charcopy(dest, &destlen, destsize, src[i++]);
115 break;
115 break;
116 case 'h':
116 case 'h':
117 state = DH;
117 state = DH;
118 charcopy(dest, &destlen, destsize, src[i++]);
118 charcopy(dest, &destlen, destsize, src[i++]);
119 break;
119 break;
120 default:
120 default:
121 state = DDEFAULT;
121 state = DDEFAULT;
122 break;
122 break;
123 }
123 }
124 break;
124 break;
125 case DH:
125 case DH:
126 if (src[i] == 'g') {
126 if (src[i] == 'g') {
127 state = DHGDI;
127 state = DHGDI;
128 charcopy(dest, &destlen, destsize, src[i++]);
128 charcopy(dest, &destlen, destsize, src[i++]);
129 }
129 }
130 else state = DDEFAULT;
130 else state = DDEFAULT;
131 break;
131 break;
132 case DHGDI:
132 case DHGDI:
133 if (src[i] == '/') {
133 if (src[i] == '/') {
134 memcopy(dest, &destlen, destsize, ".hg", 3);
134 memcopy(dest, &destlen, destsize, ".hg", 3);
135 charcopy(dest, &destlen, destsize, src[i++]);
135 charcopy(dest, &destlen, destsize, src[i++]);
136 }
136 }
137 state = DDEFAULT;
137 state = DDEFAULT;
138 break;
138 break;
139 case DDEFAULT:
139 case DDEFAULT:
140 if (src[i] == '.')
140 if (src[i] == '.')
141 state = DDOT;
141 state = DDOT;
142 charcopy(dest, &destlen, destsize, src[i++]);
142 charcopy(dest, &destlen, destsize, src[i++]);
143 break;
143 break;
144 }
144 }
145 }
145 }
146
146
147 return destlen;
147 return destlen;
148 }
148 }
149
149
150 PyObject *encodedir(PyObject *self, PyObject *args)
150 PyObject *encodedir(PyObject *self, PyObject *args)
151 {
151 {
152 Py_ssize_t len, newlen;
152 Py_ssize_t len, newlen;
153 PyObject *pathobj, *newobj;
153 PyObject *pathobj, *newobj;
154 char *path;
154 char *path;
155
155
156 if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj))
156 if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj))
157 return NULL;
157 return NULL;
158
158
159 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
159 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
160 PyErr_SetString(PyExc_TypeError, "expected a string");
160 PyErr_SetString(PyExc_TypeError, "expected a string");
161 return NULL;
161 return NULL;
162 }
162 }
163
163
164 newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;
164 newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;
165
165
166 if (newlen == len + 1) {
166 if (newlen == len + 1) {
167 Py_INCREF(pathobj);
167 Py_INCREF(pathobj);
168 return pathobj;
168 return pathobj;
169 }
169 }
170
170
171 newobj = PyBytes_FromStringAndSize(NULL, newlen);
171 newobj = PyBytes_FromStringAndSize(NULL, newlen);
172
172
173 if (newobj) {
173 if (newobj) {
174 PyBytes_GET_SIZE(newobj)--;
174 assert(PyBytes_Check(newobj));
175 Py_SIZE(newobj)--;
175 _encodedir(PyBytes_AS_STRING(newobj), newlen, path,
176 _encodedir(PyBytes_AS_STRING(newobj), newlen, path,
176 len + 1);
177 len + 1);
177 }
178 }
178
179
179 return newobj;
180 return newobj;
180 }
181 }
181
182
182 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
183 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
183 char *dest, Py_ssize_t destlen, size_t destsize,
184 char *dest, Py_ssize_t destlen, size_t destsize,
184 const char *src, Py_ssize_t len,
185 const char *src, Py_ssize_t len,
185 int encodedir)
186 int encodedir)
186 {
187 {
187 enum path_state state = START;
188 enum path_state state = START;
188 Py_ssize_t i = 0;
189 Py_ssize_t i = 0;
189
190
190 /*
191 /*
191 * Python strings end with a zero byte, which we use as a
192 * Python strings end with a zero byte, which we use as a
192 * terminal token as they are not valid inside path names.
193 * terminal token as they are not valid inside path names.
193 */
194 */
194
195
195 while (i < len) {
196 while (i < len) {
196 switch (state) {
197 switch (state) {
197 case START:
198 case START:
198 switch (src[i]) {
199 switch (src[i]) {
199 case '/':
200 case '/':
200 charcopy(dest, &destlen, destsize, src[i++]);
201 charcopy(dest, &destlen, destsize, src[i++]);
201 break;
202 break;
202 case '.':
203 case '.':
203 state = LDOT;
204 state = LDOT;
204 escape3(dest, &destlen, destsize, src[i++]);
205 escape3(dest, &destlen, destsize, src[i++]);
205 break;
206 break;
206 case ' ':
207 case ' ':
207 state = DEFAULT;
208 state = DEFAULT;
208 escape3(dest, &destlen, destsize, src[i++]);
209 escape3(dest, &destlen, destsize, src[i++]);
209 break;
210 break;
210 case 'a':
211 case 'a':
211 state = A;
212 state = A;
212 charcopy(dest, &destlen, destsize, src[i++]);
213 charcopy(dest, &destlen, destsize, src[i++]);
213 break;
214 break;
214 case 'c':
215 case 'c':
215 state = C;
216 state = C;
216 charcopy(dest, &destlen, destsize, src[i++]);
217 charcopy(dest, &destlen, destsize, src[i++]);
217 break;
218 break;
218 case 'l':
219 case 'l':
219 state = L;
220 state = L;
220 charcopy(dest, &destlen, destsize, src[i++]);
221 charcopy(dest, &destlen, destsize, src[i++]);
221 break;
222 break;
222 case 'n':
223 case 'n':
223 state = N;
224 state = N;
224 charcopy(dest, &destlen, destsize, src[i++]);
225 charcopy(dest, &destlen, destsize, src[i++]);
225 break;
226 break;
226 case 'p':
227 case 'p':
227 state = P;
228 state = P;
228 charcopy(dest, &destlen, destsize, src[i++]);
229 charcopy(dest, &destlen, destsize, src[i++]);
229 break;
230 break;
230 default:
231 default:
231 state = DEFAULT;
232 state = DEFAULT;
232 break;
233 break;
233 }
234 }
234 break;
235 break;
235 case A:
236 case A:
236 if (src[i] == 'u') {
237 if (src[i] == 'u') {
237 state = AU;
238 state = AU;
238 charcopy(dest, &destlen, destsize, src[i++]);
239 charcopy(dest, &destlen, destsize, src[i++]);
239 }
240 }
240 else state = DEFAULT;
241 else state = DEFAULT;
241 break;
242 break;
242 case AU:
243 case AU:
243 if (src[i] == 'x') {
244 if (src[i] == 'x') {
244 state = THIRD;
245 state = THIRD;
245 i++;
246 i++;
246 }
247 }
247 else state = DEFAULT;
248 else state = DEFAULT;
248 break;
249 break;
249 case THIRD:
250 case THIRD:
250 state = DEFAULT;
251 state = DEFAULT;
251 switch (src[i]) {
252 switch (src[i]) {
252 case '.':
253 case '.':
253 case '/':
254 case '/':
254 case '\0':
255 case '\0':
255 escape3(dest, &destlen, destsize, src[i - 1]);
256 escape3(dest, &destlen, destsize, src[i - 1]);
256 break;
257 break;
257 default:
258 default:
258 i--;
259 i--;
259 break;
260 break;
260 }
261 }
261 break;
262 break;
262 case C:
263 case C:
263 if (src[i] == 'o') {
264 if (src[i] == 'o') {
264 state = CO;
265 state = CO;
265 charcopy(dest, &destlen, destsize, src[i++]);
266 charcopy(dest, &destlen, destsize, src[i++]);
266 }
267 }
267 else state = DEFAULT;
268 else state = DEFAULT;
268 break;
269 break;
269 case CO:
270 case CO:
270 if (src[i] == 'm') {
271 if (src[i] == 'm') {
271 state = COMLPT;
272 state = COMLPT;
272 i++;
273 i++;
273 }
274 }
274 else if (src[i] == 'n') {
275 else if (src[i] == 'n') {
275 state = THIRD;
276 state = THIRD;
276 i++;
277 i++;
277 }
278 }
278 else state = DEFAULT;
279 else state = DEFAULT;
279 break;
280 break;
280 case COMLPT:
281 case COMLPT:
281 switch (src[i]) {
282 switch (src[i]) {
282 case '1': case '2': case '3': case '4': case '5':
283 case '1': case '2': case '3': case '4': case '5':
283 case '6': case '7': case '8': case '9':
284 case '6': case '7': case '8': case '9':
284 state = COMLPTn;
285 state = COMLPTn;
285 i++;
286 i++;
286 break;
287 break;
287 default:
288 default:
288 state = DEFAULT;
289 state = DEFAULT;
289 charcopy(dest, &destlen, destsize, src[i - 1]);
290 charcopy(dest, &destlen, destsize, src[i - 1]);
290 break;
291 break;
291 }
292 }
292 break;
293 break;
293 case COMLPTn:
294 case COMLPTn:
294 state = DEFAULT;
295 state = DEFAULT;
295 switch (src[i]) {
296 switch (src[i]) {
296 case '.':
297 case '.':
297 case '/':
298 case '/':
298 case '\0':
299 case '\0':
299 escape3(dest, &destlen, destsize, src[i - 2]);
300 escape3(dest, &destlen, destsize, src[i - 2]);
300 charcopy(dest, &destlen, destsize, src[i - 1]);
301 charcopy(dest, &destlen, destsize, src[i - 1]);
301 break;
302 break;
302 default:
303 default:
303 memcopy(dest, &destlen, destsize,
304 memcopy(dest, &destlen, destsize,
304 &src[i - 2], 2);
305 &src[i - 2], 2);
305 break;
306 break;
306 }
307 }
307 break;
308 break;
308 case L:
309 case L:
309 if (src[i] == 'p') {
310 if (src[i] == 'p') {
310 state = LP;
311 state = LP;
311 charcopy(dest, &destlen, destsize, src[i++]);
312 charcopy(dest, &destlen, destsize, src[i++]);
312 }
313 }
313 else state = DEFAULT;
314 else state = DEFAULT;
314 break;
315 break;
315 case LP:
316 case LP:
316 if (src[i] == 't') {
317 if (src[i] == 't') {
317 state = COMLPT;
318 state = COMLPT;
318 i++;
319 i++;
319 }
320 }
320 else state = DEFAULT;
321 else state = DEFAULT;
321 break;
322 break;
322 case N:
323 case N:
323 if (src[i] == 'u') {
324 if (src[i] == 'u') {
324 state = NU;
325 state = NU;
325 charcopy(dest, &destlen, destsize, src[i++]);
326 charcopy(dest, &destlen, destsize, src[i++]);
326 }
327 }
327 else state = DEFAULT;
328 else state = DEFAULT;
328 break;
329 break;
329 case NU:
330 case NU:
330 if (src[i] == 'l') {
331 if (src[i] == 'l') {
331 state = THIRD;
332 state = THIRD;
332 i++;
333 i++;
333 }
334 }
334 else state = DEFAULT;
335 else state = DEFAULT;
335 break;
336 break;
336 case P:
337 case P:
337 if (src[i] == 'r') {
338 if (src[i] == 'r') {
338 state = PR;
339 state = PR;
339 charcopy(dest, &destlen, destsize, src[i++]);
340 charcopy(dest, &destlen, destsize, src[i++]);
340 }
341 }
341 else state = DEFAULT;
342 else state = DEFAULT;
342 break;
343 break;
343 case PR:
344 case PR:
344 if (src[i] == 'n') {
345 if (src[i] == 'n') {
345 state = THIRD;
346 state = THIRD;
346 i++;
347 i++;
347 }
348 }
348 else state = DEFAULT;
349 else state = DEFAULT;
349 break;
350 break;
350 case LDOT:
351 case LDOT:
351 switch (src[i]) {
352 switch (src[i]) {
352 case 'd':
353 case 'd':
353 case 'i':
354 case 'i':
354 state = HGDI;
355 state = HGDI;
355 charcopy(dest, &destlen, destsize, src[i++]);
356 charcopy(dest, &destlen, destsize, src[i++]);
356 break;
357 break;
357 case 'h':
358 case 'h':
358 state = H;
359 state = H;
359 charcopy(dest, &destlen, destsize, src[i++]);
360 charcopy(dest, &destlen, destsize, src[i++]);
360 break;
361 break;
361 default:
362 default:
362 state = DEFAULT;
363 state = DEFAULT;
363 break;
364 break;
364 }
365 }
365 break;
366 break;
366 case DOT:
367 case DOT:
367 switch (src[i]) {
368 switch (src[i]) {
368 case '/':
369 case '/':
369 case '\0':
370 case '\0':
370 state = START;
371 state = START;
371 memcopy(dest, &destlen, destsize, "~2e", 3);
372 memcopy(dest, &destlen, destsize, "~2e", 3);
372 charcopy(dest, &destlen, destsize, src[i++]);
373 charcopy(dest, &destlen, destsize, src[i++]);
373 break;
374 break;
374 case 'd':
375 case 'd':
375 case 'i':
376 case 'i':
376 state = HGDI;
377 state = HGDI;
377 charcopy(dest, &destlen, destsize, '.');
378 charcopy(dest, &destlen, destsize, '.');
378 charcopy(dest, &destlen, destsize, src[i++]);
379 charcopy(dest, &destlen, destsize, src[i++]);
379 break;
380 break;
380 case 'h':
381 case 'h':
381 state = H;
382 state = H;
382 memcopy(dest, &destlen, destsize, ".h", 2);
383 memcopy(dest, &destlen, destsize, ".h", 2);
383 i++;
384 i++;
384 break;
385 break;
385 default:
386 default:
386 state = DEFAULT;
387 state = DEFAULT;
387 charcopy(dest, &destlen, destsize, '.');
388 charcopy(dest, &destlen, destsize, '.');
388 break;
389 break;
389 }
390 }
390 break;
391 break;
391 case H:
392 case H:
392 if (src[i] == 'g') {
393 if (src[i] == 'g') {
393 state = HGDI;
394 state = HGDI;
394 charcopy(dest, &destlen, destsize, src[i++]);
395 charcopy(dest, &destlen, destsize, src[i++]);
395 }
396 }
396 else state = DEFAULT;
397 else state = DEFAULT;
397 break;
398 break;
398 case HGDI:
399 case HGDI:
399 if (src[i] == '/') {
400 if (src[i] == '/') {
400 state = START;
401 state = START;
401 if (encodedir)
402 if (encodedir)
402 memcopy(dest, &destlen, destsize, ".hg",
403 memcopy(dest, &destlen, destsize, ".hg",
403 3);
404 3);
404 charcopy(dest, &destlen, destsize, src[i++]);
405 charcopy(dest, &destlen, destsize, src[i++]);
405 }
406 }
406 else state = DEFAULT;
407 else state = DEFAULT;
407 break;
408 break;
408 case SPACE:
409 case SPACE:
409 switch (src[i]) {
410 switch (src[i]) {
410 case '/':
411 case '/':
411 case '\0':
412 case '\0':
412 state = START;
413 state = START;
413 memcopy(dest, &destlen, destsize, "~20", 3);
414 memcopy(dest, &destlen, destsize, "~20", 3);
414 charcopy(dest, &destlen, destsize, src[i++]);
415 charcopy(dest, &destlen, destsize, src[i++]);
415 break;
416 break;
416 default:
417 default:
417 state = DEFAULT;
418 state = DEFAULT;
418 charcopy(dest, &destlen, destsize, ' ');
419 charcopy(dest, &destlen, destsize, ' ');
419 break;
420 break;
420 }
421 }
421 break;
422 break;
422 case DEFAULT:
423 case DEFAULT:
423 while (inset(onebyte, src[i])) {
424 while (inset(onebyte, src[i])) {
424 charcopy(dest, &destlen, destsize, src[i++]);
425 charcopy(dest, &destlen, destsize, src[i++]);
425 if (i == len)
426 if (i == len)
426 goto done;
427 goto done;
427 }
428 }
428 switch (src[i]) {
429 switch (src[i]) {
429 case '.':
430 case '.':
430 state = DOT;
431 state = DOT;
431 i++;
432 i++;
432 break;
433 break;
433 case ' ':
434 case ' ':
434 state = SPACE;
435 state = SPACE;
435 i++;
436 i++;
436 break;
437 break;
437 case '/':
438 case '/':
438 state = START;
439 state = START;
439 charcopy(dest, &destlen, destsize, '/');
440 charcopy(dest, &destlen, destsize, '/');
440 i++;
441 i++;
441 break;
442 break;
442 default:
443 default:
443 if (inset(onebyte, src[i])) {
444 if (inset(onebyte, src[i])) {
444 do {
445 do {
445 charcopy(dest, &destlen,
446 charcopy(dest, &destlen,
446 destsize, src[i++]);
447 destsize, src[i++]);
447 } while (i < len &&
448 } while (i < len &&
448 inset(onebyte, src[i]));
449 inset(onebyte, src[i]));
449 }
450 }
450 else if (inset(twobytes, src[i])) {
451 else if (inset(twobytes, src[i])) {
451 char c = src[i++];
452 char c = src[i++];
452 charcopy(dest, &destlen, destsize, '_');
453 charcopy(dest, &destlen, destsize, '_');
453 charcopy(dest, &destlen, destsize,
454 charcopy(dest, &destlen, destsize,
454 c == '_' ? '_' : c + 32);
455 c == '_' ? '_' : c + 32);
455 }
456 }
456 else
457 else
457 escape3(dest, &destlen, destsize,
458 escape3(dest, &destlen, destsize,
458 src[i++]);
459 src[i++]);
459 break;
460 break;
460 }
461 }
461 break;
462 break;
462 }
463 }
463 }
464 }
464 done:
465 done:
465 return destlen;
466 return destlen;
466 }
467 }
467
468
468 static Py_ssize_t basicencode(char *dest, size_t destsize,
469 static Py_ssize_t basicencode(char *dest, size_t destsize,
469 const char *src, Py_ssize_t len)
470 const char *src, Py_ssize_t len)
470 {
471 {
471 static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };
472 static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };
472
473
473 static const uint32_t onebyte[8] = {
474 static const uint32_t onebyte[8] = {
474 1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
475 1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
475 };
476 };
476
477
477 Py_ssize_t destlen = 0;
478 Py_ssize_t destlen = 0;
478
479
479 return _encode(twobytes, onebyte, dest, destlen, destsize,
480 return _encode(twobytes, onebyte, dest, destlen, destsize,
480 src, len, 1);
481 src, len, 1);
481 }
482 }
482
483
483 static const Py_ssize_t maxstorepathlen = 120;
484 static const Py_ssize_t maxstorepathlen = 120;
484
485
485 static Py_ssize_t _lowerencode(char *dest, size_t destsize,
486 static Py_ssize_t _lowerencode(char *dest, size_t destsize,
486 const char *src, Py_ssize_t len)
487 const char *src, Py_ssize_t len)
487 {
488 {
488 static const uint32_t onebyte[8] = {
489 static const uint32_t onebyte[8] = {
489 1, 0x2bfffbfb, 0xe8000001, 0x2fffffff
490 1, 0x2bfffbfb, 0xe8000001, 0x2fffffff
490 };
491 };
491
492
492 static const uint32_t lower[8] = { 0, 0, 0x7fffffe };
493 static const uint32_t lower[8] = { 0, 0, 0x7fffffe };
493
494
494 Py_ssize_t i, destlen = 0;
495 Py_ssize_t i, destlen = 0;
495
496
496 for (i = 0; i < len; i++) {
497 for (i = 0; i < len; i++) {
497 if (inset(onebyte, src[i]))
498 if (inset(onebyte, src[i]))
498 charcopy(dest, &destlen, destsize, src[i]);
499 charcopy(dest, &destlen, destsize, src[i]);
499 else if (inset(lower, src[i]))
500 else if (inset(lower, src[i]))
500 charcopy(dest, &destlen, destsize, src[i] + 32);
501 charcopy(dest, &destlen, destsize, src[i] + 32);
501 else
502 else
502 escape3(dest, &destlen, destsize, src[i]);
503 escape3(dest, &destlen, destsize, src[i]);
503 }
504 }
504
505
505 return destlen;
506 return destlen;
506 }
507 }
507
508
508 PyObject *lowerencode(PyObject *self, PyObject *args)
509 PyObject *lowerencode(PyObject *self, PyObject *args)
509 {
510 {
510 char *path;
511 char *path;
511 Py_ssize_t len, newlen;
512 Py_ssize_t len, newlen;
512 PyObject *ret;
513 PyObject *ret;
513
514
514 if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len))
515 if (!PyArg_ParseTuple(args, "s#:lowerencode", &path, &len))
515 return NULL;
516 return NULL;
516
517
517 newlen = _lowerencode(NULL, 0, path, len);
518 newlen = _lowerencode(NULL, 0, path, len);
518 ret = PyBytes_FromStringAndSize(NULL, newlen);
519 ret = PyBytes_FromStringAndSize(NULL, newlen);
519 if (ret)
520 if (ret)
520 _lowerencode(PyBytes_AS_STRING(ret), newlen, path, len);
521 _lowerencode(PyBytes_AS_STRING(ret), newlen, path, len);
521
522
522 return ret;
523 return ret;
523 }
524 }
524
525
525 /* See store.py:_auxencode for a description. */
526 /* See store.py:_auxencode for a description. */
526 static Py_ssize_t auxencode(char *dest, size_t destsize,
527 static Py_ssize_t auxencode(char *dest, size_t destsize,
527 const char *src, Py_ssize_t len)
528 const char *src, Py_ssize_t len)
528 {
529 {
529 static const uint32_t twobytes[8];
530 static const uint32_t twobytes[8];
530
531
531 static const uint32_t onebyte[8] = {
532 static const uint32_t onebyte[8] = {
532 ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U,
533 ~0U, 0xffff3ffe, ~0U, ~0U, ~0U, ~0U, ~0U, ~0U,
533 };
534 };
534
535
535 return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);
536 return _encode(twobytes, onebyte, dest, 0, destsize, src, len, 0);
536 }
537 }
537
538
538 static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20])
539 static PyObject *hashmangle(const char *src, Py_ssize_t len, const char sha[20])
539 {
540 {
540 static const Py_ssize_t dirprefixlen = 8;
541 static const Py_ssize_t dirprefixlen = 8;
541 static const Py_ssize_t maxshortdirslen = 68;
542 static const Py_ssize_t maxshortdirslen = 68;
542 char *dest;
543 char *dest;
543 PyObject *ret;
544 PyObject *ret;
544
545
545 Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
546 Py_ssize_t i, d, p, lastslash = len - 1, lastdot = -1;
546 Py_ssize_t destsize, destlen = 0, slop, used;
547 Py_ssize_t destsize, destlen = 0, slop, used;
547
548
548 while (lastslash >= 0 && src[lastslash] != '/') {
549 while (lastslash >= 0 && src[lastslash] != '/') {
549 if (src[lastslash] == '.' && lastdot == -1)
550 if (src[lastslash] == '.' && lastdot == -1)
550 lastdot = lastslash;
551 lastdot = lastslash;
551 lastslash--;
552 lastslash--;
552 }
553 }
553
554
554 #if 0
555 #if 0
555 /* All paths should end in a suffix of ".i" or ".d".
556 /* All paths should end in a suffix of ".i" or ".d".
556 Unfortunately, the file names in test-hybridencode.py
557 Unfortunately, the file names in test-hybridencode.py
557 violate this rule. */
558 violate this rule. */
558 if (lastdot != len - 3) {
559 if (lastdot != len - 3) {
559 PyErr_SetString(PyExc_ValueError,
560 PyErr_SetString(PyExc_ValueError,
560 "suffix missing or wrong length");
561 "suffix missing or wrong length");
561 return NULL;
562 return NULL;
562 }
563 }
563 #endif
564 #endif
564
565
565 /* If src contains a suffix, we will append it to the end of
566 /* If src contains a suffix, we will append it to the end of
566 the new string, so make room. */
567 the new string, so make room. */
567 destsize = 120;
568 destsize = 120;
568 if (lastdot >= 0)
569 if (lastdot >= 0)
569 destsize += len - lastdot - 1;
570 destsize += len - lastdot - 1;
570
571
571 ret = PyBytes_FromStringAndSize(NULL, destsize);
572 ret = PyBytes_FromStringAndSize(NULL, destsize);
572 if (ret == NULL)
573 if (ret == NULL)
573 return NULL;
574 return NULL;
574
575
575 dest = PyBytes_AS_STRING(ret);
576 dest = PyBytes_AS_STRING(ret);
576 memcopy(dest, &destlen, destsize, "dh/", 3);
577 memcopy(dest, &destlen, destsize, "dh/", 3);
577
578
578 /* Copy up to dirprefixlen bytes of each path component, up to
579 /* Copy up to dirprefixlen bytes of each path component, up to
579 a limit of maxshortdirslen bytes. */
580 a limit of maxshortdirslen bytes. */
580 for (i = d = p = 0; i < lastslash; i++, p++) {
581 for (i = d = p = 0; i < lastslash; i++, p++) {
581 if (src[i] == '/') {
582 if (src[i] == '/') {
582 char d = dest[destlen - 1];
583 char d = dest[destlen - 1];
583 /* After truncation, a directory name may end
584 /* After truncation, a directory name may end
584 in a space or dot, which are unportable. */
585 in a space or dot, which are unportable. */
585 if (d == '.' || d == ' ')
586 if (d == '.' || d == ' ')
586 dest[destlen - 1] = '_';
587 dest[destlen - 1] = '_';
587 /* The + 3 is to account for "dh/" in the beginning */
588 /* The + 3 is to account for "dh/" in the beginning */
588 if (destlen > maxshortdirslen + 3)
589 if (destlen > maxshortdirslen + 3)
589 break;
590 break;
590 charcopy(dest, &destlen, destsize, src[i]);
591 charcopy(dest, &destlen, destsize, src[i]);
591 p = -1;
592 p = -1;
592 }
593 }
593 else if (p < dirprefixlen)
594 else if (p < dirprefixlen)
594 charcopy(dest, &destlen, destsize, src[i]);
595 charcopy(dest, &destlen, destsize, src[i]);
595 }
596 }
596
597
597 /* Rewind to just before the last slash copied. */
598 /* Rewind to just before the last slash copied. */
598 if (destlen > maxshortdirslen + 3)
599 if (destlen > maxshortdirslen + 3)
599 do {
600 do {
600 destlen--;
601 destlen--;
601 } while (destlen > 0 && dest[destlen] != '/');
602 } while (destlen > 0 && dest[destlen] != '/');
602
603
603 if (destlen > 3) {
604 if (destlen > 3) {
604 if (lastslash > 0) {
605 if (lastslash > 0) {
605 char d = dest[destlen - 1];
606 char d = dest[destlen - 1];
606 /* The last directory component may be
607 /* The last directory component may be
607 truncated, so make it safe. */
608 truncated, so make it safe. */
608 if (d == '.' || d == ' ')
609 if (d == '.' || d == ' ')
609 dest[destlen - 1] = '_';
610 dest[destlen - 1] = '_';
610 }
611 }
611
612
612 charcopy(dest, &destlen, destsize, '/');
613 charcopy(dest, &destlen, destsize, '/');
613 }
614 }
614
615
615 /* Add a prefix of the original file's name. Its length
616 /* Add a prefix of the original file's name. Its length
616 depends on the number of bytes left after accounting for
617 depends on the number of bytes left after accounting for
617 hash and suffix. */
618 hash and suffix. */
618 used = destlen + 40;
619 used = destlen + 40;
619 if (lastdot >= 0)
620 if (lastdot >= 0)
620 used += len - lastdot - 1;
621 used += len - lastdot - 1;
621 slop = maxstorepathlen - used;
622 slop = maxstorepathlen - used;
622 if (slop > 0) {
623 if (slop > 0) {
623 Py_ssize_t basenamelen =
624 Py_ssize_t basenamelen =
624 lastslash >= 0 ? len - lastslash - 2 : len - 1;
625 lastslash >= 0 ? len - lastslash - 2 : len - 1;
625
626
626 if (basenamelen > slop)
627 if (basenamelen > slop)
627 basenamelen = slop;
628 basenamelen = slop;
628 if (basenamelen > 0)
629 if (basenamelen > 0)
629 memcopy(dest, &destlen, destsize, &src[lastslash + 1],
630 memcopy(dest, &destlen, destsize, &src[lastslash + 1],
630 basenamelen);
631 basenamelen);
631 }
632 }
632
633
633 /* Add hash and suffix. */
634 /* Add hash and suffix. */
634 for (i = 0; i < 20; i++)
635 for (i = 0; i < 20; i++)
635 hexencode(dest, &destlen, destsize, sha[i]);
636 hexencode(dest, &destlen, destsize, sha[i]);
636
637
637 if (lastdot >= 0)
638 if (lastdot >= 0)
638 memcopy(dest, &destlen, destsize, &src[lastdot],
639 memcopy(dest, &destlen, destsize, &src[lastdot],
639 len - lastdot - 1);
640 len - lastdot - 1);
640
641
641 PyBytes_GET_SIZE(ret) = destlen;
642 PyBytes_Check(ret);
643 Py_SIZE(ret) = destlen;
642
644
643 return ret;
645 return ret;
644 }
646 }
645
647
646 /*
648 /*
647 * Avoiding a trip through Python would improve performance by 50%,
649 * Avoiding a trip through Python would improve performance by 50%,
648 * but we don't encounter enough long names to be worth the code.
650 * but we don't encounter enough long names to be worth the code.
649 */
651 */
650 static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
652 static int sha1hash(char hash[20], const char *str, Py_ssize_t len)
651 {
653 {
652 static PyObject *shafunc;
654 static PyObject *shafunc;
653 PyObject *shaobj, *hashobj;
655 PyObject *shaobj, *hashobj;
654
656
655 if (shafunc == NULL) {
657 if (shafunc == NULL) {
656 PyObject *hashlib, *name = PyBytes_FromString("hashlib");
658 PyObject *hashlib, *name = PyBytes_FromString("hashlib");
657
659
658 if (name == NULL)
660 if (name == NULL)
659 return -1;
661 return -1;
660
662
661 hashlib = PyImport_Import(name);
663 hashlib = PyImport_Import(name);
662 Py_DECREF(name);
664 Py_DECREF(name);
663
665
664 if (hashlib == NULL) {
666 if (hashlib == NULL) {
665 PyErr_SetString(PyExc_ImportError, "hashlib");
667 PyErr_SetString(PyExc_ImportError, "hashlib");
666 return -1;
668 return -1;
667 }
669 }
668 shafunc = PyObject_GetAttrString(hashlib, "sha1");
670 shafunc = PyObject_GetAttrString(hashlib, "sha1");
669 Py_DECREF(hashlib);
671 Py_DECREF(hashlib);
670
672
671 if (shafunc == NULL) {
673 if (shafunc == NULL) {
672 PyErr_SetString(PyExc_AttributeError,
674 PyErr_SetString(PyExc_AttributeError,
673 "module 'hashlib' has no "
675 "module 'hashlib' has no "
674 "attribute 'sha1'");
676 "attribute 'sha1'");
675 return -1;
677 return -1;
676 }
678 }
677 }
679 }
678
680
679 shaobj = PyObject_CallFunction(shafunc, "s#", str, len);
681 shaobj = PyObject_CallFunction(shafunc, "s#", str, len);
680
682
681 if (shaobj == NULL)
683 if (shaobj == NULL)
682 return -1;
684 return -1;
683
685
684 hashobj = PyObject_CallMethod(shaobj, "digest", "");
686 hashobj = PyObject_CallMethod(shaobj, "digest", "");
685 Py_DECREF(shaobj);
687 Py_DECREF(shaobj);
686 if (hashobj == NULL)
688 if (hashobj == NULL)
687 return -1;
689 return -1;
688
690
689 if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) {
691 if (!PyBytes_Check(hashobj) || PyBytes_GET_SIZE(hashobj) != 20) {
690 PyErr_SetString(PyExc_TypeError,
692 PyErr_SetString(PyExc_TypeError,
691 "result of digest is not a 20-byte hash");
693 "result of digest is not a 20-byte hash");
692 Py_DECREF(hashobj);
694 Py_DECREF(hashobj);
693 return -1;
695 return -1;
694 }
696 }
695
697
696 memcpy(hash, PyBytes_AS_STRING(hashobj), 20);
698 memcpy(hash, PyBytes_AS_STRING(hashobj), 20);
697 Py_DECREF(hashobj);
699 Py_DECREF(hashobj);
698 return 0;
700 return 0;
699 }
701 }
700
702
701 #define MAXENCODE 4096 * 4
703 #define MAXENCODE 4096 * 4
702
704
703 static PyObject *hashencode(const char *src, Py_ssize_t len)
705 static PyObject *hashencode(const char *src, Py_ssize_t len)
704 {
706 {
705 char dired[MAXENCODE];
707 char dired[MAXENCODE];
706 char lowered[MAXENCODE];
708 char lowered[MAXENCODE];
707 char auxed[MAXENCODE];
709 char auxed[MAXENCODE];
708 Py_ssize_t dirlen, lowerlen, auxlen, baselen;
710 Py_ssize_t dirlen, lowerlen, auxlen, baselen;
709 char sha[20];
711 char sha[20];
710
712
711 baselen = (len - 5) * 3;
713 baselen = (len - 5) * 3;
712 if (baselen >= MAXENCODE) {
714 if (baselen >= MAXENCODE) {
713 PyErr_SetString(PyExc_ValueError, "string too long");
715 PyErr_SetString(PyExc_ValueError, "string too long");
714 return NULL;
716 return NULL;
715 }
717 }
716
718
717 dirlen = _encodedir(dired, baselen, src, len);
719 dirlen = _encodedir(dired, baselen, src, len);
718 if (sha1hash(sha, dired, dirlen - 1) == -1)
720 if (sha1hash(sha, dired, dirlen - 1) == -1)
719 return NULL;
721 return NULL;
720 lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5);
722 lowerlen = _lowerencode(lowered, baselen, dired + 5, dirlen - 5);
721 auxlen = auxencode(auxed, baselen, lowered, lowerlen);
723 auxlen = auxencode(auxed, baselen, lowered, lowerlen);
722 return hashmangle(auxed, auxlen, sha);
724 return hashmangle(auxed, auxlen, sha);
723 }
725 }
724
726
725 PyObject *pathencode(PyObject *self, PyObject *args)
727 PyObject *pathencode(PyObject *self, PyObject *args)
726 {
728 {
727 Py_ssize_t len, newlen;
729 Py_ssize_t len, newlen;
728 PyObject *pathobj, *newobj;
730 PyObject *pathobj, *newobj;
729 char *path;
731 char *path;
730
732
731 if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
733 if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
732 return NULL;
734 return NULL;
733
735
734 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
736 if (PyBytes_AsStringAndSize(pathobj, &path, &len) == -1) {
735 PyErr_SetString(PyExc_TypeError, "expected a string");
737 PyErr_SetString(PyExc_TypeError, "expected a string");
736 return NULL;
738 return NULL;
737 }
739 }
738
740
739 if (len > maxstorepathlen)
741 if (len > maxstorepathlen)
740 newlen = maxstorepathlen + 2;
742 newlen = maxstorepathlen + 2;
741 else
743 else
742 newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
744 newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
743
745
744 if (newlen <= maxstorepathlen + 1) {
746 if (newlen <= maxstorepathlen + 1) {
745 if (newlen == len + 1) {
747 if (newlen == len + 1) {
746 Py_INCREF(pathobj);
748 Py_INCREF(pathobj);
747 return pathobj;
749 return pathobj;
748 }
750 }
749
751
750 newobj = PyBytes_FromStringAndSize(NULL, newlen);
752 newobj = PyBytes_FromStringAndSize(NULL, newlen);
751
753
752 if (newobj) {
754 if (newobj) {
753 PyBytes_GET_SIZE(newobj)--;
755 PyBytes_Check(newobj);
756 Py_SIZE(newobj)--;
754 basicencode(PyBytes_AS_STRING(newobj), newlen, path,
757 basicencode(PyBytes_AS_STRING(newobj), newlen, path,
755 len + 1);
758 len + 1);
756 }
759 }
757 }
760 }
758 else
761 else
759 newobj = hashencode(path, len + 1);
762 newobj = hashencode(path, len + 1);
760
763
761 return newobj;
764 return newobj;
762 }
765 }
General Comments 0
You need to be logged in to leave comments. Login now