##// END OF EJS Templates
pathencode: simplify basicencode
Adrian Buehlmann -
r17691:c6c7e466 default
parent child Browse files
Show More
@@ -1,532 +1,525 b''
1 1 /*
2 2 pathencode.c - efficient path name encoding
3 3
4 4 Copyright 2012 Facebook
5 5
6 6 This software may be used and distributed according to the terms of
7 7 the GNU General Public License, incorporated herein by reference.
8 8 */
9 9
10 10 /*
11 11 * An implementation of the name encoding scheme used by the fncache
12 12 * store. The common case is of a path < 120 bytes long, which is
13 13 * handled either in a single pass with no allocations or two passes
14 14 * with a single allocation. For longer paths, multiple passes are
15 15 * required.
16 16 */
17 17
18 18 #include <Python.h>
19 19 #include <assert.h>
20 20 #include <ctype.h>
21 21 #include <stdlib.h>
22 22 #include <string.h>
23 23
24 24 #include "util.h"
25 25
26 26 /* state machine for the fast path */
27 27 enum path_state {
28 28 START, /* first byte of a path component */
29 29 A, /* "AUX" */
30 30 AU,
31 31 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
32 32 C, /* "CON" or "COMn" */
33 33 CO,
34 34 COMLPT, /* "COM" or "LPT" */
35 35 COMLPTn,
36 36 L,
37 37 LP,
38 38 N,
39 39 NU,
40 40 P, /* "PRN" */
41 41 PR,
42 42 LDOT, /* leading '.' */
43 43 DOT, /* '.' in a non-leading position */
44 44 H, /* ".h" */
45 45 HGDI, /* ".hg", ".d", or ".i" */
46 46 SPACE,
47 47 DEFAULT, /* byte of a path component after the first */
48 48 };
49 49
50 50 /* state machine for dir-encoding */
51 51 enum dir_state {
52 52 DDOT,
53 53 DH,
54 54 DHGDI,
55 55 DDEFAULT,
56 56 };
57 57
58 58 static inline int isset(const uint32_t bitset[], char c)
59 59 {
60 60 return bitset[((uint8_t)c) >> 5] & (1 << (((uint8_t)c) & 31));
61 61 }
62 62
63 63 static inline void charcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
64 64 char c)
65 65 {
66 66 if (dest) {
67 67 assert(*destlen < destsize);
68 68 dest[*destlen] = c;
69 69 }
70 70 (*destlen)++;
71 71 }
72 72
73 73 static inline void memcopy(char *dest, Py_ssize_t *destlen, size_t destsize,
74 74 const void *src, Py_ssize_t len)
75 75 {
76 76 if (dest) {
77 77 assert(*destlen + len < destsize);
78 78 memcpy((void *)&dest[*destlen], src, len);
79 79 }
80 80 *destlen += len;
81 81 }
82 82
83 83 static inline void hexencode(char *dest, Py_ssize_t *destlen, size_t destsize,
84 84 uint8_t c)
85 85 {
86 86 static const char hexdigit[] = "0123456789abcdef";
87 87
88 88 charcopy(dest, destlen, destsize, hexdigit[c >> 4]);
89 89 charcopy(dest, destlen, destsize, hexdigit[c & 15]);
90 90 }
91 91
92 92 /* 3-byte escape: tilde followed by two hex digits */
93 93 static inline void escape3(char *dest, Py_ssize_t *destlen, size_t destsize,
94 94 char c)
95 95 {
96 96 charcopy(dest, destlen, destsize, '~');
97 97 hexencode(dest, destlen, destsize, c);
98 98 }
99 99
100 100 static Py_ssize_t _encodedir(char *dest, size_t destsize,
101 101 const char *src, Py_ssize_t len)
102 102 {
103 103 enum dir_state state = DDEFAULT;
104 104 Py_ssize_t i = 0, destlen = 0;
105 105
106 106 while (i < len) {
107 107 switch (state) {
108 108 case DDOT:
109 109 switch (src[i]) {
110 110 case 'd':
111 111 case 'i':
112 112 state = DHGDI;
113 113 charcopy(dest, &destlen, destsize, src[i++]);
114 114 break;
115 115 case 'h':
116 116 state = DH;
117 117 charcopy(dest, &destlen, destsize, src[i++]);
118 118 break;
119 119 default:
120 120 state = DDEFAULT;
121 121 break;
122 122 }
123 123 break;
124 124 case DH:
125 125 if (src[i] == 'g') {
126 126 state = DHGDI;
127 127 charcopy(dest, &destlen, destsize, src[i++]);
128 128 }
129 129 else state = DDEFAULT;
130 130 break;
131 131 case DHGDI:
132 132 if (src[i] == '/') {
133 133 memcopy(dest, &destlen, destsize, ".hg", 3);
134 134 charcopy(dest, &destlen, destsize, src[i++]);
135 135 }
136 136 state = DDEFAULT;
137 137 break;
138 138 case DDEFAULT:
139 139 if (src[i] == '.')
140 140 state = DDOT;
141 141 charcopy(dest, &destlen, destsize, src[i++]);
142 142 break;
143 143 }
144 144 }
145 145
146 146 return destlen;
147 147 }
148 148
149 149 PyObject *encodedir(PyObject *self, PyObject *args)
150 150 {
151 151 Py_ssize_t len, newlen;
152 152 PyObject *pathobj, *newobj;
153 153 char *path;
154 154
155 155 if (!PyArg_ParseTuple(args, "O:encodedir", &pathobj))
156 156 return NULL;
157 157
158 158 if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
159 159 PyErr_SetString(PyExc_TypeError, "expected a string");
160 160 return NULL;
161 161 }
162 162
163 163 newlen = len ? _encodedir(NULL, 0, path, len + 1) : 1;
164 164
165 165 if (newlen == len + 1) {
166 166 Py_INCREF(pathobj);
167 167 return pathobj;
168 168 }
169 169
170 170 newobj = PyString_FromStringAndSize(NULL, newlen);
171 171
172 172 if (newobj) {
173 173 PyString_GET_SIZE(newobj)--;
174 174 _encodedir(PyString_AS_STRING(newobj), newlen, path,
175 175 len + 1);
176 176 }
177 177
178 178 return newobj;
179 179 }
180 180
181 181 static Py_ssize_t _encode(const uint32_t twobytes[8], const uint32_t onebyte[8],
182 182 char *dest, Py_ssize_t destlen, size_t destsize,
183 183 const char *src, Py_ssize_t len,
184 184 int encodedir)
185 185 {
186 186 enum path_state state = START;
187 187 Py_ssize_t i = 0;
188 188
189 189 /*
190 190 * Python strings end with a zero byte, which we use as a
191 191 * terminal token as they are not valid inside path names.
192 192 */
193 193
194 194 while (i < len) {
195 195 switch (state) {
196 196 case START:
197 197 switch (src[i]) {
198 198 case '/':
199 199 charcopy(dest, &destlen, destsize, src[i++]);
200 200 break;
201 201 case '.':
202 202 state = LDOT;
203 203 escape3(dest, &destlen, destsize, src[i++]);
204 204 break;
205 205 case ' ':
206 206 state = DEFAULT;
207 207 escape3(dest, &destlen, destsize, src[i++]);
208 208 break;
209 209 case 'a':
210 210 state = A;
211 211 charcopy(dest, &destlen, destsize, src[i++]);
212 212 break;
213 213 case 'c':
214 214 state = C;
215 215 charcopy(dest, &destlen, destsize, src[i++]);
216 216 break;
217 217 case 'l':
218 218 state = L;
219 219 charcopy(dest, &destlen, destsize, src[i++]);
220 220 break;
221 221 case 'n':
222 222 state = N;
223 223 charcopy(dest, &destlen, destsize, src[i++]);
224 224 break;
225 225 case 'p':
226 226 state = P;
227 227 charcopy(dest, &destlen, destsize, src[i++]);
228 228 break;
229 229 default:
230 230 state = DEFAULT;
231 231 break;
232 232 }
233 233 break;
234 234 case A:
235 235 if (src[i] == 'u') {
236 236 state = AU;
237 237 charcopy(dest, &destlen, destsize, src[i++]);
238 238 }
239 239 else state = DEFAULT;
240 240 break;
241 241 case AU:
242 242 if (src[i] == 'x') {
243 243 state = THIRD;
244 244 i++;
245 245 }
246 246 else state = DEFAULT;
247 247 break;
248 248 case THIRD:
249 249 state = DEFAULT;
250 250 switch (src[i]) {
251 251 case '.':
252 252 case '/':
253 253 case '\0':
254 254 escape3(dest, &destlen, destsize, src[i - 1]);
255 255 break;
256 256 default:
257 257 i--;
258 258 break;
259 259 }
260 260 break;
261 261 case C:
262 262 if (src[i] == 'o') {
263 263 state = CO;
264 264 charcopy(dest, &destlen, destsize, src[i++]);
265 265 }
266 266 else state = DEFAULT;
267 267 break;
268 268 case CO:
269 269 if (src[i] == 'm') {
270 270 state = COMLPT;
271 271 i++;
272 272 }
273 273 else if (src[i] == 'n') {
274 274 state = THIRD;
275 275 i++;
276 276 }
277 277 else state = DEFAULT;
278 278 break;
279 279 case COMLPT:
280 280 switch (src[i]) {
281 281 case '1': case '2': case '3': case '4': case '5':
282 282 case '6': case '7': case '8': case '9':
283 283 state = COMLPTn;
284 284 i++;
285 285 break;
286 286 default:
287 287 state = DEFAULT;
288 288 charcopy(dest, &destlen, destsize, src[i - 1]);
289 289 break;
290 290 }
291 291 break;
292 292 case COMLPTn:
293 293 state = DEFAULT;
294 294 switch (src[i]) {
295 295 case '.':
296 296 case '/':
297 297 case '\0':
298 298 escape3(dest, &destlen, destsize, src[i - 2]);
299 299 charcopy(dest, &destlen, destsize, src[i - 1]);
300 300 break;
301 301 default:
302 302 memcopy(dest, &destlen, destsize,
303 303 &src[i - 2], 2);
304 304 break;
305 305 }
306 306 break;
307 307 case L:
308 308 if (src[i] == 'p') {
309 309 state = LP;
310 310 charcopy(dest, &destlen, destsize, src[i++]);
311 311 }
312 312 else state = DEFAULT;
313 313 break;
314 314 case LP:
315 315 if (src[i] == 't') {
316 316 state = COMLPT;
317 317 i++;
318 318 }
319 319 else state = DEFAULT;
320 320 break;
321 321 case N:
322 322 if (src[i] == 'u') {
323 323 state = NU;
324 324 charcopy(dest, &destlen, destsize, src[i++]);
325 325 }
326 326 else state = DEFAULT;
327 327 break;
328 328 case NU:
329 329 if (src[i] == 'l') {
330 330 state = THIRD;
331 331 i++;
332 332 }
333 333 else state = DEFAULT;
334 334 break;
335 335 case P:
336 336 if (src[i] == 'r') {
337 337 state = PR;
338 338 charcopy(dest, &destlen, destsize, src[i++]);
339 339 }
340 340 else state = DEFAULT;
341 341 break;
342 342 case PR:
343 343 if (src[i] == 'n') {
344 344 state = THIRD;
345 345 i++;
346 346 }
347 347 else state = DEFAULT;
348 348 break;
349 349 case LDOT:
350 350 switch (src[i]) {
351 351 case 'd':
352 352 case 'i':
353 353 state = HGDI;
354 354 charcopy(dest, &destlen, destsize, src[i++]);
355 355 break;
356 356 case 'h':
357 357 state = H;
358 358 charcopy(dest, &destlen, destsize, src[i++]);
359 359 break;
360 360 default:
361 361 state = DEFAULT;
362 362 break;
363 363 }
364 364 break;
365 365 case DOT:
366 366 switch (src[i]) {
367 367 case '/':
368 368 case '\0':
369 369 state = START;
370 370 memcopy(dest, &destlen, destsize, "~2e", 3);
371 371 charcopy(dest, &destlen, destsize, src[i++]);
372 372 break;
373 373 case 'd':
374 374 case 'i':
375 375 state = HGDI;
376 376 charcopy(dest, &destlen, destsize, '.');
377 377 charcopy(dest, &destlen, destsize, src[i++]);
378 378 break;
379 379 case 'h':
380 380 state = H;
381 381 memcopy(dest, &destlen, destsize, ".h", 2);
382 382 i++;
383 383 break;
384 384 default:
385 385 state = DEFAULT;
386 386 charcopy(dest, &destlen, destsize, '.');
387 387 break;
388 388 }
389 389 break;
390 390 case H:
391 391 if (src[i] == 'g') {
392 392 state = HGDI;
393 393 charcopy(dest, &destlen, destsize, src[i++]);
394 394 }
395 395 else state = DEFAULT;
396 396 break;
397 397 case HGDI:
398 398 if (src[i] == '/') {
399 399 state = START;
400 400 if (encodedir)
401 401 memcopy(dest, &destlen, destsize, ".hg",
402 402 3);
403 403 charcopy(dest, &destlen, destsize, src[i++]);
404 404 }
405 405 else state = DEFAULT;
406 406 break;
407 407 case SPACE:
408 408 switch (src[i]) {
409 409 case '/':
410 410 case '\0':
411 411 state = START;
412 412 memcopy(dest, &destlen, destsize, "~20", 3);
413 413 charcopy(dest, &destlen, destsize, src[i++]);
414 414 break;
415 415 default:
416 416 state = DEFAULT;
417 417 charcopy(dest, &destlen, destsize, ' ');
418 418 break;
419 419 }
420 420 break;
421 421 case DEFAULT:
422 422 while (isset(onebyte, src[i])) {
423 423 charcopy(dest, &destlen, destsize, src[i++]);
424 424 if (i == len)
425 425 goto done;
426 426 }
427 427 switch (src[i]) {
428 428 case '.':
429 429 state = DOT;
430 430 i++;
431 431 break;
432 432 case ' ':
433 433 state = SPACE;
434 434 i++;
435 435 break;
436 436 case '/':
437 437 state = START;
438 438 charcopy(dest, &destlen, destsize, '/');
439 439 i++;
440 440 break;
441 441 default:
442 442 if (isset(onebyte, src[i])) {
443 443 do {
444 444 charcopy(dest, &destlen,
445 445 destsize, src[i++]);
446 446 } while (i < len &&
447 447 isset(onebyte, src[i]));
448 448 }
449 449 else if (isset(twobytes, src[i])) {
450 450 char c = src[i++];
451 451 charcopy(dest, &destlen, destsize, '_');
452 452 charcopy(dest, &destlen, destsize,
453 453 c == '_' ? '_' : c + 32);
454 454 }
455 455 else
456 456 escape3(dest, &destlen, destsize,
457 457 src[i++]);
458 458 break;
459 459 }
460 460 break;
461 461 }
462 462 }
463 463 done:
464 464 return destlen;
465 465 }
466 466
467 467 static Py_ssize_t basicencode(char *dest, size_t destsize,
468 468 const char *src, Py_ssize_t len)
469 469 {
470 470 static const uint32_t twobytes[8] = { 0, 0, 0x87fffffe };
471 471
472 472 static const uint32_t onebyte[8] = {
473 473 1, 0x2bff3bfa, 0x68000001, 0x2fffffff,
474 474 };
475 475
476 476 Py_ssize_t destlen = 0;
477 477
478 if (len < 5 || memcmp(src, "data/", 5) != 0) {
479 memcopy(dest, &destlen, destsize, src, len);
480 return destlen;
481 }
482
483 memcopy(dest, &destlen, destsize, "data/", 5);
484
485 478 return _encode(twobytes, onebyte, dest, destlen, destsize,
486 src + 5, len - 5, 1);
479 src, len, 1);
487 480 }
488 481
489 482 static const Py_ssize_t maxstorepathlen = 120;
490 483
491 484 /*
492 485 * We currently implement only basic encoding.
493 486 *
494 487 * If a name is too long to encode due to Windows path name limits,
495 488 * this function returns None.
496 489 */
497 490 PyObject *pathencode(PyObject *self, PyObject *args)
498 491 {
499 492 Py_ssize_t len, newlen;
500 493 PyObject *pathobj, *newobj;
501 494 char *path;
502 495
503 496 if (!PyArg_ParseTuple(args, "O:pathencode", &pathobj))
504 497 return NULL;
505 498
506 499 if (PyString_AsStringAndSize(pathobj, &path, &len) == -1) {
507 500 PyErr_SetString(PyExc_TypeError, "expected a string");
508 501 return NULL;
509 502 }
510 503
511 504 newlen = len ? basicencode(NULL, 0, path, len + 1) : 1;
512 505
513 506 if (newlen <= maxstorepathlen + 1) {
514 507 if (newlen == len + 1) {
515 508 Py_INCREF(pathobj);
516 509 return pathobj;
517 510 }
518 511
519 512 newobj = PyString_FromStringAndSize(NULL, newlen);
520 513
521 514 if (newobj) {
522 515 PyString_GET_SIZE(newobj)--;
523 516 basicencode(PyString_AS_STRING(newobj), newlen, path,
524 517 len + 1);
525 518 }
526 519 } else {
527 520 newobj = Py_None;
528 521 Py_INCREF(newobj);
529 522 }
530 523
531 524 return newobj;
532 525 }
General Comments 0
You need to be logged in to leave comments. Login now