##// END OF EJS Templates
rhg: in path_encode, be a bit more conservative about memory usage...
Arseniy Alekseyev -
r51061:6ea3b1ac default
parent child Browse files
Show More
@@ -1,634 +1,635 b''
1 1 use sha1::{Digest, Sha1};
2 2
3 3 #[derive(PartialEq, Debug)]
4 4 #[allow(non_camel_case_types)]
5 5 #[allow(clippy::upper_case_acronyms)]
6 6 enum path_state {
7 7 START, /* first byte of a path component */
8 8 A, /* "AUX" */
9 9 AU,
10 10 THIRD, /* third of a 3-byte sequence, e.g. "AUX", "NUL" */
11 11 C, /* "CON" or "COMn" */
12 12 CO,
13 13 COMLPT, /* "COM" or "LPT" */
14 14 COMLPTn,
15 15 L,
16 16 LP,
17 17 N,
18 18 NU,
19 19 P, /* "PRN" */
20 20 PR,
21 21 LDOT, /* leading '.' */
22 22 DOT, /* '.' in a non-leading position */
23 23 H, /* ".h" */
24 24 HGDI, /* ".hg", ".d", or ".i" */
25 25 SPACE,
26 26 DEFAULT, /* byte of a path component after the first */
27 27 }
28 28
29 29 /* state machine for dir-encoding */
30 30 #[allow(non_camel_case_types)]
31 31 #[allow(clippy::upper_case_acronyms)]
32 32 enum dir_state {
33 33 DDOT,
34 34 DH,
35 35 DHGDI,
36 36 DDEFAULT,
37 37 }
38 38
39 39 trait Sink {
40 40 fn write_byte(&mut self, c: u8);
41 41 fn write_bytes(&mut self, c: &[u8]);
42 42 }
43 43
44 44 fn inset(bitset: &[u32; 8], c: u8) -> bool {
45 45 bitset[(c as usize) >> 5] & (1 << (c & 31)) != 0
46 46 }
47 47
48 48 const MAXENCODE: usize = 4096 * 4;
49 49
50 50 struct DestArr<const N: usize> {
51 51 buf: [u8; N],
52 52 pub len: usize,
53 53 }
54 54
55 55 impl<const N: usize> DestArr<N> {
56 56 pub fn create() -> Self {
57 57 DestArr {
58 58 buf: [0; N],
59 59 len: 0,
60 60 }
61 61 }
62 62
63 63 pub fn contents(&self) -> &[u8] {
64 64 &self.buf[..self.len]
65 65 }
66 66 }
67 67
68 68 impl<const N: usize> Sink for DestArr<N> {
69 69 fn write_byte(&mut self, c: u8) {
70 70 self.buf[self.len] = c;
71 71 self.len += 1;
72 72 }
73 73
74 74 fn write_bytes(&mut self, src: &[u8]) {
75 75 self.buf[self.len..self.len + src.len()].copy_from_slice(src);
76 76 self.len += src.len();
77 77 }
78 78 }
79 79
80 80 struct MeasureDest {
81 81 pub len: usize,
82 82 }
83 83
84 84 impl Sink for Vec<u8> {
85 85 fn write_byte(&mut self, c: u8) {
86 86 self.push(c)
87 87 }
88 88
89 89 fn write_bytes(&mut self, src: &[u8]) {
90 90 self.extend_from_slice(src)
91 91 }
92 92 }
93 93
94 94 impl MeasureDest {
95 95 fn create() -> Self {
96 96 Self { len: 0 }
97 97 }
98 98 }
99 99
100 100 impl Sink for MeasureDest {
101 101 fn write_byte(&mut self, _c: u8) {
102 102 self.len += 1;
103 103 }
104 104
105 105 fn write_bytes(&mut self, src: &[u8]) {
106 106 self.len += src.len();
107 107 }
108 108 }
109 109
110 110 fn hexencode(dest: &mut impl Sink, c: u8) {
111 111 let hexdigit = b"0123456789abcdef";
112 112 dest.write_byte(hexdigit[(c as usize) >> 4]);
113 113 dest.write_byte(hexdigit[(c as usize) & 15]);
114 114 }
115 115
116 116 /* 3-byte escape: tilde followed by two hex digits */
117 117 fn escape3(dest: &mut impl Sink, c: u8) {
118 118 dest.write_byte(b'~');
119 119 hexencode(dest, c);
120 120 }
121 121
122 122 fn encode_dir(dest: &mut impl Sink, src: &[u8]) {
123 123 let mut state = dir_state::DDEFAULT;
124 124 let mut i = 0;
125 125
126 126 while i < src.len() {
127 127 match state {
128 128 dir_state::DDOT => match src[i] {
129 129 b'd' | b'i' => {
130 130 state = dir_state::DHGDI;
131 131 dest.write_byte(src[i]);
132 132 i += 1;
133 133 }
134 134 b'h' => {
135 135 state = dir_state::DH;
136 136 dest.write_byte(src[i]);
137 137 i += 1;
138 138 }
139 139 _ => {
140 140 state = dir_state::DDEFAULT;
141 141 }
142 142 },
143 143 dir_state::DH => {
144 144 if src[i] == b'g' {
145 145 state = dir_state::DHGDI;
146 146 dest.write_byte(src[i]);
147 147 i += 1;
148 148 } else {
149 149 state = dir_state::DDEFAULT;
150 150 }
151 151 }
152 152 dir_state::DHGDI => {
153 153 if src[i] == b'/' {
154 154 dest.write_bytes(b".hg");
155 155 dest.write_byte(src[i]);
156 156 i += 1;
157 157 }
158 158 state = dir_state::DDEFAULT;
159 159 }
160 160 dir_state::DDEFAULT => {
161 161 if src[i] == b'.' {
162 162 state = dir_state::DDOT
163 163 }
164 164 dest.write_byte(src[i]);
165 165 i += 1;
166 166 }
167 167 }
168 168 }
169 169 }
170 170
171 171 fn _encode(
172 172 twobytes: &[u32; 8],
173 173 onebyte: &[u32; 8],
174 174 dest: &mut impl Sink,
175 175 src: &[u8],
176 176 encodedir: bool,
177 177 ) {
178 178 let mut state = path_state::START;
179 179 let mut i = 0;
180 180 let len = src.len();
181 181
182 182 while i < len {
183 183 match state {
184 184 path_state::START => match src[i] {
185 185 b'/' => {
186 186 dest.write_byte(src[i]);
187 187 i += 1;
188 188 }
189 189 b'.' => {
190 190 state = path_state::LDOT;
191 191 escape3(dest, src[i]);
192 192 i += 1;
193 193 }
194 194 b' ' => {
195 195 state = path_state::DEFAULT;
196 196 escape3(dest, src[i]);
197 197 i += 1;
198 198 }
199 199 b'a' => {
200 200 state = path_state::A;
201 201 dest.write_byte(src[i]);
202 202 i += 1;
203 203 }
204 204 b'c' => {
205 205 state = path_state::C;
206 206 dest.write_byte(src[i]);
207 207 i += 1;
208 208 }
209 209 b'l' => {
210 210 state = path_state::L;
211 211 dest.write_byte(src[i]);
212 212 i += 1;
213 213 }
214 214 b'n' => {
215 215 state = path_state::N;
216 216 dest.write_byte(src[i]);
217 217 i += 1;
218 218 }
219 219 b'p' => {
220 220 state = path_state::P;
221 221 dest.write_byte(src[i]);
222 222 i += 1;
223 223 }
224 224 _ => {
225 225 state = path_state::DEFAULT;
226 226 }
227 227 },
228 228 path_state::A => {
229 229 if src[i] == b'u' {
230 230 state = path_state::AU;
231 231 dest.write_byte(src[i]);
232 232 i += 1;
233 233 } else {
234 234 state = path_state::DEFAULT;
235 235 }
236 236 }
237 237 path_state::AU => {
238 238 if src[i] == b'x' {
239 239 state = path_state::THIRD;
240 240 i += 1;
241 241 } else {
242 242 state = path_state::DEFAULT;
243 243 }
244 244 }
245 245 path_state::THIRD => {
246 246 state = path_state::DEFAULT;
247 247 match src[i] {
248 248 b'.' | b'/' | b'\0' => escape3(dest, src[i - 1]),
249 249 _ => i -= 1,
250 250 }
251 251 }
252 252 path_state::C => {
253 253 if src[i] == b'o' {
254 254 state = path_state::CO;
255 255 dest.write_byte(src[i]);
256 256 i += 1;
257 257 } else {
258 258 state = path_state::DEFAULT;
259 259 }
260 260 }
261 261 path_state::CO => {
262 262 if src[i] == b'm' {
263 263 state = path_state::COMLPT;
264 264 i += 1;
265 265 } else if src[i] == b'n' {
266 266 state = path_state::THIRD;
267 267 i += 1;
268 268 } else {
269 269 state = path_state::DEFAULT;
270 270 }
271 271 }
272 272 path_state::COMLPT => {
273 273 if src[i] >= b'1' && src[i] <= b'9' {
274 274 state = path_state::COMLPTn;
275 275 i += 1;
276 276 } else {
277 277 state = path_state::DEFAULT;
278 278 dest.write_byte(src[i - 1]);
279 279 }
280 280 }
281 281 path_state::COMLPTn => {
282 282 state = path_state::DEFAULT;
283 283 match src[i] {
284 284 b'.' | b'/' | b'\0' => {
285 285 escape3(dest, src[i - 2]);
286 286 dest.write_byte(src[i - 1]);
287 287 }
288 288 _ => {
289 289 dest.write_bytes(&src[i - 2..i]);
290 290 }
291 291 }
292 292 }
293 293 path_state::L => {
294 294 if src[i] == b'p' {
295 295 state = path_state::LP;
296 296 dest.write_byte(src[i]);
297 297 i += 1;
298 298 } else {
299 299 state = path_state::DEFAULT;
300 300 }
301 301 }
302 302 path_state::LP => {
303 303 if src[i] == b't' {
304 304 state = path_state::COMLPT;
305 305 i += 1;
306 306 } else {
307 307 state = path_state::DEFAULT;
308 308 }
309 309 }
310 310 path_state::N => {
311 311 if src[i] == b'u' {
312 312 state = path_state::NU;
313 313 dest.write_byte(src[i]);
314 314 i += 1;
315 315 } else {
316 316 state = path_state::DEFAULT;
317 317 }
318 318 }
319 319 path_state::NU => {
320 320 if src[i] == b'l' {
321 321 state = path_state::THIRD;
322 322 i += 1;
323 323 } else {
324 324 state = path_state::DEFAULT;
325 325 }
326 326 }
327 327 path_state::P => {
328 328 if src[i] == b'r' {
329 329 state = path_state::PR;
330 330 dest.write_byte(src[i]);
331 331 i += 1;
332 332 } else {
333 333 state = path_state::DEFAULT;
334 334 }
335 335 }
336 336 path_state::PR => {
337 337 if src[i] == b'n' {
338 338 state = path_state::THIRD;
339 339 i += 1;
340 340 } else {
341 341 state = path_state::DEFAULT;
342 342 }
343 343 }
344 344 path_state::LDOT => match src[i] {
345 345 b'd' | b'i' => {
346 346 state = path_state::HGDI;
347 347 dest.write_byte(src[i]);
348 348 i += 1;
349 349 }
350 350 b'h' => {
351 351 state = path_state::H;
352 352 dest.write_byte(src[i]);
353 353 i += 1;
354 354 }
355 355 _ => {
356 356 state = path_state::DEFAULT;
357 357 }
358 358 },
359 359 path_state::DOT => match src[i] {
360 360 b'/' | b'\0' => {
361 361 state = path_state::START;
362 362 dest.write_bytes(b"~2e");
363 363 dest.write_byte(src[i]);
364 364 i += 1;
365 365 }
366 366 b'd' | b'i' => {
367 367 state = path_state::HGDI;
368 368 dest.write_byte(b'.');
369 369 dest.write_byte(src[i]);
370 370 i += 1;
371 371 }
372 372 b'h' => {
373 373 state = path_state::H;
374 374 dest.write_bytes(b".h");
375 375 i += 1;
376 376 }
377 377 _ => {
378 378 state = path_state::DEFAULT;
379 379 dest.write_byte(b'.');
380 380 }
381 381 },
382 382 path_state::H => {
383 383 if src[i] == b'g' {
384 384 state = path_state::HGDI;
385 385 dest.write_byte(src[i]);
386 386 i += 1;
387 387 } else {
388 388 state = path_state::DEFAULT;
389 389 }
390 390 }
391 391 path_state::HGDI => {
392 392 if src[i] == b'/' {
393 393 state = path_state::START;
394 394 if encodedir {
395 395 dest.write_bytes(b".hg");
396 396 }
397 397 dest.write_byte(src[i]);
398 398 i += 1
399 399 } else {
400 400 state = path_state::DEFAULT;
401 401 }
402 402 }
403 403 path_state::SPACE => match src[i] {
404 404 b'/' | b'\0' => {
405 405 state = path_state::START;
406 406 dest.write_bytes(b"~20");
407 407 dest.write_byte(src[i]);
408 408 i += 1;
409 409 }
410 410 _ => {
411 411 state = path_state::DEFAULT;
412 412 dest.write_byte(b' ');
413 413 }
414 414 },
415 415 path_state::DEFAULT => {
416 416 while i != len && inset(onebyte, src[i]) {
417 417 dest.write_byte(src[i]);
418 418 i += 1;
419 419 }
420 420 if i == len {
421 421 break;
422 422 }
423 423 match src[i] {
424 424 b'.' => {
425 425 state = path_state::DOT;
426 426 i += 1
427 427 }
428 428 b' ' => {
429 429 state = path_state::SPACE;
430 430 i += 1
431 431 }
432 432 b'/' => {
433 433 state = path_state::START;
434 434 dest.write_byte(b'/');
435 435 i += 1;
436 436 }
437 437 _ => {
438 438 if inset(onebyte, src[i]) {
439 439 loop {
440 440 dest.write_byte(src[i]);
441 441 i += 1;
442 442 if !(i < len && inset(onebyte, src[i])) {
443 443 break;
444 444 }
445 445 }
446 446 } else if inset(twobytes, src[i]) {
447 447 let c = src[i];
448 448 i += 1;
449 449 dest.write_byte(b'_');
450 450 dest.write_byte(if c == b'_' {
451 451 b'_'
452 452 } else {
453 453 c + 32
454 454 });
455 455 } else {
456 456 escape3(dest, src[i]);
457 457 i += 1;
458 458 }
459 459 }
460 460 }
461 461 }
462 462 }
463 463 }
464 464 match state {
465 465 path_state::START => (),
466 466 path_state::A => (),
467 467 path_state::AU => (),
468 468 path_state::THIRD => escape3(dest, src[i - 1]),
469 469 path_state::C => (),
470 470 path_state::CO => (),
471 471 path_state::COMLPT => dest.write_byte(src[i - 1]),
472 472 path_state::COMLPTn => {
473 473 escape3(dest, src[i - 2]);
474 474 dest.write_byte(src[i - 1]);
475 475 }
476 476 path_state::L => (),
477 477 path_state::LP => (),
478 478 path_state::N => (),
479 479 path_state::NU => (),
480 480 path_state::P => (),
481 481 path_state::PR => (),
482 482 path_state::LDOT => (),
483 483 path_state::DOT => {
484 484 dest.write_bytes(b"~2e");
485 485 }
486 486 path_state::H => (),
487 487 path_state::HGDI => (),
488 488 path_state::SPACE => {
489 489 dest.write_bytes(b"~20");
490 490 }
491 491 path_state::DEFAULT => (),
492 492 }
493 493 }
494 494
495 495 fn basic_encode(dest: &mut impl Sink, src: &[u8]) {
496 496 let twobytes: [u32; 8] = [0, 0, 0x87ff_fffe, 0, 0, 0, 0, 0];
497 497 let onebyte: [u32; 8] =
498 498 [1, 0x2bff_3bfa, 0x6800_0001, 0x2fff_ffff, 0, 0, 0, 0];
499 499 _encode(&twobytes, &onebyte, dest, src, true)
500 500 }
501 501
502 502 const MAXSTOREPATHLEN: usize = 120;
503 503
504 504 fn lower_encode(dest: &mut impl Sink, src: &[u8]) {
505 505 let onebyte: [u32; 8] =
506 506 [1, 0x2bff_fbfb, 0xe800_0001, 0x2fff_ffff, 0, 0, 0, 0];
507 507 let lower: [u32; 8] = [0, 0, 0x07ff_fffe, 0, 0, 0, 0, 0];
508 508 for c in src {
509 509 if inset(&onebyte, *c) {
510 510 dest.write_byte(*c)
511 511 } else if inset(&lower, *c) {
512 512 dest.write_byte(*c + 32)
513 513 } else {
514 514 escape3(dest, *c)
515 515 }
516 516 }
517 517 }
518 518
519 519 fn aux_encode(dest: &mut impl Sink, src: &[u8]) {
520 520 let twobytes = [0; 8];
521 521 let onebyte: [u32; 8] = [!0, 0xffff_3ffe, !0, !0, !0, !0, !0, !0];
522 522 _encode(&twobytes, &onebyte, dest, src, false)
523 523 }
524 524
525 525 fn hash_mangle(src: &[u8], sha: &[u8]) -> Vec<u8> {
526 526 let dirprefixlen = 8;
527 527 let maxshortdirslen = 68;
528 528
529 529 let last_slash = src.iter().rposition(|b| *b == b'/');
530 530 let last_dot: Option<usize> = {
531 531 let s = last_slash.unwrap_or(0);
532 532 src[s..].iter().rposition(|b| *b == b'.').map(|i| i + s)
533 533 };
534 534
535 535 let mut dest = Vec::with_capacity(MAXSTOREPATHLEN);
536 536 dest.write_bytes(b"dh/");
537 537
538 538 if let Some(last_slash) = last_slash {
539 539 for slice in src[..last_slash].split(|b| *b == b'/') {
540 540 let slice = &slice[..std::cmp::min(slice.len(), dirprefixlen)];
541 541 if dest.len() + slice.len() > maxshortdirslen + 3 {
542 542 break;
543 543 } else {
544 544 dest.write_bytes(slice);
545 545 }
546 546 dest.write_byte(b'/');
547 547 }
548 548 }
549 549
550 550 let used = dest.len() + 40 + {
551 551 if let Some(l) = last_dot {
552 552 src.len() - l
553 553 } else {
554 554 0
555 555 }
556 556 };
557 557
558 558 if MAXSTOREPATHLEN > used {
559 559 let slop = MAXSTOREPATHLEN - used;
560 560 let basenamelen = match last_slash {
561 561 Some(l) => src.len() - l - 1,
562 562 None => src.len(),
563 563 };
564 564 let basenamelen = std::cmp::min(basenamelen, slop);
565 565 if basenamelen > 0 {
566 566 let start = match last_slash {
567 567 Some(l) => l + 1,
568 568 None => 0,
569 569 };
570 570 dest.write_bytes(&src[start..][..basenamelen])
571 571 }
572 572 }
573 573 for c in sha {
574 574 hexencode(&mut dest, *c);
575 575 }
576 576 if let Some(l) = last_dot {
577 577 dest.write_bytes(&src[l..]);
578 578 }
579 dest.shrink_to_fit();
579 580 dest
580 581 }
581 582
582 583 fn hash_encode(src: &[u8]) -> Vec<u8> {
583 584 let mut dired: DestArr<MAXENCODE> = DestArr::create();
584 585 let mut lowered: DestArr<MAXENCODE> = DestArr::create();
585 586 let mut auxed: DestArr<MAXENCODE> = DestArr::create();
586 587 let baselen = (src.len() - 5) * 3;
587 588 if baselen >= MAXENCODE {
588 589 panic!("path_encode::hash_encore: string too long: {}", baselen)
589 590 };
590 591 encode_dir(&mut dired, src);
591 592 let sha = Sha1::digest(dired.contents());
592 593 lower_encode(&mut lowered, &dired.contents()[5..]);
593 594 aux_encode(&mut auxed, lowered.contents());
594 595 hash_mangle(auxed.contents(), &sha)
595 596 }
596 597
597 598 pub fn path_encode(path: &[u8]) -> Vec<u8> {
598 599 let newlen = if path.len() <= MAXSTOREPATHLEN {
599 600 let mut measure = MeasureDest::create();
600 601 basic_encode(&mut measure, path);
601 602 measure.len
602 603 } else {
603 604 return hash_encode(path);
604 605 };
605 606 if newlen <= MAXSTOREPATHLEN {
606 607 if newlen == path.len() {
607 608 path.to_vec()
608 609 } else {
609 610 let mut dest = Vec::with_capacity(newlen);
610 611 basic_encode(&mut dest, path);
611 612 assert!(dest.len() == newlen);
612 613 dest
613 614 }
614 615 } else {
615 616 hash_encode(path)
616 617 }
617 618 }
618 619
619 620 #[cfg(test)]
620 621 mod tests {
621 622 use super::*;
622 623 use crate::utils::hg_path::HgPathBuf;
623 624
624 625 #[test]
625 626 fn test_long_filename_at_root() {
626 627 let input = b"data/ABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJABCDEFGHIJ.i";
627 628 let expected = b"dh/abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij.i708243a2237a7afae259ea3545a72a2ef11c247b.i";
628 629 let res = path_encode(input);
629 630 assert_eq!(
630 631 HgPathBuf::from_bytes(&res),
631 632 HgPathBuf::from_bytes(expected)
632 633 );
633 634 }
634 635 }
General Comments 0
You need to be logged in to leave comments. Login now