##// END OF EJS Templates
rust-discovery: takefullsample() core implementation...
Georges Racinet -
r42966:8041a1b4 default
parent child Browse files
Show More
@@ -17,6 +17,7 b' use self::rand::{thread_rng, RngCore, Se'
17 use super::{Graph, GraphError, Revision, NULL_REVISION};
17 use super::{Graph, GraphError, Revision, NULL_REVISION};
18 use crate::ancestors::MissingAncestors;
18 use crate::ancestors::MissingAncestors;
19 use crate::dagops;
19 use crate::dagops;
20 use std::cmp::{max, min};
20 use std::collections::{HashMap, HashSet, VecDeque};
21 use std::collections::{HashMap, HashSet, VecDeque};
21
22
22 type Rng = self::rand_pcg::Pcg32;
23 type Rng = self::rand_pcg::Pcg32;
@@ -26,8 +27,10 b' pub struct PartialDiscovery<G: Graph + C'
26 graph: G, // plays the role of self._repo
27 graph: G, // plays the role of self._repo
27 common: MissingAncestors<G>,
28 common: MissingAncestors<G>,
28 undecided: Option<HashSet<Revision>>,
29 undecided: Option<HashSet<Revision>>,
30 children_cache: Option<HashMap<Revision, Vec<Revision>>>,
29 missing: HashSet<Revision>,
31 missing: HashSet<Revision>,
30 rng: Rng,
32 rng: Rng,
33 respect_size: bool,
31 }
34 }
32
35
33 pub struct DiscoveryStats {
36 pub struct DiscoveryStats {
@@ -49,13 +52,16 b' pub struct DiscoveryStats {'
49 /// - `sample`: a sample to update
52 /// - `sample`: a sample to update
50 /// - `parentfn`: a callable to resolve parents for a revision
53 /// - `parentfn`: a callable to resolve parents for a revision
51 /// - `quicksamplesize`: optional target size of the sample
54 /// - `quicksamplesize`: optional target size of the sample
52 fn update_sample(
55 fn update_sample<I>(
53 revs: Option<&HashSet<Revision>>,
56 revs: Option<&HashSet<Revision>>,
54 heads: impl IntoIterator<Item = Revision>,
57 heads: impl IntoIterator<Item = Revision>,
55 sample: &mut HashSet<Revision>,
58 sample: &mut HashSet<Revision>,
56 parentsfn: impl Fn(Revision) -> Result<[Revision; 2], GraphError>,
59 parentsfn: impl Fn(Revision) -> Result<I, GraphError>,
57 quicksamplesize: Option<usize>,
60 quicksamplesize: Option<usize>,
58 ) -> Result<(), GraphError> {
61 ) -> Result<(), GraphError>
62 where
63 I: Iterator<Item = Revision>,
64 {
59 let mut distances: HashMap<Revision, u32> = HashMap::new();
65 let mut distances: HashMap<Revision, u32> = HashMap::new();
60 let mut visit: VecDeque<Revision> = heads.into_iter().collect();
66 let mut visit: VecDeque<Revision> = heads.into_iter().collect();
61 let mut factor: u32 = 1;
67 let mut factor: u32 = 1;
@@ -83,10 +89,7 b' fn update_sample('
83 }
89 }
84 }
90 }
85 }
91 }
86 for &p in &parentsfn(current)? {
92 for p in parentsfn(current)? {
87 if p == NULL_REVISION {
88 continue;
89 }
90 if let Some(revs) = revs {
93 if let Some(revs) = revs {
91 if !revs.contains(&p) {
94 if !revs.contains(&p) {
92 continue;
95 continue;
@@ -99,6 +102,39 b' fn update_sample('
99 Ok(())
102 Ok(())
100 }
103 }
101
104
105 struct ParentsIterator {
106 parents: [Revision; 2],
107 cur: usize,
108 }
109
110 impl ParentsIterator {
111 fn graph_parents(
112 graph: &impl Graph,
113 r: Revision,
114 ) -> Result<ParentsIterator, GraphError> {
115 Ok(ParentsIterator {
116 parents: graph.parents(r)?,
117 cur: 0,
118 })
119 }
120 }
121
122 impl Iterator for ParentsIterator {
123 type Item = Revision;
124
125 fn next(&mut self) -> Option<Revision> {
126 if self.cur > 1 {
127 return None;
128 }
129 let rev = self.parents[self.cur];
130 self.cur += 1;
131 if rev == NULL_REVISION {
132 return self.next();
133 }
134 Some(rev)
135 }
136 }
137
102 impl<G: Graph + Clone> PartialDiscovery<G> {
138 impl<G: Graph + Clone> PartialDiscovery<G> {
103 /// Create a PartialDiscovery object, with the intent
139 /// Create a PartialDiscovery object, with the intent
104 /// of comparing our `::<target_heads>` revset to the contents of another
140 /// of comparing our `::<target_heads>` revset to the contents of another
@@ -110,24 +146,36 b' impl<G: Graph + Clone> PartialDiscovery<'
110 /// If we want to make the signature more flexible,
146 /// If we want to make the signature more flexible,
111 /// we'll have to make it a type argument of `PartialDiscovery` or a trait
147 /// we'll have to make it a type argument of `PartialDiscovery` or a trait
112 /// object since we'll keep it in the meanwhile
148 /// object since we'll keep it in the meanwhile
113 pub fn new(graph: G, target_heads: Vec<Revision>) -> Self {
149 ///
150 /// The `respect_size` boolean controls how the sampling methods
151 /// will interpret the size argument requested by the caller. If it's
152 /// `false`, they are allowed to produce a sample whose size is more
153 /// appropriate to the situation (typically bigger).
154 pub fn new(
155 graph: G,
156 target_heads: Vec<Revision>,
157 respect_size: bool,
158 ) -> Self {
114 let mut seed: [u8; 16] = [0; 16];
159 let mut seed: [u8; 16] = [0; 16];
115 thread_rng().fill_bytes(&mut seed);
160 thread_rng().fill_bytes(&mut seed);
116 Self::new_with_seed(graph, target_heads, seed)
161 Self::new_with_seed(graph, target_heads, seed, respect_size)
117 }
162 }
118
163
119 pub fn new_with_seed(
164 pub fn new_with_seed(
120 graph: G,
165 graph: G,
121 target_heads: Vec<Revision>,
166 target_heads: Vec<Revision>,
122 seed: [u8; 16],
167 seed: [u8; 16],
168 respect_size: bool,
123 ) -> Self {
169 ) -> Self {
124 PartialDiscovery {
170 PartialDiscovery {
125 undecided: None,
171 undecided: None,
172 children_cache: None,
126 target_heads: Some(target_heads),
173 target_heads: Some(target_heads),
127 graph: graph.clone(),
174 graph: graph.clone(),
128 common: MissingAncestors::new(graph, vec![]),
175 common: MissingAncestors::new(graph, vec![]),
129 missing: HashSet::new(),
176 missing: HashSet::new(),
130 rng: Rng::from_seed(seed),
177 rng: Rng::from_seed(seed),
178 respect_size: respect_size,
131 }
179 }
132 }
180 }
133
181
@@ -228,6 +276,22 b' impl<G: Graph + Clone> PartialDiscovery<'
228 Ok(())
276 Ok(())
229 }
277 }
230
278
279 fn ensure_children_cache(&mut self) -> Result<(), GraphError> {
280 if self.children_cache.is_some() {
281 return Ok(());
282 }
283 self.ensure_undecided()?;
284
285 let mut children: HashMap<Revision, Vec<Revision>> = HashMap::new();
286 for &rev in self.undecided.as_ref().unwrap() {
287 for p in ParentsIterator::graph_parents(&self.graph, rev)? {
288 children.entry(p).or_insert_with(|| Vec::new()).push(rev);
289 }
290 }
291 self.children_cache = Some(children);
292 Ok(())
293 }
294
231 /// Provide statistics about the current state of the discovery process
295 /// Provide statistics about the current state of the discovery process
232 pub fn stats(&self) -> DiscoveryStats {
296 pub fn stats(&self) -> DiscoveryStats {
233 DiscoveryStats {
297 DiscoveryStats {
@@ -255,11 +319,114 b' impl<G: Graph + Clone> PartialDiscovery<'
255 None,
319 None,
256 headrevs,
320 headrevs,
257 &mut sample,
321 &mut sample,
258 |r| self.graph.parents(r),
322 |r| ParentsIterator::graph_parents(&self.graph, r),
259 Some(size),
323 Some(size),
260 )?;
324 )?;
261 Ok(sample.into_iter().collect())
325 Ok(sample.into_iter().collect())
262 }
326 }
327
328 /// Extract a sample from `self.undecided`, going from its heads and roots.
329 ///
330 /// The `size` parameter is used to avoid useless computations if
331 /// it turns out to be bigger than the whole set of undecided Revisions.
332 ///
333 /// The sample is taken by using `update_sample` from the heads, then
334 /// from the roots, working on the reverse DAG,
335 /// expressed by `self.children_cache`.
336 ///
337 /// No effort is being made to complete or limit the sample to `size`
338 /// but this method returns another interesting size that it derives
339 /// from its knowledge of the structure of the various sets, leaving
340 /// to the caller the decision to use it or not.
341 fn bidirectional_sample(
342 &mut self,
343 size: usize,
344 ) -> Result<(HashSet<Revision>, usize), GraphError> {
345 self.ensure_undecided()?;
346 {
347 // we don't want to compute children_cache before this
348 // but doing it after extracting self.undecided takes a mutable
349 // ref to self while a shareable one is still active.
350 let undecided = self.undecided.as_ref().unwrap();
351 if undecided.len() <= size {
352 return Ok((undecided.clone(), size));
353 }
354 }
355
356 self.ensure_children_cache()?;
357 let revs = self.undecided.as_ref().unwrap();
358 let mut sample: HashSet<Revision> = revs.clone();
359
360 // it's possible that leveraging the children cache would be more
361 // efficient here
362 dagops::retain_heads(&self.graph, &mut sample)?;
363 let revsheads = sample.clone(); // was again heads(revs) in python
364
365 // update from heads
366 update_sample(
367 Some(revs),
368 revsheads.iter().cloned(),
369 &mut sample,
370 |r| ParentsIterator::graph_parents(&self.graph, r),
371 None,
372 )?;
373
374 // update from roots
375 let revroots: HashSet<Revision> =
376 dagops::roots(&self.graph, revs)?.into_iter().collect();
377 let prescribed_size = max(size, min(revroots.len(), revsheads.len()));
378
379 let children = self.children_cache.as_ref().unwrap();
380 let empty_vec: Vec<Revision> = Vec::new();
381 update_sample(
382 Some(revs),
383 revroots,
384 &mut sample,
385 |r| Ok(children.get(&r).unwrap_or(&empty_vec).iter().cloned()),
386 None,
387 )?;
388 Ok((sample, prescribed_size))
389 }
390
391 /// Fill up sample up to the wished size with random undecided Revisions.
392 ///
393 /// This is intended to be used as a last resort completion if the
394 /// regular sampling algorithm returns too few elements.
395 fn random_complete_sample(
396 &mut self,
397 sample: &mut Vec<Revision>,
398 size: usize,
399 ) {
400 let sample_len = sample.len();
401 if size <= sample_len {
402 return;
403 }
404 let take_from: Vec<Revision> = self
405 .undecided
406 .as_ref()
407 .unwrap()
408 .iter()
409 .filter(|&r| !sample.contains(r))
410 .cloned()
411 .collect();
412 sample.extend(self.limit_sample(take_from, size - sample_len));
413 }
414
415 pub fn take_full_sample(
416 &mut self,
417 size: usize,
418 ) -> Result<Vec<Revision>, GraphError> {
419 let (sample_set, prescribed_size) = self.bidirectional_sample(size)?;
420 let size = if self.respect_size {
421 size
422 } else {
423 prescribed_size
424 };
425 let mut sample =
426 self.limit_sample(sample_set.into_iter().collect(), size);
427 self.random_complete_sample(&mut sample, size);
428 Ok(sample)
429 }
263 }
430 }
264
431
265 #[cfg(test)]
432 #[cfg(test)]
@@ -275,6 +442,7 b' mod tests {'
275 SampleGraph,
442 SampleGraph,
276 vec![10, 11, 12, 13],
443 vec![10, 11, 12, 13],
277 [0; 16],
444 [0; 16],
445 true,
278 )
446 )
279 }
447 }
280
448
@@ -282,7 +450,7 b' mod tests {'
282 ///
450 ///
283 /// To avoid actual randomness in tests, we give it a fixed random seed.
451 /// To avoid actual randomness in tests, we give it a fixed random seed.
284 fn disco12() -> PartialDiscovery<SampleGraph> {
452 fn disco12() -> PartialDiscovery<SampleGraph> {
285 PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16])
453 PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16], true)
286 }
454 }
287
455
288 fn sorted_undecided(
456 fn sorted_undecided(
@@ -390,4 +558,58 b' mod tests {'
390 assert_eq!(sample_vec, vec![4, 9, 12]);
558 assert_eq!(sample_vec, vec![4, 9, 12]);
391 Ok(())
559 Ok(())
392 }
560 }
561
562 #[test]
563 fn test_children_cache() -> Result<(), GraphError> {
564 let mut disco = full_disco();
565 disco.ensure_children_cache()?;
566
567 let cache = disco.children_cache.unwrap();
568 assert_eq!(cache.get(&2).cloned(), Some(vec![4]));
569 assert_eq!(cache.get(&10).cloned(), None);
570
571 let mut children_4 = cache.get(&4).cloned().unwrap();
572 children_4.sort();
573 assert_eq!(children_4, vec![5, 6, 7]);
574
575 let mut children_7 = cache.get(&7).cloned().unwrap();
576 children_7.sort();
577 assert_eq!(children_7, vec![9, 11]);
578
579 Ok(())
580 }
581
582 #[test]
583 fn test_complete_sample() {
584 let mut disco = full_disco();
585 let undecided: HashSet<Revision> =
586 [4, 7, 9, 2, 3].iter().cloned().collect();
587 disco.undecided = Some(undecided);
588
589 let mut sample = vec![0];
590 disco.random_complete_sample(&mut sample, 3);
591 assert_eq!(sample.len(), 3);
592
593 let mut sample = vec![2, 4, 7];
594 disco.random_complete_sample(&mut sample, 1);
595 assert_eq!(sample.len(), 3);
596 }
597
598 #[test]
599 fn test_bidirectional_sample() -> Result<(), GraphError> {
600 let mut disco = full_disco();
601 disco.undecided = Some((0..=13).into_iter().collect());
602
603 let (sample_set, size) = disco.bidirectional_sample(7)?;
604 assert_eq!(size, 7);
605 let mut sample: Vec<Revision> = sample_set.into_iter().collect();
606 sample.sort();
607 // our DAG is a bit too small for the results to be really interesting
608 // at least it shows that
609 // - we went both ways
610 // - we didn't take all Revisions (6 is not in the sample)
611 assert_eq!(sample, vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]);
612 Ok(())
613 }
614
393 }
615 }
@@ -36,7 +36,7 b' py_class!(pub class PartialDiscovery |py'
36 _cls,
36 _cls,
37 repo: PyObject,
37 repo: PyObject,
38 targetheads: PyObject,
38 targetheads: PyObject,
39 _respectsize: bool
39 respectsize: bool
40 ) -> PyResult<PartialDiscovery> {
40 ) -> PyResult<PartialDiscovery> {
41 let index = repo.getattr(py, "changelog")?.getattr(py, "index")?;
41 let index = repo.getattr(py, "changelog")?.getattr(py, "index")?;
42 Self::create_instance(
42 Self::create_instance(
@@ -44,6 +44,7 b' py_class!(pub class PartialDiscovery |py'
44 RefCell::new(Box::new(CorePartialDiscovery::new(
44 RefCell::new(Box::new(CorePartialDiscovery::new(
45 Index::new(py, index)?,
45 Index::new(py, index)?,
46 rev_pyiter_collect(py, &targetheads)?,
46 rev_pyiter_collect(py, &targetheads)?,
47 respectsize
47 )))
48 )))
48 )
49 )
49 }
50 }
General Comments 0
You need to be logged in to leave comments. Login now