diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml --- a/rust/hg-core/Cargo.toml +++ b/rust/hg-core/Cargo.toml @@ -8,12 +8,10 @@ edition = "2018" [lib] name = "hg" -[dev-dependencies] -rand = "*" -rand_pcg = "*" - [dependencies] byteorder = "1.3.1" lazy_static = "1.3.0" memchr = "2.2.0" +rand = "> 0.6.4" +rand_pcg = "> 0.1.0" regex = "^1.1" diff --git a/rust/hg-core/src/discovery.rs b/rust/hg-core/src/discovery.rs --- a/rust/hg-core/src/discovery.rs +++ b/rust/hg-core/src/discovery.rs @@ -10,10 +10,16 @@ //! This is a Rust counterpart to the `partialdiscovery` class of //! `mercurial.setdiscovery` -use super::{Graph, GraphError, Revision}; +extern crate rand; +extern crate rand_pcg; +use self::rand::seq::SliceRandom; +use self::rand::{thread_rng, RngCore, SeedableRng}; +use super::{Graph, GraphError, Revision, NULL_REVISION}; use crate::ancestors::MissingAncestors; use crate::dagops; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet, VecDeque}; + +type Rng = self::rand_pcg::Pcg32; pub struct PartialDiscovery { target_heads: Option>, @@ -21,12 +27,78 @@ pub struct PartialDiscovery, undecided: Option>, missing: HashSet, + rng: Rng, } pub struct DiscoveryStats { pub undecided: Option, } +/// Update an existing sample to match the expected size +/// +/// The sample is updated with revisions exponentially distant from each +/// element of `heads`. +/// +/// If a target size is specified, the sampling will stop once this size is +/// reached. Otherwise sampling will happen until roots of the set are +/// reached. +/// +/// - `revs`: set of revs we want to discover (if None, `assume` the whole dag +/// represented by `parentfn` +/// - `heads`: set of DAG head revs +/// - `sample`: a sample to update +/// - `parentfn`: a callable to resolve parents for a revision +/// - `quicksamplesize`: optional target size of the sample +fn update_sample( + revs: Option<&HashSet>, + heads: impl IntoIterator, + sample: &mut HashSet, + parentsfn: impl Fn(Revision) -> Result<[Revision; 2], GraphError>, + quicksamplesize: Option, +) -> Result<(), GraphError> { + let mut distances: HashMap = HashMap::new(); + let mut visit: VecDeque = heads.into_iter().collect(); + let mut factor: u32 = 1; + let mut seen: HashSet = HashSet::new(); + loop { + let current = match visit.pop_front() { + None => { + break; + } + Some(r) => r, + }; + if !seen.insert(current) { + continue; + } + + let d = *distances.entry(current).or_insert(1); + if d > factor { + factor *= 2; + } + if d == factor { + sample.insert(current); + if let Some(sz) = quicksamplesize { + if sample.len() >= sz { + return Ok(()); + } + } + } + for &p in &parentsfn(current)? { + if p == NULL_REVISION { + continue; + } + if let Some(revs) = revs { + if !revs.contains(&p) { + continue; + } + } + distances.entry(p).or_insert(d + 1); + visit.push_back(p); + } + } + Ok(()) +} + impl PartialDiscovery { /// Create a PartialDiscovery object, with the intent /// of comparing our `::` revset to the contents of another @@ -39,15 +111,47 @@ impl PartialDiscovery< /// we'll have to make it a type argument of `PartialDiscovery` or a trait /// object since we'll keep it in the meanwhile pub fn new(graph: G, target_heads: Vec) -> Self { + let mut seed: [u8; 16] = [0; 16]; + thread_rng().fill_bytes(&mut seed); + Self::new_with_seed(graph, target_heads, seed) + } + + pub fn new_with_seed( + graph: G, + target_heads: Vec, + seed: [u8; 16], + ) -> Self { PartialDiscovery { undecided: None, target_heads: Some(target_heads), graph: graph.clone(), common: MissingAncestors::new(graph, vec![]), missing: HashSet::new(), + rng: Rng::from_seed(seed), } } + /// Extract at most `size` random elements from sample and return them + /// as a vector + fn limit_sample( + &mut self, + mut sample: Vec, + size: usize, + ) -> Vec { + let sample_len = sample.len(); + if sample_len <= size { + return sample; + } + let rng = &mut self.rng; + let dropped_size = sample_len - size; + let limited_slice = if size < dropped_size { + sample.partial_shuffle(rng, size).0 + } else { + sample.partial_shuffle(rng, dropped_size).1 + }; + limited_slice.to_owned() + } + /// Register revisions known as being common pub fn add_common_revisions( &mut self, @@ -130,6 +234,32 @@ impl PartialDiscovery< undecided: self.undecided.as_ref().map(|s| s.len()), } } + + pub fn take_quick_sample( + &mut self, + headrevs: impl IntoIterator, + size: usize, + ) -> Result, GraphError> { + self.ensure_undecided()?; + let mut sample = { + let undecided = self.undecided.as_ref().unwrap(); + if undecided.len() <= size { + return Ok(undecided.iter().cloned().collect()); + } + dagops::heads(&self.graph, undecided.iter())? + }; + if sample.len() >= size { + return Ok(self.limit_sample(sample.into_iter().collect(), size)); + } + update_sample( + None, + headrevs, + &mut sample, + |r| self.graph.parents(r), + Some(size), + )?; + Ok(sample.into_iter().collect()) + } } #[cfg(test)] @@ -138,8 +268,21 @@ mod tests { use crate::testing::SampleGraph; /// A PartialDiscovery as for pushing all the heads of `SampleGraph` + /// + /// To avoid actual randomness in tests, we give it a fixed random seed. fn full_disco() -> PartialDiscovery { - PartialDiscovery::new(SampleGraph, vec![10, 11, 12, 13]) + PartialDiscovery::new_with_seed( + SampleGraph, + vec![10, 11, 12, 13], + [0; 16], + ) + } + + /// A PartialDiscovery as for pushing the 12 head of `SampleGraph` + /// + /// To avoid actual randomness in tests, we give it a fixed random seed. + fn disco12() -> PartialDiscovery { + PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16]) } fn sorted_undecided( @@ -206,4 +349,45 @@ mod tests { assert_eq!(sorted_common_heads(&disco)?, vec![5, 11, 12]); Ok(()) } + + #[test] + fn test_limit_sample_no_need_to() { + let sample = vec![1, 2, 3, 4]; + assert_eq!(full_disco().limit_sample(sample, 10), vec![1, 2, 3, 4]); + } + + #[test] + fn test_limit_sample_less_than_half() { + assert_eq!(full_disco().limit_sample((1..6).collect(), 2), vec![4, 2]); + } + + #[test] + fn test_limit_sample_more_than_half() { + assert_eq!(full_disco().limit_sample((1..4).collect(), 2), vec![3, 2]); + } + + #[test] + fn test_quick_sample_enough_undecided_heads() -> Result<(), GraphError> { + let mut disco = full_disco(); + disco.undecided = Some((1..=13).collect()); + + let mut sample_vec = disco.take_quick_sample(vec![], 4)?; + sample_vec.sort(); + assert_eq!(sample_vec, vec![10, 11, 12, 13]); + Ok(()) + } + + #[test] + fn test_quick_sample_climbing_from_12() -> Result<(), GraphError> { + let mut disco = disco12(); + disco.ensure_undecided()?; + + let mut sample_vec = disco.take_quick_sample(vec![12], 4)?; + sample_vec.sort(); + // r12's only parent is r9, whose unique grand-parent through the + // diamond shape is r4. This ends there because the distance from r4 + // to the root is only 3. + assert_eq!(sample_vec, vec![4, 9, 12]); + Ok(()) + } }