##// END OF EJS Templates
rust-discovery: exposing sampling to python...
rust-discovery: exposing sampling to python Differential Revision: https://phab.mercurial-scm.org/D6425

File last commit:

r42966:8041a1b4 default
r42967:1c4b5689 default
Show More
discovery.rs
615 lines | 19.7 KiB | application/rls-services+xml | RustLexer
// discovery.rs
//
// Copyright 2019 Georges Racinet <georges.racinet@octobus.net>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.
//! Discovery operations
//!
//! This is a Rust counterpart to the `partialdiscovery` class of
//! `mercurial.setdiscovery`
extern crate rand;
extern crate rand_pcg;
use self::rand::seq::SliceRandom;
use self::rand::{thread_rng, RngCore, SeedableRng};
use super::{Graph, GraphError, Revision, NULL_REVISION};
use crate::ancestors::MissingAncestors;
use crate::dagops;
use std::cmp::{max, min};
use std::collections::{HashMap, HashSet, VecDeque};
type Rng = self::rand_pcg::Pcg32;
pub struct PartialDiscovery<G: Graph + Clone> {
target_heads: Option<Vec<Revision>>,
graph: G, // plays the role of self._repo
common: MissingAncestors<G>,
undecided: Option<HashSet<Revision>>,
children_cache: Option<HashMap<Revision, Vec<Revision>>>,
missing: HashSet<Revision>,
rng: Rng,
respect_size: bool,
}
pub struct DiscoveryStats {
pub undecided: Option<usize>,
}
/// Update an existing sample to match the expected size
///
/// The sample is updated with revisions exponentially distant from each
/// element of `heads`.
///
/// If a target size is specified, the sampling will stop once this size is
/// reached. Otherwise sampling will happen until roots of the <revs> set are
/// reached.
///
/// - `revs`: set of revs we want to discover (if None, `assume` the whole dag
/// represented by `parentfn`
/// - `heads`: set of DAG head revs
/// - `sample`: a sample to update
/// - `parentfn`: a callable to resolve parents for a revision
/// - `quicksamplesize`: optional target size of the sample
fn update_sample<I>(
revs: Option<&HashSet<Revision>>,
heads: impl IntoIterator<Item = Revision>,
sample: &mut HashSet<Revision>,
parentsfn: impl Fn(Revision) -> Result<I, GraphError>,
quicksamplesize: Option<usize>,
) -> Result<(), GraphError>
where
I: Iterator<Item = Revision>,
{
let mut distances: HashMap<Revision, u32> = HashMap::new();
let mut visit: VecDeque<Revision> = heads.into_iter().collect();
let mut factor: u32 = 1;
let mut seen: HashSet<Revision> = HashSet::new();
loop {
let current = match visit.pop_front() {
None => {
break;
}
Some(r) => r,
};
if !seen.insert(current) {
continue;
}
let d = *distances.entry(current).or_insert(1);
if d > factor {
factor *= 2;
}
if d == factor {
sample.insert(current);
if let Some(sz) = quicksamplesize {
if sample.len() >= sz {
return Ok(());
}
}
}
for p in parentsfn(current)? {
if let Some(revs) = revs {
if !revs.contains(&p) {
continue;
}
}
distances.entry(p).or_insert(d + 1);
visit.push_back(p);
}
}
Ok(())
}
struct ParentsIterator {
parents: [Revision; 2],
cur: usize,
}
impl ParentsIterator {
fn graph_parents(
graph: &impl Graph,
r: Revision,
) -> Result<ParentsIterator, GraphError> {
Ok(ParentsIterator {
parents: graph.parents(r)?,
cur: 0,
})
}
}
impl Iterator for ParentsIterator {
type Item = Revision;
fn next(&mut self) -> Option<Revision> {
if self.cur > 1 {
return None;
}
let rev = self.parents[self.cur];
self.cur += 1;
if rev == NULL_REVISION {
return self.next();
}
Some(rev)
}
}
impl<G: Graph + Clone> PartialDiscovery<G> {
/// Create a PartialDiscovery object, with the intent
/// of comparing our `::<target_heads>` revset to the contents of another
/// repo.
///
/// For now `target_heads` is passed as a vector, and will be used
/// at the first call to `ensure_undecided()`.
///
/// If we want to make the signature more flexible,
/// we'll have to make it a type argument of `PartialDiscovery` or a trait
/// object since we'll keep it in the meanwhile
///
/// The `respect_size` boolean controls how the sampling methods
/// will interpret the size argument requested by the caller. If it's
/// `false`, they are allowed to produce a sample whose size is more
/// appropriate to the situation (typically bigger).
pub fn new(
graph: G,
target_heads: Vec<Revision>,
respect_size: bool,
) -> Self {
let mut seed: [u8; 16] = [0; 16];
thread_rng().fill_bytes(&mut seed);
Self::new_with_seed(graph, target_heads, seed, respect_size)
}
pub fn new_with_seed(
graph: G,
target_heads: Vec<Revision>,
seed: [u8; 16],
respect_size: bool,
) -> Self {
PartialDiscovery {
undecided: None,
children_cache: None,
target_heads: Some(target_heads),
graph: graph.clone(),
common: MissingAncestors::new(graph, vec![]),
missing: HashSet::new(),
rng: Rng::from_seed(seed),
respect_size: respect_size,
}
}
/// Extract at most `size` random elements from sample and return them
/// as a vector
fn limit_sample(
&mut self,
mut sample: Vec<Revision>,
size: usize,
) -> Vec<Revision> {
let sample_len = sample.len();
if sample_len <= size {
return sample;
}
let rng = &mut self.rng;
let dropped_size = sample_len - size;
let limited_slice = if size < dropped_size {
sample.partial_shuffle(rng, size).0
} else {
sample.partial_shuffle(rng, dropped_size).1
};
limited_slice.to_owned()
}
/// Register revisions known as being common
pub fn add_common_revisions(
&mut self,
common: impl IntoIterator<Item = Revision>,
) -> Result<(), GraphError> {
self.common.add_bases(common);
if let Some(ref mut undecided) = self.undecided {
self.common.remove_ancestors_from(undecided)?;
}
Ok(())
}
/// Register revisions known as being missing
pub fn add_missing_revisions(
&mut self,
missing: impl IntoIterator<Item = Revision>,
) -> Result<(), GraphError> {
self.ensure_undecided()?;
let range = dagops::range(
&self.graph,
missing,
self.undecided.as_ref().unwrap().iter().cloned(),
)?;
let undecided_mut = self.undecided.as_mut().unwrap();
for missrev in range {
self.missing.insert(missrev);
undecided_mut.remove(&missrev);
}
Ok(())
}
/// Do we have any information about the peer?
pub fn has_info(&self) -> bool {
self.common.has_bases()
}
/// Did we acquire full knowledge of our Revisions that the peer has?
pub fn is_complete(&self) -> bool {
self.undecided.as_ref().map_or(false, |s| s.is_empty())
}
/// Return the heads of the currently known common set of revisions.
///
/// If the discovery process is not complete (see `is_complete()`), the
/// caller must be aware that this is an intermediate state.
///
/// On the other hand, if it is complete, then this is currently
/// the only way to retrieve the end results of the discovery process.
///
/// We may introduce in the future an `into_common_heads` call that
/// would be more appropriate for normal Rust callers, dropping `self`
/// if it is complete.
pub fn common_heads(&self) -> Result<HashSet<Revision>, GraphError> {
self.common.bases_heads()
}
/// Force first computation of `self.undecided`
///
/// After this, `self.undecided.as_ref()` and `.as_mut()` can be
/// unwrapped to get workable immutable or mutable references without
/// any panic.
///
/// This is an imperative call instead of an access with added lazyness
/// to reduce easily the scope of mutable borrow for the caller,
/// compared to undecided(&'a mut self) -> &'a… that would keep it
/// as long as the resulting immutable one.
fn ensure_undecided(&mut self) -> Result<(), GraphError> {
if self.undecided.is_some() {
return Ok(());
}
let tgt = self.target_heads.take().unwrap();
self.undecided =
Some(self.common.missing_ancestors(tgt)?.into_iter().collect());
Ok(())
}
fn ensure_children_cache(&mut self) -> Result<(), GraphError> {
if self.children_cache.is_some() {
return Ok(());
}
self.ensure_undecided()?;
let mut children: HashMap<Revision, Vec<Revision>> = HashMap::new();
for &rev in self.undecided.as_ref().unwrap() {
for p in ParentsIterator::graph_parents(&self.graph, rev)? {
children.entry(p).or_insert_with(|| Vec::new()).push(rev);
}
}
self.children_cache = Some(children);
Ok(())
}
/// Provide statistics about the current state of the discovery process
pub fn stats(&self) -> DiscoveryStats {
DiscoveryStats {
undecided: self.undecided.as_ref().map(|s| s.len()),
}
}
pub fn take_quick_sample(
&mut self,
headrevs: impl IntoIterator<Item = Revision>,
size: usize,
) -> Result<Vec<Revision>, GraphError> {
self.ensure_undecided()?;
let mut sample = {
let undecided = self.undecided.as_ref().unwrap();
if undecided.len() <= size {
return Ok(undecided.iter().cloned().collect());
}
dagops::heads(&self.graph, undecided.iter())?
};
if sample.len() >= size {
return Ok(self.limit_sample(sample.into_iter().collect(), size));
}
update_sample(
None,
headrevs,
&mut sample,
|r| ParentsIterator::graph_parents(&self.graph, r),
Some(size),
)?;
Ok(sample.into_iter().collect())
}
/// Extract a sample from `self.undecided`, going from its heads and roots.
///
/// The `size` parameter is used to avoid useless computations if
/// it turns out to be bigger than the whole set of undecided Revisions.
///
/// The sample is taken by using `update_sample` from the heads, then
/// from the roots, working on the reverse DAG,
/// expressed by `self.children_cache`.
///
/// No effort is being made to complete or limit the sample to `size`
/// but this method returns another interesting size that it derives
/// from its knowledge of the structure of the various sets, leaving
/// to the caller the decision to use it or not.
fn bidirectional_sample(
&mut self,
size: usize,
) -> Result<(HashSet<Revision>, usize), GraphError> {
self.ensure_undecided()?;
{
// we don't want to compute children_cache before this
// but doing it after extracting self.undecided takes a mutable
// ref to self while a shareable one is still active.
let undecided = self.undecided.as_ref().unwrap();
if undecided.len() <= size {
return Ok((undecided.clone(), size));
}
}
self.ensure_children_cache()?;
let revs = self.undecided.as_ref().unwrap();
let mut sample: HashSet<Revision> = revs.clone();
// it's possible that leveraging the children cache would be more
// efficient here
dagops::retain_heads(&self.graph, &mut sample)?;
let revsheads = sample.clone(); // was again heads(revs) in python
// update from heads
update_sample(
Some(revs),
revsheads.iter().cloned(),
&mut sample,
|r| ParentsIterator::graph_parents(&self.graph, r),
None,
)?;
// update from roots
let revroots: HashSet<Revision> =
dagops::roots(&self.graph, revs)?.into_iter().collect();
let prescribed_size = max(size, min(revroots.len(), revsheads.len()));
let children = self.children_cache.as_ref().unwrap();
let empty_vec: Vec<Revision> = Vec::new();
update_sample(
Some(revs),
revroots,
&mut sample,
|r| Ok(children.get(&r).unwrap_or(&empty_vec).iter().cloned()),
None,
)?;
Ok((sample, prescribed_size))
}
/// Fill up sample up to the wished size with random undecided Revisions.
///
/// This is intended to be used as a last resort completion if the
/// regular sampling algorithm returns too few elements.
fn random_complete_sample(
&mut self,
sample: &mut Vec<Revision>,
size: usize,
) {
let sample_len = sample.len();
if size <= sample_len {
return;
}
let take_from: Vec<Revision> = self
.undecided
.as_ref()
.unwrap()
.iter()
.filter(|&r| !sample.contains(r))
.cloned()
.collect();
sample.extend(self.limit_sample(take_from, size - sample_len));
}
pub fn take_full_sample(
&mut self,
size: usize,
) -> Result<Vec<Revision>, GraphError> {
let (sample_set, prescribed_size) = self.bidirectional_sample(size)?;
let size = if self.respect_size {
size
} else {
prescribed_size
};
let mut sample =
self.limit_sample(sample_set.into_iter().collect(), size);
self.random_complete_sample(&mut sample, size);
Ok(sample)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::testing::SampleGraph;
/// A PartialDiscovery as for pushing all the heads of `SampleGraph`
///
/// To avoid actual randomness in tests, we give it a fixed random seed.
fn full_disco() -> PartialDiscovery<SampleGraph> {
PartialDiscovery::new_with_seed(
SampleGraph,
vec![10, 11, 12, 13],
[0; 16],
true,
)
}
/// A PartialDiscovery as for pushing the 12 head of `SampleGraph`
///
/// To avoid actual randomness in tests, we give it a fixed random seed.
fn disco12() -> PartialDiscovery<SampleGraph> {
PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16], true)
}
fn sorted_undecided(
disco: &PartialDiscovery<SampleGraph>,
) -> Vec<Revision> {
let mut as_vec: Vec<Revision> =
disco.undecided.as_ref().unwrap().iter().cloned().collect();
as_vec.sort();
as_vec
}
fn sorted_missing(disco: &PartialDiscovery<SampleGraph>) -> Vec<Revision> {
let mut as_vec: Vec<Revision> =
disco.missing.iter().cloned().collect();
as_vec.sort();
as_vec
}
fn sorted_common_heads(
disco: &PartialDiscovery<SampleGraph>,
) -> Result<Vec<Revision>, GraphError> {
let mut as_vec: Vec<Revision> =
disco.common_heads()?.iter().cloned().collect();
as_vec.sort();
Ok(as_vec)
}
#[test]
fn test_add_common_get_undecided() -> Result<(), GraphError> {
let mut disco = full_disco();
assert_eq!(disco.undecided, None);
assert!(!disco.has_info());
assert_eq!(disco.stats().undecided, None);
disco.add_common_revisions(vec![11, 12])?;
assert!(disco.has_info());
assert!(!disco.is_complete());
assert!(disco.missing.is_empty());
// add_common_revisions did not trigger a premature computation
// of `undecided`, let's check that and ask for them
assert_eq!(disco.undecided, None);
disco.ensure_undecided()?;
assert_eq!(sorted_undecided(&disco), vec![5, 8, 10, 13]);
assert_eq!(disco.stats().undecided, Some(4));
Ok(())
}
/// in this test, we pretend that our peer misses exactly (8+10)::
/// and we're comparing all our repo to it (as in a bare push)
#[test]
fn test_discovery() -> Result<(), GraphError> {
let mut disco = full_disco();
disco.add_common_revisions(vec![11, 12])?;
disco.add_missing_revisions(vec![8, 10])?;
assert_eq!(sorted_undecided(&disco), vec![5]);
assert_eq!(sorted_missing(&disco), vec![8, 10, 13]);
assert!(!disco.is_complete());
disco.add_common_revisions(vec![5])?;
assert_eq!(sorted_undecided(&disco), vec![]);
assert_eq!(sorted_missing(&disco), vec![8, 10, 13]);
assert!(disco.is_complete());
assert_eq!(sorted_common_heads(&disco)?, vec![5, 11, 12]);
Ok(())
}
#[test]
fn test_limit_sample_no_need_to() {
let sample = vec![1, 2, 3, 4];
assert_eq!(full_disco().limit_sample(sample, 10), vec![1, 2, 3, 4]);
}
#[test]
fn test_limit_sample_less_than_half() {
assert_eq!(full_disco().limit_sample((1..6).collect(), 2), vec![4, 2]);
}
#[test]
fn test_limit_sample_more_than_half() {
assert_eq!(full_disco().limit_sample((1..4).collect(), 2), vec![3, 2]);
}
#[test]
fn test_quick_sample_enough_undecided_heads() -> Result<(), GraphError> {
let mut disco = full_disco();
disco.undecided = Some((1..=13).collect());
let mut sample_vec = disco.take_quick_sample(vec![], 4)?;
sample_vec.sort();
assert_eq!(sample_vec, vec![10, 11, 12, 13]);
Ok(())
}
#[test]
fn test_quick_sample_climbing_from_12() -> Result<(), GraphError> {
let mut disco = disco12();
disco.ensure_undecided()?;
let mut sample_vec = disco.take_quick_sample(vec![12], 4)?;
sample_vec.sort();
// r12's only parent is r9, whose unique grand-parent through the
// diamond shape is r4. This ends there because the distance from r4
// to the root is only 3.
assert_eq!(sample_vec, vec![4, 9, 12]);
Ok(())
}
#[test]
fn test_children_cache() -> Result<(), GraphError> {
let mut disco = full_disco();
disco.ensure_children_cache()?;
let cache = disco.children_cache.unwrap();
assert_eq!(cache.get(&2).cloned(), Some(vec![4]));
assert_eq!(cache.get(&10).cloned(), None);
let mut children_4 = cache.get(&4).cloned().unwrap();
children_4.sort();
assert_eq!(children_4, vec![5, 6, 7]);
let mut children_7 = cache.get(&7).cloned().unwrap();
children_7.sort();
assert_eq!(children_7, vec![9, 11]);
Ok(())
}
#[test]
fn test_complete_sample() {
let mut disco = full_disco();
let undecided: HashSet<Revision> =
[4, 7, 9, 2, 3].iter().cloned().collect();
disco.undecided = Some(undecided);
let mut sample = vec![0];
disco.random_complete_sample(&mut sample, 3);
assert_eq!(sample.len(), 3);
let mut sample = vec![2, 4, 7];
disco.random_complete_sample(&mut sample, 1);
assert_eq!(sample.len(), 3);
}
#[test]
fn test_bidirectional_sample() -> Result<(), GraphError> {
let mut disco = full_disco();
disco.undecided = Some((0..=13).into_iter().collect());
let (sample_set, size) = disco.bidirectional_sample(7)?;
assert_eq!(size, 7);
let mut sample: Vec<Revision> = sample_set.into_iter().collect();
sample.sort();
// our DAG is a bit too small for the results to be really interesting
// at least it shows that
// - we went both ways
// - we didn't take all Revisions (6 is not in the sample)
assert_eq!(sample, vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]);
Ok(())
}
}