upstream/mercurial-mirror Commit - r42968:4e7bd618

rust-discovery: optionally don't randomize at all, for tests...

Georges Racinet -

r42968:4e7bd618 default

parent child

mercurial/setdiscovery.py

0 +23 -9

             # setdiscovery.py - improved discovery of common nodeset for mercurial
             #
             # Copyright 2010 Benoit Boissinot <bboissin@gmail.com>
             # and Peter Arrenbrecht <peter@arrenbrecht.ch>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             """
             Algorithm works in the following way. You have two repository: local and
             remote. They both contains a DAG of changelists.
             The goal of the discovery protocol is to find one set of node *common*,
             the set of nodes shared by local and remote.
             One of the issue with the original protocol was latency, it could
             potentially require lots of roundtrips to discover that the local repo was a
             subset of remote (which is a very common case, you usually have few changes
             compared to upstream, while upstream probably had lots of development).
             The new protocol only requires one interface for the remote repo: `known()`,
             which given a set of changelists tells you if they are present in the DAG.
             The algorithm then works as follow:
              - We will be using three sets, `common`, `missing`, `unknown`. Originally
              all nodes are in `unknown`.
              - Take a sample from `unknown`, call `remote.known(sample)`
                - For each node that remote knows, move it and all its ancestors to `common`
                - For each node that remote doesn't know, move it and all its descendants
                to `missing`
              - Iterate until `unknown` is empty
             There are a couple optimizations, first is instead of starting with a random
             sample of missing, start by sending all heads, in the case where the local
             repo is a subset, you computed the answer in one round trip.
             Then you can do something similar to the bisecting strategy used when
             finding faulty changesets. Instead of random samples, you can try picking
             nodes that will maximize the number of nodes that will be
             classified with it (since all ancestors or descendants will be marked as well).
             """
             from __future__ import absolute_import
             import collections
             import random
             from .i18n import _
             from .node import (
                 nullid,
                 nullrev,
             )
             from . import (
                 error,
                 util,
             )
             def _updatesample(revs, heads, sample, parentfn, quicksamplesize=0):
                 """update an existing sample to match the expected size
                 The sample is updated with revs exponentially distant from each head of the
                 <revs> set. (H~1, H~2, H~4, H~8, etc).
                 If a target size is specified, the sampling will stop once this size is
                 reached. Otherwise sampling will happen until roots of the <revs> set are
                 reached.
                 :revs:  set of revs we want to discover (if None, assume the whole dag)
                 :heads: set of DAG head revs
                 :sample: a sample to update
                 :parentfn: a callable to resolve parents for a revision
                 :quicksamplesize: optional target size of the sample"""
                 dist = {}
                 visit = collections.deque(heads)
                 seen = set()
                 factor = 1
                 while visit:
                     curr = visit.popleft()
                     if curr in seen:
                         continue
                     d = dist.setdefault(curr, 1)
                     if d > factor:
                         factor *= 2
                     if d == factor:
                         sample.add(curr)
                         if quicksamplesize and (len(sample) >= quicksamplesize):
                             return
                     seen.add(curr)
                     for p in parentfn(curr):
                         if p != nullrev and (not revs or p in revs):
                             dist.setdefault(p, d + 1)
                             visit.append(p)
-            def _limitsample(sample, desiredlen):
+            def _limitsample(sample, desiredlen, randomize=True):
-                """return a random subset of sample of at most desiredlen item"""
+                """return a random subset of sample of at most desiredlen item.
-                if len(sample) > desiredlen:
-                    sample = set(random.sample(sample, desiredlen))
+                If randomize is False, though, a deterministic subset is returned.
-                return sample
+                This is meant for integration tests.
+                """
+                if len(sample) <= desiredlen:
+                    return sample
+                if randomize:
+                    return set(random.sample(sample, desiredlen))
+                sample = list(sample)
+                sample.sort()
+                return set(sample[:desiredlen])
             class partialdiscovery(object):
                 """an object representing ongoing discovery
                 Feed with data from the remote repository, this object keep track of the
                 current set of changeset in various states:
                 - common:    revs also known remotely
                 - undecided: revs we don't have information on yet
                 - missing:   revs missing remotely
                 (all tracked revisions are known locally)
                 """
-                def __init__(self, repo, targetheads, respectsize):
+                def __init__(self, repo, targetheads, respectsize, randomize=True):
                     self._repo = repo
                     self._targetheads = targetheads
                     self._common = repo.changelog.incrementalmissingrevs()
                     self._undecided = None
                     self.missing = set()
                     self._childrenmap = None
                     self._respectsize = respectsize
+                    self.randomize = randomize
                 def addcommons(self, commons):
                     """register nodes known as common"""
                     self._common.addbases(commons)
                     if self._undecided is not None:
                         self._common.removeancestorsfrom(self._undecided)
                 def addmissings(self, missings):
                     """register some nodes as missing"""
                     newmissing = self._repo.revs('%ld::%ld', missings, self.undecided)
                     if newmissing:
                         self.missing.update(newmissing)
                         self.undecided.difference_update(newmissing)
                 def addinfo(self, sample):
                     """consume an iterable of (rev, known) tuples"""
                     common = set()
                     missing = set()
                     for rev, known in sample:
                         if known:
                             common.add(rev)
                         else:
                             missing.add(rev)
                     if common:
                         self.addcommons(common)
                     if missing:
                         self.addmissings(missing)
                 def hasinfo(self):
                     """return True is we have any clue about the remote state"""
                     return self._common.hasbases()
                 def iscomplete(self):
                     """True if all the necessary data have been gathered"""
                     return self._undecided is not None and not self._undecided
                 @property
                 def undecided(self):
                     if self._undecided is not None:
                         return self._undecided
                     self._undecided = set(self._common.missingancestors(self._targetheads))
                     return self._undecided
                 def stats(self):
                     return {
                         'undecided': len(self.undecided),
                     }
                 def commonheads(self):
                     """the heads of the known common set"""
                     # heads(common) == heads(common.bases) since common represents
                     # common.bases and all its ancestors
                     return self._common.basesheads()
                 def _parentsgetter(self):
                     getrev = self._repo.changelog.index.__getitem__
                     def getparents(r):
                         return getrev(r)[5:7]
                     return getparents
                 def _childrengetter(self):
                     if self._childrenmap is not None:
                         # During discovery, the `undecided` set keep shrinking.
                         # Therefore, the map computed for an iteration N will be
                         # valid for iteration N+1. Instead of computing the same
                         # data over and over we cached it the first time.
                         return self._childrenmap.__getitem__
                     # _updatesample() essentially does interaction over revisions to look
                     # up their children. This lookup is expensive and doing it in a loop is
                     # quadratic. We precompute the children for all relevant revisions and
                     # make the lookup in _updatesample() a simple dict lookup.
                     self._childrenmap = children = {}
                     parentrevs = self._parentsgetter()
                     revs = self.undecided
                     for rev in sorted(revs):
                         # Always ensure revision has an entry so we don't need to worry
                         # about missing keys.
                         children[rev] = []
                         for prev in parentrevs(rev):
                             if prev == nullrev:
                                 continue
                             c = children.get(prev)
                             if c is not None:
                                 c.append(rev)
                     return children.__getitem__
                 def takequicksample(self, headrevs, size):
                     """takes a quick sample of size <size>
                     It is meant for initial sampling and focuses on querying heads and close
                     ancestors of heads.
                     :headrevs: set of head revisions in local DAG to consider
                     :size: the maximum size of the sample"""
                     revs = self.undecided
                     if len(revs) <= size:
                         return list(revs)
                     sample = set(self._repo.revs('heads(%ld)', revs))
                     if len(sample) >= size:
-                        return _limitsample(sample, size)
+                        return _limitsample(sample, size, randomize=self.randomize)
                     _updatesample(None, headrevs, sample, self._parentsgetter(),
                                   quicksamplesize=size)
                     return sample
                 def takefullsample(self, headrevs, size):
                     revs = self.undecided
                     if len(revs) <= size:
                         return list(revs)
                     repo = self._repo
                     sample = set(repo.revs('heads(%ld)', revs))
                     parentrevs = self._parentsgetter()
                     # update from heads
                     revsheads = sample.copy()
                     _updatesample(revs, revsheads, sample, parentrevs)
                     # update from roots
                     revsroots = set(repo.revs('roots(%ld)', revs))
                     childrenrevs = self._childrengetter()
                     _updatesample(revs, revsroots, sample, childrenrevs)
                     assert sample
                     if not self._respectsize:
                         size = max(size, min(len(revsroots), len(revsheads)))
-                    sample = _limitsample(sample, size)
+                    sample = _limitsample(sample, size, randomize=self.randomize)
                     if len(sample) < size:
                         more = size - len(sample)
-                        sample.update(random.sample(list(revs - sample), more))
+                        takefrom = list(revs - sample)
+                        if self.randomize:
+                            sample.update(random.sample(takefrom, more))
+                        else:
+                            takefrom.sort()
+                            sample.update(takefrom[:more])
                     return sample
             def findcommonheads(ui, local, remote,
                                 initialsamplesize=100,
                                 fullsamplesize=200,
                                 abortwhenunrelated=True,
                                 ancestorsof=None,
                                 samplegrowth=1.05):
                 '''Return a tuple (common, anyincoming, remoteheads) used to identify
                 missing nodes from or in remote.
                 '''
                 start = util.timer()
                 roundtrips = 0
                 cl = local.changelog
                 clnode = cl.node
                 clrev = cl.rev
                 if ancestorsof is not None:
                     ownheads = [clrev(n) for n in ancestorsof]
                 else:
                     ownheads = [rev for rev in cl.headrevs() if rev != nullrev]
                 # early exit if we know all the specified remote heads already
                 ui.debug("query 1; heads\n")
                 roundtrips += 1
                 # We also ask remote about all the local heads. That set can be arbitrarily
                 # large, so we used to limit it size to `initialsamplesize`. We no longer
                 # do as it proved counter productive. The skipped heads could lead to a
                 # large "undecided" set, slower to be clarified than if we asked the
                 # question for all heads right away.
                 #
                 # We are already fetching all server heads using the `heads` commands,
                 # sending a equivalent number of heads the other way should not have a
                 # significant impact.  In addition, it is very likely that we are going to
                 # have to issue "known" request for an equivalent amount of revisions in
                 # order to decide if theses heads are common or missing.
                 #
                 # find a detailled analysis below.
                 #
                 # Case A: local and server both has few heads
                 #
                 #     Ownheads is below initialsamplesize, limit would not have any effect.
                 #
                 # Case B: local has few heads and server has many
                 #
                 #     Ownheads is below initialsamplesize, limit would not have any effect.
                 #
                 # Case C: local and server both has many heads
                 #
                 #     We now transfert some more data, but not significantly more than is
                 #     already transfered to carry the server heads.
                 #
                 # Case D: local has many heads, server has few
                 #
                 #   D.1 local heads are mostly known remotely
                 #
                 #     All the known head will have be part of a `known` request at some
                 #     point for the discovery to finish. Sending them all earlier is
                 #     actually helping.
                 #
                 #     (This case is fairly unlikely, it requires the numerous heads to all
                 #     be merged server side in only a few heads)
                 #
                 #   D.2 local heads are mostly missing remotely
                 #
                 #     To determine that the heads are missing, we'll have to issue `known`
                 #     request for them or one of their ancestors. This amount of `known`
                 #     request will likely be in the same order of magnitude than the amount
                 #     of local heads.
                 #
                 #     The only case where we can be more efficient using `known` request on
                 #     ancestors are case were all the "missing" local heads are based on a
                 #     few changeset, also "missing".  This means we would have a "complex"
                 #     graph (with many heads) attached to, but very independant to a the
                 #     "simple" graph on the server. This is a fairly usual case and have
                 #     not been met in the wild so far.
                 if remote.limitedarguments:
                     sample = _limitsample(ownheads, initialsamplesize)
                     # indices between sample and externalized version must match
                     sample = list(sample)
                 else:
                     sample = ownheads
                 with remote.commandexecutor() as e:
                     fheads = e.callcommand('heads', {})
                     fknown = e.callcommand('known', {
                         'nodes': [clnode(r) for r in sample],
                     })
                 srvheadhashes, yesno = fheads.result(), fknown.result()
                 if cl.tip() == nullid:
                     if srvheadhashes != [nullid]:
                         return [nullid], True, srvheadhashes
                     return [nullid], False, []
                 # start actual discovery (we note this before the next "if" for
                 # compatibility reasons)
                 ui.status(_("searching for changes\n"))
                 knownsrvheads = []  # revnos of remote heads that are known locally
                 for node in srvheadhashes:
                     if node == nullid:
                         continue
                     try:
                         knownsrvheads.append(clrev(node))
                     # Catches unknown and filtered nodes.
                     except error.LookupError:
                         continue
                 if len(knownsrvheads) == len(srvheadhashes):
                     ui.debug("all remote heads known locally\n")
                     return srvheadhashes, False, srvheadhashes
                 if len(sample) == len(ownheads) and all(yesno):
                     ui.note(_("all local heads known remotely\n"))
                     ownheadhashes = [clnode(r) for r in ownheads]
                     return ownheadhashes, True, srvheadhashes
                 # full blown discovery
                 disco = partialdiscovery(local, ownheads, remote.limitedarguments)
                 # treat remote heads (and maybe own heads) as a first implicit sample
                 # response
                 disco.addcommons(knownsrvheads)
                 disco.addinfo(zip(sample, yesno))
                 full = False
                 progress = ui.makeprogress(_('searching'), unit=_('queries'))
                 while not disco.iscomplete():
                     if full or disco.hasinfo():
                         if full:
                             ui.note(_("sampling from both directions\n"))
                         else:
                             ui.debug("taking initial sample\n")
                         samplefunc = disco.takefullsample
                         targetsize = fullsamplesize
                         if not remote.limitedarguments:
                             fullsamplesize = int(fullsamplesize * samplegrowth)
                     else:
                         # use even cheaper initial sample
                         ui.debug("taking quick initial sample\n")
                         samplefunc = disco.takequicksample
                         targetsize = initialsamplesize
                     sample = samplefunc(ownheads, targetsize)
                     roundtrips += 1
                     progress.update(roundtrips)
                     stats = disco.stats()
                     ui.debug("query %i; still undecided: %i, sample size is: %i\n"
                              % (roundtrips, stats['undecided'], len(sample)))
                     # indices between sample and externalized version must match
                     sample = list(sample)
                     with remote.commandexecutor() as e:
                         yesno = e.callcommand('known', {
                             'nodes': [clnode(r) for r in sample],
                         }).result()
                     full = True
                     disco.addinfo(zip(sample, yesno))
                 result = disco.commonheads()
                 elapsed = util.timer() - start
                 progress.complete()
                 ui.debug("%d total queries in %.4fs\n" % (roundtrips, elapsed))
                 msg = ('found %d common and %d unknown server heads,'
                        ' %d roundtrips in %.4fs\n')
                 missing = set(result) - set(knownsrvheads)
                 ui.log('discovery', msg, len(result), len(missing), roundtrips,
                        elapsed)
                 if not result and srvheadhashes != [nullid]:
                     if abortwhenunrelated:
                         raise error.Abort(_("repository is unrelated"))
                     else:
                         ui.warn(_("warning: repository is unrelated\n"))
                     return ({nullid}, True, srvheadhashes,)
                 anyincoming = (srvheadhashes != [nullid])
                 result = {clnode(r) for r in result}
                 return result, anyincoming, srvheadhashes

rust/hg-core/src/discovery.rs

0 +42 -4

             // discovery.rs
             //
             // Copyright 2019 Georges Racinet <georges.racinet@octobus.net>
             //
             // This software may be used and distributed according to the terms of the
             // GNU General Public License version 2 or any later version.
             //! Discovery operations
             //!
             //! This is a Rust counterpart to the `partialdiscovery` class of
             //! `mercurial.setdiscovery`
             extern crate rand;
             extern crate rand_pcg;
             use self::rand::seq::SliceRandom;
             use self::rand::{thread_rng, RngCore, SeedableRng};
             use super::{Graph, GraphError, Revision, NULL_REVISION};
             use crate::ancestors::MissingAncestors;
             use crate::dagops;
             use std::cmp::{max, min};
             use std::collections::{HashMap, HashSet, VecDeque};
             type Rng = self::rand_pcg::Pcg32;
             pub struct PartialDiscovery<G: Graph + Clone> {
                 target_heads: Option<Vec<Revision>>,
                 graph: G, // plays the role of self._repo
                 common: MissingAncestors<G>,
                 undecided: Option<HashSet<Revision>>,
                 children_cache: Option<HashMap<Revision, Vec<Revision>>>,
                 missing: HashSet<Revision>,
                 rng: Rng,
                 respect_size: bool,
+                randomize: bool,
             }
             pub struct DiscoveryStats {
                 pub undecided: Option<usize>,
             }
             /// Update an existing sample to match the expected size
             ///
             /// The sample is updated with revisions exponentially distant from each
             /// element of `heads`.
             ///
             /// If a target size is specified, the sampling will stop once this size is
             /// reached. Otherwise sampling will happen until roots of the <revs> set are
             /// reached.
             ///
             /// - `revs`: set of revs we want to discover (if None, `assume` the whole dag
             ///   represented by `parentfn`
             /// - `heads`: set of DAG head revs
             /// - `sample`: a sample to update
             /// - `parentfn`: a callable to resolve parents for a revision
             /// - `quicksamplesize`: optional target size of the sample
             fn update_sample<I>(
                 revs: Option<&HashSet<Revision>>,
                 heads: impl IntoIterator<Item = Revision>,
                 sample: &mut HashSet<Revision>,
                 parentsfn: impl Fn(Revision) -> Result<I, GraphError>,
                 quicksamplesize: Option<usize>,
             ) -> Result<(), GraphError>
             where
                 I: Iterator<Item = Revision>,
             {
                 let mut distances: HashMap<Revision, u32> = HashMap::new();
                 let mut visit: VecDeque<Revision> = heads.into_iter().collect();
                 let mut factor: u32 = 1;
                 let mut seen: HashSet<Revision> = HashSet::new();
                 loop {
                     let current = match visit.pop_front() {
                         None => {
                             break;
                         }
                         Some(r) => r,
                     };
                     if !seen.insert(current) {
                         continue;
                     }
                     let d = *distances.entry(current).or_insert(1);
                     if d > factor {
                         factor *= 2;
                     }
                     if d == factor {
                         sample.insert(current);
                         if let Some(sz) = quicksamplesize {
                             if sample.len() >= sz {
                                 return Ok(());
                             }
                         }
                     }
                     for p in parentsfn(current)? {
                         if let Some(revs) = revs {
                             if !revs.contains(&p) {
                                 continue;
                             }
                         }
                         distances.entry(p).or_insert(d + 1);
                         visit.push_back(p);
                     }
                 }
                 Ok(())
             }
             struct ParentsIterator {
                 parents: [Revision; 2],
                 cur: usize,
             }
             impl ParentsIterator {
                 fn graph_parents(
                     graph: &impl Graph,
                     r: Revision,
                 ) -> Result<ParentsIterator, GraphError> {
                     Ok(ParentsIterator {
                         parents: graph.parents(r)?,
                         cur: 0,
                     })
                 }
             }
             impl Iterator for ParentsIterator {
                 type Item = Revision;
                 fn next(&mut self) -> Option<Revision> {
                     if self.cur > 1 {
                         return None;
                     }
                     let rev = self.parents[self.cur];
                     self.cur += 1;
                     if rev == NULL_REVISION {
                         return self.next();
                     }
                     Some(rev)
                 }
             }
             impl<G: Graph + Clone> PartialDiscovery<G> {
                 /// Create a PartialDiscovery object, with the intent
                 /// of comparing our `::<target_heads>` revset to the contents of another
                 /// repo.
                 ///
                 /// For now `target_heads` is passed as a vector, and will be used
                 /// at the first call to `ensure_undecided()`.
                 ///
                 /// If we want to make the signature more flexible,
                 /// we'll have to make it a type argument of `PartialDiscovery` or a trait
                 /// object since we'll keep it in the meanwhile
                 ///
                 /// The `respect_size` boolean controls how the sampling methods
                 /// will interpret the size argument requested by the caller. If it's
                 /// `false`, they are allowed to produce a sample whose size is more
                 /// appropriate to the situation (typically bigger).
+                ///
+                /// The `randomize` boolean affects sampling, and specifically how
+                /// limiting or last-minute expanding is been done:
+                ///
+                /// If `true`, both will perform random picking from `self.undecided`.
+                /// This is currently the best for actual discoveries.
+                ///
+                /// If `false`, a reproductible picking strategy is performed. This is
+                /// useful for integration tests.
                 pub fn new(
                     graph: G,
                     target_heads: Vec<Revision>,
                     respect_size: bool,
+                    randomize: bool,
                 ) -> Self {
                     let mut seed: [u8; 16] = [0; 16];
-                    thread_rng().fill_bytes(&mut seed);
+                    if randomize {
-                    Self::new_with_seed(graph, target_heads, seed, respect_size)
+                        thread_rng().fill_bytes(&mut seed);
+                    }
+                    Self::new_with_seed(graph, target_heads, seed, respect_size, randomize)
                 }
                 pub fn new_with_seed(
                     graph: G,
                     target_heads: Vec<Revision>,
                     seed: [u8; 16],
                     respect_size: bool,
+                    randomize: bool,
                 ) -> Self {
                     PartialDiscovery {
                         undecided: None,
                         children_cache: None,
                         target_heads: Some(target_heads),
                         graph: graph.clone(),
                         common: MissingAncestors::new(graph, vec![]),
                         missing: HashSet::new(),
                         rng: Rng::from_seed(seed),
                         respect_size: respect_size,
+                        randomize: randomize,
                     }
                 }
                 /// Extract at most `size` random elements from sample and return them
                 /// as a vector
                 fn limit_sample(
                     &mut self,
                     mut sample: Vec<Revision>,
                     size: usize,
                 ) -> Vec<Revision> {
+                    if !self.randomize {
+                        sample.sort();
+                        sample.truncate(size);
+                        return sample;
+                    }
                     let sample_len = sample.len();
                     if sample_len <= size {
                         return sample;
                     }
                     let rng = &mut self.rng;
                     let dropped_size = sample_len - size;
                     let limited_slice = if size < dropped_size {
                         sample.partial_shuffle(rng, size).0
                     } else {
                         sample.partial_shuffle(rng, dropped_size).1
                     };
                     limited_slice.to_owned()
                 }
                 /// Register revisions known as being common
                 pub fn add_common_revisions(
                     &mut self,
                     common: impl IntoIterator<Item = Revision>,
                 ) -> Result<(), GraphError> {
                     self.common.add_bases(common);
                     if let Some(ref mut undecided) = self.undecided {
                         self.common.remove_ancestors_from(undecided)?;
                     }
                     Ok(())
                 }
                 /// Register revisions known as being missing
                 pub fn add_missing_revisions(
                     &mut self,
                     missing: impl IntoIterator<Item = Revision>,
                 ) -> Result<(), GraphError> {
                     self.ensure_undecided()?;
                     let range = dagops::range(
                         &self.graph,
                         missing,
                         self.undecided.as_ref().unwrap().iter().cloned(),
                     )?;
                     let undecided_mut = self.undecided.as_mut().unwrap();
                     for missrev in range {
                         self.missing.insert(missrev);
                         undecided_mut.remove(&missrev);
                     }
                     Ok(())
                 }
                 /// Do we have any information about the peer?
                 pub fn has_info(&self) -> bool {
                     self.common.has_bases()
                 }
                 /// Did we acquire full knowledge of our Revisions that the peer has?
                 pub fn is_complete(&self) -> bool {
                     self.undecided.as_ref().map_or(false, |s| s.is_empty())
                 }
                 /// Return the heads of the currently known common set of revisions.
                 ///
                 /// If the discovery process is not complete (see `is_complete()`), the
                 /// caller must be aware that this is an intermediate state.
                 ///
                 /// On the other hand, if it is complete, then this is currently
                 /// the only way to retrieve the end results of the discovery process.
                 ///
                 /// We may introduce in the future an `into_common_heads` call that
                 /// would be more appropriate for normal Rust callers, dropping `self`
                 /// if it is complete.
                 pub fn common_heads(&self) -> Result<HashSet<Revision>, GraphError> {
                     self.common.bases_heads()
                 }
                 /// Force first computation of `self.undecided`
                 ///
                 /// After this, `self.undecided.as_ref()` and `.as_mut()` can be
                 /// unwrapped to get workable immutable or mutable references without
                 /// any panic.
                 ///
                 /// This is an imperative call instead of an access with added lazyness
                 /// to reduce easily the scope of mutable borrow for the caller,
                 /// compared to undecided(&'a mut self) -> &'a… that would keep it
                 /// as long as the resulting immutable one.
                 fn ensure_undecided(&mut self) -> Result<(), GraphError> {
                     if self.undecided.is_some() {
                         return Ok(());
                     }
                     let tgt = self.target_heads.take().unwrap();
                     self.undecided =
                         Some(self.common.missing_ancestors(tgt)?.into_iter().collect());
                     Ok(())
                 }
                 fn ensure_children_cache(&mut self) -> Result<(), GraphError> {
                     if self.children_cache.is_some() {
                         return Ok(());
                     }
                     self.ensure_undecided()?;
                     let mut children: HashMap<Revision, Vec<Revision>> = HashMap::new();
                     for &rev in self.undecided.as_ref().unwrap() {
                         for p in ParentsIterator::graph_parents(&self.graph, rev)? {
                             children.entry(p).or_insert_with(|| Vec::new()).push(rev);
                         }
                     }
                     self.children_cache = Some(children);
                     Ok(())
                 }
                 /// Provide statistics about the current state of the discovery process
                 pub fn stats(&self) -> DiscoveryStats {
                     DiscoveryStats {
                         undecided: self.undecided.as_ref().map(|s| s.len()),
                     }
                 }
                 pub fn take_quick_sample(
                     &mut self,
                     headrevs: impl IntoIterator<Item = Revision>,
                     size: usize,
                 ) -> Result<Vec<Revision>, GraphError> {
                     self.ensure_undecided()?;
                     let mut sample = {
                         let undecided = self.undecided.as_ref().unwrap();
                         if undecided.len() <= size {
                             return Ok(undecided.iter().cloned().collect());
                         }
                         dagops::heads(&self.graph, undecided.iter())?
                     };
                     if sample.len() >= size {
                         return Ok(self.limit_sample(sample.into_iter().collect(), size));
                     }
                     update_sample(
                         None,
                         headrevs,
                         &mut sample,
                         |r| ParentsIterator::graph_parents(&self.graph, r),
                         Some(size),
                     )?;
                     Ok(sample.into_iter().collect())
                 }
                 /// Extract a sample from `self.undecided`, going from its heads and roots.
                 ///
                 /// The `size` parameter is used to avoid useless computations if
                 /// it turns out to be bigger than the whole set of undecided Revisions.
                 ///
                 /// The sample is taken by using `update_sample` from the heads, then
                 /// from the roots, working on the reverse DAG,
                 /// expressed by `self.children_cache`.
                 ///
                 /// No effort is being made to complete or limit the sample to `size`
                 /// but this method returns another interesting size that it derives
                 /// from its knowledge of the structure of the various sets, leaving
                 /// to the caller the decision to use it or not.
                 fn bidirectional_sample(
                     &mut self,
                     size: usize,
                 ) -> Result<(HashSet<Revision>, usize), GraphError> {
                     self.ensure_undecided()?;
                     {
                         // we don't want to compute children_cache before this
                         // but doing it after extracting self.undecided takes a mutable
                         // ref to self while a shareable one is still active.
                         let undecided = self.undecided.as_ref().unwrap();
                         if undecided.len() <= size {
                             return Ok((undecided.clone(), size));
                         }
                     }
                     self.ensure_children_cache()?;
                     let revs = self.undecided.as_ref().unwrap();
                     let mut sample: HashSet<Revision> = revs.clone();
                     // it's possible that leveraging the children cache would be more
                     // efficient here
                     dagops::retain_heads(&self.graph, &mut sample)?;
                     let revsheads = sample.clone(); // was again heads(revs) in python
                     // update from heads
                     update_sample(
                         Some(revs),
                         revsheads.iter().cloned(),
                         &mut sample,
                         |r| ParentsIterator::graph_parents(&self.graph, r),
                         None,
                     )?;
                     // update from roots
                     let revroots: HashSet<Revision> =
                         dagops::roots(&self.graph, revs)?.into_iter().collect();
                     let prescribed_size = max(size, min(revroots.len(), revsheads.len()));
                     let children = self.children_cache.as_ref().unwrap();
                     let empty_vec: Vec<Revision> = Vec::new();
                     update_sample(
                         Some(revs),
                         revroots,
                         &mut sample,
                         |r| Ok(children.get(&r).unwrap_or(&empty_vec).iter().cloned()),
                         None,
                     )?;
                     Ok((sample, prescribed_size))
                 }
                 /// Fill up sample up to the wished size with random undecided Revisions.
                 ///
                 /// This is intended to be used as a last resort completion if the
                 /// regular sampling algorithm returns too few elements.
                 fn random_complete_sample(
                     &mut self,
                     sample: &mut Vec<Revision>,
                     size: usize,
                 ) {
                     let sample_len = sample.len();
                     if size <= sample_len {
                         return;
                     }
                     let take_from: Vec<Revision> = self
                         .undecided
                         .as_ref()
                         .unwrap()
                         .iter()
                         .filter(|&r| !sample.contains(r))
                         .cloned()
                         .collect();
                     sample.extend(self.limit_sample(take_from, size - sample_len));
                 }
                 pub fn take_full_sample(
                     &mut self,
                     size: usize,
                 ) -> Result<Vec<Revision>, GraphError> {
                     let (sample_set, prescribed_size) = self.bidirectional_sample(size)?;
                     let size = if self.respect_size {
                         size
                     } else {
                         prescribed_size
                     };
                     let mut sample =
                         self.limit_sample(sample_set.into_iter().collect(), size);
                     self.random_complete_sample(&mut sample, size);
                     Ok(sample)
                 }
             }
             #[cfg(test)]
             mod tests {
                 use super::*;
                 use crate::testing::SampleGraph;
                 /// A PartialDiscovery as for pushing all the heads of `SampleGraph`
                 ///
-                /// To avoid actual randomness in tests, we give it a fixed random seed.
+                /// To avoid actual randomness in these tests, we give it a fixed
+                /// random seed, but by default we'll test the random version.
                 fn full_disco() -> PartialDiscovery<SampleGraph> {
                     PartialDiscovery::new_with_seed(
                         SampleGraph,
                         vec![10, 11, 12, 13],
                         [0; 16],
                         true,
+                        true,
                     )
                 }
                 /// A PartialDiscovery as for pushing the 12 head of `SampleGraph`
                 ///
                 /// To avoid actual randomness in tests, we give it a fixed random seed.
                 fn disco12() -> PartialDiscovery<SampleGraph> {
-                    PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16], true)
+                    PartialDiscovery::new_with_seed(
+                        SampleGraph,
+                        vec![12],
+                        [0; 16],
+                        true,
+                        true,
+                    )
                 }
                 fn sorted_undecided(
                     disco: &PartialDiscovery<SampleGraph>,
                 ) -> Vec<Revision> {
                     let mut as_vec: Vec<Revision> =
                         disco.undecided.as_ref().unwrap().iter().cloned().collect();
                     as_vec.sort();
                     as_vec
                 }
                 fn sorted_missing(disco: &PartialDiscovery<SampleGraph>) -> Vec<Revision> {
                     let mut as_vec: Vec<Revision> =
                         disco.missing.iter().cloned().collect();
                     as_vec.sort();
                     as_vec
                 }
                 fn sorted_common_heads(
                     disco: &PartialDiscovery<SampleGraph>,
                 ) -> Result<Vec<Revision>, GraphError> {
                     let mut as_vec: Vec<Revision> =
                         disco.common_heads()?.iter().cloned().collect();
                     as_vec.sort();
                     Ok(as_vec)
                 }
                 #[test]
                 fn test_add_common_get_undecided() -> Result<(), GraphError> {
                     let mut disco = full_disco();
                     assert_eq!(disco.undecided, None);
                     assert!(!disco.has_info());
                     assert_eq!(disco.stats().undecided, None);
                     disco.add_common_revisions(vec![11, 12])?;
                     assert!(disco.has_info());
                     assert!(!disco.is_complete());
                     assert!(disco.missing.is_empty());
                     // add_common_revisions did not trigger a premature computation
                     // of `undecided`, let's check that and ask for them
                     assert_eq!(disco.undecided, None);
                     disco.ensure_undecided()?;
                     assert_eq!(sorted_undecided(&disco), vec![5, 8, 10, 13]);
                     assert_eq!(disco.stats().undecided, Some(4));
                     Ok(())
                 }
                 /// in this test, we pretend that our peer misses exactly (8+10)::
                 /// and we're comparing all our repo to it (as in a bare push)
                 #[test]
                 fn test_discovery() -> Result<(), GraphError> {
                     let mut disco = full_disco();
                     disco.add_common_revisions(vec![11, 12])?;
                     disco.add_missing_revisions(vec![8, 10])?;
                     assert_eq!(sorted_undecided(&disco), vec![5]);
                     assert_eq!(sorted_missing(&disco), vec![8, 10, 13]);
                     assert!(!disco.is_complete());
                     disco.add_common_revisions(vec![5])?;
                     assert_eq!(sorted_undecided(&disco), vec![]);
                     assert_eq!(sorted_missing(&disco), vec![8, 10, 13]);
                     assert!(disco.is_complete());
                     assert_eq!(sorted_common_heads(&disco)?, vec![5, 11, 12]);
                     Ok(())
                 }
                 #[test]
                 fn test_limit_sample_no_need_to() {
                     let sample = vec![1, 2, 3, 4];
                     assert_eq!(full_disco().limit_sample(sample, 10), vec![1, 2, 3, 4]);
                 }
                 #[test]
                 fn test_limit_sample_less_than_half() {
                     assert_eq!(full_disco().limit_sample((1..6).collect(), 2), vec![4, 2]);
                 }
                 #[test]
                 fn test_limit_sample_more_than_half() {
                     assert_eq!(full_disco().limit_sample((1..4).collect(), 2), vec![3, 2]);
                 }
                 #[test]
+                fn test_limit_sample_no_random() {
+                    let mut disco = full_disco();
+                    disco.randomize = false;
+                    assert_eq!(
+                        disco.limit_sample(vec![1, 8, 13, 5, 7, 3], 4),
+                        vec![1, 3, 5, 7]
+                    );
+                }
+                #[test]
                 fn test_quick_sample_enough_undecided_heads() -> Result<(), GraphError> {
                     let mut disco = full_disco();
                     disco.undecided = Some((1..=13).collect());
                     let mut sample_vec = disco.take_quick_sample(vec![], 4)?;
                     sample_vec.sort();
                     assert_eq!(sample_vec, vec![10, 11, 12, 13]);
                     Ok(())
                 }
                 #[test]
                 fn test_quick_sample_climbing_from_12() -> Result<(), GraphError> {
                     let mut disco = disco12();
                     disco.ensure_undecided()?;
                     let mut sample_vec = disco.take_quick_sample(vec![12], 4)?;
                     sample_vec.sort();
                     // r12's only parent is r9, whose unique grand-parent through the
                     // diamond shape is r4. This ends there because the distance from r4
                     // to the root is only 3.
                     assert_eq!(sample_vec, vec![4, 9, 12]);
                     Ok(())
                 }
                 #[test]
                 fn test_children_cache() -> Result<(), GraphError> {
                     let mut disco = full_disco();
                     disco.ensure_children_cache()?;
                     let cache = disco.children_cache.unwrap();
                     assert_eq!(cache.get(&2).cloned(), Some(vec![4]));
                     assert_eq!(cache.get(&10).cloned(), None);
                     let mut children_4 = cache.get(&4).cloned().unwrap();
                     children_4.sort();
                     assert_eq!(children_4, vec![5, 6, 7]);
                     let mut children_7 = cache.get(&7).cloned().unwrap();
                     children_7.sort();
                     assert_eq!(children_7, vec![9, 11]);
                     Ok(())
                 }
                 #[test]
                 fn test_complete_sample() {
                     let mut disco = full_disco();
                     let undecided: HashSet<Revision> =
                         [4, 7, 9, 2, 3].iter().cloned().collect();
                     disco.undecided = Some(undecided);
                     let mut sample = vec![0];
                     disco.random_complete_sample(&mut sample, 3);
                     assert_eq!(sample.len(), 3);
                     let mut sample = vec![2, 4, 7];
                     disco.random_complete_sample(&mut sample, 1);
                     assert_eq!(sample.len(), 3);
                 }
                 #[test]
                 fn test_bidirectional_sample() -> Result<(), GraphError> {
                     let mut disco = full_disco();
                     disco.undecided = Some((0..=13).into_iter().collect());
                     let (sample_set, size) = disco.bidirectional_sample(7)?;
                     assert_eq!(size, 7);
                     let mut sample: Vec<Revision> = sample_set.into_iter().collect();
                     sample.sort();
                     // our DAG is a bit too small for the results to be really interesting
                     // at least it shows that
                     // - we went both ways
                     // - we didn't take all Revisions (6 is not in the sample)
                     assert_eq!(sample, vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]);
                     Ok(())
                 }
             }

rust/hg-cpython/src/discovery.rs

0 +4 -2

             // discovery.rs
             //
             // Copyright 2018 Georges Racinet <gracinet@anybox.fr>
             //
             // This software may be used and distributed according to the terms of the
             // GNU General Public License version 2 or any later version.
             //! Bindings for the `hg::discovery` module provided by the
             //! `hg-core` crate. From Python, this will be seen as `rustext.discovery`
             //!
             //! # Classes visible from Python:
             //! - [`PartialDiscover`] is the Rust implementation of
             //!   `mercurial.setdiscovery.partialdiscovery`.
             use crate::{
                 cindex::Index,
                 conversion::{py_set, rev_pyiter_collect},
                 exceptions::GraphError,
             };
             use cpython::{
                 ObjectProtocol, PyDict, PyModule, PyObject, PyResult, PyTuple, Python,
                 PythonObject, ToPyObject,
             };
             use hg::discovery::PartialDiscovery as CorePartialDiscovery;
             use hg::Revision;
             use std::cell::RefCell;
             py_class!(pub class PartialDiscovery |py| {
                 data inner: RefCell<Box<CorePartialDiscovery<Index>>>;
                 // `_respectsize` is currently only here to replicate the Python API and
                 // will be used in future patches inside methods that are yet to be
                 // implemented.
                 def __new__(
                     _cls,
                     repo: PyObject,
                     targetheads: PyObject,
-                    respectsize: bool
+                    respectsize: bool,
+                    randomize: bool = true
                 ) -> PyResult<PartialDiscovery> {
                     let index = repo.getattr(py, "changelog")?.getattr(py, "index")?;
                     Self::create_instance(
                         py,
                         RefCell::new(Box::new(CorePartialDiscovery::new(
                             Index::new(py, index)?,
                             rev_pyiter_collect(py, &targetheads)?,
-                            respectsize
+                            respectsize,
+                            randomize,
                         )))
                     )
                 }
                 def addcommons(&self, commons: PyObject) -> PyResult<PyObject> {
                     let mut inner = self.inner(py).borrow_mut();
                     let commons_vec: Vec<Revision> = rev_pyiter_collect(py, &commons)?;
                     inner.add_common_revisions(commons_vec)
                         .map_err(|e| GraphError::pynew(py, e))?;
                     Ok(py.None())
                 }
                 def addmissings(&self, missings: PyObject) -> PyResult<PyObject> {
                     let mut inner = self.inner(py).borrow_mut();
                     let missings_vec: Vec<Revision> = rev_pyiter_collect(py, &missings)?;
                     inner.add_missing_revisions(missings_vec)
                         .map_err(|e| GraphError::pynew(py, e))?;
                     Ok(py.None())
                 }
                 def addinfo(&self, sample: PyObject) -> PyResult<PyObject> {
                     let mut missing: Vec<Revision> = Vec::new();
                     let mut common: Vec<Revision> = Vec::new();
                     for info in sample.iter(py)? { // info is a pair (Revision, bool)
                         let mut revknown = info?.iter(py)?;
                         let rev: Revision = revknown.next().unwrap()?.extract(py)?;
                         let known: bool = revknown.next().unwrap()?.extract(py)?;
                         if known {
                             common.push(rev);
                         } else {
                             missing.push(rev);
                         }
                     }
                     let mut inner = self.inner(py).borrow_mut();
                     inner.add_common_revisions(common)
                         .map_err(|e| GraphError::pynew(py, e))?;
                     inner.add_missing_revisions(missing)
                         .map_err(|e| GraphError::pynew(py, e))?;
                     Ok(py.None())
                 }
                 def hasinfo(&self) -> PyResult<bool> {
                     Ok(self.inner(py).borrow().has_info())
                 }
                 def iscomplete(&self) -> PyResult<bool> {
                     Ok(self.inner(py).borrow().is_complete())
                 }
                 def stats(&self) -> PyResult<PyDict> {
                     let stats = self.inner(py).borrow().stats();
                     let as_dict: PyDict = PyDict::new(py);
                     as_dict.set_item(py, "undecided",
                                      stats.undecided.map(
                                          |l| l.to_py_object(py).into_object())
                                          .unwrap_or_else(|| py.None()))?;
                     Ok(as_dict)
                 }
                 def commonheads(&self) -> PyResult<PyObject> {
                     py_set(
                         py,
                         &self.inner(py).borrow().common_heads()
                             .map_err(|e| GraphError::pynew(py, e))?
                     )
                 }
                 def takefullsample(&self, _headrevs: PyObject,
                                    size: usize) -> PyResult<PyObject> {
                     let mut inner = self.inner(py).borrow_mut();
                     let sample = inner.take_full_sample(size)
                         .map_err(|e| GraphError::pynew(py, e))?;
                     let as_vec: Vec<PyObject> = sample
                         .iter()
                         .map(|rev| rev.to_py_object(py).into_object())
                         .collect();
                     Ok(PyTuple::new(py, as_vec.as_slice()).into_object())
                 }
                 def takequicksample(&self, headrevs: PyObject,
                                     size: usize) -> PyResult<PyObject> {
                     let mut inner = self.inner(py).borrow_mut();
                     let revsvec: Vec<Revision> = rev_pyiter_collect(py, &headrevs)?;
                     let sample = inner.take_quick_sample(revsvec, size)
                         .map_err(|e| GraphError::pynew(py, e))?;
                     let as_vec: Vec<PyObject> = sample
                         .iter()
                         .map(|rev| rev.to_py_object(py).into_object())
                         .collect();
                     Ok(PyTuple::new(py, as_vec.as_slice()).into_object())
                 }
             });
             /// Create the module, with __package__ given from parent
             pub fn init_module(py: Python, package: &str) -> PyResult<PyModule> {
                 let dotted_name = &format!("{}.discovery", package);
                 let m = PyModule::new(py, dotted_name)?;
                 m.add(py, "__package__", package)?;
                 m.add(
                     py,
                     "__doc__",
                     "Discovery of common node sets - Rust implementation",
                 )?;
                 m.add_class::<PartialDiscovery>(py)?;
                 let sys = PyModule::import(py, "sys")?;
                 let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?;
                 sys_modules.set_item(py, dotted_name, &m)?;
                 // Example C code (see pyexpat.c and import.c) will "give away the
                 // reference", but we won't because it will be consumed once the
                 // Rust PyObject is dropped.
                 Ok(m)
             }

tests/test-rust-discovery.py

0 +3 0

             from __future__ import absolute_import
             import unittest
             from mercurial import policy
             PartialDiscovery = policy.importrust('discovery', member='PartialDiscovery')
             try:
                 from mercurial.cext import parsers as cparsers
             except ImportError:
                 cparsers = None
             # picked from test-parse-index2, copied rather than imported
             # so that it stays stable even if test-parse-index2 changes or disappears.
             data_non_inlined = (
                 b'\x00\x00\x00\x01\x00\x00\x00\x00\x00\x01D\x19'
                 b'\x00\x07e\x12\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff'
                 b'\xff\xff\xff\xff\xd1\xf4\xbb\xb0\xbe\xfc\x13\xbd\x8c\xd3\x9d'
                 b'\x0f\xcd\xd9;\x8c\x07\x8cJ/\x00\x00\x00\x00\x00\x00\x00\x00\x00'
                 b'\x00\x00\x00\x00\x00\x00\x01D\x19\x00\x00\x00\x00\x00\xdf\x00'
                 b'\x00\x01q\x00\x00\x00\x01\x00\x00\x00\x01\x00\x00\x00\x00\xff'
                 b'\xff\xff\xff\xc1\x12\xb9\x04\x96\xa4Z1t\x91\xdfsJ\x90\xf0\x9bh'
                 b'\x07l&\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
                 b'\x00\x01D\xf8\x00\x00\x00\x00\x01\x1b\x00\x00\x01\xb8\x00\x00'
                 b'\x00\x01\x00\x00\x00\x02\x00\x00\x00\x01\xff\xff\xff\xff\x02\n'
                 b'\x0e\xc6&\xa1\x92\xae6\x0b\x02i\xfe-\xe5\xbao\x05\xd1\xe7\x00'
                 b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01F'
                 b'\x13\x00\x00\x00\x00\x01\xec\x00\x00\x03\x06\x00\x00\x00\x01'
                 b'\x00\x00\x00\x03\x00\x00\x00\x02\xff\xff\xff\xff\x12\xcb\xeby1'
                 b'\xb6\r\x98B\xcb\x07\xbd`\x8f\x92\xd9\xc4\x84\xbdK\x00\x00\x00'
                 b'\x00\x00\x00\x00\x00\x00\x00\x00\x00'
                 )
             class fakerepo(object):
                 def __init__(self, idx):
                     """Just make so that self.changelog.index is the given idx."""
                     self.index = idx
                     self.changelog = self
             @unittest.skipIf(PartialDiscovery is None or cparsers is None,
                              "rustext or the C Extension parsers module "
                              "discovery relies on is not available")
             class rustdiscoverytest(unittest.TestCase):
                 """Test the correctness of binding to Rust code.
                 This test is merely for the binding to Rust itself: extraction of
                 Python variable, giving back the results etc.
                 It is not meant to test the algorithmic correctness of the provided
                 methods. Hence the very simple embedded index data is good enough.
                 Algorithmic correctness is asserted by the Rust unit tests.
                 """
                 def parseindex(self):
                     return cparsers.parse_index2(data_non_inlined, False)[0]
                 def repo(self):
                     return fakerepo(self.parseindex())
                 def testindex(self):
                     idx = self.parseindex()
                     # checking our assumptions about the index binary data:
                     self.assertEqual({i: (r[5], r[6]) for i, r in enumerate(idx)},
                                      {0: (-1, -1),
 : (0, -1),
 : (1, -1),
 : (2, -1)})
                 def testaddcommonsmissings(self):
                     disco = PartialDiscovery(self.repo(), [3], True)
                     self.assertFalse(disco.hasinfo())
                     self.assertFalse(disco.iscomplete())
                     disco.addcommons([1])
                     self.assertTrue(disco.hasinfo())
                     self.assertFalse(disco.iscomplete())
                     disco.addmissings([2])
                     self.assertTrue(disco.hasinfo())
                     self.assertTrue(disco.iscomplete())
                     self.assertEqual(disco.commonheads(), {1})
                 def testaddmissingsstats(self):
                     disco = PartialDiscovery(self.repo(), [3], True)
                     self.assertIsNone(disco.stats()['undecided'], None)
                     disco.addmissings([2])
                     self.assertEqual(disco.stats()['undecided'], 2)
                 def testaddinfocommonfirst(self):
                     disco = PartialDiscovery(self.repo(), [3], True)
                     disco.addinfo([(1, True), (2, False)])
                     self.assertTrue(disco.hasinfo())
                     self.assertTrue(disco.iscomplete())
                     self.assertEqual(disco.commonheads(), {1})
                 def testaddinfomissingfirst(self):
                     disco = PartialDiscovery(self.repo(), [3], True)
                     disco.addinfo([(2, False), (1, True)])
                     self.assertTrue(disco.hasinfo())
                     self.assertTrue(disco.iscomplete())
                     self.assertEqual(disco.commonheads(), {1})
+                def testinitnorandom(self):
+                    PartialDiscovery(self.repo(), [3], True, randomize=False)
             if __name__ == '__main__':
                 import silenttestrunner
                 silenttestrunner.main(__name__)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages