upstream/mercurial-mirror Commit - r42966:8041a1b4

rust-discovery: takefullsample() core implementation...

Georges Racinet -

r42966:8041a1b4 default

parent child

rust/hg-core/src/discovery.rs

0 +233 -11

@@ -17,6 +17,7 b' use self::rand::{thread_rng, RngCore, Se'
17	use super::{Graph, GraphError, Revision, NULL_REVISION};	17	use super::{Graph, GraphError, Revision, NULL_REVISION};
18	use crate::ancestors::MissingAncestors;	18	use crate::ancestors::MissingAncestors;
19	use crate::dagops;	19	use crate::dagops;
		20	use std::cmp::{max, min};
20	use std::collections::{HashMap, HashSet, VecDeque};	21	use std::collections::{HashMap, HashSet, VecDeque};
21		22
22	type Rng = self::rand_pcg::Pcg32;	23	type Rng = self::rand_pcg::Pcg32;
@@ -26,8 +27,10 b' pub struct PartialDiscovery<G: Graph + C'
26	graph: G, // plays the role of self._repo	27	graph: G, // plays the role of self._repo
27	common: MissingAncestors<G>,	28	common: MissingAncestors<G>,
28	undecided: Option<HashSet<Revision>>,	29	undecided: Option<HashSet<Revision>>,
		30	children_cache: Option<HashMap<Revision, Vec<Revision>>>,
29	missing: HashSet<Revision>,	31	missing: HashSet<Revision>,
30	rng: Rng,	32	rng: Rng,
		33	respect_size: bool,
31	}	34	}
32		35
33	pub struct DiscoveryStats {	36	pub struct DiscoveryStats {
@@ -49,13 +52,16 b' pub struct DiscoveryStats {'
49	/// - `sample`: a sample to update	52	/// - `sample`: a sample to update
50	/// - `parentfn`: a callable to resolve parents for a revision	53	/// - `parentfn`: a callable to resolve parents for a revision
51	/// - `quicksamplesize`: optional target size of the sample	54	/// - `quicksamplesize`: optional target size of the sample
52	fn update_sample(	55	fn update_sample<I>(
53	revs: Option<&HashSet<Revision>>,	56	revs: Option<&HashSet<Revision>>,
54	heads: impl IntoIterator<Item = Revision>,	57	heads: impl IntoIterator<Item = Revision>,
55	sample: &mut HashSet<Revision>,	58	sample: &mut HashSet<Revision>,
56	parentsfn: impl Fn(Revision) -> Result<[~~Revision~~; 2], GraphError>,	59	parentsfn: impl Fn(Revision) -> Result<I, GraphError>,
57	quicksamplesize: Option<usize>,	60	quicksamplesize: Option<usize>,
58	) -> Result<(), GraphError> {	61	) -> Result<(), GraphError>
		62	where
		63	I: Iterator<Item = Revision>,
		64	{
59	let mut distances: HashMap<Revision, u32> = HashMap::new();	65	let mut distances: HashMap<Revision, u32> = HashMap::new();
60	let mut visit: VecDeque<Revision> = heads.into_iter().collect();	66	let mut visit: VecDeque<Revision> = heads.into_iter().collect();
61	let mut factor: u32 = 1;	67	let mut factor: u32 = 1;
@@ -83,10 +89,7 b' fn update_sample('
83	}	89	}
84	}	90	}
85	}	91	}
86	for &p in &parentsfn(current)? {	92	for p in parentsfn(current)? {
87	if p == NULL_REVISION {
88	continue;
89	}
90	if let Some(revs) = revs {	93	if let Some(revs) = revs {
91	if !revs.contains(&p) {	94	if !revs.contains(&p) {
92	continue;	95	continue;
@@ -99,6 +102,39 b' fn update_sample('
99	Ok(())	102	Ok(())
100	}	103	}
101		104
		105	struct ParentsIterator {
		106	parents: [Revision; 2],
		107	cur: usize,
		108	}
		109
		110	impl ParentsIterator {
		111	fn graph_parents(
		112	graph: &impl Graph,
		113	r: Revision,
		114	) -> Result<ParentsIterator, GraphError> {
		115	Ok(ParentsIterator {
		116	parents: graph.parents(r)?,
		117	cur: 0,
		118	})
		119	}
		120	}
		121
		122	impl Iterator for ParentsIterator {
		123	type Item = Revision;
		124
		125	fn next(&mut self) -> Option<Revision> {
		126	if self.cur > 1 {
		127	return None;
		128	}
		129	let rev = self.parents[self.cur];
		130	self.cur += 1;
		131	if rev == NULL_REVISION {
		132	return self.next();
		133	}
		134	Some(rev)
		135	}
		136	}
		137
102	impl<G: Graph + Clone> PartialDiscovery<G> {	138	impl<G: Graph + Clone> PartialDiscovery<G> {
103	/// Create a PartialDiscovery object, with the intent	139	/// Create a PartialDiscovery object, with the intent
104	/// of comparing our `::<target_heads>` revset to the contents of another	140	/// of comparing our `::<target_heads>` revset to the contents of another
@@ -110,24 +146,36 b' impl<G: Graph + Clone> PartialDiscovery<'
110	/// If we want to make the signature more flexible,	146	/// If we want to make the signature more flexible,
111	/// we'll have to make it a type argument of `PartialDiscovery` or a trait	147	/// we'll have to make it a type argument of `PartialDiscovery` or a trait
112	/// object since we'll keep it in the meanwhile	148	/// object since we'll keep it in the meanwhile
113	pub fn new(graph: G, target_heads: Vec<Revision>) -> Self {	149	///
		150	/// The `respect_size` boolean controls how the sampling methods
		151	/// will interpret the size argument requested by the caller. If it's
		152	/// `false`, they are allowed to produce a sample whose size is more
		153	/// appropriate to the situation (typically bigger).
		154	pub fn new(
		155	graph: G,
		156	target_heads: Vec<Revision>,
		157	respect_size: bool,
		158	) -> Self {
114	let mut seed: [u8; 16] = [0; 16];	159	let mut seed: [u8; 16] = [0; 16];
115	thread_rng().fill_bytes(&mut seed);	160	thread_rng().fill_bytes(&mut seed);
116	Self::new_with_seed(graph, target_heads, seed)	161	Self::new_with_seed(graph, target_heads, seed, respect_size)
117	}	162	}
118		163
119	pub fn new_with_seed(	164	pub fn new_with_seed(
120	graph: G,	165	graph: G,
121	target_heads: Vec<Revision>,	166	target_heads: Vec<Revision>,
122	seed: [u8; 16],	167	seed: [u8; 16],
		168	respect_size: bool,
123	) -> Self {	169	) -> Self {
124	PartialDiscovery {	170	PartialDiscovery {
125	undecided: None,	171	undecided: None,
		172	children_cache: None,
126	target_heads: Some(target_heads),	173	target_heads: Some(target_heads),
127	graph: graph.clone(),	174	graph: graph.clone(),
128	common: MissingAncestors::new(graph, vec![]),	175	common: MissingAncestors::new(graph, vec![]),
129	missing: HashSet::new(),	176	missing: HashSet::new(),
130	rng: Rng::from_seed(seed),	177	rng: Rng::from_seed(seed),
		178	respect_size: respect_size,
131	}	179	}
132	}	180	}
133		181
@@ -228,6 +276,22 b' impl<G: Graph + Clone> PartialDiscovery<'
228	Ok(())	276	Ok(())
229	}	277	}
230		278
		279	fn ensure_children_cache(&mut self) -> Result<(), GraphError> {
		280	if self.children_cache.is_some() {
		281	return Ok(());
		282	}
		283	self.ensure_undecided()?;
		284
		285	let mut children: HashMap<Revision, Vec<Revision>> = HashMap::new();
		286	for &rev in self.undecided.as_ref().unwrap() {
		287	for p in ParentsIterator::graph_parents(&self.graph, rev)? {
		288	children.entry(p).or_insert_with(\|\| Vec::new()).push(rev);
		289	}
		290	}
		291	self.children_cache = Some(children);
		292	Ok(())
		293	}
		294
231	/// Provide statistics about the current state of the discovery process	295	/// Provide statistics about the current state of the discovery process
232	pub fn stats(&self) -> DiscoveryStats {	296	pub fn stats(&self) -> DiscoveryStats {
233	DiscoveryStats {	297	DiscoveryStats {
@@ -255,11 +319,114 b' impl<G: Graph + Clone> PartialDiscovery<'
255	None,	319	None,
256	headrevs,	320	headrevs,
257	&mut sample,	321	&mut sample,
258	\|r\| self.graph.parents(r),	322	\|r\| ParentsIterator::graph_parents(&self.graph, r),
259	Some(size),	323	Some(size),
260	)?;	324	)?;
261	Ok(sample.into_iter().collect())	325	Ok(sample.into_iter().collect())
262	}	326	}
		327
		328	/// Extract a sample from `self.undecided`, going from its heads and roots.
		329	///
		330	/// The `size` parameter is used to avoid useless computations if
		331	/// it turns out to be bigger than the whole set of undecided Revisions.
		332	///
		333	/// The sample is taken by using `update_sample` from the heads, then
		334	/// from the roots, working on the reverse DAG,
		335	/// expressed by `self.children_cache`.
		336	///
		337	/// No effort is being made to complete or limit the sample to `size`
		338	/// but this method returns another interesting size that it derives
		339	/// from its knowledge of the structure of the various sets, leaving
		340	/// to the caller the decision to use it or not.
		341	fn bidirectional_sample(
		342	&mut self,
		343	size: usize,
		344	) -> Result<(HashSet<Revision>, usize), GraphError> {
		345	self.ensure_undecided()?;
		346	{
		347	// we don't want to compute children_cache before this
		348	// but doing it after extracting self.undecided takes a mutable
		349	// ref to self while a shareable one is still active.
		350	let undecided = self.undecided.as_ref().unwrap();
		351	if undecided.len() <= size {
		352	return Ok((undecided.clone(), size));
		353	}
		354	}
		355
		356	self.ensure_children_cache()?;
		357	let revs = self.undecided.as_ref().unwrap();
		358	let mut sample: HashSet<Revision> = revs.clone();
		359
		360	// it's possible that leveraging the children cache would be more
		361	// efficient here
		362	dagops::retain_heads(&self.graph, &mut sample)?;
		363	let revsheads = sample.clone(); // was again heads(revs) in python
		364
		365	// update from heads
		366	update_sample(
		367	Some(revs),
		368	revsheads.iter().cloned(),
		369	&mut sample,
		370	\|r\| ParentsIterator::graph_parents(&self.graph, r),
		371	None,
		372	)?;
		373
		374	// update from roots
		375	let revroots: HashSet<Revision> =
		376	dagops::roots(&self.graph, revs)?.into_iter().collect();
		377	let prescribed_size = max(size, min(revroots.len(), revsheads.len()));
		378
		379	let children = self.children_cache.as_ref().unwrap();
		380	let empty_vec: Vec<Revision> = Vec::new();
		381	update_sample(
		382	Some(revs),
		383	revroots,
		384	&mut sample,
		385	\|r\| Ok(children.get(&r).unwrap_or(&empty_vec).iter().cloned()),
		386	None,
		387	)?;
		388	Ok((sample, prescribed_size))
		389	}
		390
		391	/// Fill up sample up to the wished size with random undecided Revisions.
		392	///
		393	/// This is intended to be used as a last resort completion if the
		394	/// regular sampling algorithm returns too few elements.
		395	fn random_complete_sample(
		396	&mut self,
		397	sample: &mut Vec<Revision>,
		398	size: usize,
		399	) {
		400	let sample_len = sample.len();
		401	if size <= sample_len {
		402	return;
		403	}
		404	let take_from: Vec<Revision> = self
		405	.undecided
		406	.as_ref()
		407	.unwrap()
		408	.iter()
		409	.filter(\|&r\| !sample.contains(r))
		410	.cloned()
		411	.collect();
		412	sample.extend(self.limit_sample(take_from, size - sample_len));
		413	}
		414
		415	pub fn take_full_sample(
		416	&mut self,
		417	size: usize,
		418	) -> Result<Vec<Revision>, GraphError> {
		419	let (sample_set, prescribed_size) = self.bidirectional_sample(size)?;
		420	let size = if self.respect_size {
		421	size
		422	} else {
		423	prescribed_size
		424	};
		425	let mut sample =
		426	self.limit_sample(sample_set.into_iter().collect(), size);
		427	self.random_complete_sample(&mut sample, size);
		428	Ok(sample)
		429	}
263	}	430	}
264		431
265	#[cfg(test)]	432	#[cfg(test)]
@@ -275,6 +442,7 b' mod tests {'
275	SampleGraph,	442	SampleGraph,
276	vec![10, 11, 12, 13],	443	vec![10, 11, 12, 13],
277	[0; 16],	444	[0; 16],
		445	true,
278	)	446	)
279	}	447	}
280		448
@@ -282,7 +450,7 b' mod tests {'
282	///	450	///
283	/// To avoid actual randomness in tests, we give it a fixed random seed.	451	/// To avoid actual randomness in tests, we give it a fixed random seed.
284	fn disco12() -> PartialDiscovery<SampleGraph> {	452	fn disco12() -> PartialDiscovery<SampleGraph> {
285	PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16])	453	PartialDiscovery::new_with_seed(SampleGraph, vec![12], [0; 16], true)
286	}	454	}
287		455
288	fn sorted_undecided(	456	fn sorted_undecided(
@@ -390,4 +558,58 b' mod tests {'
390	assert_eq!(sample_vec, vec![4, 9, 12]);	558	assert_eq!(sample_vec, vec![4, 9, 12]);
391	Ok(())	559	Ok(())
392	}	560	}
		561
		562	#[test]
		563	fn test_children_cache() -> Result<(), GraphError> {
		564	let mut disco = full_disco();
		565	disco.ensure_children_cache()?;
		566
		567	let cache = disco.children_cache.unwrap();
		568	assert_eq!(cache.get(&2).cloned(), Some(vec![4]));
		569	assert_eq!(cache.get(&10).cloned(), None);
		570
		571	let mut children_4 = cache.get(&4).cloned().unwrap();
		572	children_4.sort();
		573	assert_eq!(children_4, vec![5, 6, 7]);
		574
		575	let mut children_7 = cache.get(&7).cloned().unwrap();
		576	children_7.sort();
		577	assert_eq!(children_7, vec![9, 11]);
		578
		579	Ok(())
		580	}
		581
		582	#[test]
		583	fn test_complete_sample() {
		584	let mut disco = full_disco();
		585	let undecided: HashSet<Revision> =
		586	[4, 7, 9, 2, 3].iter().cloned().collect();
		587	disco.undecided = Some(undecided);
		588
		589	let mut sample = vec![0];
		590	disco.random_complete_sample(&mut sample, 3);
		591	assert_eq!(sample.len(), 3);
		592
		593	let mut sample = vec![2, 4, 7];
		594	disco.random_complete_sample(&mut sample, 1);
		595	assert_eq!(sample.len(), 3);
		596	}
		597
		598	#[test]
		599	fn test_bidirectional_sample() -> Result<(), GraphError> {
		600	let mut disco = full_disco();
		601	disco.undecided = Some((0..=13).into_iter().collect());
		602
		603	let (sample_set, size) = disco.bidirectional_sample(7)?;
		604	assert_eq!(size, 7);
		605	let mut sample: Vec<Revision> = sample_set.into_iter().collect();
		606	sample.sort();
		607	// our DAG is a bit too small for the results to be really interesting
		608	// at least it shows that
		609	// - we went both ways
		610	// - we didn't take all Revisions (6 is not in the sample)
		611	assert_eq!(sample, vec![0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]);
		612	Ok(())
		613	}
		614
393	}	615	}

rust/hg-cpython/src/discovery.rs

0 +2 -1

                     _cls,
                     repo: PyObject,
                     targetheads: PyObject,
-                    _respectsize: bool
+                    respectsize: bool
                 ) -> PyResult<PartialDiscovery> {
                     let index = repo.getattr(py, "changelog")?.getattr(py, "index")?;
                     Self::create_instance(
                         RefCell::new(Box::new(CorePartialDiscovery::new(
                             Index::new(py, index)?,
                             rev_pyiter_collect(py, &targetheads)?,
+                            respectsize
                         )))
                     )
                 }

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages