upstream/mercurial-mirror Files · contrib/merge-lists/src/main.rs

rhg: support "!" syntax for disabling extensions...

rhg: support "!" syntax for disabling extensions This makes it so that calls in test-log.t do not fall back immediately because of the disabled extension, instead going through the CLI parsing code, which breaks because of invalid UTF-8 in a flag. I *think* clap 3.x+ supports this? I'm not sure, and we have to upgrade the minimum Rust version to use clap 3.x anyway which is out of scope for this series, so let's just kick that can down the road a little bit.

Martin von Zweigbergk - - Load All Authors

File last commit:

r49875:b999edb1 default


                r50372:f3cd2d6e

default

Download file

             main.rs
        
                    300 lines
            
             | 10.0 KiB
            
                | application/rls-services+xml
            
             |
                RustLexer
            
             / contrib / merge-lists / src / main.rs
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        Martin von Zweigbergk
    
merge-lists: make it possible to specify pattern to match...

              r49875
            
      use clap::{ArgGroup, Parser};

        Martin von Zweigbergk
    
contrib: add a partial-merge tool for sorted lists (such as Python imports)...

              r49874
            
      use itertools::Itertools;

      use regex::bytes::Regex;

      use similar::ChangeTag;

      use std::cmp::{max, min, Ordering};

      use std::collections::HashSet;

      use std::ffi::OsString;

      use std::ops::Range;

      use std::path::PathBuf;

      fn find_unchanged_ranges(

          old_bytes: &[u8],

          new_bytes: &[u8],

      ) -> Vec<(Range<usize>, Range<usize>)> {

          let diff = similar::TextDiff::configure()

              .algorithm(similar::Algorithm::Patience)

              .diff_lines(old_bytes, new_bytes);

          let mut new_unchanged_ranges = vec![];

          let mut old_index = 0;

          let mut new_index = 0;

          for diff in diff.iter_all_changes() {

              match diff.tag() {

                  ChangeTag::Equal => {

                      new_unchanged_ranges.push((

                          old_index..old_index + diff.value().len(),

                          new_index..new_index + diff.value().len(),

                      ));

                      old_index += diff.value().len();

                      new_index += diff.value().len();

                  }

                  ChangeTag::Delete => {

                      old_index += diff.value().len();

                  }

                  ChangeTag::Insert => {

                      new_index += diff.value().len();

                  }

              }

          }

          new_unchanged_ranges

      }

      /// Returns a list of all the lines in the input (including trailing newlines),

      /// but only if they all match the regex and they are sorted.

      fn get_lines<'input>(

          input: &'input [u8],

          regex: &Regex,

      ) -> Option<Vec<&'input [u8]>> {

          let lines = input.split_inclusive(|x| *x == b'\n').collect_vec();

          let mut previous_line = "".as_bytes();

          for line in &lines {

              if *line < previous_line {

                  return None;

              }

              if !regex.is_match(line) {

                  return None;

              }

              previous_line = line;

          }

          Some(lines)

      }

      fn resolve_conflict(

          base_slice: &[u8],

          local_slice: &[u8],

          other_slice: &[u8],

          regex: &Regex,

      ) -> Option<Vec<u8>> {

          let base_lines = get_lines(base_slice, regex)?;

          let local_lines = get_lines(local_slice, regex)?;

          let other_lines = get_lines(other_slice, regex)?;

          let base_lines_set: HashSet<_> = base_lines.iter().copied().collect();

          let local_lines_set: HashSet<_> = local_lines.iter().copied().collect();

          let other_lines_set: HashSet<_> = other_lines.iter().copied().collect();

          let mut result = local_lines_set;

          for to_add in other_lines_set.difference(&base_lines_set) {

              result.insert(to_add);

          }

          for to_remove in base_lines_set.difference(&other_lines_set) {

              result.remove(to_remove);

          }

          Some(result.into_iter().sorted().collect_vec().concat())

      }

      fn resolve(

          base_bytes: &[u8],

          local_bytes: &[u8],

          other_bytes: &[u8],

          regex: &Regex,

      ) -> (Vec<u8>, Vec<u8>, Vec<u8>) {

          // Find unchanged ranges between the base and the two sides. We do that by

          // initially considering the whole base unchanged. Then we compare each

          // side with the base and intersect the unchanged ranges we find with

          // what we had before.

          let unchanged_ranges = vec![UnchangedRange {

              base_range: 0..base_bytes.len(),

              offsets: vec![],

          }];

          let unchanged_ranges = intersect_regions(

              unchanged_ranges,

              &find_unchanged_ranges(base_bytes, local_bytes),

          );

          let mut unchanged_ranges = intersect_regions(

              unchanged_ranges,

              &find_unchanged_ranges(base_bytes, other_bytes),

          );

          // Add an empty UnchangedRange at the end to make it easier to find change

          // ranges. That way there's a changed range before each UnchangedRange.

          unchanged_ranges.push(UnchangedRange {

              base_range: base_bytes.len()..base_bytes.len(),

              offsets: vec![

                  local_bytes.len().wrapping_sub(base_bytes.len()) as isize,

                  other_bytes.len().wrapping_sub(base_bytes.len()) as isize,

              ],

          });

          let mut new_base_bytes: Vec<u8> = vec![];

          let mut new_local_bytes: Vec<u8> = vec![];

          let mut new_other_bytes: Vec<u8> = vec![];

          let mut previous = UnchangedRange {

              base_range: 0..0,

              offsets: vec![0, 0],

          };

          for current in unchanged_ranges {

              let base_slice =

                  &base_bytes[previous.base_range.end..current.base_range.start];

              let local_slice = &local_bytes[previous.end(0)..current.start(0)];

              let other_slice = &other_bytes[previous.end(1)..current.start(1)];

              if let Some(resolution) =

                  resolve_conflict(base_slice, local_slice, other_slice, regex)

              {

                  new_base_bytes.extend(&resolution);

                  new_local_bytes.extend(&resolution);

                  new_other_bytes.extend(&resolution);

              } else {

                  new_base_bytes.extend(base_slice);

                  new_local_bytes.extend(local_slice);

                  new_other_bytes.extend(other_slice);

              }

              new_base_bytes.extend(&base_bytes[current.base_range.clone()]);

              new_local_bytes.extend(&local_bytes[current.start(0)..current.end(0)]);

              new_other_bytes.extend(&other_bytes[current.start(1)..current.end(1)]);

              previous = current;

          }

          (new_base_bytes, new_local_bytes, new_other_bytes)

      }

      /// A tool that performs a 3-way merge, resolving conflicts in sorted lists and

      /// leaving other conflicts unchanged. This is useful with Mercurial's support

      /// for partial merge tools (configured in `[partial-merge-tools]`).

      #[derive(Parser, Debug)]

      #[clap(version, about, long_about = None)]

        Martin von Zweigbergk
    
merge-lists: make it possible to specify pattern to match...

              r49875
            
      #[clap(group(ArgGroup::new("match").required(true).args(&["pattern", "python-imports"])))]

        Martin von Zweigbergk
    
contrib: add a partial-merge tool for sorted lists (such as Python imports)...

              r49874
            
      struct Args {

          /// Path to the file's content in the "local" side

          local: OsString,

          /// Path to the file's content in the base

          base: OsString,

          /// Path to the file's content in the "other" side

          other: OsString,

        Martin von Zweigbergk
    
merge-lists: make it possible to specify pattern to match...

              r49875
            
          /// Regular expression to use

          #[clap(long, short)]

          pattern: Option<String>,

          /// Use built-in regular expression for Python imports

          #[clap(long)]

          python_imports: bool,

      }

      fn get_regex(args: &Args) -> Regex {

          let pattern = if args.python_imports {

              r"import \w+(\.\w+)*( +#.*)?\n|from (\w+(\.\w+)* import \w+( as \w+)?(, \w+( as \w+)?)*( +#.*)?)"

          } else if let Some(pattern) = &args.pattern {

              pattern

          } else {

              ".*"

          };

          let pattern = format!(r"{}\r?\n?", pattern);

          regex::bytes::Regex::new(&pattern).unwrap()

        Martin von Zweigbergk
    
contrib: add a partial-merge tool for sorted lists (such as Python imports)...

              r49874
            
      }

      fn main() {

          let args: Args = Args::parse();

          let base_path = PathBuf::from(&args.base);

          let local_path = PathBuf::from(&args.local);

          let other_path = PathBuf::from(&args.other);

          let base_bytes = std::fs::read(&base_path).unwrap();

          let local_bytes = std::fs::read(&local_path).unwrap();

          let other_bytes = std::fs::read(&other_path).unwrap();

        Martin von Zweigbergk
    
merge-lists: make it possible to specify pattern to match...

              r49875
            
          let regex = get_regex(&args);

        Martin von Zweigbergk
    
contrib: add a partial-merge tool for sorted lists (such as Python imports)...

              r49874
            
          let (new_base_bytes, new_local_bytes, new_other_bytes) =

              resolve(&base_bytes, &local_bytes, &other_bytes, &regex);

          // Write out the result if anything changed

          if new_base_bytes != base_bytes {

              std::fs::write(&base_path, new_base_bytes).unwrap();

          }

          if new_local_bytes != local_bytes {

              std::fs::write(&local_path, new_local_bytes).unwrap();

          }

          if new_other_bytes != other_bytes {

              std::fs::write(&other_path, new_other_bytes).unwrap();

          }

      }

      fn checked_add(base: usize, offset: isize) -> usize {

          if offset < 0 {

              base.checked_sub(offset.checked_abs().unwrap() as usize)

                  .unwrap()

          } else {

              base.checked_add(offset as usize).unwrap()

          }

      }

      // The remainder of the file is copied from

      // https://github.com/martinvonz/jj/blob/main/lib/src/diff.rs

      #[derive(Clone, PartialEq, Eq, Debug)]

      struct UnchangedRange {

          base_range: Range<usize>,

          offsets: Vec<isize>,

      }

      impl UnchangedRange {

          fn start(&self, side: usize) -> usize {

              checked_add(self.base_range.start, self.offsets[side])

          }

          fn end(&self, side: usize) -> usize {

              checked_add(self.base_range.end, self.offsets[side])

          }

      }

      impl PartialOrd for UnchangedRange {

          fn partial_cmp(&self, other: &Self) -> Option<Ordering> {

              Some(self.cmp(other))

          }

      }

      impl Ord for UnchangedRange {

          fn cmp(&self, other: &Self) -> Ordering {

              self.base_range

                  .start

                  .cmp(&other.base_range.start)

                  .then_with(|| self.base_range.end.cmp(&other.base_range.end))

          }

      }

      /// Takes the current regions and intersects it with the new unchanged ranges

      /// from a 2-way diff. The result is a map of unchanged regions with one more

      /// offset in the map's values.

      fn intersect_regions(

          current_ranges: Vec<UnchangedRange>,

          new_unchanged_ranges: &[(Range<usize>, Range<usize>)],

      ) -> Vec<UnchangedRange> {

          let mut result = vec![];

          let mut current_ranges_iter = current_ranges.into_iter().peekable();

          for (new_base_range, other_range) in new_unchanged_ranges.iter() {

              assert_eq!(new_base_range.len(), other_range.len());

              while let Some(UnchangedRange {

                  base_range,

                  offsets,

              }) = current_ranges_iter.peek()

              {

                  // No need to look further if we're past the new range.

                  if base_range.start >= new_base_range.end {

                      break;

                  }

                  // Discard any current unchanged regions that don't match between

                  // the base and the new input.

                  if base_range.end <= new_base_range.start {

                      current_ranges_iter.next();

                      continue;

                  }

                  let new_start = max(base_range.start, new_base_range.start);

                  let new_end = min(base_range.end, new_base_range.end);

                  let mut new_offsets = offsets.clone();

                  new_offsets

                      .push(other_range.start.wrapping_sub(new_base_range.start)

                          as isize);

                  result.push(UnchangedRange {

                      base_range: new_start..new_end,

                      offsets: new_offsets,

                  });

                  if base_range.end >= new_base_range.end {

                      // Break without consuming the item; there may be other new

                      // ranges that overlap with it.

                      break;

                  }

                  current_ranges_iter.next();

              }

          }

          result

      }

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

Martin von Zweigbergk merge-lists: make it possible to specify pattern to match...	r49875	use clap::{ArgGroup, Parser};
Martin von Zweigbergk contrib: add a partial-merge tool for sorted lists (such as Python imports)...	r49874	use itertools::Itertools;
		use regex::bytes::Regex;
		use similar::ChangeTag;
		use std::cmp::{max, min, Ordering};
		use std::collections::HashSet;
		use std::ffi::OsString;
		use std::ops::Range;
		use std::path::PathBuf;

		fn find_unchanged_ranges(
		old_bytes: &[u8],
		new_bytes: &[u8],
		) -> Vec<(Range<usize>, Range<usize>)> {
		let diff = similar::TextDiff::configure()
		.algorithm(similar::Algorithm::Patience)
		.diff_lines(old_bytes, new_bytes);
		let mut new_unchanged_ranges = vec![];
		let mut old_index = 0;
		let mut new_index = 0;
		for diff in diff.iter_all_changes() {
		match diff.tag() {
		ChangeTag::Equal => {
		new_unchanged_ranges.push((
		old_index..old_index + diff.value().len(),
		new_index..new_index + diff.value().len(),
		));
		old_index += diff.value().len();
		new_index += diff.value().len();
		}
		ChangeTag::Delete => {
		old_index += diff.value().len();
		}
		ChangeTag::Insert => {
		new_index += diff.value().len();
		}
		}
		}
		new_unchanged_ranges
		}

		/// Returns a list of all the lines in the input (including trailing newlines),
		/// but only if they all match the regex and they are sorted.
		fn get_lines<'input>(
		input: &'input [u8],
		regex: &Regex,
		) -> Option<Vec<&'input [u8]>> {
		let lines = input.split_inclusive(\|x\| *x == b'\n').collect_vec();
		let mut previous_line = "".as_bytes();
		for line in &lines {
		if *line < previous_line {
		return None;
		}
		if !regex.is_match(line) {
		return None;
		}
		previous_line = line;
		}
		Some(lines)
		}

		fn resolve_conflict(
		base_slice: &[u8],
		local_slice: &[u8],
		other_slice: &[u8],
		regex: &Regex,
		) -> Option<Vec<u8>> {
		let base_lines = get_lines(base_slice, regex)?;
		let local_lines = get_lines(local_slice, regex)?;
		let other_lines = get_lines(other_slice, regex)?;
		let base_lines_set: HashSet<_> = base_lines.iter().copied().collect();
		let local_lines_set: HashSet<_> = local_lines.iter().copied().collect();
		let other_lines_set: HashSet<_> = other_lines.iter().copied().collect();
		let mut result = local_lines_set;
		for to_add in other_lines_set.difference(&base_lines_set) {
		result.insert(to_add);
		}
		for to_remove in base_lines_set.difference(&other_lines_set) {
		result.remove(to_remove);
		}
		Some(result.into_iter().sorted().collect_vec().concat())
		}

		fn resolve(
		base_bytes: &[u8],
		local_bytes: &[u8],
		other_bytes: &[u8],
		regex: &Regex,
		) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
		// Find unchanged ranges between the base and the two sides. We do that by
		// initially considering the whole base unchanged. Then we compare each
		// side with the base and intersect the unchanged ranges we find with
		// what we had before.
		let unchanged_ranges = vec![UnchangedRange {
		base_range: 0..base_bytes.len(),
		offsets: vec![],
		}];
		let unchanged_ranges = intersect_regions(
		unchanged_ranges,
		&find_unchanged_ranges(base_bytes, local_bytes),
		);
		let mut unchanged_ranges = intersect_regions(
		unchanged_ranges,
		&find_unchanged_ranges(base_bytes, other_bytes),
		);
		// Add an empty UnchangedRange at the end to make it easier to find change
		// ranges. That way there's a changed range before each UnchangedRange.
		unchanged_ranges.push(UnchangedRange {
		base_range: base_bytes.len()..base_bytes.len(),
		offsets: vec![
		local_bytes.len().wrapping_sub(base_bytes.len()) as isize,
		other_bytes.len().wrapping_sub(base_bytes.len()) as isize,
		],
		});

		let mut new_base_bytes: Vec<u8> = vec![];
		let mut new_local_bytes: Vec<u8> = vec![];
		let mut new_other_bytes: Vec<u8> = vec![];
		let mut previous = UnchangedRange {
		base_range: 0..0,
		offsets: vec![0, 0],
		};
		for current in unchanged_ranges {
		let base_slice =
		&base_bytes[previous.base_range.end..current.base_range.start];
		let local_slice = &local_bytes[previous.end(0)..current.start(0)];
		let other_slice = &other_bytes[previous.end(1)..current.start(1)];
		if let Some(resolution) =
		resolve_conflict(base_slice, local_slice, other_slice, regex)
		{
		new_base_bytes.extend(&resolution);
		new_local_bytes.extend(&resolution);
		new_other_bytes.extend(&resolution);
		} else {
		new_base_bytes.extend(base_slice);
		new_local_bytes.extend(local_slice);
		new_other_bytes.extend(other_slice);
		}
		new_base_bytes.extend(&base_bytes[current.base_range.clone()]);
		new_local_bytes.extend(&local_bytes[current.start(0)..current.end(0)]);
		new_other_bytes.extend(&other_bytes[current.start(1)..current.end(1)]);
		previous = current;
		}

		(new_base_bytes, new_local_bytes, new_other_bytes)
		}

		/// A tool that performs a 3-way merge, resolving conflicts in sorted lists and
		/// leaving other conflicts unchanged. This is useful with Mercurial's support
		/// for partial merge tools (configured in `[partial-merge-tools]`).
		#[derive(Parser, Debug)]
		#[clap(version, about, long_about = None)]
Martin von Zweigbergk merge-lists: make it possible to specify pattern to match...	r49875	#[clap(group(ArgGroup::new("match").required(true).args(&["pattern", "python-imports"])))]
Martin von Zweigbergk contrib: add a partial-merge tool for sorted lists (such as Python imports)...	r49874	struct Args {
		/// Path to the file's content in the "local" side
		local: OsString,

		/// Path to the file's content in the base
		base: OsString,

		/// Path to the file's content in the "other" side
		other: OsString,
Martin von Zweigbergk merge-lists: make it possible to specify pattern to match...	r49875
		/// Regular expression to use
		#[clap(long, short)]
		pattern: Option<String>,

		/// Use built-in regular expression for Python imports
		#[clap(long)]
		python_imports: bool,
		}

		fn get_regex(args: &Args) -> Regex {
		let pattern = if args.python_imports {
		r"import \w+(\.\w+)( +#.)?\n\|from (\w+(\.\w+)* import \w+( as \w+)?(, \w+( as \w+)?)( +#.)?)"
		} else if let Some(pattern) = &args.pattern {
		pattern
		} else {
		".*"
		};
		let pattern = format!(r"{}\r?\n?", pattern);
		regex::bytes::Regex::new(&pattern).unwrap()
Martin von Zweigbergk contrib: add a partial-merge tool for sorted lists (such as Python imports)...	r49874	}

		fn main() {
		let args: Args = Args::parse();

		let base_path = PathBuf::from(&args.base);
		let local_path = PathBuf::from(&args.local);
		let other_path = PathBuf::from(&args.other);

		let base_bytes = std::fs::read(&base_path).unwrap();
		let local_bytes = std::fs::read(&local_path).unwrap();
		let other_bytes = std::fs::read(&other_path).unwrap();

Martin von Zweigbergk merge-lists: make it possible to specify pattern to match...	r49875	let regex = get_regex(&args);
Martin von Zweigbergk contrib: add a partial-merge tool for sorted lists (such as Python imports)...	r49874	let (new_base_bytes, new_local_bytes, new_other_bytes) =
		resolve(&base_bytes, &local_bytes, &other_bytes, &regex);

		// Write out the result if anything changed
		if new_base_bytes != base_bytes {
		std::fs::write(&base_path, new_base_bytes).unwrap();
		}
		if new_local_bytes != local_bytes {
		std::fs::write(&local_path, new_local_bytes).unwrap();
		}
		if new_other_bytes != other_bytes {
		std::fs::write(&other_path, new_other_bytes).unwrap();
		}
		}

		fn checked_add(base: usize, offset: isize) -> usize {
		if offset < 0 {
		base.checked_sub(offset.checked_abs().unwrap() as usize)
		.unwrap()
		} else {
		base.checked_add(offset as usize).unwrap()
		}
		}

		// The remainder of the file is copied from
		// https://github.com/martinvonz/jj/blob/main/lib/src/diff.rs

		#[derive(Clone, PartialEq, Eq, Debug)]
		struct UnchangedRange {
		base_range: Range<usize>,
		offsets: Vec<isize>,
		}

		impl UnchangedRange {
		fn start(&self, side: usize) -> usize {
		checked_add(self.base_range.start, self.offsets[side])
		}

		fn end(&self, side: usize) -> usize {
		checked_add(self.base_range.end, self.offsets[side])
		}
		}

		impl PartialOrd for UnchangedRange {
		fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
		Some(self.cmp(other))
		}
		}

		impl Ord for UnchangedRange {
		fn cmp(&self, other: &Self) -> Ordering {
		self.base_range
		.start
		.cmp(&other.base_range.start)
		.then_with(\|\| self.base_range.end.cmp(&other.base_range.end))
		}
		}

		/// Takes the current regions and intersects it with the new unchanged ranges
		/// from a 2-way diff. The result is a map of unchanged regions with one more
		/// offset in the map's values.
		fn intersect_regions(
		current_ranges: Vec<UnchangedRange>,
		new_unchanged_ranges: &[(Range<usize>, Range<usize>)],
		) -> Vec<UnchangedRange> {
		let mut result = vec![];
		let mut current_ranges_iter = current_ranges.into_iter().peekable();
		for (new_base_range, other_range) in new_unchanged_ranges.iter() {
		assert_eq!(new_base_range.len(), other_range.len());
		while let Some(UnchangedRange {
		base_range,
		offsets,
		}) = current_ranges_iter.peek()
		{
		// No need to look further if we're past the new range.
		if base_range.start >= new_base_range.end {
		break;
		}
		// Discard any current unchanged regions that don't match between
		// the base and the new input.
		if base_range.end <= new_base_range.start {
		current_ranges_iter.next();
		continue;
		}
		let new_start = max(base_range.start, new_base_range.start);
		let new_end = min(base_range.end, new_base_range.end);
		let mut new_offsets = offsets.clone();
		new_offsets
		.push(other_range.start.wrapping_sub(new_base_range.start)
		as isize);
		result.push(UnchangedRange {
		base_range: new_start..new_end,
		offsets: new_offsets,
		});
		if base_range.end >= new_base_range.end {
		// Break without consuming the item; there may be other new
		// ranges that overlap with it.
		break;
		}
		current_ranges_iter.next();
		}
		}
		result
		}