##// END OF EJS Templates
debug-revlog: keep the revlog open for the analysis duration...
debug-revlog: keep the revlog open for the analysis duration This is more explicit. No performance differences were observed.

File last commit:

r51910:edc44ab7 default
r51910:edc44ab7 default
Show More
debug.py
712 lines | 21.2 KiB | text/x-python | PythonLexer
# revlogutils/debug.py - utility used for revlog debuging
#
# Copyright 2005-2007 Olivia Mackall <olivia@selenic.com>
# Copyright 2022 Octobus <contact@octobus.net>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.
import collections
import string
from .. import (
mdiff,
node as nodemod,
revlogutils,
)
from . import (
constants,
deltas as deltautil,
)
INDEX_ENTRY_DEBUG_COLUMN = []
NODE_SIZE = object()
class _column_base:
"""constains the definition of a revlog column
name: the column header,
value_func: the function called to get a value,
size: the width of the column,
verbose_only: only include the column in verbose mode.
"""
def __init__(self, name, value_func, size=None, verbose=False):
self.name = name
self.value_func = value_func
if size is not NODE_SIZE:
if size is None:
size = 8 # arbitrary default
size = max(len(name), size)
self._size = size
self.verbose_only = verbose
def get_size(self, node_size):
if self._size is NODE_SIZE:
return node_size
else:
return self._size
def debug_column(name, size=None, verbose=False):
"""decorated function is registered as a column
name: the name of the column,
size: the expected size of the column.
"""
def register(func):
entry = _column_base(
name=name,
value_func=func,
size=size,
verbose=verbose,
)
INDEX_ENTRY_DEBUG_COLUMN.append(entry)
return entry
return register
@debug_column(b"rev", size=6)
def _rev(index, rev, entry, hexfn):
return b"%d" % rev
@debug_column(b"rank", size=6, verbose=True)
def rank(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_RANK]
@debug_column(b"linkrev", size=6)
def _linkrev(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_LINK_REV]
@debug_column(b"nodeid", size=NODE_SIZE)
def _nodeid(index, rev, entry, hexfn):
return hexfn(entry[constants.ENTRY_NODE_ID])
@debug_column(b"p1-rev", size=6, verbose=True)
def _p1_rev(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_PARENT_1]
@debug_column(b"p1-nodeid", size=NODE_SIZE)
def _p1_node(index, rev, entry, hexfn):
parent = entry[constants.ENTRY_PARENT_1]
p_entry = index[parent]
return hexfn(p_entry[constants.ENTRY_NODE_ID])
@debug_column(b"p2-rev", size=6, verbose=True)
def _p2_rev(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_PARENT_2]
@debug_column(b"p2-nodeid", size=NODE_SIZE)
def _p2_node(index, rev, entry, hexfn):
parent = entry[constants.ENTRY_PARENT_2]
p_entry = index[parent]
return hexfn(p_entry[constants.ENTRY_NODE_ID])
@debug_column(b"full-size", size=20, verbose=True)
def full_size(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_DATA_UNCOMPRESSED_LENGTH]
@debug_column(b"delta-base", size=6, verbose=True)
def delta_base(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_DELTA_BASE]
@debug_column(b"flags", size=2, verbose=True)
def flags(index, rev, entry, hexfn):
field = entry[constants.ENTRY_DATA_OFFSET]
field &= 0xFFFF
return b"%d" % field
@debug_column(b"comp-mode", size=4, verbose=True)
def compression_mode(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_DATA_COMPRESSION_MODE]
@debug_column(b"data-offset", size=20, verbose=True)
def data_offset(index, rev, entry, hexfn):
field = entry[constants.ENTRY_DATA_OFFSET]
field >>= 16
return b"%d" % field
@debug_column(b"chunk-size", size=10, verbose=True)
def data_chunk_size(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_DATA_COMPRESSED_LENGTH]
@debug_column(b"sd-comp-mode", size=7, verbose=True)
def sidedata_compression_mode(index, rev, entry, hexfn):
compression = entry[constants.ENTRY_SIDEDATA_COMPRESSION_MODE]
if compression == constants.COMP_MODE_PLAIN:
return b"plain"
elif compression == constants.COMP_MODE_DEFAULT:
return b"default"
elif compression == constants.COMP_MODE_INLINE:
return b"inline"
else:
return b"%d" % compression
@debug_column(b"sidedata-offset", size=20, verbose=True)
def sidedata_offset(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_SIDEDATA_OFFSET]
@debug_column(b"sd-chunk-size", size=10, verbose=True)
def sidedata_chunk_size(index, rev, entry, hexfn):
return b"%d" % entry[constants.ENTRY_SIDEDATA_COMPRESSED_LENGTH]
def debug_index(
ui,
repo,
formatter,
revlog,
full_node,
):
"""display index data for a revlog"""
if full_node:
hexfn = nodemod.hex
else:
hexfn = nodemod.short
idlen = 12
for i in revlog:
idlen = len(hexfn(revlog.node(i)))
break
fm = formatter
header_pieces = []
for column in INDEX_ENTRY_DEBUG_COLUMN:
if column.verbose_only and not ui.verbose:
continue
size = column.get_size(idlen)
name = column.name
header_pieces.append(name.rjust(size))
fm.plain(b' '.join(header_pieces) + b'\n')
index = revlog.index
for rev in revlog:
fm.startitem()
entry = index[rev]
first = True
for column in INDEX_ENTRY_DEBUG_COLUMN:
if column.verbose_only and not ui.verbose:
continue
if not first:
fm.plain(b' ')
first = False
size = column.get_size(idlen)
value = column.value_func(index, rev, entry, hexfn)
display = b"%%%ds" % size
fm.write(column.name, display, value)
fm.plain(b'\n')
fm.end()
def dump(ui, revlog):
"""perform the work for `hg debugrevlog --dump"""
# XXX seems redundant with debug index ?
r = revlog
numrevs = len(r)
ui.write(
(
b"# rev p1rev p2rev start end deltastart base p1 p2"
b" rawsize totalsize compression heads chainlen\n"
)
)
ts = 0
heads = set()
for rev in range(numrevs):
dbase = r.deltaparent(rev)
if dbase == -1:
dbase = rev
cbase = r.chainbase(rev)
clen = r.chainlen(rev)
p1, p2 = r.parentrevs(rev)
rs = r.rawsize(rev)
ts = ts + rs
heads -= set(r.parentrevs(rev))
heads.add(rev)
try:
compression = ts / r.end(rev)
except ZeroDivisionError:
compression = 0
ui.write(
b"%5d %5d %5d %5d %5d %10d %4d %4d %4d %7d %9d "
b"%11d %5d %8d\n"
% (
rev,
p1,
p2,
r.start(rev),
r.end(rev),
r.start(dbase),
r.start(cbase),
r.start(p1),
r.start(p2),
rs,
ts,
compression,
len(heads),
clen,
)
)
def debug_revlog(ui, revlog):
"""code for `hg debugrevlog`"""
r = revlog
format = r._format_version
v = r._format_flags
flags = []
gdelta = False
if v & constants.FLAG_INLINE_DATA:
flags.append(b'inline')
if v & constants.FLAG_GENERALDELTA:
gdelta = True
flags.append(b'generaldelta')
if not flags:
flags = [b'(none)']
### the total size of stored content if incompressed.
full_text_total_size = 0
### tracks merge vs single parent
nummerges = 0
### tracks ways the "delta" are build
# nodelta
numempty = 0
numemptytext = 0
numemptydelta = 0
# full file content
numfull = 0
# intermediate snapshot against a prior snapshot
numsemi = 0
# snapshot count per depth
numsnapdepth = collections.defaultdict(lambda: 0)
# number of snapshots with a non-ancestor delta
numsnapdepth_nad = collections.defaultdict(lambda: 0)
# delta against previous revision
numprev = 0
# delta against prev, where prev is a non-ancestor
numprev_nad = 0
# delta against first or second parent (not prev)
nump1 = 0
nump2 = 0
# delta against neither prev nor parents
numother = 0
# delta against other that is a non-ancestor
numother_nad = 0
# delta against prev that are also first or second parent
# (details of `numprev`)
nump1prev = 0
nump2prev = 0
# data about delta chain of each revs
chainlengths = []
chainbases = []
chainspans = []
# data about each revision
datasize = [None, 0, 0]
fullsize = [None, 0, 0]
semisize = [None, 0, 0]
# snapshot count per depth
snapsizedepth = collections.defaultdict(lambda: [None, 0, 0])
deltasize = [None, 0, 0]
chunktypecounts = {}
chunktypesizes = {}
def addsize(size, l):
if l[0] is None or size < l[0]:
l[0] = size
if size > l[1]:
l[1] = size
l[2] += size
with r.reading():
numrevs = len(r)
for rev in range(numrevs):
p1, p2 = r.parentrevs(rev)
delta = r.deltaparent(rev)
if format > 0:
s = r.rawsize(rev)
full_text_total_size += s
addsize(s, datasize)
if p2 != nodemod.nullrev:
nummerges += 1
size = r.length(rev)
if delta == nodemod.nullrev:
chainlengths.append(0)
chainbases.append(r.start(rev))
chainspans.append(size)
if size == 0:
numempty += 1
numemptytext += 1
else:
numfull += 1
numsnapdepth[0] += 1
addsize(size, fullsize)
addsize(size, snapsizedepth[0])
else:
nad = (
delta != p1
and delta != p2
and not r.isancestorrev(delta, rev)
)
chainlengths.append(chainlengths[delta] + 1)
baseaddr = chainbases[delta]
revaddr = r.start(rev)
chainbases.append(baseaddr)
chainspans.append((revaddr - baseaddr) + size)
if size == 0:
numempty += 1
numemptydelta += 1
elif r.issnapshot(rev):
addsize(size, semisize)
numsemi += 1
depth = r.snapshotdepth(rev)
numsnapdepth[depth] += 1
if nad:
numsnapdepth_nad[depth] += 1
addsize(size, snapsizedepth[depth])
else:
addsize(size, deltasize)
if delta == rev - 1:
numprev += 1
if delta == p1:
nump1prev += 1
elif delta == p2:
nump2prev += 1
elif nad:
numprev_nad += 1
elif delta == p1:
nump1 += 1
elif delta == p2:
nump2 += 1
elif delta != nodemod.nullrev:
numother += 1
numother_nad += 1
# Obtain data on the raw chunks in the revlog.
if hasattr(r, '_getsegmentforrevs'):
segment = r._getsegmentforrevs(rev, rev)[1]
else:
segment = r._revlog._getsegmentforrevs(rev, rev)[1]
if segment:
chunktype = bytes(segment[0:1])
else:
chunktype = b'empty'
if chunktype not in chunktypecounts:
chunktypecounts[chunktype] = 0
chunktypesizes[chunktype] = 0
chunktypecounts[chunktype] += 1
chunktypesizes[chunktype] += size
# Adjust size min value for empty cases
for size in (datasize, fullsize, semisize, deltasize):
if size[0] is None:
size[0] = 0
numdeltas = numrevs - numfull - numempty - numsemi
numoprev = numprev - nump1prev - nump2prev - numprev_nad
num_other_ancestors = numother - numother_nad
totalrawsize = datasize[2]
datasize[2] /= numrevs
fulltotal = fullsize[2]
if numfull == 0:
fullsize[2] = 0
else:
fullsize[2] /= numfull
semitotal = semisize[2]
snaptotal = {}
if numsemi > 0:
semisize[2] /= numsemi
for depth in snapsizedepth:
snaptotal[depth] = snapsizedepth[depth][2]
snapsizedepth[depth][2] /= numsnapdepth[depth]
deltatotal = deltasize[2]
if numdeltas > 0:
deltasize[2] /= numdeltas
totalsize = fulltotal + semitotal + deltatotal
avgchainlen = sum(chainlengths) / numrevs
maxchainlen = max(chainlengths)
maxchainspan = max(chainspans)
compratio = 1
if totalsize:
compratio = totalrawsize / totalsize
basedfmtstr = b'%%%dd\n'
basepcfmtstr = b'%%%dd %s(%%5.2f%%%%)\n'
def dfmtstr(max):
return basedfmtstr % len(str(max))
def pcfmtstr(max, padding=0):
return basepcfmtstr % (len(str(max)), b' ' * padding)
def pcfmt(value, total):
if total:
return (value, 100 * float(value) / total)
else:
return value, 100.0
ui.writenoi18n(b'format : %d\n' % format)
ui.writenoi18n(b'flags : %s\n' % b', '.join(flags))
ui.write(b'\n')
fmt = pcfmtstr(totalsize)
fmt2 = dfmtstr(totalsize)
ui.writenoi18n(b'revisions : ' + fmt2 % numrevs)
ui.writenoi18n(b' merges : ' + fmt % pcfmt(nummerges, numrevs))
ui.writenoi18n(
b' normal : ' + fmt % pcfmt(numrevs - nummerges, numrevs)
)
ui.writenoi18n(b'revisions : ' + fmt2 % numrevs)
ui.writenoi18n(b' empty : ' + fmt % pcfmt(numempty, numrevs))
ui.writenoi18n(
b' text : '
+ fmt % pcfmt(numemptytext, numemptytext + numemptydelta)
)
ui.writenoi18n(
b' delta : '
+ fmt % pcfmt(numemptydelta, numemptytext + numemptydelta)
)
ui.writenoi18n(
b' snapshot : ' + fmt % pcfmt(numfull + numsemi, numrevs)
)
for depth in sorted(numsnapdepth):
base = b' lvl-%-3d : ' % depth
count = fmt % pcfmt(numsnapdepth[depth], numrevs)
pieces = [base, count]
if numsnapdepth_nad[depth]:
pieces[-1] = count = count[:-1] # drop the final '\n'
more = b' non-ancestor-bases: '
anc_count = fmt
anc_count %= pcfmt(numsnapdepth_nad[depth], numsnapdepth[depth])
pieces.append(more)
pieces.append(anc_count)
ui.write(b''.join(pieces))
ui.writenoi18n(b' deltas : ' + fmt % pcfmt(numdeltas, numrevs))
ui.writenoi18n(b'revision size : ' + fmt2 % totalsize)
ui.writenoi18n(
b' snapshot : ' + fmt % pcfmt(fulltotal + semitotal, totalsize)
)
for depth in sorted(numsnapdepth):
ui.write(
(b' lvl-%-3d : ' % depth)
+ fmt % pcfmt(snaptotal[depth], totalsize)
)
ui.writenoi18n(b' deltas : ' + fmt % pcfmt(deltatotal, totalsize))
letters = string.ascii_letters.encode('ascii')
def fmtchunktype(chunktype):
if chunktype == b'empty':
return b' %s : ' % chunktype
elif chunktype in letters:
return b' 0x%s (%s) : ' % (nodemod.hex(chunktype), chunktype)
else:
return b' 0x%s : ' % nodemod.hex(chunktype)
ui.write(b'\n')
ui.writenoi18n(b'chunks : ' + fmt2 % numrevs)
for chunktype in sorted(chunktypecounts):
ui.write(fmtchunktype(chunktype))
ui.write(fmt % pcfmt(chunktypecounts[chunktype], numrevs))
ui.writenoi18n(b'chunks size : ' + fmt2 % totalsize)
for chunktype in sorted(chunktypecounts):
ui.write(fmtchunktype(chunktype))
ui.write(fmt % pcfmt(chunktypesizes[chunktype], totalsize))
ui.write(b'\n')
b_total = b"%d" % full_text_total_size
p_total = []
while len(b_total) > 3:
p_total.append(b_total[-3:])
b_total = b_total[:-3]
p_total.append(b_total)
p_total.reverse()
b_total = b' '.join(p_total)
ui.write(b'\n')
ui.writenoi18n(b'total-stored-content: %s bytes\n' % b_total)
ui.write(b'\n')
fmt = dfmtstr(max(avgchainlen, maxchainlen, maxchainspan, compratio))
ui.writenoi18n(b'avg chain length : ' + fmt % avgchainlen)
ui.writenoi18n(b'max chain length : ' + fmt % maxchainlen)
ui.writenoi18n(b'max chain reach : ' + fmt % maxchainspan)
ui.writenoi18n(b'compression ratio : ' + fmt % compratio)
if format > 0:
ui.write(b'\n')
ui.writenoi18n(
b'uncompressed data size (min/max/avg) : %d / %d / %d\n'
% tuple(datasize)
)
ui.writenoi18n(
b'full revision size (min/max/avg) : %d / %d / %d\n'
% tuple(fullsize)
)
ui.writenoi18n(
b'inter-snapshot size (min/max/avg) : %d / %d / %d\n'
% tuple(semisize)
)
for depth in sorted(snapsizedepth):
if depth == 0:
continue
ui.writenoi18n(
b' level-%-3d (min/max/avg) : %d / %d / %d\n'
% ((depth,) + tuple(snapsizedepth[depth]))
)
ui.writenoi18n(
b'delta size (min/max/avg) : %d / %d / %d\n'
% tuple(deltasize)
)
if numdeltas > 0:
ui.write(b'\n')
fmt = pcfmtstr(numdeltas)
fmt2 = pcfmtstr(numdeltas, 4)
ui.writenoi18n(
b'deltas against prev : ' + fmt % pcfmt(numprev, numdeltas)
)
if numprev > 0:
ui.writenoi18n(
b' where prev = p1 : ' + fmt2 % pcfmt(nump1prev, numprev)
)
ui.writenoi18n(
b' where prev = p2 : ' + fmt2 % pcfmt(nump2prev, numprev)
)
ui.writenoi18n(
b' other-ancestor : ' + fmt2 % pcfmt(numoprev, numprev)
)
ui.writenoi18n(
b' unrelated : ' + fmt2 % pcfmt(numoprev, numprev)
)
if gdelta:
ui.writenoi18n(
b'deltas against p1 : ' + fmt % pcfmt(nump1, numdeltas)
)
ui.writenoi18n(
b'deltas against p2 : ' + fmt % pcfmt(nump2, numdeltas)
)
ui.writenoi18n(
b'deltas against ancs : '
+ fmt % pcfmt(num_other_ancestors, numdeltas)
)
ui.writenoi18n(
b'deltas against other : '
+ fmt % pcfmt(numother_nad, numdeltas)
)
def debug_delta_find(ui, revlog, rev, base_rev=nodemod.nullrev):
"""display the search process for a delta"""
deltacomputer = deltautil.deltacomputer(
revlog,
write_debug=ui.write,
debug_search=not ui.quiet,
)
node = revlog.node(rev)
p1r, p2r = revlog.parentrevs(rev)
p1 = revlog.node(p1r)
p2 = revlog.node(p2r)
full_text = revlog.revision(rev)
btext = [full_text]
textlen = len(btext[0])
cachedelta = None
flags = revlog.flags(rev)
if base_rev != nodemod.nullrev:
base_text = revlog.revision(base_rev)
delta = mdiff.textdiff(base_text, full_text)
cachedelta = (base_rev, delta, constants.DELTA_BASE_REUSE_TRY)
btext = [None]
revinfo = revlogutils.revisioninfo(
node,
p1,
p2,
btext,
textlen,
cachedelta,
flags,
)
fh = revlog._datafp()
deltacomputer.finddeltainfo(revinfo, fh, target_rev=rev)
def debug_revlog_stats(
repo, fm, changelog: bool, manifest: bool, filelogs: bool
):
"""Format revlog statistics for debugging purposes
fm: the output formatter.
"""
fm.plain(b'rev-count data-size inl type target \n')
revlog_entries = [e for e in repo.store.walk() if e.is_revlog]
revlog_entries.sort(key=lambda e: (e.revlog_type, e.target_id))
for entry in revlog_entries:
if not changelog and entry.is_changelog:
continue
elif not manifest and entry.is_manifestlog:
continue
elif not filelogs and entry.is_filelog:
continue
rlog = entry.get_revlog_instance(repo).get_revlog()
fm.startitem()
nb_rev = len(rlog)
inline = rlog._inline
data_size = rlog._get_data_offset(nb_rev - 1)
target = rlog.target
revlog_type = b'unknown'
revlog_target = b''
if target[0] == constants.KIND_CHANGELOG:
revlog_type = b'changelog'
elif target[0] == constants.KIND_MANIFESTLOG:
revlog_type = b'manifest'
revlog_target = target[1]
elif target[0] == constants.KIND_FILELOG:
revlog_type = b'file'
revlog_target = target[1]
fm.write(b'revlog.rev-count', b'%9d', nb_rev)
fm.write(b'revlog.data-size', b'%12d', data_size)
fm.write(b'revlog.inline', b' %-3s', b'yes' if inline else b'no')
fm.write(b'revlog.type', b' %-9s', revlog_type)
fm.write(b'revlog.target', b' %s', revlog_target)
fm.plain(b'\n')