upstream/mercurial-mirror Commit - r44810:2ea6a67f

1

# nodemap.py - nodemap related code and utilities

1

# nodemap.py - nodemap related code and utilities

2

#

2

#

3

4

5

#

5

#

6

# This software may be used and distributed according to the terms of the

6

# This software may be used and distributed according to the terms of the

7

# GNU General Public License version 2 or any later version.

7

# GNU General Public License version 2 or any later version.

8

9

from __future__ import absolute_import

9

from __future__ import absolute_import

10

11

import os

11

import os

12

import re

12

import re

13

import struct

13

import struct

14

15

from .. import (

15

from .. import (

16

error,

16

error,

17

node as nodemod,

17

node as nodemod,

18

util,

18

util,

19

)

19

)

20

21

22

class NodeMap(dict):

22

class NodeMap(dict):

23

def __missing__(self, x):

23

def __missing__(self, x):

24

raise error.RevlogError(b'unknown node: %s' % x)

24

raise error.RevlogError(b'unknown node: %s' % x)

25

26

27

def persisted_data(revlog):

27

def persisted_data(revlog):

28

"""read the nodemap for a revlog from disk"""

28

"""read the nodemap for a revlog from disk"""

29

if revlog.nodemap_file is None:

29

if revlog.nodemap_file is None:

30

return None

30

return None

31

pdata = revlog.opener.tryread(revlog.nodemap_file)

31

pdata = revlog.opener.tryread(revlog.nodemap_file)

32

if not pdata:

32

if not pdata:

33

return None

33

return None

34

offset = 0

34

offset = 0

35

(version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])

35

(version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])

36

if version != ONDISK_VERSION:

36

if version != ONDISK_VERSION:

37

return None

37

return None

38

offset += S_VERSION.size

38

offset += S_VERSION.size

39

headers = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])

39

headers = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])

40

uid_size, tip_rev, data_length, data_unused = headers

40

uid_size, tip_rev, data_length, data_unused = headers

41

offset += S_HEADER.size

41

offset += S_HEADER.size

42

docket = NodeMapDocket(pdata[offset : offset + uid_size])

42

docket = NodeMapDocket(pdata[offset : offset + uid_size])

43

docket.tip_rev = tip_rev

43

docket.tip_rev = tip_rev

44

docket.data_length = data_length

44

docket.data_length = data_length

45

docket.data_unused = data_unused

45

docket.data_unused = data_unused

46

47

filename = _rawdata_filepath(revlog, docket)

47

filename = _rawdata_filepath(revlog, docket)

48

return docket, revlog.opener.tryread(filename)

48

return docket, revlog.opener.tryread(filename)

49

50

51

def setup_persistent_nodemap(tr, revlog):

51

def setup_persistent_nodemap(tr, revlog):

52

"""Install whatever is needed transaction side to persist a nodemap on disk

52

"""Install whatever is needed transaction side to persist a nodemap on disk

53

54

(only actually persist the nodemap if this is relevant for this revlog)

54

(only actually persist the nodemap if this is relevant for this revlog)

55

"""

55

"""

56

if revlog._inline:

56

if revlog._inline:

57

return # inlined revlog are too small for this to be relevant

57

return # inlined revlog are too small for this to be relevant

58

if revlog.nodemap_file is None:

58

if revlog.nodemap_file is None:

59

return # we do not use persistent_nodemap on this revlog

59

return # we do not use persistent_nodemap on this revlog

60

callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file

60

callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file

61

if tr.hasfinalize(callback_id):

61

if tr.hasfinalize(callback_id):

62

return # no need to register again

62

return # no need to register again

63

tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))

63

tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))

64

65

66

def _persist_nodemap(tr, revlog):

66

def _persist_nodemap(tr, revlog):

67

"""Write nodemap data on disk for a given revlog

67

"""Write nodemap data on disk for a given revlog

68

"""

68

"""

69

if getattr(revlog, 'filteredrevs', ()):

69

if getattr(revlog, 'filteredrevs', ()):

70

raise error.ProgrammingError(

70

raise error.ProgrammingError(

71

"cannot persist nodemap of a filtered changelog"

71

"cannot persist nodemap of a filtered changelog"

72

)

72

)

73

if revlog.nodemap_file is None:

73

if revlog.nodemap_file is None:

74

msg = "calling persist nodemap on a revlog without the feature enableb"

74

msg = "calling persist nodemap on a revlog without the feature enableb"

75

raise error.ProgrammingError(msg)

75

raise error.ProgrammingError(msg)

76

77

can_incremental = util.safehasattr(revlog.index, "nodemap_data_incremental")

77

can_incremental = util.safehasattr(revlog.index, "nodemap_data_incremental")

78

ondisk_docket = revlog._nodemap_docket

78

ondisk_docket = revlog._nodemap_docket

79

80

data = None

80

data = None

81

# first attemp an incremental update of the data

81

# first attemp an incremental update of the data

82

if can_incremental and ondisk_docket is not None:

82

if can_incremental and ondisk_docket is not None:

83

target_docket = revlog._nodemap_docket.copy()

83

target_docket = revlog._nodemap_docket.copy()

84

(

84

(

85

src_docket,

85

src_docket,

86

data_changed_count,

86

data_changed_count,

87

data,

87

data,

88

) = revlog.index.nodemap_data_incremental()

88

) = revlog.index.nodemap_data_incremental()

89

if src_docket != target_docket:

89

if src_docket != target_docket:

90

data = None

90

data = None

91

else:

91

else:

92

datafile = _rawdata_filepath(revlog, target_docket)

92

datafile = _rawdata_filepath(revlog, target_docket)

93

# EXP-TODO: if this is a cache, this should use a cache vfs, not a

93

# EXP-TODO: if this is a cache, this should use a cache vfs, not a

94

# store vfs

94

# store vfs

95

with revlog.opener(datafile, b'a') as fd:

95

with revlog.opener(datafile, b'r+') as fd:

96

fd.seek(target_docket.data_length)

96

fd.write(data)

97

fd.write(data)

97

target_docket.data_length += len(data)

98

target_docket.data_length += len(data)

98

target_docket.data_unused += data_changed_count

99

target_docket.data_unused += data_changed_count

99

100

if data is None:

101

if data is None:

101

# otherwise fallback to a full new export

102

# otherwise fallback to a full new export

102

target_docket = NodeMapDocket()

103

target_docket = NodeMapDocket()

103

datafile = _rawdata_filepath(revlog, target_docket)

104

datafile = _rawdata_filepath(revlog, target_docket)

104

if util.safehasattr(revlog.index, "nodemap_data_all"):

105

if util.safehasattr(revlog.index, "nodemap_data_all"):

105

data = revlog.index.nodemap_data_all()

106

data = revlog.index.nodemap_data_all()

106

else:

107

else:

107

data = persistent_data(revlog.index)

108

data = persistent_data(revlog.index)

108

# EXP-TODO: if this is a cache, this should use a cache vfs, not a

109

# EXP-TODO: if this is a cache, this should use a cache vfs, not a

109

# store vfs

110

# store vfs

110

with revlog.opener(datafile, b'w') as fd:

111

with revlog.opener(datafile, b'w') as fd:

111

fd.write(data)

112

fd.write(data)

112

target_docket.data_length = len(data)

113

target_docket.data_length = len(data)

113

target_docket.tip_rev = revlog.tiprev()

114

target_docket.tip_rev = revlog.tiprev()

114

# EXP-TODO: if this is a cache, this should use a cache vfs, not a

115

# EXP-TODO: if this is a cache, this should use a cache vfs, not a

115

# store vfs

116

# store vfs

116

with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:

117

with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:

117

fp.write(target_docket.serialize())

118

fp.write(target_docket.serialize())

118

revlog._nodemap_docket = target_docket

119

revlog._nodemap_docket = target_docket

119

# EXP-TODO: if the transaction abort, we should remove the new data and

120

# EXP-TODO: if the transaction abort, we should remove the new data and

120

# reinstall the old one.

121

# reinstall the old one.

121

122

# search for old index file in all cases, some older process might have

123

# search for old index file in all cases, some older process might have

123

# left one behind.

124

# left one behind.

124

olds = _other_rawdata_filepath(revlog, target_docket)

125

olds = _other_rawdata_filepath(revlog, target_docket)

125

if olds:

126

if olds:

126

realvfs = getattr(revlog, '_realopener', revlog.opener)

127

realvfs = getattr(revlog, '_realopener', revlog.opener)

127

128

def cleanup(tr):

129

def cleanup(tr):

129

for oldfile in olds:

130

for oldfile in olds:

130

realvfs.tryunlink(oldfile)

131

realvfs.tryunlink(oldfile)

131

132

callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file

133

callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file

133

tr.addpostclose(callback_id, cleanup)

134

tr.addpostclose(callback_id, cleanup)

134

135

136

### Nodemap docket file

137

### Nodemap docket file

137

#

138

#

138

# The nodemap data are stored on disk using 2 files:

139

# The nodemap data are stored on disk using 2 files:

139

#

140

#

140

# * a raw data files containing a persistent nodemap

141

# * a raw data files containing a persistent nodemap

141

# (see `Nodemap Trie` section)

142

# (see `Nodemap Trie` section)

142

#

143

#

143

# * a small "docket" file containing medatadata

144

# * a small "docket" file containing medatadata

144

#

145

#

145

# While the nodemap data can be multiple tens of megabytes, the "docket" is

146

# While the nodemap data can be multiple tens of megabytes, the "docket" is

146

# small, it is easy to update it automatically or to duplicated its content

147

# small, it is easy to update it automatically or to duplicated its content

147

# during a transaction.

148

# during a transaction.

148

#

149

#

149

# Multiple raw data can exist at the same time (The currently valid one and a

150

# Multiple raw data can exist at the same time (The currently valid one and a

150

# new one beind used by an in progress transaction). To accomodate this, the

151

# new one beind used by an in progress transaction). To accomodate this, the

151

# filename hosting the raw data has a variable parts. The exact filename is

152

# filename hosting the raw data has a variable parts. The exact filename is

152

# specified inside the "docket" file.

153

# specified inside the "docket" file.

153

#

154

#

154

# The docket file contains information to find, qualify and validate the raw

155

# The docket file contains information to find, qualify and validate the raw

155

# data. Its content is currently very light, but it will expand as the on disk

156

# data. Its content is currently very light, but it will expand as the on disk

156

# nodemap gains the necessary features to be used in production.

157

# nodemap gains the necessary features to be used in production.

157

158

# version 0 is experimental, no BC garantee, do no use outside of tests.

159

# version 0 is experimental, no BC garantee, do no use outside of tests.

159

ONDISK_VERSION = 0

160

ONDISK_VERSION = 0

160

S_VERSION = struct.Struct(">B")

161

S_VERSION = struct.Struct(">B")

161

S_HEADER = struct.Struct(">BQQQ")

162

S_HEADER = struct.Struct(">BQQQ")

162

163

ID_SIZE = 8

164

ID_SIZE = 8

164

165

166

def _make_uid():

167

def _make_uid():

167

"""return a new unique identifier.

168

"""return a new unique identifier.

168

169

The identifier is random and composed of ascii characters."""

170

The identifier is random and composed of ascii characters."""

170

return nodemod.hex(os.urandom(ID_SIZE))

171

return nodemod.hex(os.urandom(ID_SIZE))

171

172

173

class NodeMapDocket(object):

174

class NodeMapDocket(object):

174

"""metadata associated with persistent nodemap data

175

"""metadata associated with persistent nodemap data

175

176

The persistent data may come from disk or be on their way to disk.

177

The persistent data may come from disk or be on their way to disk.

177

"""

178

"""

178

179

def __init__(self, uid=None):

180

def __init__(self, uid=None):

180

if uid is None:

181

if uid is None:

181

uid = _make_uid()

182

uid = _make_uid()

182

self.uid = uid

183

self.uid = uid

183

self.tip_rev = None

184

self.tip_rev = None

184

self.data_length = None

185

self.data_length = None

185

self.data_unused = 0

186

self.data_unused = 0

186

187

def copy(self):

188

def copy(self):

188

new = NodeMapDocket(uid=self.uid)

189

new = NodeMapDocket(uid=self.uid)

189

new.tip_rev = self.tip_rev

190

new.tip_rev = self.tip_rev

190

new.data_length = self.data_length

191

new.data_length = self.data_length

191

new.data_unused = self.data_unused

192

new.data_unused = self.data_unused

192

return new

193

return new

193

194

def __cmp__(self, other):

195

def __cmp__(self, other):

195

if self.uid < other.uid:

196

if self.uid < other.uid:

196

return -1

197

return -1

197

if self.uid > other.uid:

198

if self.uid > other.uid:

198

return 1

199

return 1

199

elif self.data_length < other.data_length:

200

elif self.data_length < other.data_length:

200

return -1

201

return -1

201

elif self.data_length > other.data_length:

202

elif self.data_length > other.data_length:

202

return 1

203

return 1

203

return 0

204

return 0

204

205

def __eq__(self, other):

206

def __eq__(self, other):

206

return self.uid == other.uid and self.data_length == other.data_length

207

return self.uid == other.uid and self.data_length == other.data_length

207

208

def serialize(self):

209

def serialize(self):

209

"""return serialized bytes for a docket using the passed uid"""

210

"""return serialized bytes for a docket using the passed uid"""

210

data = []

211

data = []

211

data.append(S_VERSION.pack(ONDISK_VERSION))

212

data.append(S_VERSION.pack(ONDISK_VERSION))

212

headers = (

213

headers = (

213

len(self.uid),

214

len(self.uid),

214

self.tip_rev,

215

self.tip_rev,

215

self.data_length,

216

self.data_length,

216

self.data_unused,

217

self.data_unused,

217

)

218

)

218

data.append(S_HEADER.pack(*headers))

219

data.append(S_HEADER.pack(*headers))

219

data.append(self.uid)

220

data.append(self.uid)

220

return b''.join(data)

221

return b''.join(data)

221

222

223

def _rawdata_filepath(revlog, docket):

224

def _rawdata_filepath(revlog, docket):

224

"""The (vfs relative) nodemap's rawdata file for a given uid"""

225

"""The (vfs relative) nodemap's rawdata file for a given uid"""

225

prefix = revlog.nodemap_file[:-2]

226

prefix = revlog.nodemap_file[:-2]

226

return b"%s-%s.nd" % (prefix, docket.uid)

227

return b"%s-%s.nd" % (prefix, docket.uid)

227

228

229

def _other_rawdata_filepath(revlog, docket):

230

def _other_rawdata_filepath(revlog, docket):

230

prefix = revlog.nodemap_file[:-2]

231

prefix = revlog.nodemap_file[:-2]

231

pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)

232

pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)

232

new_file_path = _rawdata_filepath(revlog, docket)

233

new_file_path = _rawdata_filepath(revlog, docket)

233

new_file_name = revlog.opener.basename(new_file_path)

234

new_file_name = revlog.opener.basename(new_file_path)

234

dirpath = revlog.opener.dirname(new_file_path)

235

dirpath = revlog.opener.dirname(new_file_path)

235

others = []

236

others = []

236

for f in revlog.opener.listdir(dirpath):

237

for f in revlog.opener.listdir(dirpath):

237

if pattern.match(f) and f != new_file_name:

238

if pattern.match(f) and f != new_file_name:

238

others.append(f)

239

others.append(f)

239

return others

240

return others

240

241

242

### Nodemap Trie

243

### Nodemap Trie

243

#

244

#

244

# This is a simple reference implementation to compute and persist a nodemap

245

# This is a simple reference implementation to compute and persist a nodemap

245

# trie. This reference implementation is write only. The python version of this

246

# trie. This reference implementation is write only. The python version of this

246

# is not expected to be actually used, since it wont provide performance

247

# is not expected to be actually used, since it wont provide performance

247

# improvement over existing non-persistent C implementation.

248

# improvement over existing non-persistent C implementation.

248

#

249

#

249

# The nodemap is persisted as Trie using 4bits-address/16-entries block. each

250

# The nodemap is persisted as Trie using 4bits-address/16-entries block. each

250

# revision can be adressed using its node shortest prefix.

251

# revision can be adressed using its node shortest prefix.

251

#

252

#

252

# The trie is stored as a sequence of block. Each block contains 16 entries

253

# The trie is stored as a sequence of block. Each block contains 16 entries

253

# (signed 64bit integer, big endian). Each entry can be one of the following:

254

# (signed 64bit integer, big endian). Each entry can be one of the following:

254

#

255

#

255

# * value >= 0 -> index of sub-block

256

# * value >= 0 -> index of sub-block

256

# * value == -1 -> no value

257

# * value == -1 -> no value

257

# * value < -1 -> a revision value: rev = -(value+10)

258

# * value < -1 -> a revision value: rev = -(value+10)

258

#

259

#

259

# The implementation focus on simplicity, not on performance. A Rust

260

# The implementation focus on simplicity, not on performance. A Rust

260

# implementation should provide a efficient version of the same binary

261

# implementation should provide a efficient version of the same binary

261

# persistence. This reference python implementation is never meant to be

262

# persistence. This reference python implementation is never meant to be

262

# extensively use in production.

263

# extensively use in production.

263

264

265

def persistent_data(index):

266

def persistent_data(index):

266

"""return the persistent binary form for a nodemap for a given index

267

"""return the persistent binary form for a nodemap for a given index

267

"""

268

"""

268

trie = _build_trie(index)

269

trie = _build_trie(index)

269

return _persist_trie(trie)

270

return _persist_trie(trie)

270

271

272

def update_persistent_data(index, root, max_idx, last_rev):

273

def update_persistent_data(index, root, max_idx, last_rev):

273

"""return the incremental update for persistent nodemap from a given index

274

"""return the incremental update for persistent nodemap from a given index

274

"""

275

"""

275

changed_block, trie = _update_trie(index, root, last_rev)

276

changed_block, trie = _update_trie(index, root, last_rev)

276

return (

277

return (

277

changed_block * S_BLOCK.size,

278

changed_block * S_BLOCK.size,

278

_persist_trie(trie, existing_idx=max_idx),

279

_persist_trie(trie, existing_idx=max_idx),

279

)

280

)

280

281

282

S_BLOCK = struct.Struct(">" + ("l" * 16))

283

S_BLOCK = struct.Struct(">" + ("l" * 16))

283

284

NO_ENTRY = -1

285

NO_ENTRY = -1

285

# rev 0 need to be -2 because 0 is used by block, -1 is a special value.

286

# rev 0 need to be -2 because 0 is used by block, -1 is a special value.

286

REV_OFFSET = 2

287

REV_OFFSET = 2

287

288

289

def _transform_rev(rev):

290

def _transform_rev(rev):

290

"""Return the number used to represent the rev in the tree.

291

"""Return the number used to represent the rev in the tree.

291

292

(or retrieve a rev number from such representation)

293

(or retrieve a rev number from such representation)

293

294

Note that this is an involution, a function equal to its inverse (i.e.

295

Note that this is an involution, a function equal to its inverse (i.e.

295

which gives the identity when applied to itself).

296

which gives the identity when applied to itself).

296

"""

297

"""

297

return -(rev + REV_OFFSET)

298

return -(rev + REV_OFFSET)

298

299

300

def _to_int(hex_digit):

301

def _to_int(hex_digit):

301

"""turn an hexadecimal digit into a proper integer"""

302

"""turn an hexadecimal digit into a proper integer"""

302

return int(hex_digit, 16)

303

return int(hex_digit, 16)

303

304

305

class Block(dict):

306

class Block(dict):

306

"""represent a block of the Trie

307

"""represent a block of the Trie

307

308

contains up to 16 entry indexed from 0 to 15"""

309

contains up to 16 entry indexed from 0 to 15"""

309

310

def __init__(self):

311

def __init__(self):

311

super(Block, self).__init__()

312

super(Block, self).__init__()

312

# If this block exist on disk, here is its ID

313

# If this block exist on disk, here is its ID

313

self.ondisk_id = None

314

self.ondisk_id = None

314

315

def __iter__(self):

316

def __iter__(self):

316

return iter(self.get(i) for i in range(16))

317

return iter(self.get(i) for i in range(16))

317

318

319

def _build_trie(index):

320

def _build_trie(index):

320

"""build a nodemap trie

321

"""build a nodemap trie

321

322

The nodemap stores revision number for each unique prefix.

323

The nodemap stores revision number for each unique prefix.

323

324

Each block is a dictionary with keys in `[0, 15]`. Values are either

325

Each block is a dictionary with keys in `[0, 15]`. Values are either

325

another block or a revision number.

326

another block or a revision number.

326

"""

327

"""

327

root = Block()

328

root = Block()

328

for rev in range(len(index)):

329

for rev in range(len(index)):

329

hex = nodemod.hex(index[rev][7])

330

hex = nodemod.hex(index[rev][7])

330

_insert_into_block(index, 0, root, rev, hex)

331

_insert_into_block(index, 0, root, rev, hex)

331

return root

332

return root

332

333

334

def _update_trie(index, root, last_rev):

335

def _update_trie(index, root, last_rev):

335

"""consume"""

336

"""consume"""

336

changed = 0

337

changed = 0

337

for rev in range(last_rev + 1, len(index)):

338

for rev in range(last_rev + 1, len(index)):

338

hex = nodemod.hex(index[rev][7])

339

hex = nodemod.hex(index[rev][7])

339

changed += _insert_into_block(index, 0, root, rev, hex)

340

changed += _insert_into_block(index, 0, root, rev, hex)

340

return changed, root

341

return changed, root

341

342

343

def _insert_into_block(index, level, block, current_rev, current_hex):

344

def _insert_into_block(index, level, block, current_rev, current_hex):

344

"""insert a new revision in a block

345

"""insert a new revision in a block

345

346

index: the index we are adding revision for

347

index: the index we are adding revision for

347

level: the depth of the current block in the trie

348

level: the depth of the current block in the trie

348

block: the block currently being considered

349

block: the block currently being considered

349

current_rev: the revision number we are adding

350

current_rev: the revision number we are adding

350

current_hex: the hexadecimal representation of the of that revision

351

current_hex: the hexadecimal representation of the of that revision

351

"""

352

"""

352

changed = 1

353

changed = 1

353

if block.ondisk_id is not None:

354

if block.ondisk_id is not None:

354

block.ondisk_id = None

355

block.ondisk_id = None

355

hex_digit = _to_int(current_hex[level : level + 1])

356

hex_digit = _to_int(current_hex[level : level + 1])

356

entry = block.get(hex_digit)

357

entry = block.get(hex_digit)

357

if entry is None:

358

if entry is None:

358

# no entry, simply store the revision number

359

# no entry, simply store the revision number

359

block[hex_digit] = current_rev

360

block[hex_digit] = current_rev

360

elif isinstance(entry, dict):

361

elif isinstance(entry, dict):

361

# need to recurse to an underlying block

362

# need to recurse to an underlying block

362

changed += _insert_into_block(

363

changed += _insert_into_block(

363

index, level + 1, entry, current_rev, current_hex

364

index, level + 1, entry, current_rev, current_hex

364

)

365

)

365

else:

366

else:

366

# collision with a previously unique prefix, inserting new

367

# collision with a previously unique prefix, inserting new

367

# vertices to fit both entry.

368

# vertices to fit both entry.

368

other_hex = nodemod.hex(index[entry][7])

369

other_hex = nodemod.hex(index[entry][7])

369

other_rev = entry

370

other_rev = entry

370

new = Block()

371

new = Block()

371

block[hex_digit] = new

372

block[hex_digit] = new

372

_insert_into_block(index, level + 1, new, other_rev, other_hex)

373

_insert_into_block(index, level + 1, new, other_rev, other_hex)

373

_insert_into_block(index, level + 1, new, current_rev, current_hex)

374

_insert_into_block(index, level + 1, new, current_rev, current_hex)

374

return changed

375

return changed

375

376

377

def _persist_trie(root, existing_idx=None):

378

def _persist_trie(root, existing_idx=None):

378

"""turn a nodemap trie into persistent binary data

379

"""turn a nodemap trie into persistent binary data

379

380

See `_build_trie` for nodemap trie structure"""

381

See `_build_trie` for nodemap trie structure"""

381

block_map = {}

382

block_map = {}

382

if existing_idx is not None:

383

if existing_idx is not None:

383

base_idx = existing_idx + 1

384

base_idx = existing_idx + 1

384

else:

385

else:

385

base_idx = 0

386

base_idx = 0

386

chunks = []

387

chunks = []

387

for tn in _walk_trie(root):

388

for tn in _walk_trie(root):

388

if tn.ondisk_id is not None:

389

if tn.ondisk_id is not None:

389

block_map[id(tn)] = tn.ondisk_id

390

block_map[id(tn)] = tn.ondisk_id

390

else:

391

else:

391

block_map[id(tn)] = len(chunks) + base_idx

392

block_map[id(tn)] = len(chunks) + base_idx

392

chunks.append(_persist_block(tn, block_map))

393

chunks.append(_persist_block(tn, block_map))

393

return b''.join(chunks)

394

return b''.join(chunks)

394

395

396

def _walk_trie(block):

397

def _walk_trie(block):

397

"""yield all the block in a trie

398

"""yield all the block in a trie

398

399

Children blocks are always yield before their parent block.

400

Children blocks are always yield before their parent block.

400

"""

401

"""

401

for (_, item) in sorted(block.items()):

402

for (_, item) in sorted(block.items()):

402

if isinstance(item, dict):

403

if isinstance(item, dict):

403

for sub_block in _walk_trie(item):

404

for sub_block in _walk_trie(item):

404

yield sub_block

405

yield sub_block

405

yield block

406

yield block

406

407

408

def _persist_block(block_node, block_map):

409

def _persist_block(block_node, block_map):

409

"""produce persistent binary data for a single block

410

"""produce persistent binary data for a single block

410

411

Children block are assumed to be already persisted and present in

412

Children block are assumed to be already persisted and present in

412

block_map.

413

block_map.

413

"""

414

"""

414

data = tuple(_to_value(v, block_map) for v in block_node)

415

data = tuple(_to_value(v, block_map) for v in block_node)

415

return S_BLOCK.pack(*data)

416

return S_BLOCK.pack(*data)

416

417

418

def _to_value(item, block_map):

419

def _to_value(item, block_map):

419

"""persist any value as an integer"""

420

"""persist any value as an integer"""

420

if item is None:

421

if item is None:

421

return NO_ENTRY

422

return NO_ENTRY

422

elif isinstance(item, dict):

423

elif isinstance(item, dict):

423

return block_map[id(item)]

424

return block_map[id(item)]

424

else:

425

else:

425

return _transform_rev(item)

426

return _transform_rev(item)

426

427

428

def parse_data(data):

429

def parse_data(data):

429

"""parse parse nodemap data into a nodemap Trie"""

430

"""parse parse nodemap data into a nodemap Trie"""

430

if (len(data) % S_BLOCK.size) != 0:

431

if (len(data) % S_BLOCK.size) != 0:

431

msg = "nodemap data size is not a multiple of block size (%d): %d"

432

msg = "nodemap data size is not a multiple of block size (%d): %d"

432

raise error.Abort(msg % (S_BLOCK.size, len(data)))

433

raise error.Abort(msg % (S_BLOCK.size, len(data)))

433

if not data:

434

if not data:

434

return Block(), None

435

return Block(), None

435

block_map = {}

436

block_map = {}

436

new_blocks = []

437

new_blocks = []

437

for i in range(0, len(data), S_BLOCK.size):

438

for i in range(0, len(data), S_BLOCK.size):

438

block = Block()

439

block = Block()

439

block.ondisk_id = len(block_map)

440

block.ondisk_id = len(block_map)

440

block_map[block.ondisk_id] = block

441

block_map[block.ondisk_id] = block

441

block_data = data[i : i + S_BLOCK.size]

442

block_data = data[i : i + S_BLOCK.size]

442

values = S_BLOCK.unpack(block_data)

443

values = S_BLOCK.unpack(block_data)

443

new_blocks.append((block, values))

444

new_blocks.append((block, values))

444

for b, values in new_blocks:

445

for b, values in new_blocks:

445

for idx, v in enumerate(values):

446

for idx, v in enumerate(values):

446

if v == NO_ENTRY:

447

if v == NO_ENTRY:

447

continue

448

continue

448

elif v >= 0:

449

elif v >= 0:

449

b[idx] = block_map[v]

450

b[idx] = block_map[v]

450

else:

451

else:

451

b[idx] = _transform_rev(v)

452

b[idx] = _transform_rev(v)

452

return block, i // S_BLOCK.size

453

return block, i // S_BLOCK.size

453

454

455

# debug utility

456

# debug utility

456

457

458

def check_data(ui, index, data):

459

def check_data(ui, index, data):

459

"""verify that the provided nodemap data are valid for the given idex"""

460

"""verify that the provided nodemap data are valid for the given idex"""

460

ret = 0

461

ret = 0

461

ui.status((b"revision in index: %d\n") % len(index))

462

ui.status((b"revision in index: %d\n") % len(index))

462

root, __ = parse_data(data)

463

root, __ = parse_data(data)

463

all_revs = set(_all_revisions(root))

464

all_revs = set(_all_revisions(root))

464

ui.status((b"revision in nodemap: %d\n") % len(all_revs))

465

ui.status((b"revision in nodemap: %d\n") % len(all_revs))

465

for r in range(len(index)):

466

for r in range(len(index)):

466

if r not in all_revs:

467

if r not in all_revs:

467

msg = b" revision missing from nodemap: %d\n" % r

468

msg = b" revision missing from nodemap: %d\n" % r

468

ui.write_err(msg)

469

ui.write_err(msg)

469

ret = 1

470

ret = 1

470

else:

471

else:

471

all_revs.remove(r)

472

all_revs.remove(r)

472

nm_rev = _find_node(root, nodemod.hex(index[r][7]))

473

nm_rev = _find_node(root, nodemod.hex(index[r][7]))

473

if nm_rev is None:

474

if nm_rev is None:

474

msg = b" revision node does not match any entries: %d\n" % r

475

msg = b" revision node does not match any entries: %d\n" % r

475

ui.write_err(msg)

476

ui.write_err(msg)

476

ret = 1

477

ret = 1

477

elif nm_rev != r:

478

elif nm_rev != r:

478

msg = (

479

msg = (

479

b" revision node does not match the expected revision: "

480

b" revision node does not match the expected revision: "

480

b"%d != %d\n" % (r, nm_rev)

481

b"%d != %d\n" % (r, nm_rev)

481

)

482

)

482

ui.write_err(msg)

483

ui.write_err(msg)

483

ret = 1

484

ret = 1

484

485

if all_revs:

486

if all_revs:

486

for r in sorted(all_revs):

487

for r in sorted(all_revs):

487

msg = b" extra revision in nodemap: %d\n" % r

488

msg = b" extra revision in nodemap: %d\n" % r

488

ui.write_err(msg)

489

ui.write_err(msg)

489

ret = 1

490

ret = 1

490

return ret

491

return ret

491

492

493

def _all_revisions(root):

494

def _all_revisions(root):

494

"""return all revisions stored in a Trie"""

495

"""return all revisions stored in a Trie"""

495

for block in _walk_trie(root):

496

for block in _walk_trie(root):

496

for v in block:

497

for v in block:

497

if v is None or isinstance(v, Block):

498

if v is None or isinstance(v, Block):

498

continue

499

continue

499

yield v

500

yield v

500

501

502

def _find_node(block, node):

503

def _find_node(block, node):

503

"""find the revision associated with a given node"""

504

"""find the revision associated with a given node"""

504

entry = block.get(_to_int(node[0:1]))

505

entry = block.get(_to_int(node[0:1]))

505

if isinstance(entry, dict):

506

if isinstance(entry, dict):

506

return _find_node(entry, node[1:])

507

return _find_node(entry, node[1:])

507

return entry

508

return entry

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # nodemap.py - nodemap related code and utilities
             #
             # Copyright 2019 Pierre-Yves David <pierre-yves.david@octobus.net>
             # Copyright 2019 George Racinet <georges.racinet@octobus.net>
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import os
             import re
             import struct
             from .. import (
                 error,
                 node as nodemod,
                 util,
             )
             class NodeMap(dict):
                 def __missing__(self, x):
                     raise error.RevlogError(b'unknown node: %s' % x)
             def persisted_data(revlog):
                 """read the nodemap for a revlog from disk"""
                 if revlog.nodemap_file is None:
                     return None
                 pdata = revlog.opener.tryread(revlog.nodemap_file)
                 if not pdata:
                     return None
                 offset = 0
                 (version,) = S_VERSION.unpack(pdata[offset : offset + S_VERSION.size])
                 if version != ONDISK_VERSION:
                     return None
                 offset += S_VERSION.size
                 headers = S_HEADER.unpack(pdata[offset : offset + S_HEADER.size])
                 uid_size, tip_rev, data_length, data_unused = headers
                 offset += S_HEADER.size
                 docket = NodeMapDocket(pdata[offset : offset + uid_size])
                 docket.tip_rev = tip_rev
                 docket.data_length = data_length
                 docket.data_unused = data_unused
                 filename = _rawdata_filepath(revlog, docket)
                 return docket, revlog.opener.tryread(filename)
             def setup_persistent_nodemap(tr, revlog):
                 """Install whatever is needed transaction side to persist a nodemap on disk
                 (only actually persist the nodemap if this is relevant for this revlog)
                 """
                 if revlog._inline:
                     return  # inlined revlog are too small for this to be relevant
                 if revlog.nodemap_file is None:
                     return  # we do not use persistent_nodemap on this revlog
                 callback_id = b"revlog-persistent-nodemap-%s" % revlog.nodemap_file
                 if tr.hasfinalize(callback_id):
                     return  # no need to register again
                 tr.addfinalize(callback_id, lambda tr: _persist_nodemap(tr, revlog))
             def _persist_nodemap(tr, revlog):
                 """Write nodemap data on disk for a given revlog
                 """
                 if getattr(revlog, 'filteredrevs', ()):
                     raise error.ProgrammingError(
                         "cannot persist nodemap of a filtered changelog"
                     )
                 if revlog.nodemap_file is None:
                     msg = "calling persist nodemap on a revlog without the feature enableb"
                     raise error.ProgrammingError(msg)
                 can_incremental = util.safehasattr(revlog.index, "nodemap_data_incremental")
                 ondisk_docket = revlog._nodemap_docket
                 data = None
                 # first attemp an incremental update of the data
                 if can_incremental and ondisk_docket is not None:
                     target_docket = revlog._nodemap_docket.copy()
                     (
                         src_docket,
                         data_changed_count,
                         data,
                     ) = revlog.index.nodemap_data_incremental()
                     if src_docket != target_docket:
                         data = None
                     else:
                         datafile = _rawdata_filepath(revlog, target_docket)
                         # EXP-TODO: if this is a cache, this should use a cache vfs, not a
                         # store vfs
-                        with revlog.opener(datafile, b'a') as fd:
+                        with revlog.opener(datafile, b'r+') as fd:
+                            fd.seek(target_docket.data_length)
                             fd.write(data)
                         target_docket.data_length += len(data)
                         target_docket.data_unused += data_changed_count
                 if data is None:
                     # otherwise fallback to a full new export
                     target_docket = NodeMapDocket()
                     datafile = _rawdata_filepath(revlog, target_docket)
                     if util.safehasattr(revlog.index, "nodemap_data_all"):
                         data = revlog.index.nodemap_data_all()
                     else:
                         data = persistent_data(revlog.index)
                     # EXP-TODO: if this is a cache, this should use a cache vfs, not a
                     # store vfs
                     with revlog.opener(datafile, b'w') as fd:
                         fd.write(data)
                     target_docket.data_length = len(data)
                 target_docket.tip_rev = revlog.tiprev()
                 # EXP-TODO: if this is a cache, this should use a cache vfs, not a
                 # store vfs
                 with revlog.opener(revlog.nodemap_file, b'w', atomictemp=True) as fp:
                     fp.write(target_docket.serialize())
                 revlog._nodemap_docket = target_docket
                 # EXP-TODO: if the transaction abort, we should remove the new data and
                 # reinstall the old one.
                 # search for old index file in all cases, some older process might have
                 # left one behind.
                 olds = _other_rawdata_filepath(revlog, target_docket)
                 if olds:
                     realvfs = getattr(revlog, '_realopener', revlog.opener)
                     def cleanup(tr):
                         for oldfile in olds:
                             realvfs.tryunlink(oldfile)
                     callback_id = b"revlog-cleanup-nodemap-%s" % revlog.nodemap_file
                     tr.addpostclose(callback_id, cleanup)
             ### Nodemap docket file
             #
             # The nodemap data are stored on disk using 2 files:
             #
             # * a raw data files containing a persistent nodemap
             #   (see `Nodemap Trie` section)
             #
             # * a small "docket" file containing medatadata
             #
             # While the nodemap data can be multiple tens of megabytes, the "docket" is
             # small, it is easy to update it automatically or to duplicated its content
             # during a transaction.
             #
             # Multiple raw data can exist at the same time (The currently valid one and a
             # new one beind used by an in progress transaction). To accomodate this, the
             # filename hosting the raw data has a variable parts. The exact filename is
             # specified inside the "docket" file.
             #
             # The docket file contains information to find, qualify and validate the raw
             # data. Its content is currently very light, but it will expand as the on disk
             # nodemap gains the necessary features to be used in production.
             # version 0 is experimental, no BC garantee, do no use outside of tests.
             ONDISK_VERSION = 0
             S_VERSION = struct.Struct(">B")
             S_HEADER = struct.Struct(">BQQQ")
             ID_SIZE = 8
             def _make_uid():
                 """return a new unique identifier.
                 The identifier is random and composed of ascii characters."""
                 return nodemod.hex(os.urandom(ID_SIZE))
             class NodeMapDocket(object):
                 """metadata associated with persistent nodemap data
                 The persistent data may come from disk or be on their way to disk.
                 """
                 def __init__(self, uid=None):
                     if uid is None:
                         uid = _make_uid()
                     self.uid = uid
                     self.tip_rev = None
                     self.data_length = None
                     self.data_unused = 0
                 def copy(self):
                     new = NodeMapDocket(uid=self.uid)
                     new.tip_rev = self.tip_rev
                     new.data_length = self.data_length
                     new.data_unused = self.data_unused
                     return new
                 def __cmp__(self, other):
                     if self.uid < other.uid:
                         return -1
                     if self.uid > other.uid:
                         return 1
                     elif self.data_length < other.data_length:
                         return -1
                     elif self.data_length > other.data_length:
                         return 1
                     return 0
                 def __eq__(self, other):
                     return self.uid == other.uid and self.data_length == other.data_length
                 def serialize(self):
                     """return serialized bytes for a docket using the passed uid"""
                     data = []
                     data.append(S_VERSION.pack(ONDISK_VERSION))
                     headers = (
                         len(self.uid),
                         self.tip_rev,
                         self.data_length,
                         self.data_unused,
                     )
                     data.append(S_HEADER.pack(*headers))
                     data.append(self.uid)
                     return b''.join(data)
             def _rawdata_filepath(revlog, docket):
                 """The (vfs relative) nodemap's rawdata file for a given uid"""
                 prefix = revlog.nodemap_file[:-2]
                 return b"%s-%s.nd" % (prefix, docket.uid)
             def _other_rawdata_filepath(revlog, docket):
                 prefix = revlog.nodemap_file[:-2]
                 pattern = re.compile(b"(^|/)%s-[0-9a-f]+\.nd$" % prefix)
                 new_file_path = _rawdata_filepath(revlog, docket)
                 new_file_name = revlog.opener.basename(new_file_path)
                 dirpath = revlog.opener.dirname(new_file_path)
                 others = []
                 for f in revlog.opener.listdir(dirpath):
                     if pattern.match(f) and f != new_file_name:
                         others.append(f)
                 return others
             ### Nodemap Trie
             #
             # This is a simple reference implementation to compute and persist a nodemap
             # trie. This reference implementation is write only. The python version of this
             # is not expected to be actually used, since it wont provide performance
             # improvement over existing non-persistent C implementation.
             #
             # The nodemap is persisted as Trie using 4bits-address/16-entries block. each
             # revision can be adressed using its node shortest prefix.
             #
             # The trie is stored as a sequence of block. Each block contains 16 entries
             # (signed 64bit integer, big endian). Each entry can be one of the following:
             #
             #  * value >=  0 -> index of sub-block
             #  * value == -1 -> no value
             #  * value <  -1 -> a revision value: rev = -(value+10)
             #
             # The implementation focus on simplicity, not on performance. A Rust
             # implementation should provide a efficient version of the same binary
             # persistence. This reference python implementation is never meant to be
             # extensively use in production.
             def persistent_data(index):
                 """return the persistent binary form for a nodemap for a given index
                 """
                 trie = _build_trie(index)
                 return _persist_trie(trie)
             def update_persistent_data(index, root, max_idx, last_rev):
                 """return the incremental update for persistent nodemap from a given index
                 """
                 changed_block, trie = _update_trie(index, root, last_rev)
                 return (
                     changed_block * S_BLOCK.size,
                     _persist_trie(trie, existing_idx=max_idx),
                 )
             S_BLOCK = struct.Struct(">" + ("l" * 16))
             NO_ENTRY = -1
             # rev 0 need to be -2 because 0 is used by block, -1 is a special value.
             REV_OFFSET = 2
             def _transform_rev(rev):
                 """Return the number used to represent the rev in the tree.
                 (or retrieve a rev number from such representation)
                 Note that this is an involution, a function equal to its inverse (i.e.
                 which gives the identity when applied to itself).
                 """
                 return -(rev + REV_OFFSET)
             def _to_int(hex_digit):
                 """turn an hexadecimal digit into a proper integer"""
                 return int(hex_digit, 16)
             class Block(dict):
                 """represent a block of the Trie
                 contains up to 16 entry indexed from 0 to 15"""
                 def __init__(self):
                     super(Block, self).__init__()
                     # If this block exist on disk, here is its ID
                     self.ondisk_id = None
                 def __iter__(self):
                     return iter(self.get(i) for i in range(16))
             def _build_trie(index):
                 """build a nodemap trie
                 The nodemap stores revision number for each unique prefix.
                 Each block is a dictionary with keys in `[0, 15]`. Values are either
                 another block or a revision number.
                 """
                 root = Block()
                 for rev in range(len(index)):
                     hex = nodemod.hex(index[rev][7])
                     _insert_into_block(index, 0, root, rev, hex)
                 return root
             def _update_trie(index, root, last_rev):
                 """consume"""
                 changed = 0
                 for rev in range(last_rev + 1, len(index)):
                     hex = nodemod.hex(index[rev][7])
                     changed += _insert_into_block(index, 0, root, rev, hex)
                 return changed, root
             def _insert_into_block(index, level, block, current_rev, current_hex):
                 """insert a new revision in a block
                 index: the index we are adding revision for
                 level: the depth of the current block in the trie
                 block: the block currently being considered
                 current_rev: the revision number we are adding
                 current_hex: the hexadecimal representation of the of that revision
                 """
                 changed = 1
                 if block.ondisk_id is not None:
                     block.ondisk_id = None
                 hex_digit = _to_int(current_hex[level : level + 1])
                 entry = block.get(hex_digit)
                 if entry is None:
                     # no entry, simply store the revision number
                     block[hex_digit] = current_rev
                 elif isinstance(entry, dict):
                     # need to recurse to an underlying block
                     changed += _insert_into_block(
                         index, level + 1, entry, current_rev, current_hex
                     )
                 else:
                     # collision with a previously unique prefix, inserting new
                     # vertices to fit both entry.
                     other_hex = nodemod.hex(index[entry][7])
                     other_rev = entry
                     new = Block()
                     block[hex_digit] = new
                     _insert_into_block(index, level + 1, new, other_rev, other_hex)
                     _insert_into_block(index, level + 1, new, current_rev, current_hex)
                 return changed
             def _persist_trie(root, existing_idx=None):
                 """turn a nodemap trie into persistent binary data
                 See `_build_trie` for nodemap trie structure"""
                 block_map = {}
                 if existing_idx is not None:
                     base_idx = existing_idx + 1
                 else:
                     base_idx = 0
                 chunks = []
                 for tn in _walk_trie(root):
                     if tn.ondisk_id is not None:
                         block_map[id(tn)] = tn.ondisk_id
                     else:
                         block_map[id(tn)] = len(chunks) + base_idx
                         chunks.append(_persist_block(tn, block_map))
                 return b''.join(chunks)
             def _walk_trie(block):
                 """yield all the block in a trie
                 Children blocks are always yield before their parent block.
                 """
                 for (_, item) in sorted(block.items()):
                     if isinstance(item, dict):
                         for sub_block in _walk_trie(item):
                             yield sub_block
                 yield block
             def _persist_block(block_node, block_map):
                 """produce persistent binary data for a single block
                 Children block are assumed to be already persisted and present in
                 block_map.
                 """
                 data = tuple(_to_value(v, block_map) for v in block_node)
                 return S_BLOCK.pack(*data)
             def _to_value(item, block_map):
                 """persist any value as an integer"""
                 if item is None:
                     return NO_ENTRY
                 elif isinstance(item, dict):
                     return block_map[id(item)]
                 else:
                     return _transform_rev(item)
             def parse_data(data):
                 """parse parse nodemap data into a nodemap Trie"""
                 if (len(data) % S_BLOCK.size) != 0:
                     msg = "nodemap data size is not a multiple of block size (%d): %d"
                     raise error.Abort(msg % (S_BLOCK.size, len(data)))
                 if not data:
                     return Block(), None
                 block_map = {}
                 new_blocks = []
                 for i in range(0, len(data), S_BLOCK.size):
                     block = Block()
                     block.ondisk_id = len(block_map)
                     block_map[block.ondisk_id] = block
                     block_data = data[i : i + S_BLOCK.size]
                     values = S_BLOCK.unpack(block_data)
                     new_blocks.append((block, values))
                 for b, values in new_blocks:
                     for idx, v in enumerate(values):
                         if v == NO_ENTRY:
                             continue
                         elif v >= 0:
                             b[idx] = block_map[v]
                         else:
                             b[idx] = _transform_rev(v)
                 return block, i // S_BLOCK.size
             # debug utility
             def check_data(ui, index, data):
                 """verify that the provided nodemap data are valid for the given idex"""
                 ret = 0
                 ui.status((b"revision in index:   %d\n") % len(index))
                 root, __ = parse_data(data)
                 all_revs = set(_all_revisions(root))
                 ui.status((b"revision in nodemap: %d\n") % len(all_revs))
                 for r in range(len(index)):
                     if r not in all_revs:
                         msg = b"  revision missing from nodemap: %d\n" % r
                         ui.write_err(msg)
                         ret = 1
                     else:
                         all_revs.remove(r)
                     nm_rev = _find_node(root, nodemod.hex(index[r][7]))
                     if nm_rev is None:
                         msg = b"  revision node does not match any entries: %d\n" % r
                         ui.write_err(msg)
                         ret = 1
                     elif nm_rev != r:
                         msg = (
                             b"  revision node does not match the expected revision: "
                             b"%d != %d\n" % (r, nm_rev)
                         )
                         ui.write_err(msg)
                         ret = 1
                 if all_revs:
                     for r in sorted(all_revs):
                         msg = b"  extra revision in  nodemap: %d\n" % r
                         ui.write_err(msg)
                     ret = 1
                 return ret
             def _all_revisions(root):
                 """return all revisions stored in a Trie"""
                 for block in _walk_trie(root):
                     for v in block:
                         if v is None or isinstance(v, Block):
                             continue
                         yield v
             def _find_node(block, node):
                 """find the revision associated with a given node"""
                 entry = block.get(_to_int(node[0:1]))
                 if isinstance(entry, dict):
                     return _find_node(entry, node[1:])
                 return entry