upstream/kallithea Commit - r3922:d8e02de5

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.daemon

3

rhodecode.lib.indexers.daemon

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

A daemon will read from task table and run tasks

6

A daemon will read from task table and run tasks

7

8

:created_on: Jan 26, 2010

8

:created_on: Jan 26, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

from __future__ import with_statement

25

from __future__ import with_statement

26

27

import os

27

import os

28

import sys

28

import sys

29

import logging

29

import logging

30

import traceback

30

import traceback

31

32

from shutil import rmtree

32

from shutil import rmtree

33

from time import mktime

33

from time import mktime

34

35

from os.path import dirname as dn

35

from os.path import dirname as dn

36

from os.path import join as jn

36

from os.path import join as jn

37

38

#to get the rhodecode import

38

#to get the rhodecode import

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

40

sys.path.append(project_path)

40

sys.path.append(project_path)

41

42

from rhodecode.config.conf import INDEX_EXTENSIONS

42

from rhodecode.config.conf import INDEX_EXTENSIONS

43

from rhodecode.model.scm import ScmModel

43

from rhodecode.model.scm import ScmModel

44

from rhodecode.model.db import Repository

44

from rhodecode.model.db import Repository

45

from rhodecode.lib.utils2 import safe_unicode, safe_str

45

from rhodecode.lib.utils2 import safe_unicode, safe_str

46

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

46

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

47

CHGSET_IDX_NAME

47

CHGSET_IDX_NAME

48

49

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

49

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

50

NodeDoesNotExistError

50

NodeDoesNotExistError

51

52

from whoosh.index import create_in, open_dir, exists_in

52

from whoosh.index import create_in, open_dir, exists_in

53

from whoosh.query import *

53

from whoosh.query import *

54

from whoosh.qparser import QueryParser

54

from whoosh.qparser import QueryParser

55

56

log = logging.getLogger('whoosh_indexer')

56

log = logging.getLogger('whoosh_indexer')

57

58

59

class WhooshIndexingDaemon(object):

59

class WhooshIndexingDaemon(object):

60

"""

60

"""

61

Daemon for atomic indexing jobs

61

Daemon for atomic indexing jobs

62

"""

62

"""

63

64

def __init__(self, indexname=IDX_NAME, index_location=None,

64

def __init__(self, indexname=IDX_NAME, index_location=None,

65

repo_location=None, sa=None, repo_list=None,

65

repo_location=None, sa=None, repo_list=None,

66

repo_update_list=None):

66

repo_update_list=None):

67

self.indexname = indexname

67

self.indexname = indexname

68

69

self.index_location = index_location

69

self.index_location = index_location

70

if not index_location:

70

if not index_location:

71

raise Exception('You have to provide index location')

71

raise Exception('You have to provide index location')

72

73

self.repo_location = repo_location

73

self.repo_location = repo_location

74

if not repo_location:

74

if not repo_location:

75

raise Exception('You have to provide repositories location')

75

raise Exception('You have to provide repositories location')

76

77

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

77

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

78

79

#filter repo list

79

#filter repo list

80

if repo_list:

80

if repo_list:

81

#Fix non-ascii repo names to unicode

81

#Fix non-ascii repo names to unicode

82

repo_list = map(safe_unicode, repo_list)

82

repo_list = map(safe_unicode, repo_list)

83

self.filtered_repo_paths = {}

83

self.filtered_repo_paths = {}

84

for repo_name, repo in self.repo_paths.items():

84

for repo_name, repo in self.repo_paths.items():

85

if repo_name in repo_list:

85

if repo_name in repo_list:

86

self.filtered_repo_paths[repo_name] = repo

86

self.filtered_repo_paths[repo_name] = repo

87

88

self.repo_paths = self.filtered_repo_paths

88

self.repo_paths = self.filtered_repo_paths

89

90

#filter update repo list

90

#filter update repo list

91

self.filtered_repo_update_paths = {}

91

self.filtered_repo_update_paths = {}

92

if repo_update_list:

92

if repo_update_list:

93

self.filtered_repo_update_paths = {}

93

self.filtered_repo_update_paths = {}

94

for repo_name, repo in self.repo_paths.items():

94

for repo_name, repo in self.repo_paths.items():

95

if repo_name in repo_update_list:

95

if repo_name in repo_update_list:

96

self.filtered_repo_update_paths[repo_name] = repo

96

self.filtered_repo_update_paths[repo_name] = repo

97

self.repo_paths = self.filtered_repo_update_paths

97

self.repo_paths = self.filtered_repo_update_paths

98

99

self.initial = True

99

self.initial = True

100

if not os.path.isdir(self.index_location):

100

if not os.path.isdir(self.index_location):

101

os.makedirs(self.index_location)

101

os.makedirs(self.index_location)

102

log.info('Cannot run incremental index since it does not '

102

log.info('Cannot run incremental index since it does not '

103

'yet exist running full build')

103

'yet exist running full build')

104

elif not exists_in(self.index_location, IDX_NAME):

104

elif not exists_in(self.index_location, IDX_NAME):

105

log.info('Running full index build as the file content '

105

log.info('Running full index build as the file content '

106

'index does not exist')

106

'index does not exist')

107

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

107

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

108

log.info('Running full index build as the changeset '

108

log.info('Running full index build as the changeset '

109

'index does not exist')

109

'index does not exist')

110

else:

110

else:

111

self.initial = False

111

self.initial = False

112

113

def _get_index_revision(self, repo):

113

def _get_index_revision(self, repo):

114

db_repo = Repository.get_by_repo_name(repo.name)

114

db_repo = Repository.get_by_repo_name(repo.name)

115

landing_rev = 'tip'

115

landing_rev = 'tip'

116

if db_repo:

116

if db_repo:

117

landing_rev = db_repo.landing_rev

117

landing_rev = db_repo.landing_rev

118

return landing_rev

118

return landing_rev

119

120

def _get_index_changeset(self, repo):

120

def _get_index_changeset(self, repo):

121

index_rev = self._get_index_revision(repo)

121

index_rev = self._get_index_revision(repo)

122

cs = repo.get_changeset(index_rev)

122

cs = repo.get_changeset(index_rev)

123

return cs

123

return cs

124

125

def get_paths(self, repo):

125

def get_paths(self, repo):

126

"""

126

"""

127

recursive walk in root dir and return a set of all path in that dir

127

recursive walk in root dir and return a set of all path in that dir

128

based on repository walk function

128

based on repository walk function

129

"""

129

"""

130

index_paths_ = set()

130

index_paths_ = set()

131

try:

131

try:

132

cs = self._get_index_changeset(repo)

132

cs = self._get_index_changeset(repo)

133

for _topnode, _dirs, files in cs.walk('/'):

133

for _topnode, _dirs, files in cs.walk('/'):

134

for f in files:

134

for f in files:

135

index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))

135

index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))

136

137

except RepositoryError:

137

except RepositoryError:

138

log.debug(traceback.format_exc())

138

log.debug(traceback.format_exc())

139

pass

139

pass

140

return index_paths_

140

return index_paths_

141

142

def get_node(self, repo, path):

142

def get_node(self, repo, path):

143

"""

143

"""

144

gets a filenode based on given full path.It operates on string for

144

gets a filenode based on given full path.It operates on string for

145

hg git compatability.

145

hg git compatability.

146

147

:param repo: scm repo instance

147

:param repo: scm repo instance

148

:param path: full path including root location

148

:param path: full path including root location

149

:return: FileNode

149

:return: FileNode

150

"""

150

"""

151

root_path = safe_str(repo.path)+'/'

151

root_path = safe_str(repo.path)+'/'

152

parts = safe_str(path).partition(root_path)

152

parts = safe_str(path).partition(root_path)

153

cs = self._get_index_changeset(repo)

153

cs = self._get_index_changeset(repo)

154

node = cs.get_node(parts[-1])

154

node = cs.get_node(parts[-1])

155

return node

155

return node

156

157

def get_node_mtime(self, node):

157

def get_node_mtime(self, node):

158

return mktime(node.last_changeset.date.timetuple())

158

return mktime(node.last_changeset.date.timetuple())

159

160

def add_doc(self, writer, path, repo, repo_name):

160

def add_doc(self, writer, path, repo, repo_name):

161

"""

161

"""

162

Adding doc to writer this function itself fetches data from

162

Adding doc to writer this function itself fetches data from

163

the instance of vcs backend

163

the instance of vcs backend

164

"""

164

"""

165

166

node = self.get_node(repo, path)

166

node = self.get_node(repo, path)

167

indexed = indexed_w_content = 0

167

indexed = indexed_w_content = 0

168

# we just index the content of chosen files, and skip binary files

168

# we just index the content of chosen files, and skip binary files

169

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

169

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

170

u_content = node.content

170

u_content = node.content

171

if not isinstance(u_content, unicode):

171

if not isinstance(u_content, unicode):

172

log.warning(' >> %s Could not get this content as unicode '

172

log.warning(' >> %s Could not get this content as unicode '

173

'replacing with empty content' % path)

173

'replacing with empty content' % path)

174

u_content = u''

174

u_content = u''

175

else:

175

else:

176

log.debug(' >> %s [WITH CONTENT]' % path)

176

log.debug(' >> %s [WITH CONTENT]' % path)

177

indexed_w_content += 1

177

indexed_w_content += 1

178

179

else:

179

else:

180

log.debug(' >> %s' % path)

180

log.debug(' >> %s' % path)

181

# just index file name without it's content

181

# just index file name without it's content

182

u_content = u''

182

u_content = u''

183

indexed += 1

183

indexed += 1

184

185

p = safe_unicode(path)

185

p = safe_unicode(path)

186

writer.add_document(

186

writer.add_document(

187

fileid=p,

187

fileid=p,

188

owner=unicode(repo.contact),

188

owner=unicode(repo.contact),

189

repository=safe_unicode(repo_name),

189

repository=safe_unicode(repo_name),

190

path=p,

190

path=p,

191

content=u_content,

191

content=u_content,

192

modtime=self.get_node_mtime(node),

192

modtime=self.get_node_mtime(node),

193

extension=node.extension

193

extension=node.extension

194

)

194

)

195

return indexed, indexed_w_content

195

return indexed, indexed_w_content

196

197

def index_changesets(self, writer, repo_name, repo, start_rev=None):

197

def index_changesets(self, writer, repo_name, repo, start_rev=None):

198

"""

198

"""

199

Add all changeset in the vcs repo starting at start_rev

199

Add all changeset in the vcs repo starting at start_rev

200

to the index writer

200

to the index writer

201

202

:param writer: the whoosh index writer to add to

202

:param writer: the whoosh index writer to add to

203

:param repo_name: name of the repository from whence the

203

:param repo_name: name of the repository from whence the

204

changeset originates including the repository group

204

changeset originates including the repository group

205

:param repo: the vcs repository instance to index changesets for,

205

:param repo: the vcs repository instance to index changesets for,

206

the presumption is the repo has changesets to index

206

the presumption is the repo has changesets to index

207

:param start_rev=None: the full sha id to start indexing from

207

:param start_rev=None: the full sha id to start indexing from

208

if start_rev is None then index from the first changeset in

208

if start_rev is None then index from the first changeset in

209

the repo

209

the repo

210

"""

210

"""

211

212

if start_rev is None:

212

if start_rev is None:

213

start_rev = repo[0].raw_id

213

start_rev = repo[0].raw_id

214

215

log.debug('indexing changesets in %s starting at rev: %s' %

215

log.debug('indexing changesets in %s starting at rev: %s' %

216

(repo_name, start_rev))

216

(repo_name, start_rev))

217

218

indexed = 0

218

indexed = 0

219

for cs in repo.get_changesets(start=start_rev):

219

cs_iter = repo.get_changesets(start=start_rev)

220

log.debug(' >> %s' % cs)

220

total = len(cs_iter)

221

for cs in cs_iter:

222

log.debug(' >> %s/%s' % (cs, total))

221

writer.add_document(

223

writer.add_document(

222

raw_id=unicode(cs.raw_id),

224

raw_id=unicode(cs.raw_id),

223

owner=unicode(repo.contact),

225

owner=unicode(repo.contact),

224

date=cs._timestamp,

226

date=cs._timestamp,

225

repository=safe_unicode(repo_name),

227

repository=safe_unicode(repo_name),

226

author=cs.author,

228

author=cs.author,

227

message=cs.message,

229

message=cs.message,

228

last=cs.last,

230

last=cs.last,

229

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

231

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

230

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

232

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

231

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

233

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

232

parents=u' '.join([cs.raw_id for cs in cs.parents]),

234

parents=u' '.join([cs.raw_id for cs in cs.parents]),

233

)

235

)

234

indexed += 1

236

indexed += 1

235

237

236

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

238

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

237

return indexed

239

return indexed

238

240

239

def index_files(self, file_idx_writer, repo_name, repo):

241

def index_files(self, file_idx_writer, repo_name, repo):

240

"""

242

"""

241

Index files for given repo_name

243

Index files for given repo_name

242

244

243

:param file_idx_writer: the whoosh index writer to add to

245

:param file_idx_writer: the whoosh index writer to add to

244

:param repo_name: name of the repository we're indexing

246

:param repo_name: name of the repository we're indexing

245

:param repo: instance of vcs repo

247

:param repo: instance of vcs repo

246

"""

248

"""

247

i_cnt = iwc_cnt = 0

249

i_cnt = iwc_cnt = 0

248

log.debug('building index for %s @revision:%s' % (repo.path,

250

log.debug('building index for %s @revision:%s' % (repo.path,

249

self._get_index_revision(repo)))

251

self._get_index_revision(repo)))

250

for idx_path in self.get_paths(repo):

252

for idx_path in self.get_paths(repo):

251

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

253

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

252

i_cnt += i

254

i_cnt += i

253

iwc_cnt += iwc

255

iwc_cnt += iwc

254

256

255

log.debug('added %s files %s with content for repo %s' %

257

log.debug('added %s files %s with content for repo %s' %

256

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

258

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

257

return i_cnt, iwc_cnt

259

return i_cnt, iwc_cnt

258

260

259

def update_changeset_index(self):

261

def update_changeset_index(self):

260

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

262

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

261

263

262

with idx.searcher() as searcher:

264

with idx.searcher() as searcher:

263

writer = idx.writer()

265

writer = idx.writer()

264

writer_is_dirty = False

266

writer_is_dirty = False

265

try:

267

try:

266

indexed_total = 0

268

indexed_total = 0

267

repo_name = None

269

repo_name = None

268

for repo_name, repo in self.repo_paths.items():

270

for repo_name, repo in self.repo_paths.items():

269

# skip indexing if there aren't any revs in the repo

271

# skip indexing if there aren't any revs in the repo

270

num_of_revs = len(repo)

272

num_of_revs = len(repo)

271

if num_of_revs < 1:

273

if num_of_revs < 1:

272

continue

274

continue

273

275

274

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

276

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

275

q = qp.parse(u"last:t AND %s" % repo_name)

277

q = qp.parse(u"last:t AND %s" % repo_name)

276

278

277

results = searcher.search(q)

279

results = searcher.search(q)

278

280

279

# default to scanning the entire repo

281

# default to scanning the entire repo

280

last_rev = 0

282

last_rev = 0

281

start_id = None

283

start_id = None

282

284

283

if len(results) > 0:

285

if len(results) > 0:

284

# assuming that there is only one result, if not this

286

# assuming that there is only one result, if not this

285

# may require a full re-index.

287

# may require a full re-index.

286

start_id = results[0]['raw_id']

288

start_id = results[0]['raw_id']

287

last_rev = repo.get_changeset(revision=start_id).revision

289

last_rev = repo.get_changeset(revision=start_id).revision

288

290

289

# there are new changesets to index or a new repo to index

291

# there are new changesets to index or a new repo to index

290

if last_rev == 0 or num_of_revs > last_rev + 1:

292

if last_rev == 0 or num_of_revs > last_rev + 1:

291

# delete the docs in the index for the previous

293

# delete the docs in the index for the previous

292

# last changeset(s)

294

# last changeset(s)

293

for hit in results:

295

for hit in results:

294

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

296

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

295

(repo_name, hit['raw_id']))

297

(repo_name, hit['raw_id']))

296

writer.delete_by_query(q)

298

writer.delete_by_query(q)

297

299

298

# index from the previous last changeset + all new ones

300

# index from the previous last changeset + all new ones

299

indexed_total += self.index_changesets(writer,

301

indexed_total += self.index_changesets(writer,

300

repo_name, repo, start_id)

302

repo_name, repo, start_id)

301

writer_is_dirty = True

303

writer_is_dirty = True

302

log.debug('indexed %s changesets for repo %s' % (

304

log.debug('indexed %s changesets for repo %s' % (

303

indexed_total, repo_name)

305

indexed_total, repo_name)

304

)

306

)

305

finally:

307

finally:

306

if writer_is_dirty:

308

if writer_is_dirty:

307

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

309

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

308

writer.commit(merge=True)

310

writer.commit(merge=True)

309

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

311

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

310

else:

312

else:

311

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

313

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

312

314

313

def update_file_index(self):

315

def update_file_index(self):

314

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

316

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

315

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

317

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

316

318

317

idx = open_dir(self.index_location, indexname=self.indexname)

319

idx = open_dir(self.index_location, indexname=self.indexname)

318

# The set of all paths in the index

320

# The set of all paths in the index

319

indexed_paths = set()

321

indexed_paths = set()

320

# The set of all paths we need to re-index

322

# The set of all paths we need to re-index

321

to_index = set()

323

to_index = set()

322

324

323

writer = idx.writer()

325

writer = idx.writer()

324

writer_is_dirty = False

326

writer_is_dirty = False

325

try:

327

try:

326

with idx.reader() as reader:

328

with idx.reader() as reader:

327

329

328

# Loop over the stored fields in the index

330

# Loop over the stored fields in the index

329

for fields in reader.all_stored_fields():

331

for fields in reader.all_stored_fields():

330

indexed_path = fields['path']

332

indexed_path = fields['path']

331

indexed_repo_path = fields['repository']

333

indexed_repo_path = fields['repository']

332

indexed_paths.add(indexed_path)

334

indexed_paths.add(indexed_path)

333

335

334

if not indexed_repo_path in self.filtered_repo_update_paths:

336

if not indexed_repo_path in self.filtered_repo_update_paths:

335

continue

337

continue

336

338

337

repo = self.repo_paths[indexed_repo_path]

339

repo = self.repo_paths[indexed_repo_path]

338

340

339

try:

341

try:

340

node = self.get_node(repo, indexed_path)

342

node = self.get_node(repo, indexed_path)

341

# Check if this file was changed since it was indexed

343

# Check if this file was changed since it was indexed

342

indexed_time = fields['modtime']

344

indexed_time = fields['modtime']

343

mtime = self.get_node_mtime(node)

345

mtime = self.get_node_mtime(node)

344

if mtime > indexed_time:

346

if mtime > indexed_time:

345

# The file has changed, delete it and add it to

347

# The file has changed, delete it and add it to

346

# the list of files to reindex

348

# the list of files to reindex

347

log.debug(

349

log.debug(

348

'adding to reindex list %s mtime: %s vs %s' % (

350

'adding to reindex list %s mtime: %s vs %s' % (

349

indexed_path, mtime, indexed_time)

351

indexed_path, mtime, indexed_time)

350

)

352

)

351

writer.delete_by_term('fileid', indexed_path)

353

writer.delete_by_term('fileid', indexed_path)

352

writer_is_dirty = True

354

writer_is_dirty = True

353

355

354

to_index.add(indexed_path)

356

to_index.add(indexed_path)

355

except (ChangesetError, NodeDoesNotExistError):

357

except (ChangesetError, NodeDoesNotExistError):

356

# This file was deleted since it was indexed

358

# This file was deleted since it was indexed

357

log.debug('removing from index %s' % indexed_path)

359

log.debug('removing from index %s' % indexed_path)

358

writer.delete_by_term('path', indexed_path)

360

writer.delete_by_term('path', indexed_path)

359

writer_is_dirty = True

361

writer_is_dirty = True

360

362

361

# Loop over the files in the filesystem

363

# Loop over the files in the filesystem

362

# Assume we have a function that gathers the filenames of the

364

# Assume we have a function that gathers the filenames of the

363

# documents to be indexed

365

# documents to be indexed

364

ri_cnt_total = 0 # indexed

366

ri_cnt_total = 0 # indexed

365

riwc_cnt_total = 0 # indexed with content

367

riwc_cnt_total = 0 # indexed with content

366

for repo_name, repo in self.repo_paths.items():

368

for repo_name, repo in self.repo_paths.items():

367

# skip indexing if there aren't any revisions

369

# skip indexing if there aren't any revisions

368

if len(repo) < 1:

370

if len(repo) < 1:

369

continue

371

continue

370

ri_cnt = 0 # indexed

372

ri_cnt = 0 # indexed

371

riwc_cnt = 0 # indexed with content

373

riwc_cnt = 0 # indexed with content

372

for path in self.get_paths(repo):

374

for path in self.get_paths(repo):

373

path = safe_unicode(path)

375

path = safe_unicode(path)

374

if path in to_index or path not in indexed_paths:

376

if path in to_index or path not in indexed_paths:

375

377

376

# This is either a file that's changed, or a new file

378

# This is either a file that's changed, or a new file

377

# that wasn't indexed before. So index it!

379

# that wasn't indexed before. So index it!

378

i, iwc = self.add_doc(writer, path, repo, repo_name)

380

i, iwc = self.add_doc(writer, path, repo, repo_name)

379

writer_is_dirty = True

381

writer_is_dirty = True

380

log.debug('re indexing %s' % path)

382

log.debug('re indexing %s' % path)

381

ri_cnt += i

383

ri_cnt += i

382

ri_cnt_total += 1

384

ri_cnt_total += 1

383

riwc_cnt += iwc

385

riwc_cnt += iwc

384

riwc_cnt_total += iwc

386

riwc_cnt_total += iwc

385

log.debug('added %s files %s with content for repo %s' % (

387

log.debug('added %s files %s with content for repo %s' % (

386

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

388

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

387

)

389

)

388

log.debug('indexed %s files in total and %s with content' % (

390

log.debug('indexed %s files in total and %s with content' % (

389

ri_cnt_total, riwc_cnt_total)

391

ri_cnt_total, riwc_cnt_total)

390

)

392

)

391

finally:

393

finally:

392

if writer_is_dirty:

394

if writer_is_dirty:

393

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

395

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

394

writer.commit(merge=True)

396

writer.commit(merge=True)

395

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

397

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

396

else:

398

else:

397

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

399

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

398

writer.cancel()

400

writer.cancel()

399

401

400

def build_indexes(self):

402

def build_indexes(self):

401

if os.path.exists(self.index_location):

403

if os.path.exists(self.index_location):

402

log.debug('removing previous index')

404

log.debug('removing previous index')

403

rmtree(self.index_location)

405

rmtree(self.index_location)

404

406

405

if not os.path.exists(self.index_location):

407

if not os.path.exists(self.index_location):

406

os.mkdir(self.index_location)

408

os.mkdir(self.index_location)

407

409

408

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

410

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

409

indexname=CHGSET_IDX_NAME)

411

indexname=CHGSET_IDX_NAME)

410

chgset_idx_writer = chgset_idx.writer()

412

chgset_idx_writer = chgset_idx.writer()

411

413

412

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

414

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

413

file_idx_writer = file_idx.writer()

415

file_idx_writer = file_idx.writer()

414

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

416

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

415

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

417

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

416

418

417

for repo_name, repo in self.repo_paths.items():

419

for repo_name, repo in self.repo_paths.items():

418

# skip indexing if there aren't any revisions

420

# skip indexing if there aren't any revisions

419

if len(repo) < 1:

421

if len(repo) < 1:

420

continue

422

continue

421

423

422

self.index_files(file_idx_writer, repo_name, repo)

424

self.index_files(file_idx_writer, repo_name, repo)

423

self.index_changesets(chgset_idx_writer, repo_name, repo)

425

self.index_changesets(chgset_idx_writer, repo_name, repo)

424

426

425

log.debug('>> COMMITING CHANGES <<')

427

log.debug('>> COMMITING CHANGES <<')

426

file_idx_writer.commit(merge=True)

428

file_idx_writer.commit(merge=True)

427

chgset_idx_writer.commit(merge=True)

429

chgset_idx_writer.commit(merge=True)

428

log.debug('>>> FINISHED BUILDING INDEX <<<')

430

log.debug('>>> FINISHED BUILDING INDEX <<<')

429

431

430

def update_indexes(self):

432

def update_indexes(self):

431

self.update_file_index()

433

self.update_file_index()

432

self.update_changeset_index()

434

self.update_changeset_index()

433

435

434

def run(self, full_index=False):

436

def run(self, full_index=False):

435

"""Run daemon"""

437

"""Run daemon"""

436

if full_index or self.initial:

438

if full_index or self.initial:

437

self.build_indexes()

439

self.build_indexes()

438

else:

440

else:

439

self.update_indexes()

441

self.update_indexes()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.daemon
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 A daemon will read from task table and run tasks
                 :created_on: Jan 26, 2010
                 :author: marcink
                 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             from __future__ import with_statement
             import os
             import sys
             import logging
             import traceback
             from shutil import rmtree
             from time import mktime
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the rhodecode import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from rhodecode.config.conf import INDEX_EXTENSIONS
             from rhodecode.model.scm import ScmModel
             from rhodecode.model.db import Repository
             from rhodecode.lib.utils2 import safe_unicode, safe_str
             from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
                 CHGSET_IDX_NAME
             from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
                 NodeDoesNotExistError
             from whoosh.index import create_in, open_dir, exists_in
             from whoosh.query import *
             from whoosh.qparser import QueryParser
             log = logging.getLogger('whoosh_indexer')
             class WhooshIndexingDaemon(object):
                 """
                 Daemon for atomic indexing jobs
                 """
                 def __init__(self, indexname=IDX_NAME, index_location=None,
                              repo_location=None, sa=None, repo_list=None,
                              repo_update_list=None):
                     self.indexname = indexname
                     self.index_location = index_location
                     if not index_location:
                         raise Exception('You have to provide index location')
                     self.repo_location = repo_location
                     if not repo_location:
                         raise Exception('You have to provide repositories location')
                     self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
                     #filter repo list
                     if repo_list:
                         #Fix non-ascii repo names to unicode
                         repo_list = map(safe_unicode, repo_list)
                         self.filtered_repo_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_list:
                                 self.filtered_repo_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_paths
                     #filter update repo list
                     self.filtered_repo_update_paths = {}
                     if repo_update_list:
                         self.filtered_repo_update_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_update_list:
                                 self.filtered_repo_update_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_update_paths
                     self.initial = True
                     if not os.path.isdir(self.index_location):
                         os.makedirs(self.index_location)
                         log.info('Cannot run incremental index since it does not '
                                  'yet exist running full build')
                     elif not exists_in(self.index_location, IDX_NAME):
                         log.info('Running full index build as the file content '
                                  'index does not exist')
                     elif not exists_in(self.index_location, CHGSET_IDX_NAME):
                         log.info('Running full index build as the changeset '
                                  'index does not exist')
                     else:
                         self.initial = False
                 def _get_index_revision(self, repo):
                     db_repo = Repository.get_by_repo_name(repo.name)
                     landing_rev = 'tip'
                     if db_repo:
                         landing_rev = db_repo.landing_rev
                     return landing_rev
                 def _get_index_changeset(self, repo):
                     index_rev = self._get_index_revision(repo)
                     cs = repo.get_changeset(index_rev)
                     return cs
                 def get_paths(self, repo):
                     """
                     recursive walk in root dir and return a set of all path in that dir
                     based on repository walk function
                     """
                     index_paths_ = set()
                     try:
                         cs = self._get_index_changeset(repo)
                         for _topnode, _dirs, files in cs.walk('/'):
                             for f in files:
                                 index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
                     except RepositoryError:
                         log.debug(traceback.format_exc())
                         pass
                     return index_paths_
                 def get_node(self, repo, path):
                     """
                     gets a filenode based on given full path.It operates on string for
                     hg git compatability.
                     :param repo: scm repo instance
                     :param path: full path including root location
                     :return: FileNode
                     """
                     root_path = safe_str(repo.path)+'/'
                     parts = safe_str(path).partition(root_path)
                     cs = self._get_index_changeset(repo)
                     node = cs.get_node(parts[-1])
                     return node
                 def get_node_mtime(self, node):
                     return mktime(node.last_changeset.date.timetuple())
                 def add_doc(self, writer, path, repo, repo_name):
                     """
                     Adding doc to writer this function itself fetches data from
                     the instance of vcs backend
                     """
                     node = self.get_node(repo, path)
                     indexed = indexed_w_content = 0
                     # we just index the content of chosen files, and skip binary files
                     if node.extension in INDEX_EXTENSIONS and not node.is_binary:
                         u_content = node.content
                         if not isinstance(u_content, unicode):
                             log.warning('  >> %s Could not get this content as unicode '
                                         'replacing with empty content' % path)
                             u_content = u''
                         else:
                             log.debug('    >> %s [WITH CONTENT]' % path)
                             indexed_w_content += 1
                     else:
                         log.debug('    >> %s' % path)
                         # just index file name without it's content
                         u_content = u''
                         indexed += 1
                     p = safe_unicode(path)
                     writer.add_document(
                         fileid=p,
                         owner=unicode(repo.contact),
                         repository=safe_unicode(repo_name),
                         path=p,
                         content=u_content,
                         modtime=self.get_node_mtime(node),
                         extension=node.extension
                     )
                     return indexed, indexed_w_content
                 def index_changesets(self, writer, repo_name, repo, start_rev=None):
                     """
                     Add all changeset in the vcs repo starting at start_rev
                     to the index writer
                     :param writer: the whoosh index writer to add to
                     :param repo_name: name of the repository from whence the
                       changeset originates including the repository group
                     :param repo: the vcs repository instance to index changesets for,
                       the presumption is the repo has changesets to index
                     :param start_rev=None: the full sha id to start indexing from
                       if start_rev is None then index from the first changeset in
                       the repo
                     """
                     if start_rev is None:
                         start_rev = repo[0].raw_id
                     log.debug('indexing changesets in %s starting at rev: %s' %
                               (repo_name, start_rev))
                     indexed = 0
-                    for cs in repo.get_changesets(start=start_rev):
+                    cs_iter = repo.get_changesets(start=start_rev)
-                        log.debug('    >> %s' % cs)
+                    total = len(cs_iter)
+                    for cs in cs_iter:
+                        log.debug('    >> %s/%s' % (cs, total))
                         writer.add_document(
                             raw_id=unicode(cs.raw_id),
                             owner=unicode(repo.contact),
                             date=cs._timestamp,
                             repository=safe_unicode(repo_name),
                             author=cs.author,
                             message=cs.message,
                             last=cs.last,
                             added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
                             removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
                             changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
                             parents=u' '.join([cs.raw_id for cs in cs.parents]),
                         )
                         indexed += 1
                     log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
                     return indexed
                 def index_files(self, file_idx_writer, repo_name, repo):
                     """
                     Index files for given repo_name
                     :param file_idx_writer: the whoosh index writer to add to
                     :param repo_name: name of the repository we're indexing
                     :param repo: instance of vcs repo
                     """
                     i_cnt = iwc_cnt = 0
                     log.debug('building index for %s @revision:%s' % (repo.path,
                                                             self._get_index_revision(repo)))
                     for idx_path in self.get_paths(repo):
                         i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
                         i_cnt += i
                         iwc_cnt += iwc
                     log.debug('added %s files %s with content for repo %s' %
                               (i_cnt + iwc_cnt, iwc_cnt, repo.path))
                     return i_cnt, iwc_cnt
                 def update_changeset_index(self):
                     idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
                     with idx.searcher() as searcher:
                         writer = idx.writer()
                         writer_is_dirty = False
                         try:
                             indexed_total = 0
                             repo_name = None
                             for repo_name, repo in self.repo_paths.items():
                                 # skip indexing if there aren't any revs in the repo
                                 num_of_revs = len(repo)
                                 if num_of_revs < 1:
                                     continue
                                 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                                 q = qp.parse(u"last:t AND %s" % repo_name)
                                 results = searcher.search(q)
                                 # default to scanning the entire repo
                                 last_rev = 0
                                 start_id = None
                                 if len(results) > 0:
                                     # assuming that there is only one result, if not this
                                     # may require a full re-index.
                                     start_id = results[0]['raw_id']
                                     last_rev = repo.get_changeset(revision=start_id).revision
                                 # there are new changesets to index or a new repo to index
                                 if last_rev == 0 or num_of_revs > last_rev + 1:
                                     # delete the docs in the index for the previous
                                     # last changeset(s)
                                     for hit in results:
                                         q = qp.parse(u"last:t AND %s AND raw_id:%s" %
                                                         (repo_name, hit['raw_id']))
                                         writer.delete_by_query(q)
                                     # index from the previous last changeset + all new ones
                                     indexed_total += self.index_changesets(writer,
                                                             repo_name, repo, start_id)
                                     writer_is_dirty = True
                             log.debug('indexed %s changesets for repo %s' % (
                                       indexed_total, repo_name)
                             )
                         finally:
                             if writer_is_dirty:
                                 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
                                 writer.commit(merge=True)
                                 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
                             else:
                                 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
                 def update_file_index(self):
                     log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
                                'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     idx = open_dir(self.index_location, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     writer = idx.writer()
                     writer_is_dirty = False
                     try:
                         with idx.reader() as reader:
                             # Loop over the stored fields in the index
                             for fields in reader.all_stored_fields():
                                 indexed_path = fields['path']
                                 indexed_repo_path = fields['repository']
                                 indexed_paths.add(indexed_path)
                                 if not indexed_repo_path in self.filtered_repo_update_paths:
                                     continue
                                 repo = self.repo_paths[indexed_repo_path]
                                 try:
                                     node = self.get_node(repo, indexed_path)
                                     # Check if this file was changed since it was indexed
                                     indexed_time = fields['modtime']
                                     mtime = self.get_node_mtime(node)
                                     if mtime > indexed_time:
                                         # The file has changed, delete it and add it to
                                         # the list of files to reindex
                                         log.debug(
                                             'adding to reindex list %s mtime: %s vs %s' % (
                                                 indexed_path, mtime, indexed_time)
                                         )
                                         writer.delete_by_term('fileid', indexed_path)
                                         writer_is_dirty = True
                                         to_index.add(indexed_path)
                                 except (ChangesetError, NodeDoesNotExistError):
                                     # This file was deleted since it was indexed
                                     log.debug('removing from index %s' % indexed_path)
                                     writer.delete_by_term('path', indexed_path)
                                     writer_is_dirty = True
                         # Loop over the files in the filesystem
                         # Assume we have a function that gathers the filenames of the
                         # documents to be indexed
                         ri_cnt_total = 0  # indexed
                         riwc_cnt_total = 0  # indexed with content
                         for repo_name, repo in self.repo_paths.items():
                             # skip indexing if there aren't any revisions
                             if len(repo) < 1:
                                 continue
                             ri_cnt = 0   # indexed
                             riwc_cnt = 0  # indexed with content
                             for path in self.get_paths(repo):
                                 path = safe_unicode(path)
                                 if path in to_index or path not in indexed_paths:
                                     # This is either a file that's changed, or a new file
                                     # that wasn't indexed before. So index it!
                                     i, iwc = self.add_doc(writer, path, repo, repo_name)
                                     writer_is_dirty = True
                                     log.debug('re indexing %s' % path)
                                     ri_cnt += i
                                     ri_cnt_total += 1
                                     riwc_cnt += iwc
                                     riwc_cnt_total += iwc
                             log.debug('added %s files %s with content for repo %s' % (
                                          ri_cnt + riwc_cnt, riwc_cnt, repo.path)
                             )
                         log.debug('indexed %s files in total and %s with content' % (
                                     ri_cnt_total, riwc_cnt_total)
                         )
                     finally:
                         if writer_is_dirty:
                             log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
                             writer.commit(merge=True)
                             log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
                         else:
                             log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
                             writer.cancel()
                 def build_indexes(self):
                     if os.path.exists(self.index_location):
                         log.debug('removing previous index')
                         rmtree(self.index_location)
                     if not os.path.exists(self.index_location):
                         os.mkdir(self.index_location)
                     chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
                                            indexname=CHGSET_IDX_NAME)
                     chgset_idx_writer = chgset_idx.writer()
                     file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
                     file_idx_writer = file_idx.writer()
                     log.debug('BUILDING INDEX FOR EXTENSIONS %s '
                               'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     for repo_name, repo in self.repo_paths.items():
                         # skip indexing if there aren't any revisions
                         if len(repo) < 1:
                             continue
                         self.index_files(file_idx_writer, repo_name, repo)
                         self.index_changesets(chgset_idx_writer, repo_name, repo)
                     log.debug('>> COMMITING CHANGES <<')
                     file_idx_writer.commit(merge=True)
                     chgset_idx_writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_indexes(self):
                     self.update_file_index()
                     self.update_changeset_index()
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index or self.initial:
                         self.build_indexes()
                     else:
                         self.update_indexes()