upstream/kallithea Commit - r2839:c0ddc86b

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.daemon

3

rhodecode.lib.indexers.daemon

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

A daemon will read from task table and run tasks

6

A daemon will read from task table and run tasks

7

8

:created_on: Jan 26, 2010

8

:created_on: Jan 26, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

from __future__ import with_statement

25

from __future__ import with_statement

26

27

import os

27

import os

28

import sys

28

import sys

29

import logging

29

import logging

30

import traceback

30

import traceback

31

32

from shutil import rmtree

32

from shutil import rmtree

33

from time import mktime

33

from time import mktime

34

35

from os.path import dirname as dn

35

from os.path import dirname as dn

36

from os.path import join as jn

36

from os.path import join as jn

37

38

#to get the rhodecode import

38

#to get the rhodecode import

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

40

sys.path.append(project_path)

40

sys.path.append(project_path)

41

42

from rhodecode.config.conf import INDEX_EXTENSIONS

42

from rhodecode.config.conf import INDEX_EXTENSIONS

43

from rhodecode.model.scm import ScmModel

43

from rhodecode.model.scm import ScmModel

44

from rhodecode.lib.utils2 import safe_unicode

44

from rhodecode.lib.utils2 import safe_unicode

45

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

45

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

46

CHGSET_IDX_NAME

46

CHGSET_IDX_NAME

47

48

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

48

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

49

NodeDoesNotExistError

49

NodeDoesNotExistError

50

51

from whoosh.index import create_in, open_dir, exists_in

51

from whoosh.index import create_in, open_dir, exists_in

52

from whoosh.query import *

52

from whoosh.query import *

53

from whoosh.qparser import QueryParser

53

from whoosh.qparser import QueryParser

54

55

log = logging.getLogger('whoosh_indexer')

55

log = logging.getLogger('whoosh_indexer')

56

57

58

class WhooshIndexingDaemon(object):

58

class WhooshIndexingDaemon(object):

59

"""

59

"""

60

Daemon for atomic indexing jobs

60

Daemon for atomic indexing jobs

61

"""

61

"""

62

63

def __init__(self, indexname=IDX_NAME, index_location=None,

63

def __init__(self, indexname=IDX_NAME, index_location=None,

64

repo_location=None, sa=None, repo_list=None,

64

repo_location=None, sa=None, repo_list=None,

65

repo_update_list=None):

65

repo_update_list=None):

66

self.indexname = indexname

66

self.indexname = indexname

67

68

self.index_location = index_location

68

self.index_location = index_location

69

if not index_location:

69

if not index_location:

70

raise Exception('You have to provide index location')

70

raise Exception('You have to provide index location')

71

72

self.repo_location = repo_location

72

self.repo_location = repo_location

73

if not repo_location:

73

if not repo_location:

74

raise Exception('You have to provide repositories location')

74

raise Exception('You have to provide repositories location')

75

76

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

76

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

77

78

#filter repo list

78

#filter repo list

79

if repo_list:

79

if repo_list:

80

self.filtered_repo_paths = {}

80

self.filtered_repo_paths = {}

81

for repo_name, repo in self.repo_paths.items():

81

for repo_name, repo in self.repo_paths.items():

82

if repo_name in repo_list:

82

if repo_name in repo_list:

83

self.filtered_repo_paths[repo_name] = repo

83

self.filtered_repo_paths[repo_name] = repo

84

85

self.repo_paths = self.filtered_repo_paths

85

self.repo_paths = self.filtered_repo_paths

86

87

#filter update repo list

87

#filter update repo list

88

self.filtered_repo_update_paths = {}

88

self.filtered_repo_update_paths = {}

89

if repo_update_list:

89

if repo_update_list:

90

self.filtered_repo_update_paths = {}

90

self.filtered_repo_update_paths = {}

91

for repo_name, repo in self.repo_paths.items():

91

for repo_name, repo in self.repo_paths.items():

92

if repo_name in repo_update_list:

92

if repo_name in repo_update_list:

93

self.filtered_repo_update_paths[repo_name] = repo

93

self.filtered_repo_update_paths[repo_name] = repo

94

self.repo_paths = self.filtered_repo_update_paths

94

self.repo_paths = self.filtered_repo_update_paths

95

96

self.initial = True

96

self.initial = True

97

if not os.path.isdir(self.index_location):

97

if not os.path.isdir(self.index_location):

98

os.makedirs(self.index_location)

98

os.makedirs(self.index_location)

99

log.info('Cannot run incremental index since it does not'

99

log.info('Cannot run incremental index since it does not'

100

' yet exist running full build')

100

' yet exist running full build')

101

elif not exists_in(self.index_location, IDX_NAME):

101

elif not exists_in(self.index_location, IDX_NAME):

102

log.info('Running full index build as the file content'

102

log.info('Running full index build as the file content'

103

' index does not exist')

103

' index does not exist')

104

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

104

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

105

log.info('Running full index build as the changeset'

105

log.info('Running full index build as the changeset'

106

' index does not exist')

106

' index does not exist')

107

else:

107

else:

108

self.initial = False

108

self.initial = False

109

110

def get_paths(self, repo):

110

def get_paths(self, repo):

111

"""

111

"""

112

recursive walk in root dir and return a set of all path in that dir

112

recursive walk in root dir and return a set of all path in that dir

113

based on repository walk function

113

based on repository walk function

114

"""

114

"""

115

index_paths_ = set()

115

index_paths_ = set()

116

try:

116

try:

117

tip = repo.get_changeset('tip')

117

tip = repo.get_changeset('tip')

118

for _topnode, _dirs, files in tip.walk('/'):

118

for _topnode, _dirs, files in tip.walk('/'):

119

for f in files:

119

for f in files:

120

index_paths_.add(jn(repo.path, f.path))

120

index_paths_.add(jn(repo.path, f.path))

121

122

except RepositoryError:

122

except RepositoryError:

123

log.debug(traceback.format_exc())

123

log.debug(traceback.format_exc())

124

pass

124

pass

125

return index_paths_

125

return index_paths_

126

127

def get_node(self, repo, path):

127

def get_node(self, repo, path):

128

n_path = path[len(repo.path) + 1:]

128

n_path = path[len(repo.path) + 1:]

129

node = repo.get_changeset().get_node(n_path)

129

node = repo.get_changeset().get_node(n_path)

130

return node

130

return node

131

132

def get_node_mtime(self, node):

132

def get_node_mtime(self, node):

133

return mktime(node.last_changeset.date.timetuple())

133

return mktime(node.last_changeset.date.timetuple())

134

135

def add_doc(self, writer, path, repo, repo_name):

135

def add_doc(self, writer, path, repo, repo_name):

136

"""

136

"""

137

Adding doc to writer this function itself fetches data from

137

Adding doc to writer this function itself fetches data from

138

the instance of vcs backend

138

the instance of vcs backend

139

"""

139

"""

140

141

node = self.get_node(repo, path)

141

node = self.get_node(repo, path)

142

indexed = indexed_w_content = 0

142

indexed = indexed_w_content = 0

143

# we just index the content of chosen files, and skip binary files

143

# we just index the content of chosen files, and skip binary files

144

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

144

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

145

u_content = node.content

145

u_content = node.content

146

if not isinstance(u_content, unicode):

146

if not isinstance(u_content, unicode):

147

log.warning(' >> %s Could not get this content as unicode '

147

log.warning(' >> %s Could not get this content as unicode '

148

'replacing with empty content' % path)

148

'replacing with empty content' % path)

149

u_content = u''

149

u_content = u''

150

else:

150

else:

151

log.debug(' >> %s [WITH CONTENT]' % path)

151

log.debug(' >> %s [WITH CONTENT]' % path)

152

indexed_w_content += 1

152

indexed_w_content += 1

153

154

else:

154

else:

155

log.debug(' >> %s' % path)

155

log.debug(' >> %s' % path)

156

# just index file name without it's content

156

# just index file name without it's content

157

u_content = u''

157

u_content = u''

158

indexed += 1

158

indexed += 1

159

160

p = safe_unicode(path)

160

p = safe_unicode(path)

161

writer.add_document(

161

writer.add_document(

162

fileid=p,

162

fileid=p,

163

owner=unicode(repo.contact),

163

owner=unicode(repo.contact),

164

repository=safe_unicode(repo_name),

164

repository=safe_unicode(repo_name),

165

path=p,

165

path=p,

166

content=u_content,

166

content=u_content,

167

modtime=self.get_node_mtime(node),

167

modtime=self.get_node_mtime(node),

168

extension=node.extension

168

extension=node.extension

169

)

169

)

170

return indexed, indexed_w_content

170

return indexed, indexed_w_content

171

172

def index_changesets(self, writer, repo_name, repo, start_rev=None):

172

def index_changesets(self, writer, repo_name, repo, start_rev=None):

173

"""

173

"""

174

Add all changeset in the vcs repo starting at start_rev

174

Add all changeset in the vcs repo starting at start_rev

175

to the index writer

175

to the index writer

176

177

:param writer: the whoosh index writer to add to

177

:param writer: the whoosh index writer to add to

178

:param repo_name: name of the repository from whence the

178

:param repo_name: name of the repository from whence the

179

changeset originates including the repository group

179

changeset originates including the repository group

180

:param repo: the vcs repository instance to index changesets for,

180

:param repo: the vcs repository instance to index changesets for,

181

the presumption is the repo has changesets to index

181

the presumption is the repo has changesets to index

182

:param start_rev=None: the full sha id to start indexing from

182

:param start_rev=None: the full sha id to start indexing from

183

if start_rev is None then index from the first changeset in

183

if start_rev is None then index from the first changeset in

184

the repo

184

the repo

185

"""

185

"""

186

187

if start_rev is None:

187

if start_rev is None:

188

start_rev = repo[0].raw_id

188

start_rev = repo[0].raw_id

189

190

log.debug('indexing changesets in %s starting at rev: %s' %

190

log.debug('indexing changesets in %s starting at rev: %s' %

191

(repo_name, start_rev))

191

(repo_name, start_rev))

192

193

indexed = 0

193

indexed = 0

194

for cs in repo.get_changesets(start=start_rev):

194

for cs in repo.get_changesets(start=start_rev):

195

log.debug(' >> %s' % cs)

195

log.debug(' >> %s' % cs)

196

writer.add_document(

196

writer.add_document(

197

raw_id=unicode(cs.raw_id),

197

raw_id=unicode(cs.raw_id),

198

owner=unicode(repo.contact),

198

owner=unicode(repo.contact),

199

date=cs._timestamp,

199

date=cs._timestamp,

200

repository=safe_unicode(repo_name),

200

repository=safe_unicode(repo_name),

201

author=cs.author,

201

author=cs.author,

202

message=cs.message,

202

message=cs.message,

203

last=cs.last,

203

last=cs.last,

204

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

204

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

205

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

205

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

206

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

206

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

207

parents=u' '.join([cs.raw_id for cs in cs.parents]),

207

parents=u' '.join([cs.raw_id for cs in cs.parents]),

208

)

208

)

209

indexed += 1

209

indexed += 1

210

211

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

211

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

212

return indexed

212

return indexed

213

214

def index_files(self, file_idx_writer, repo_name, repo):

214

def index_files(self, file_idx_writer, repo_name, repo):

215

"""

215

"""

216

Index files for given repo_name

216

Index files for given repo_name

217

218

:param file_idx_writer: the whoosh index writer to add to

218

:param file_idx_writer: the whoosh index writer to add to

219

:param repo_name: name of the repository we're indexing

219

:param repo_name: name of the repository we're indexing

220

:param repo: instance of vcs repo

220

:param repo: instance of vcs repo

221

"""

221

"""

222

i_cnt = iwc_cnt = 0

222

i_cnt = iwc_cnt = 0

223

log.debug('building index for [%s]' % repo.path)

223

log.debug('building index for [%s]' % repo.path)

224

for idx_path in self.get_paths(repo):

224

for idx_path in self.get_paths(repo):

225

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

225

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

226

i_cnt += i

226

i_cnt += i

227

iwc_cnt += iwc

227

iwc_cnt += iwc

228

229

log.debug('added %s files %s with content for repo %s' %

229

log.debug('added %s files %s with content for repo %s' %

230

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

230

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

231

return i_cnt, iwc_cnt

231

return i_cnt, iwc_cnt

232

233

def update_changeset_index(self):

233

def update_changeset_index(self):

234

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

234

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

235

236

with idx.searcher() as searcher:

236

with idx.searcher() as searcher:

237

writer = idx.writer()

237

writer = idx.writer()

238

writer_is_dirty = False

238

writer_is_dirty = False

239

try:

239

try:

240

indexed_total = 0

240

indexed_total = 0

241

repo_name = None

241

for repo_name, repo in self.repo_paths.items():

242

for repo_name, repo in self.repo_paths.items():

242

# skip indexing if there aren't any revs in the repo

243

# skip indexing if there aren't any revs in the repo

243

num_of_revs = len(repo)

244

num_of_revs = len(repo)

244

if num_of_revs < 1:

245

if num_of_revs < 1:

245

continue

246

continue

246

247

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

248

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

248

q = qp.parse(u"last:t AND %s" % repo_name)

249

q = qp.parse(u"last:t AND %s" % repo_name)

249

250

results = searcher.search(q)

251

results = searcher.search(q)

251

252

# default to scanning the entire repo

253

# default to scanning the entire repo

253

last_rev = 0

254

last_rev = 0

254

start_id = None

255

start_id = None

255

256

if len(results) > 0:

257

if len(results) > 0:

257

# assuming that there is only one result, if not this

258

# assuming that there is only one result, if not this

258

# may require a full re-index.

259

# may require a full re-index.

259

start_id = results[0]['raw_id']

260

start_id = results[0]['raw_id']

260

last_rev = repo.get_changeset(revision=start_id).revision

261

last_rev = repo.get_changeset(revision=start_id).revision

261

262

# there are new changesets to index or a new repo to index

263

# there are new changesets to index or a new repo to index

263

if last_rev == 0 or num_of_revs > last_rev + 1:

264

if last_rev == 0 or num_of_revs > last_rev + 1:

264

# delete the docs in the index for the previous

265

# delete the docs in the index for the previous

265

# last changeset(s)

266

# last changeset(s)

266

for hit in results:

267

for hit in results:

267

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

268

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

268

(repo_name, hit['raw_id']))

269

(repo_name, hit['raw_id']))

269

writer.delete_by_query(q)

270

writer.delete_by_query(q)

270

271

# index from the previous last changeset + all new ones

272

# index from the previous last changeset + all new ones

272

indexed_total += self.index_changesets(writer,

273

indexed_total += self.index_changesets(writer,

273

repo_name, repo, start_id)

274

repo_name, repo, start_id)

274

writer_is_dirty = True

275

writer_is_dirty = True

275

log.debug('indexed %s changesets for repo %s' % (

276

log.debug('indexed %s changesets for repo %s' % (

276

indexed_total, repo_name)

277

indexed_total, repo_name)

277

)

278

)

278

finally:

279

finally:

279

if writer_is_dirty:

280

if writer_is_dirty:

280

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

281

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

281

writer.commit(merge=True)

282

writer.commit(merge=True)

282

log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')

283

log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')

283

else:

284

else:

284

writer.cancel

285

writer.cancel

285

log.debug('>> NOTHING TO COMMIT<<')

286

log.debug('>> NOTHING TO COMMIT<<')

286

287

def update_file_index(self):

288

def update_file_index(self):

288

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

289

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

289

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

290

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

290

291

idx = open_dir(self.index_location, indexname=self.indexname)

292

idx = open_dir(self.index_location, indexname=self.indexname)

292

# The set of all paths in the index

293

# The set of all paths in the index

293

indexed_paths = set()

294

indexed_paths = set()

294

# The set of all paths we need to re-index

295

# The set of all paths we need to re-index

295

to_index = set()

296

to_index = set()

296

297

writer = idx.writer()

298

writer = idx.writer()

298

writer_is_dirty = False

299

writer_is_dirty = False

299

try:

300

try:

300

with idx.reader() as reader:

301

with idx.reader() as reader:

301

302

# Loop over the stored fields in the index

303

# Loop over the stored fields in the index

303

for fields in reader.all_stored_fields():

304

for fields in reader.all_stored_fields():

304

indexed_path = fields['path']

305

indexed_path = fields['path']

305

indexed_repo_path = fields['repository']

306

indexed_repo_path = fields['repository']

306

indexed_paths.add(indexed_path)

307

indexed_paths.add(indexed_path)

307

308

if not indexed_repo_path in self.filtered_repo_update_paths:

309

if not indexed_repo_path in self.filtered_repo_update_paths:

309

continue

310

continue

310

311

repo = self.repo_paths[indexed_repo_path]

312

repo = self.repo_paths[indexed_repo_path]

312

313

try:

314

try:

314

node = self.get_node(repo, indexed_path)

315

node = self.get_node(repo, indexed_path)

315

# Check if this file was changed since it was indexed

316

# Check if this file was changed since it was indexed

316

indexed_time = fields['modtime']

317

indexed_time = fields['modtime']

317

mtime = self.get_node_mtime(node)

318

mtime = self.get_node_mtime(node)

318

if mtime > indexed_time:

319

if mtime > indexed_time:

319

# The file has changed, delete it and add it to

320

# The file has changed, delete it and add it to

320

# the list of files to reindex

321

# the list of files to reindex

321

log.debug(

322

log.debug(

322

'adding to reindex list %s mtime: %s vs %s' % (

323

'adding to reindex list %s mtime: %s vs %s' % (

323

indexed_path, mtime, indexed_time)

324

indexed_path, mtime, indexed_time)

324

)

325

)

325

writer.delete_by_term('fileid', indexed_path)

326

writer.delete_by_term('fileid', indexed_path)

326

writer_is_dirty = True

327

writer_is_dirty = True

327

328

to_index.add(indexed_path)

329

to_index.add(indexed_path)

329

except (ChangesetError, NodeDoesNotExistError):

330

except (ChangesetError, NodeDoesNotExistError):

330

# This file was deleted since it was indexed

331

# This file was deleted since it was indexed

331

log.debug('removing from index %s' % indexed_path)

332

log.debug('removing from index %s' % indexed_path)

332

writer.delete_by_term('path', indexed_path)

333

writer.delete_by_term('path', indexed_path)

333

writer_is_dirty = True

334

writer_is_dirty = True

334

335

# Loop over the files in the filesystem

336

# Loop over the files in the filesystem

336

# Assume we have a function that gathers the filenames of the

337

# Assume we have a function that gathers the filenames of the

337

# documents to be indexed

338

# documents to be indexed

338

ri_cnt_total = 0 # indexed

339

ri_cnt_total = 0 # indexed

339

riwc_cnt_total = 0 # indexed with content

340

riwc_cnt_total = 0 # indexed with content

340

for repo_name, repo in self.repo_paths.items():

341

for repo_name, repo in self.repo_paths.items():

341

# skip indexing if there aren't any revisions

342

# skip indexing if there aren't any revisions

342

if len(repo) < 1:

343

if len(repo) < 1:

343

continue

344

continue

344

ri_cnt = 0 # indexed

345

ri_cnt = 0 # indexed

345

riwc_cnt = 0 # indexed with content

346

riwc_cnt = 0 # indexed with content

346

for path in self.get_paths(repo):

347

for path in self.get_paths(repo):

347

path = safe_unicode(path)

348

path = safe_unicode(path)

348

if path in to_index or path not in indexed_paths:

349

if path in to_index or path not in indexed_paths:

349

350

# This is either a file that's changed, or a new file

351

# This is either a file that's changed, or a new file

351

# that wasn't indexed before. So index it!

352

# that wasn't indexed before. So index it!

352

i, iwc = self.add_doc(writer, path, repo, repo_name)

353

i, iwc = self.add_doc(writer, path, repo, repo_name)

353

writer_is_dirty = True

354

writer_is_dirty = True

354

log.debug('re indexing %s' % path)

355

log.debug('re indexing %s' % path)

355

ri_cnt += i

356

ri_cnt += i

356

ri_cnt_total += 1

357

ri_cnt_total += 1

357

riwc_cnt += iwc

358

riwc_cnt += iwc

358

riwc_cnt_total += iwc

359

riwc_cnt_total += iwc

359

log.debug('added %s files %s with content for repo %s' % (

360

log.debug('added %s files %s with content for repo %s' % (

360

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

361

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

361

)

362

)

362

log.debug('indexed %s files in total and %s with content' % (

363

log.debug('indexed %s files in total and %s with content' % (

363

ri_cnt_total, riwc_cnt_total)

364

ri_cnt_total, riwc_cnt_total)

364

)

365

)

365

finally:

366

finally:

366

if writer_is_dirty:

367

if writer_is_dirty:

367

log.debug('>> COMMITING CHANGES <<')

368

log.debug('>> COMMITING CHANGES <<')

368

writer.commit(merge=True)

369

writer.commit(merge=True)

369

log.debug('>>> FINISHED REBUILDING INDEX <<<')

370

log.debug('>>> FINISHED REBUILDING INDEX <<<')

370

else:

371

else:

371

log.debug('>> NOTHING TO COMMIT<<')

372

log.debug('>> NOTHING TO COMMIT<<')

372

writer.cancel()

373

writer.cancel()

373

374

def build_indexes(self):

375

def build_indexes(self):

375

if os.path.exists(self.index_location):

376

if os.path.exists(self.index_location):

376

log.debug('removing previous index')

377

log.debug('removing previous index')

377

rmtree(self.index_location)

378

rmtree(self.index_location)

378

379

if not os.path.exists(self.index_location):

380

if not os.path.exists(self.index_location):

380

os.mkdir(self.index_location)

381

os.mkdir(self.index_location)

381

382

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

383

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

383

indexname=CHGSET_IDX_NAME)

384

indexname=CHGSET_IDX_NAME)

384

chgset_idx_writer = chgset_idx.writer()

385

chgset_idx_writer = chgset_idx.writer()

385

386

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

387

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

387

file_idx_writer = file_idx.writer()

388

file_idx_writer = file_idx.writer()

388

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

389

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

389

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

390

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

390

391

for repo_name, repo in self.repo_paths.items():

392

for repo_name, repo in self.repo_paths.items():

392

# skip indexing if there aren't any revisions

393

# skip indexing if there aren't any revisions

393

if len(repo) < 1:

394

if len(repo) < 1:

394

continue

395

continue

395

396

self.index_files(file_idx_writer, repo_name, repo)

397

self.index_files(file_idx_writer, repo_name, repo)

397

self.index_changesets(chgset_idx_writer, repo_name, repo)

398

self.index_changesets(chgset_idx_writer, repo_name, repo)

398

399

log.debug('>> COMMITING CHANGES <<')

400

log.debug('>> COMMITING CHANGES <<')

400

file_idx_writer.commit(merge=True)

401

file_idx_writer.commit(merge=True)

401

chgset_idx_writer.commit(merge=True)

402

chgset_idx_writer.commit(merge=True)

402

log.debug('>>> FINISHED BUILDING INDEX <<<')

403

log.debug('>>> FINISHED BUILDING INDEX <<<')

403

404

def update_indexes(self):

405

def update_indexes(self):

405

self.update_file_index()

406

self.update_file_index()

406

self.update_changeset_index()

407

self.update_changeset_index()

407

408

def run(self, full_index=False):

409

def run(self, full_index=False):

409

"""Run daemon"""

410

"""Run daemon"""

410

if full_index or self.initial:

411

if full_index or self.initial:

411

self.build_indexes()

412

self.build_indexes()

412

else:

413

else:

413

self.update_indexes()

414

self.update_indexes()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.daemon
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 A daemon will read from task table and run tasks
                 :created_on: Jan 26, 2010
                 :author: marcink
                 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             from __future__ import with_statement
             import os
             import sys
             import logging
             import traceback
             from shutil import rmtree
             from time import mktime
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the rhodecode import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from rhodecode.config.conf import INDEX_EXTENSIONS
             from rhodecode.model.scm import ScmModel
             from rhodecode.lib.utils2 import safe_unicode
             from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
                 CHGSET_IDX_NAME
             from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
                 NodeDoesNotExistError
             from whoosh.index import create_in, open_dir, exists_in
             from whoosh.query import *
             from whoosh.qparser import QueryParser
             log = logging.getLogger('whoosh_indexer')
             class WhooshIndexingDaemon(object):
                 """
                 Daemon for atomic indexing jobs
                 """
                 def __init__(self, indexname=IDX_NAME, index_location=None,
                              repo_location=None, sa=None, repo_list=None,
                              repo_update_list=None):
                     self.indexname = indexname
                     self.index_location = index_location
                     if not index_location:
                         raise Exception('You have to provide index location')
                     self.repo_location = repo_location
                     if not repo_location:
                         raise Exception('You have to provide repositories location')
                     self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
                     #filter repo list
                     if repo_list:
                         self.filtered_repo_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_list:
                                 self.filtered_repo_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_paths
                     #filter update repo list
                     self.filtered_repo_update_paths = {}
                     if repo_update_list:
                         self.filtered_repo_update_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_update_list:
                                 self.filtered_repo_update_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_update_paths
                     self.initial = True
                     if not os.path.isdir(self.index_location):
                         os.makedirs(self.index_location)
                         log.info('Cannot run incremental index since it does not'
                                  ' yet exist running full build')
                     elif not exists_in(self.index_location, IDX_NAME):
                         log.info('Running full index build as the file content'
                                  ' index does not exist')
                     elif not exists_in(self.index_location, CHGSET_IDX_NAME):
                         log.info('Running full index build as the changeset'
                                  ' index does not exist')
                     else:
                         self.initial = False
                 def get_paths(self, repo):
                     """
                     recursive walk in root dir and return a set of all path in that dir
                     based on repository walk function
                     """
                     index_paths_ = set()
                     try:
                         tip = repo.get_changeset('tip')
                         for _topnode, _dirs, files in tip.walk('/'):
                             for f in files:
                                 index_paths_.add(jn(repo.path, f.path))
                     except RepositoryError:
                         log.debug(traceback.format_exc())
                         pass
                     return index_paths_
                 def get_node(self, repo, path):
                     n_path = path[len(repo.path) + 1:]
                     node = repo.get_changeset().get_node(n_path)
                     return node
                 def get_node_mtime(self, node):
                     return mktime(node.last_changeset.date.timetuple())
                 def add_doc(self, writer, path, repo, repo_name):
                     """
                     Adding doc to writer this function itself fetches data from
                     the instance of vcs backend
                     """
                     node = self.get_node(repo, path)
                     indexed = indexed_w_content = 0
                     # we just index the content of chosen files, and skip binary files
                     if node.extension in INDEX_EXTENSIONS and not node.is_binary:
                         u_content = node.content
                         if not isinstance(u_content, unicode):
                             log.warning('  >> %s Could not get this content as unicode '
                                         'replacing with empty content' % path)
                             u_content = u''
                         else:
                             log.debug('    >> %s [WITH CONTENT]' % path)
                             indexed_w_content += 1
                     else:
                         log.debug('    >> %s' % path)
                         # just index file name without it's content
                         u_content = u''
                         indexed += 1
                     p = safe_unicode(path)
                     writer.add_document(
                         fileid=p,
                         owner=unicode(repo.contact),
                         repository=safe_unicode(repo_name),
                         path=p,
                         content=u_content,
                         modtime=self.get_node_mtime(node),
                         extension=node.extension
                     )
                     return indexed, indexed_w_content
                 def index_changesets(self, writer, repo_name, repo, start_rev=None):
                     """
                     Add all changeset in the vcs repo starting at start_rev
                     to the index writer
                     :param writer: the whoosh index writer to add to
                     :param repo_name: name of the repository from whence the
                       changeset originates including the repository group
                     :param repo: the vcs repository instance to index changesets for,
                       the presumption is the repo has changesets to index
                     :param start_rev=None: the full sha id to start indexing from
                       if start_rev is None then index from the first changeset in
                       the repo
                     """
                     if start_rev is None:
                         start_rev = repo[0].raw_id
                     log.debug('indexing changesets in %s starting at rev: %s' %
                               (repo_name, start_rev))
                     indexed = 0
                     for cs in repo.get_changesets(start=start_rev):
                         log.debug('    >> %s' % cs)
                         writer.add_document(
                             raw_id=unicode(cs.raw_id),
                             owner=unicode(repo.contact),
                             date=cs._timestamp,
                             repository=safe_unicode(repo_name),
                             author=cs.author,
                             message=cs.message,
                             last=cs.last,
                             added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
                             removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
                             changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
                             parents=u' '.join([cs.raw_id for cs in cs.parents]),
                         )
                         indexed += 1
                     log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
                     return indexed
                 def index_files(self, file_idx_writer, repo_name, repo):
                     """
                     Index files for given repo_name
                     :param file_idx_writer: the whoosh index writer to add to
                     :param repo_name: name of the repository we're indexing
                     :param repo: instance of vcs repo
                     """
                     i_cnt = iwc_cnt = 0
                     log.debug('building index for [%s]' % repo.path)
                     for idx_path in self.get_paths(repo):
                         i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
                         i_cnt += i
                         iwc_cnt += iwc
                     log.debug('added %s files %s with content for repo %s' %
                               (i_cnt + iwc_cnt, iwc_cnt, repo.path))
                     return i_cnt, iwc_cnt
                 def update_changeset_index(self):
                     idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
                     with idx.searcher() as searcher:
                         writer = idx.writer()
                         writer_is_dirty = False
                         try:
                             indexed_total = 0
+                            repo_name = None
                             for repo_name, repo in self.repo_paths.items():
                                 # skip indexing if there aren't any revs in the repo
                                 num_of_revs = len(repo)
                                 if num_of_revs < 1:
                                     continue
                                 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                                 q = qp.parse(u"last:t AND %s" % repo_name)
                                 results = searcher.search(q)
                                 # default to scanning the entire repo
                                 last_rev = 0
                                 start_id = None
                                 if len(results) > 0:
                                     # assuming that there is only one result, if not this
                                     # may require a full re-index.
                                     start_id = results[0]['raw_id']
                                     last_rev = repo.get_changeset(revision=start_id).revision
                                 # there are new changesets to index or a new repo to index
                                 if last_rev == 0 or num_of_revs > last_rev + 1:
                                     # delete the docs in the index for the previous
                                     # last changeset(s)
                                     for hit in results:
                                         q = qp.parse(u"last:t AND %s AND raw_id:%s" %
                                                         (repo_name, hit['raw_id']))
                                         writer.delete_by_query(q)
                                     # index from the previous last changeset + all new ones
                                     indexed_total += self.index_changesets(writer,
                                                             repo_name, repo, start_id)
                                     writer_is_dirty = True
                             log.debug('indexed %s changesets for repo %s' % (
                                          indexed_total, repo_name)
                             )
                         finally:
                             if writer_is_dirty:
                                 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
                                 writer.commit(merge=True)
                                 log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')
                             else:
                                 writer.cancel
                                 log.debug('>> NOTHING TO COMMIT<<')
                 def update_file_index(self):
                     log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
                                'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     idx = open_dir(self.index_location, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     writer = idx.writer()
                     writer_is_dirty = False
                     try:
                         with idx.reader() as reader:
                             # Loop over the stored fields in the index
                             for fields in reader.all_stored_fields():
                                 indexed_path = fields['path']
                                 indexed_repo_path = fields['repository']
                                 indexed_paths.add(indexed_path)
                                 if not indexed_repo_path in self.filtered_repo_update_paths:
                                     continue
                                 repo = self.repo_paths[indexed_repo_path]
                                 try:
                                     node = self.get_node(repo, indexed_path)
                                     # Check if this file was changed since it was indexed
                                     indexed_time = fields['modtime']
                                     mtime = self.get_node_mtime(node)
                                     if mtime > indexed_time:
                                         # The file has changed, delete it and add it to
                                         # the list of files to reindex
                                         log.debug(
                                             'adding to reindex list %s mtime: %s vs %s' % (
                                                 indexed_path, mtime, indexed_time)
                                         )
                                         writer.delete_by_term('fileid', indexed_path)
                                         writer_is_dirty = True
                                         to_index.add(indexed_path)
                                 except (ChangesetError, NodeDoesNotExistError):
                                     # This file was deleted since it was indexed
                                     log.debug('removing from index %s' % indexed_path)
                                     writer.delete_by_term('path', indexed_path)
                                     writer_is_dirty = True
                         # Loop over the files in the filesystem
                         # Assume we have a function that gathers the filenames of the
                         # documents to be indexed
                         ri_cnt_total = 0  # indexed
                         riwc_cnt_total = 0  # indexed with content
                         for repo_name, repo in self.repo_paths.items():
                             # skip indexing if there aren't any revisions
                             if len(repo) < 1:
                                 continue
                             ri_cnt = 0   # indexed
                             riwc_cnt = 0  # indexed with content
                             for path in self.get_paths(repo):
                                 path = safe_unicode(path)
                                 if path in to_index or path not in indexed_paths:
                                     # This is either a file that's changed, or a new file
                                     # that wasn't indexed before. So index it!
                                     i, iwc = self.add_doc(writer, path, repo, repo_name)
                                     writer_is_dirty = True
                                     log.debug('re indexing %s' % path)
                                     ri_cnt += i
                                     ri_cnt_total += 1
                                     riwc_cnt += iwc
                                     riwc_cnt_total += iwc
                             log.debug('added %s files %s with content for repo %s' % (
                                          ri_cnt + riwc_cnt, riwc_cnt, repo.path)
                             )
                         log.debug('indexed %s files in total and %s with content' % (
                                     ri_cnt_total, riwc_cnt_total)
                         )
                     finally:
                         if writer_is_dirty:
                             log.debug('>> COMMITING CHANGES <<')
                             writer.commit(merge=True)
                             log.debug('>>> FINISHED REBUILDING INDEX <<<')
                         else:
                             log.debug('>> NOTHING TO COMMIT<<')
                             writer.cancel()
                 def build_indexes(self):
                     if os.path.exists(self.index_location):
                         log.debug('removing previous index')
                         rmtree(self.index_location)
                     if not os.path.exists(self.index_location):
                         os.mkdir(self.index_location)
                     chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
                                            indexname=CHGSET_IDX_NAME)
                     chgset_idx_writer = chgset_idx.writer()
                     file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
                     file_idx_writer = file_idx.writer()
                     log.debug('BUILDING INDEX FOR EXTENSIONS %s '
                               'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     for repo_name, repo in self.repo_paths.items():
                         # skip indexing if there aren't any revisions
                         if len(repo) < 1:
                             continue
                         self.index_files(file_idx_writer, repo_name, repo)
                         self.index_changesets(chgset_idx_writer, repo_name, repo)
                     log.debug('>> COMMITING CHANGES <<')
                     file_idx_writer.commit(merge=True)
                     chgset_idx_writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_indexes(self):
                     self.update_file_index()
                     self.update_changeset_index()
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index or self.initial:
                         self.build_indexes()
                     else:
                         self.update_indexes()