upstream/kallithea Commit - r2641:cfcd981d

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.daemon

3

rhodecode.lib.indexers.daemon

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

A daemon will read from task table and run tasks

6

A daemon will read from task table and run tasks

7

8

:created_on: Jan 26, 2010

8

:created_on: Jan 26, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

from __future__ import with_statement

25

26

import os

27

import os

27

import sys

28

import sys

28

import logging

29

import logging

29

import traceback

30

import traceback

30

31

from shutil import rmtree

32

from shutil import rmtree

32

from time import mktime

33

from time import mktime

33

34

from os.path import dirname as dn

35

from os.path import dirname as dn

35

from os.path import join as jn

36

from os.path import join as jn

36

37

#to get the rhodecode import

38

#to get the rhodecode import

38

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

39

sys.path.append(project_path)

40

sys.path.append(project_path)

40

41

from rhodecode.config.conf import INDEX_EXTENSIONS

42

from rhodecode.config.conf import INDEX_EXTENSIONS

42

from rhodecode.model.scm import ScmModel

43

from rhodecode.model.scm import ScmModel

43

from rhodecode.lib.utils2 import safe_unicode

44

from rhodecode.lib.utils2 import safe_unicode

44

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME

45

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME

45

46

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

47

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

47

NodeDoesNotExistError

48

NodeDoesNotExistError

48

49

from whoosh.index import create_in, open_dir, exists_in

50

from whoosh.index import create_in, open_dir, exists_in

50

from whoosh.query import *

51

from whoosh.query import *

51

from whoosh.qparser import QueryParser

52

from whoosh.qparser import QueryParser

52

53

log = logging.getLogger('whoosh_indexer')

54

log = logging.getLogger('whoosh_indexer')

54

55

56

class WhooshIndexingDaemon(object):

57

class WhooshIndexingDaemon(object):

57

"""

58

"""

58

Daemon for atomic indexing jobs

59

Daemon for atomic indexing jobs

59

"""

60

"""

60

61

def __init__(self, indexname=IDX_NAME, index_location=None,

62

def __init__(self, indexname=IDX_NAME, index_location=None,

62

repo_location=None, sa=None, repo_list=None,

63

repo_location=None, sa=None, repo_list=None,

63

repo_update_list=None):

64

repo_update_list=None):

64

self.indexname = indexname

65

self.indexname = indexname

65

66

self.index_location = index_location

67

self.index_location = index_location

67

if not index_location:

68

if not index_location:

68

raise Exception('You have to provide index location')

69

raise Exception('You have to provide index location')

69

70

self.repo_location = repo_location

71

self.repo_location = repo_location

71

if not repo_location:

72

if not repo_location:

72

raise Exception('You have to provide repositories location')

73

raise Exception('You have to provide repositories location')

73

74

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

75

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

75

76

#filter repo list

77

#filter repo list

77

if repo_list:

78

if repo_list:

78

self.filtered_repo_paths = {}

79

self.filtered_repo_paths = {}

79

for repo_name, repo in self.repo_paths.items():

80

for repo_name, repo in self.repo_paths.items():

80

if repo_name in repo_list:

81

if repo_name in repo_list:

81

self.filtered_repo_paths[repo_name] = repo

82

self.filtered_repo_paths[repo_name] = repo

82

83

self.repo_paths = self.filtered_repo_paths

84

self.repo_paths = self.filtered_repo_paths

84

85

#filter update repo list

86

#filter update repo list

86

self.filtered_repo_update_paths = {}

87

self.filtered_repo_update_paths = {}

87

if repo_update_list:

88

if repo_update_list:

88

self.filtered_repo_update_paths = {}

89

self.filtered_repo_update_paths = {}

89

for repo_name, repo in self.repo_paths.items():

90

for repo_name, repo in self.repo_paths.items():

90

if repo_name in repo_update_list:

91

if repo_name in repo_update_list:

91

self.filtered_repo_update_paths[repo_name] = repo

92

self.filtered_repo_update_paths[repo_name] = repo

92

self.repo_paths = self.filtered_repo_update_paths

93

self.repo_paths = self.filtered_repo_update_paths

93

94

self.initial = True

95

self.initial = True

95

if not os.path.isdir(self.index_location):

96

if not os.path.isdir(self.index_location):

96

os.makedirs(self.index_location)

97

os.makedirs(self.index_location)

97

log.info('Cannot run incremental index since it does not'

98

log.info('Cannot run incremental index since it does not'

98

' yet exist running full build')

99

' yet exist running full build')

99

elif not exists_in(self.index_location, IDX_NAME):

100

elif not exists_in(self.index_location, IDX_NAME):

100

log.info('Running full index build as the file content'

101

log.info('Running full index build as the file content'

101

' index does not exist')

102

' index does not exist')

102

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

103

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

103

log.info('Running full index build as the changeset'

104

log.info('Running full index build as the changeset'

104

' index does not exist')

105

' index does not exist')

105

else:

106

else:

106

self.initial = False

107

self.initial = False

107

108

def get_paths(self, repo):

109

def get_paths(self, repo):

109

"""

110

"""

110

recursive walk in root dir and return a set of all path in that dir

111

recursive walk in root dir and return a set of all path in that dir

111

based on repository walk function

112

based on repository walk function

112

"""

113

"""

113

index_paths_ = set()

114

index_paths_ = set()

114

try:

115

try:

115

tip = repo.get_changeset('tip')

116

tip = repo.get_changeset('tip')

116

for topnode, dirs, files in tip.walk('/'):

117

for topnode, dirs, files in tip.walk('/'):

117

for f in files:

118

for f in files:

118

index_paths_.add(jn(repo.path, f.path))

119

index_paths_.add(jn(repo.path, f.path))

119

120

except RepositoryError, e:

121

except RepositoryError, e:

121

log.debug(traceback.format_exc())

122

log.debug(traceback.format_exc())

122

pass

123

pass

123

return index_paths_

124

return index_paths_

124

125

def get_node(self, repo, path):

126

def get_node(self, repo, path):

126

n_path = path[len(repo.path) + 1:]

127

n_path = path[len(repo.path) + 1:]

127

node = repo.get_changeset().get_node(n_path)

128

node = repo.get_changeset().get_node(n_path)

128

return node

129

return node

129

130

def get_node_mtime(self, node):

131

def get_node_mtime(self, node):

131

return mktime(node.last_changeset.date.timetuple())

132

return mktime(node.last_changeset.date.timetuple())

132

133

def add_doc(self, writer, path, repo, repo_name):

134

def add_doc(self, writer, path, repo, repo_name):

134

"""

135

"""

135

Adding doc to writer this function itself fetches data from

136

Adding doc to writer this function itself fetches data from

136

the instance of vcs backend

137

the instance of vcs backend

137

"""

138

"""

138

139

node = self.get_node(repo, path)

140

node = self.get_node(repo, path)

140

indexed = indexed_w_content = 0

141

indexed = indexed_w_content = 0

141

# we just index the content of chosen files, and skip binary files

142

# we just index the content of chosen files, and skip binary files

142

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

143

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

143

u_content = node.content

144

u_content = node.content

144

if not isinstance(u_content, unicode):

145

if not isinstance(u_content, unicode):

145

log.warning(' >> %s Could not get this content as unicode '

146

log.warning(' >> %s Could not get this content as unicode '

146

'replacing with empty content' % path)

147

'replacing with empty content' % path)

147

u_content = u''

148

u_content = u''

148

else:

149

else:

149

log.debug(' >> %s [WITH CONTENT]' % path)

150

log.debug(' >> %s [WITH CONTENT]' % path)

150

indexed_w_content += 1

151

indexed_w_content += 1

151

152

else:

153

else:

153

log.debug(' >> %s' % path)

154

log.debug(' >> %s' % path)

154

# just index file name without it's content

155

# just index file name without it's content

155

u_content = u''

156

u_content = u''

156

indexed += 1

157

indexed += 1

157

158

p = safe_unicode(path)

159

p = safe_unicode(path)

159

writer.add_document(

160

writer.add_document(

160

fileid=p,

161

fileid=p,

161

owner=unicode(repo.contact),

162

owner=unicode(repo.contact),

162

repository=safe_unicode(repo_name),

163

repository=safe_unicode(repo_name),

163

path=p,

164

path=p,

164

content=u_content,

165

content=u_content,

165

modtime=self.get_node_mtime(node),

166

modtime=self.get_node_mtime(node),

166

extension=node.extension

167

extension=node.extension

167

)

168

)

168

return indexed, indexed_w_content

169

return indexed, indexed_w_content

169

170

def index_changesets(self, writer, repo_name, repo, start_rev=0):

171

def index_changesets(self, writer, repo_name, repo, start_rev=0):

171

"""

172

"""

172

Add all changeset in the vcs repo starting at start_rev

173

Add all changeset in the vcs repo starting at start_rev

173

to the index writer

174

to the index writer

174

"""

175

"""

175

176

log.debug('indexing changesets in %s[%d:]' % (repo_name, start_rev))

177

log.debug('indexing changesets in %s[%d:]' % (repo_name, start_rev))

177

178

indexed=0

179

indexed=0

179

for cs in repo[start_rev:]:

180

for cs in repo[start_rev:]:

180

writer.add_document(

181

writer.add_document(

181

path=unicode(cs.raw_id),

182

path=unicode(cs.raw_id),

182

owner=unicode(repo.contact),

183

owner=unicode(repo.contact),

183

repository=safe_unicode(repo_name),

184

repository=safe_unicode(repo_name),

184

author=cs.author,

185

author=cs.author,

185

message=cs.message,

186

message=cs.message,

186

revision=cs.revision,

187

revision=cs.revision,

187

last=cs.last,

188

last=cs.last,

188

added=u' '.join([node.path for node in cs.added]).lower(),

189

added=u' '.join([node.path for node in cs.added]).lower(),

189

removed=u' '.join([node.path for node in cs.removed]).lower(),

190

removed=u' '.join([node.path for node in cs.removed]).lower(),

190

changed=u' '.join([node.path for node in cs.changed]).lower(),

191

changed=u' '.join([node.path for node in cs.changed]).lower(),

191

parents=u' '.join([cs.raw_id for cs in cs.parents]),

192

parents=u' '.join([cs.raw_id for cs in cs.parents]),

192

)

193

)

193

indexed += 1

194

indexed += 1

194

195

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

196

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

196

197

def index_files(self, file_idx_writer, repo_name, repo):

198

def index_files(self, file_idx_writer, repo_name, repo):

198

i_cnt = iwc_cnt = 0

199

i_cnt = iwc_cnt = 0

199

log.debug('building index for [%s]' % repo.path)

200

log.debug('building index for [%s]' % repo.path)

200

for idx_path in self.get_paths(repo):

201

for idx_path in self.get_paths(repo):

201

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

202

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

202

i_cnt += i

203

i_cnt += i

203

iwc_cnt += iwc

204

iwc_cnt += iwc

204

205

log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path))

206

log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path))

206

207

def update_changeset_index(self):

208

def update_changeset_index(self):

208

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

209

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

209

210

with idx.searcher() as searcher:

211

with idx.searcher() as searcher:

211

writer = idx.writer()

212

writer = idx.writer()

212

writer_is_dirty = False

213

writer_is_dirty = False

213

try:

214

try:

214

for repo_name, repo in self.repo_paths.items():

215

for repo_name, repo in self.repo_paths.items():

215

# skip indexing if there aren't any revs in the repo

216

# skip indexing if there aren't any revs in the repo

216

revs = repo.revisions

217

revs = repo.revisions

217

if len(revs) < 1:

218

if len(revs) < 1:

218

continue

219

continue

219

220

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

221

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

221

q = qp.parse(u"last:t AND %s" % repo_name)

222

q = qp.parse(u"last:t AND %s" % repo_name)

222

223

results = searcher.search(q, sortedby='revision')

224

results = searcher.search(q, sortedby='revision')

224

225

last_rev = 0

226

last_rev = 0

226

if len(results) > 0:

227

if len(results) > 0:

227

last_rev = results[0]['revision']

228

last_rev = results[0]['revision']

228

229

# there are new changesets to index or a new repo to index

230

# there are new changesets to index or a new repo to index

230

if last_rev == 0 or len(revs) > last_rev + 1:

231

if last_rev == 0 or len(revs) > last_rev + 1:

231

# delete the docs in the index for the previous last changeset(s)

232

# delete the docs in the index for the previous last changeset(s)

232

for hit in results:

233

for hit in results:

233

q = qp.parse(u"last:t AND %s AND path:%s" %

234

q = qp.parse(u"last:t AND %s AND path:%s" %

234

(repo_name, hit['path']))

235

(repo_name, hit['path']))

235

writer.delete_by_query(q)

236

writer.delete_by_query(q)

236

237

# index from the previous last changeset + all new ones

238

# index from the previous last changeset + all new ones

238

self.index_changesets(writer, repo_name, repo, last_rev)

239

self.index_changesets(writer, repo_name, repo, last_rev)

239

writer_is_dirty = True

240

writer_is_dirty = True

240

241

finally:

242

finally:

242

if writer_is_dirty:

243

if writer_is_dirty:

243

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

244

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

244

writer.commit(merge=True)

245

writer.commit(merge=True)

245

log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')

246

log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')

246

else:

247

else:

247

writer.cancel

248

writer.cancel

248

249

def update_file_index(self):

250

def update_file_index(self):

250

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

251

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

251

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

252

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

252

253

idx = open_dir(self.index_location, indexname=self.indexname)

254

idx = open_dir(self.index_location, indexname=self.indexname)

254

# The set of all paths in the index

255

# The set of all paths in the index

255

indexed_paths = set()

256

indexed_paths = set()

256

# The set of all paths we need to re-index

257

# The set of all paths we need to re-index

257

to_index = set()

258

to_index = set()

258

259

writer = idx.writer()

260

writer = idx.writer()

260

writer_is_dirty = False

261

writer_is_dirty = False

261

try:

262

try:

262

with idx.reader() as reader:

263

with idx.reader() as reader:

263

264

# Loop over the stored fields in the index

265

# Loop over the stored fields in the index

265

for fields in reader.all_stored_fields():

266

for fields in reader.all_stored_fields():

266

indexed_path = fields['path']

267

indexed_path = fields['path']

267

indexed_repo_path = fields['repository']

268

indexed_repo_path = fields['repository']

268

indexed_paths.add(indexed_path)

269

indexed_paths.add(indexed_path)

269

270

if not indexed_repo_path in self.filtered_repo_update_paths:

271

if not indexed_repo_path in self.filtered_repo_update_paths:

271

continue

272

continue

272

273

repo = self.repo_paths[indexed_repo_path]

274

repo = self.repo_paths[indexed_repo_path]

274

275

try:

276

try:

276

node = self.get_node(repo, indexed_path)

277

node = self.get_node(repo, indexed_path)

277

# Check if this file was changed since it was indexed

278

# Check if this file was changed since it was indexed

278

indexed_time = fields['modtime']

279

indexed_time = fields['modtime']

279

mtime = self.get_node_mtime(node)

280

mtime = self.get_node_mtime(node)

280

if mtime > indexed_time:

281

if mtime > indexed_time:

281

# The file has changed, delete it and add it to the list of

282

# The file has changed, delete it and add it to the list of

282

# files to reindex

283

# files to reindex

283

log.debug('adding to reindex list %s mtime: %s vs %s' % (

284

log.debug('adding to reindex list %s mtime: %s vs %s' % (

284

indexed_path, mtime, indexed_time)

285

indexed_path, mtime, indexed_time)

285

)

286

)

286

writer.delete_by_term('fileid', indexed_path)

287

writer.delete_by_term('fileid', indexed_path)

287

writer_is_dirty = True

288

writer_is_dirty = True

288

289

to_index.add(indexed_path)

290

to_index.add(indexed_path)

290

except (ChangesetError, NodeDoesNotExistError):

291

except (ChangesetError, NodeDoesNotExistError):

291

# This file was deleted since it was indexed

292

# This file was deleted since it was indexed

292

log.debug('removing from index %s' % indexed_path)

293

log.debug('removing from index %s' % indexed_path)

293

writer.delete_by_term('path', indexed_path)

294

writer.delete_by_term('path', indexed_path)

294

writer_is_dirty = True

295

writer_is_dirty = True

295

296

# Loop over the files in the filesystem

297

# Loop over the files in the filesystem

297

# Assume we have a function that gathers the filenames of the

298

# Assume we have a function that gathers the filenames of the

298

# documents to be indexed

299

# documents to be indexed

299

ri_cnt_total = 0 # indexed

300

ri_cnt_total = 0 # indexed

300

riwc_cnt_total = 0 # indexed with content

301

riwc_cnt_total = 0 # indexed with content

301

for repo_name, repo in self.repo_paths.items():

302

for repo_name, repo in self.repo_paths.items():

302

# skip indexing if there aren't any revisions

303

# skip indexing if there aren't any revisions

303

if len(repo) < 1:

304

if len(repo) < 1:

304

continue

305

continue

305

ri_cnt = 0 # indexed

306

ri_cnt = 0 # indexed

306

riwc_cnt = 0 # indexed with content

307

riwc_cnt = 0 # indexed with content

307

for path in self.get_paths(repo):

308

for path in self.get_paths(repo):

308

path = safe_unicode(path)

309

path = safe_unicode(path)

309

if path in to_index or path not in indexed_paths:

310

if path in to_index or path not in indexed_paths:

310

311

# This is either a file that's changed, or a new file

312

# This is either a file that's changed, or a new file

312

# that wasn't indexed before. So index it!

313

# that wasn't indexed before. So index it!

313

i, iwc = self.add_doc(writer, path, repo, repo_name)

314

i, iwc = self.add_doc(writer, path, repo, repo_name)

314

writer_is_dirty = True

315

writer_is_dirty = True

315

log.debug('re indexing %s' % path)

316

log.debug('re indexing %s' % path)

316

ri_cnt += i

317

ri_cnt += i

317

ri_cnt_total += 1

318

ri_cnt_total += 1

318

riwc_cnt += iwc

319

riwc_cnt += iwc

319

riwc_cnt_total += iwc

320

riwc_cnt_total += iwc

320

log.debug('added %s files %s with content for repo %s' % (

321

log.debug('added %s files %s with content for repo %s' % (

321

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

322

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

322

)

323

)

323

log.debug('indexed %s files in total and %s with content' % (

324

log.debug('indexed %s files in total and %s with content' % (

324

ri_cnt_total, riwc_cnt_total)

325

ri_cnt_total, riwc_cnt_total)

325

)

326

)

326

finally:

327

finally:

327

if writer_is_dirty:

328

if writer_is_dirty:

328

log.debug('>> COMMITING CHANGES <<')

329

log.debug('>> COMMITING CHANGES <<')

329

writer.commit(merge=True)

330

writer.commit(merge=True)

330

log.debug('>>> FINISHED REBUILDING INDEX <<<')

331

log.debug('>>> FINISHED REBUILDING INDEX <<<')

331

else:

332

else:

332

writer.cancel()

333

writer.cancel()

333

334

def build_indexes(self):

335

def build_indexes(self):

335

if os.path.exists(self.index_location):

336

if os.path.exists(self.index_location):

336

log.debug('removing previous index')

337

log.debug('removing previous index')

337

rmtree(self.index_location)

338

rmtree(self.index_location)

338

339

if not os.path.exists(self.index_location):

340

if not os.path.exists(self.index_location):

340

os.mkdir(self.index_location)

341

os.mkdir(self.index_location)

341

342

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME)

343

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME)

343

chgset_idx_writer = chgset_idx.writer()

344

chgset_idx_writer = chgset_idx.writer()

344

345

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

346

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

346

file_idx_writer = file_idx.writer()

347

file_idx_writer = file_idx.writer()

347

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

348

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

348

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

349

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

349

350

for repo_name, repo in self.repo_paths.items():

351

for repo_name, repo in self.repo_paths.items():

351

# skip indexing if there aren't any revisions

352

# skip indexing if there aren't any revisions

352

if len(repo) < 1:

353

if len(repo) < 1:

353

continue

354

continue

354

355

self.index_files(file_idx_writer, repo_name, repo)

356

self.index_files(file_idx_writer, repo_name, repo)

356

self.index_changesets(chgset_idx_writer, repo_name, repo)

357

self.index_changesets(chgset_idx_writer, repo_name, repo)

357

358

log.debug('>> COMMITING CHANGES <<')

359

log.debug('>> COMMITING CHANGES <<')

359

file_idx_writer.commit(merge=True)

360

file_idx_writer.commit(merge=True)

360

chgset_idx_writer.commit(merge=True)

361

chgset_idx_writer.commit(merge=True)

361

log.debug('>>> FINISHED BUILDING INDEX <<<')

362

log.debug('>>> FINISHED BUILDING INDEX <<<')

362

363

def update_indexes(self):

364

def update_indexes(self):

364

self.update_file_index()

365

self.update_file_index()

365

self.update_changeset_index()

366

self.update_changeset_index()

366

367

def run(self, full_index=False):

368

def run(self, full_index=False):

368

"""Run daemon"""

369

"""Run daemon"""

369

if full_index or self.initial:

370

if full_index or self.initial:

370

self.build_indexes()

371

self.build_indexes()

371

else:

372

else:

372

self.update_indexes()

373

self.update_indexes()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.daemon
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 A daemon will read from task table and run tasks
                 :created_on: Jan 26, 2010
                 :author: marcink
                 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
+            from __future__ import with_statement
             import os
             import sys
             import logging
             import traceback
             from shutil import rmtree
             from time import mktime
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the rhodecode import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from rhodecode.config.conf import INDEX_EXTENSIONS
             from rhodecode.model.scm import ScmModel
             from rhodecode.lib.utils2 import safe_unicode
             from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, CHGSET_IDX_NAME
             from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
                 NodeDoesNotExistError
             from whoosh.index import create_in, open_dir, exists_in
             from whoosh.query import *
             from whoosh.qparser import QueryParser
             log = logging.getLogger('whoosh_indexer')
             class WhooshIndexingDaemon(object):
                 """
                 Daemon for atomic indexing jobs
                 """
                 def __init__(self, indexname=IDX_NAME, index_location=None,
                              repo_location=None, sa=None, repo_list=None,
                              repo_update_list=None):
                     self.indexname = indexname
                     self.index_location = index_location
                     if not index_location:
                         raise Exception('You have to provide index location')
                     self.repo_location = repo_location
                     if not repo_location:
                         raise Exception('You have to provide repositories location')
                     self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
                     #filter repo list
                     if repo_list:
                         self.filtered_repo_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_list:
                                 self.filtered_repo_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_paths
                     #filter update repo list
                     self.filtered_repo_update_paths = {}
                     if repo_update_list:
                         self.filtered_repo_update_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_update_list:
                                 self.filtered_repo_update_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_update_paths
                     self.initial = True
                     if not os.path.isdir(self.index_location):
                         os.makedirs(self.index_location)
                         log.info('Cannot run incremental index since it does not'
                                  ' yet exist running full build')
                     elif not exists_in(self.index_location, IDX_NAME):
                         log.info('Running full index build as the file content'
                                  ' index does not exist')
                     elif not exists_in(self.index_location, CHGSET_IDX_NAME):
                         log.info('Running full index build as the changeset'
                                  ' index does not exist')
                     else:
                         self.initial = False
                 def get_paths(self, repo):
                     """
                     recursive walk in root dir and return a set of all path in that dir
                     based on repository walk function
                     """
                     index_paths_ = set()
                     try:
                         tip = repo.get_changeset('tip')
                         for topnode, dirs, files in tip.walk('/'):
                             for f in files:
                                 index_paths_.add(jn(repo.path, f.path))
                     except RepositoryError, e:
                         log.debug(traceback.format_exc())
                         pass
                     return index_paths_
                 def get_node(self, repo, path):
                     n_path = path[len(repo.path) + 1:]
                     node = repo.get_changeset().get_node(n_path)
                     return node
                 def get_node_mtime(self, node):
                     return mktime(node.last_changeset.date.timetuple())
                 def add_doc(self, writer, path, repo, repo_name):
                     """
                     Adding doc to writer this function itself fetches data from
                     the instance of vcs backend
                     """
                     node = self.get_node(repo, path)
                     indexed = indexed_w_content = 0
                     # we just index the content of chosen files, and skip binary files
                     if node.extension in INDEX_EXTENSIONS and not node.is_binary:
                         u_content = node.content
                         if not isinstance(u_content, unicode):
                             log.warning('  >> %s Could not get this content as unicode '
                                         'replacing with empty content' % path)
                             u_content = u''
                         else:
                             log.debug('    >> %s [WITH CONTENT]' % path)
                             indexed_w_content += 1
                     else:
                         log.debug('    >> %s' % path)
                         # just index file name without it's content
                         u_content = u''
                         indexed += 1
                     p = safe_unicode(path)
                     writer.add_document(
                         fileid=p,
                         owner=unicode(repo.contact),
                         repository=safe_unicode(repo_name),
                         path=p,
                         content=u_content,
                         modtime=self.get_node_mtime(node),
                         extension=node.extension
                     )
                     return indexed, indexed_w_content
                 def index_changesets(self, writer, repo_name, repo, start_rev=0):
                     """
                     Add all changeset in the vcs repo starting at start_rev
                     to the index writer
                     """
                     log.debug('indexing changesets in %s[%d:]' % (repo_name, start_rev))
                     indexed=0
                     for cs in repo[start_rev:]:
                         writer.add_document(
                             path=unicode(cs.raw_id),
                             owner=unicode(repo.contact),
                             repository=safe_unicode(repo_name),
                             author=cs.author,
                             message=cs.message,
                             revision=cs.revision,
                             last=cs.last,
                             added=u' '.join([node.path for node in cs.added]).lower(),
                             removed=u' '.join([node.path for node in cs.removed]).lower(),
                             changed=u' '.join([node.path for node in cs.changed]).lower(),
                             parents=u' '.join([cs.raw_id for cs in cs.parents]),
                         )
                         indexed += 1
                     log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
                 def index_files(self, file_idx_writer, repo_name, repo):
                     i_cnt = iwc_cnt = 0
                     log.debug('building index for [%s]' % repo.path)
                     for idx_path in self.get_paths(repo):
                         i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
                         i_cnt += i
                         iwc_cnt += iwc
                     log.debug('added %s files %s with content for repo %s' % (i_cnt + iwc_cnt, iwc_cnt, repo.path))
                 def update_changeset_index(self):
                     idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
                     with idx.searcher() as searcher:
                         writer = idx.writer()
                         writer_is_dirty = False
                         try:
                             for repo_name, repo in self.repo_paths.items():
                                 # skip indexing if there aren't any revs in the repo
                                 revs = repo.revisions
                                 if len(revs) < 1:
                                     continue
                                 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                                 q = qp.parse(u"last:t AND %s" % repo_name)
                                 results = searcher.search(q, sortedby='revision')
                                 last_rev = 0
                                 if len(results) > 0:
                                     last_rev = results[0]['revision']
                                 # there are new changesets to index or a new repo to index
                                 if last_rev == 0 or len(revs) > last_rev + 1:
                                     # delete the docs in the index for the previous last changeset(s)
                                     for hit in results:
                                         q = qp.parse(u"last:t AND %s AND path:%s" %
                                                         (repo_name, hit['path']))
                                         writer.delete_by_query(q)
                                     # index from the previous last changeset + all new ones
                                     self.index_changesets(writer, repo_name, repo, last_rev)
                                     writer_is_dirty = True
                         finally:
                             if writer_is_dirty:
                                 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
                                 writer.commit(merge=True)
                                 log.debug('>> COMMITTED CHANGES TO CHANGESET INDEX<<')
                             else:
                                 writer.cancel
                 def update_file_index(self):
                     log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
                                'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     idx = open_dir(self.index_location, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     writer = idx.writer()
                     writer_is_dirty = False
                     try:
                         with idx.reader() as reader:
                             # Loop over the stored fields in the index
                             for fields in reader.all_stored_fields():
                                 indexed_path = fields['path']
                                 indexed_repo_path = fields['repository']
                                 indexed_paths.add(indexed_path)
                                 if not indexed_repo_path in self.filtered_repo_update_paths:
                                     continue
                                 repo = self.repo_paths[indexed_repo_path]
                                 try:
                                     node = self.get_node(repo, indexed_path)
                                     # Check if this file was changed since it was indexed
                                     indexed_time = fields['modtime']
                                     mtime = self.get_node_mtime(node)
                                     if mtime > indexed_time:
                                         # The file has changed, delete it and add it to the list of
                                         # files to reindex
                                         log.debug('adding to reindex list %s mtime: %s vs %s' % (
                                                         indexed_path, mtime, indexed_time)
                                         )
                                         writer.delete_by_term('fileid', indexed_path)
                                         writer_is_dirty = True
                                         to_index.add(indexed_path)
                                 except (ChangesetError, NodeDoesNotExistError):
                                     # This file was deleted since it was indexed
                                     log.debug('removing from index %s' % indexed_path)
                                     writer.delete_by_term('path', indexed_path)
                                     writer_is_dirty = True
                         # Loop over the files in the filesystem
                         # Assume we have a function that gathers the filenames of the
                         # documents to be indexed
                         ri_cnt_total = 0  # indexed
                         riwc_cnt_total = 0  # indexed with content
                         for repo_name, repo in self.repo_paths.items():
                             # skip indexing if there aren't any revisions
                             if len(repo) < 1:
                                 continue
                             ri_cnt = 0   # indexed
                             riwc_cnt = 0  # indexed with content
                             for path in self.get_paths(repo):
                                 path = safe_unicode(path)
                                 if path in to_index or path not in indexed_paths:
                                     # This is either a file that's changed, or a new file
                                     # that wasn't indexed before. So index it!
                                     i, iwc = self.add_doc(writer, path, repo, repo_name)
                                     writer_is_dirty = True
                                     log.debug('re indexing %s' % path)
                                     ri_cnt += i
                                     ri_cnt_total += 1
                                     riwc_cnt += iwc
                                     riwc_cnt_total += iwc
                             log.debug('added %s files %s with content for repo %s' % (
                                          ri_cnt + riwc_cnt, riwc_cnt, repo.path)
                             )
                         log.debug('indexed %s files in total and %s with content' % (
                                     ri_cnt_total, riwc_cnt_total)
                         )
                     finally:
                         if writer_is_dirty:
                             log.debug('>> COMMITING CHANGES <<')
                             writer.commit(merge=True)
                             log.debug('>>> FINISHED REBUILDING INDEX <<<')
                         else:
                             writer.cancel()
                 def build_indexes(self):
                     if os.path.exists(self.index_location):
                         log.debug('removing previous index')
                         rmtree(self.index_location)
                     if not os.path.exists(self.index_location):
                         os.mkdir(self.index_location)
                     chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME)
                     chgset_idx_writer = chgset_idx.writer()
                     file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
                     file_idx_writer = file_idx.writer()
                     log.debug('BUILDING INDEX FOR EXTENSIONS %s '
                               'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     for repo_name, repo in self.repo_paths.items():
                         # skip indexing if there aren't any revisions
                         if len(repo) < 1:
                             continue
                         self.index_files(file_idx_writer, repo_name, repo)
                         self.index_changesets(chgset_idx_writer, repo_name, repo)
                     log.debug('>> COMMITING CHANGES <<')
                     file_idx_writer.commit(merge=True)
                     chgset_idx_writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_indexes(self):
                     self.update_file_index()
                     self.update_changeset_index()
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index or self.initial:
                         self.build_indexes()
                     else:
                         self.update_indexes()