upstream/kallithea Commit - r3916:ba08786c

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.daemon

3

rhodecode.lib.indexers.daemon

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

A daemon will read from task table and run tasks

6

A daemon will read from task table and run tasks

7

8

:created_on: Jan 26, 2010

8

:created_on: Jan 26, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

from __future__ import with_statement

25

from __future__ import with_statement

26

27

import os

27

import os

28

import sys

28

import sys

29

import logging

29

import logging

30

import traceback

30

import traceback

31

32

from shutil import rmtree

32

from shutil import rmtree

33

from time import mktime

33

from time import mktime

34

35

from os.path import dirname as dn

35

from os.path import dirname as dn

36

from os.path import join as jn

36

from os.path import join as jn

37

38

#to get the rhodecode import

38

#to get the rhodecode import

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

40

sys.path.append(project_path)

40

sys.path.append(project_path)

41

42

from rhodecode.config.conf import INDEX_EXTENSIONS

42

from rhodecode.config.conf import INDEX_EXTENSIONS

43

from rhodecode.model.scm import ScmModel

43

from rhodecode.model.scm import ScmModel

44

from rhodecode.model.db import Repository

44

from rhodecode.lib.utils2 import safe_unicode, safe_str

45

from rhodecode.lib.utils2 import safe_unicode, safe_str

45

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

46

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

46

CHGSET_IDX_NAME

47

CHGSET_IDX_NAME

47

48

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

49

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

49

NodeDoesNotExistError

50

NodeDoesNotExistError

50

51

from whoosh.index import create_in, open_dir, exists_in

52

from whoosh.index import create_in, open_dir, exists_in

52

from whoosh.query import *

53

from whoosh.query import *

53

from whoosh.qparser import QueryParser

54

from whoosh.qparser import QueryParser

54

55

log = logging.getLogger('whoosh_indexer')

56

log = logging.getLogger('whoosh_indexer')

56

57

58

class WhooshIndexingDaemon(object):

59

class WhooshIndexingDaemon(object):

59

"""

60

"""

60

Daemon for atomic indexing jobs

61

Daemon for atomic indexing jobs

61

"""

62

"""

62

63

def __init__(self, indexname=IDX_NAME, index_location=None,

64

def __init__(self, indexname=IDX_NAME, index_location=None,

64

repo_location=None, sa=None, repo_list=None,

65

repo_location=None, sa=None, repo_list=None,

65

repo_update_list=None):

66

repo_update_list=None):

66

self.indexname = indexname

67

self.indexname = indexname

67

68

self.index_location = index_location

69

self.index_location = index_location

69

if not index_location:

70

if not index_location:

70

raise Exception('You have to provide index location')

71

raise Exception('You have to provide index location')

71

72

self.repo_location = repo_location

73

self.repo_location = repo_location

73

if not repo_location:

74

if not repo_location:

74

raise Exception('You have to provide repositories location')

75

raise Exception('You have to provide repositories location')

75

76

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

77

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

77

78

#filter repo list

79

#filter repo list

79

if repo_list:

80

if repo_list:

80

#Fix non-ascii repo names to unicode

81

#Fix non-ascii repo names to unicode

81

repo_list = map(safe_unicode, repo_list)

82

repo_list = map(safe_unicode, repo_list)

82

self.filtered_repo_paths = {}

83

self.filtered_repo_paths = {}

83

for repo_name, repo in self.repo_paths.items():

84

for repo_name, repo in self.repo_paths.items():

84

if repo_name in repo_list:

85

if repo_name in repo_list:

85

self.filtered_repo_paths[repo_name] = repo

86

self.filtered_repo_paths[repo_name] = repo

86

87

self.repo_paths = self.filtered_repo_paths

88

self.repo_paths = self.filtered_repo_paths

88

89

#filter update repo list

90

#filter update repo list

90

self.filtered_repo_update_paths = {}

91

self.filtered_repo_update_paths = {}

91

if repo_update_list:

92

if repo_update_list:

92

self.filtered_repo_update_paths = {}

93

self.filtered_repo_update_paths = {}

93

for repo_name, repo in self.repo_paths.items():

94

for repo_name, repo in self.repo_paths.items():

94

if repo_name in repo_update_list:

95

if repo_name in repo_update_list:

95

self.filtered_repo_update_paths[repo_name] = repo

96

self.filtered_repo_update_paths[repo_name] = repo

96

self.repo_paths = self.filtered_repo_update_paths

97

self.repo_paths = self.filtered_repo_update_paths

97

98

self.initial = True

99

self.initial = True

99

if not os.path.isdir(self.index_location):

100

if not os.path.isdir(self.index_location):

100

os.makedirs(self.index_location)

101

os.makedirs(self.index_location)

101

log.info('Cannot run incremental index since it does not'

102

log.info('Cannot run incremental index since it does not '

102

' yet exist running full build')

103

'yet exist running full build')

103

elif not exists_in(self.index_location, IDX_NAME):

104

elif not exists_in(self.index_location, IDX_NAME):

104

log.info('Running full index build as the file content'

105

log.info('Running full index build as the file content '

105

' index does not exist')

106

'index does not exist')

106

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

107

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

107

log.info('Running full index build as the changeset'

108

log.info('Running full index build as the changeset '

108

' index does not exist')

109

'index does not exist')

109

else:

110

else:

110

self.initial = False

111

self.initial = False

111

112

113

def _get_index_revision(self, repo):

114

db_repo = Repository.get_by_repo_name(repo.name)

115

landing_rev = 'tip'

116

if db_repo:

117

landing_rev = db_repo.landing_rev

118

return landing_rev

119

120

def _get_index_changeset(self, repo):

121

index_rev = self._get_index_revision(repo)

122

cs = repo.get_changeset(index_rev)

123

return cs

124

112

def get_paths(self, repo):

125

def get_paths(self, repo):

113

"""

126

"""

114

recursive walk in root dir and return a set of all path in that dir

127

recursive walk in root dir and return a set of all path in that dir

115

based on repository walk function

128

based on repository walk function

116

"""

129

"""

117

index_paths_ = set()

130

index_paths_ = set()

118

try:

131

try:

119

~~tip~~ = repo.~~get_changeset~~(~~'tip'~~)

132

cs = self._get_index_changeset(repo)

120

for _topnode, _dirs, files in ~~tip~~.walk('/'):

133

for _topnode, _dirs, files in cs.walk('/'):

121

for f in files:

134

for f in files:

122

index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))

135

index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))

123

136

124

except RepositoryError:

137

except RepositoryError:

125

log.debug(traceback.format_exc())

138

log.debug(traceback.format_exc())

126

pass

139

pass

127

return index_paths_

140

return index_paths_

128

141

129

def get_node(self, repo, path):

142

def get_node(self, repo, path):

130

n_path = path[len(repo.path) + 1:]

143

n_path = path[len(repo.path) + 1:]

131

node = repo.get_changeset().get_node(n_path)

144

cs = self._get_index_changeset(repo)

145

node = cs.get_node(n_path)

132

return node

146

return node

133

147

134

def get_node_mtime(self, node):

148

def get_node_mtime(self, node):

135

return mktime(node.last_changeset.date.timetuple())

149

return mktime(node.last_changeset.date.timetuple())

136

150

137

def add_doc(self, writer, path, repo, repo_name):

151

def add_doc(self, writer, path, repo, repo_name):

138

"""

152

"""

139

Adding doc to writer this function itself fetches data from

153

Adding doc to writer this function itself fetches data from

140

the instance of vcs backend

154

the instance of vcs backend

141

"""

155

"""

142

156

143

node = self.get_node(repo, path)

157

node = self.get_node(repo, path)

144

indexed = indexed_w_content = 0

158

indexed = indexed_w_content = 0

145

# we just index the content of chosen files, and skip binary files

159

# we just index the content of chosen files, and skip binary files

146

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

160

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

147

u_content = node.content

161

u_content = node.content

148

if not isinstance(u_content, unicode):

162

if not isinstance(u_content, unicode):

149

log.warning(' >> %s Could not get this content as unicode '

163

log.warning(' >> %s Could not get this content as unicode '

150

'replacing with empty content' % path)

164

'replacing with empty content' % path)

151

u_content = u''

165

u_content = u''

152

else:

166

else:

153

log.debug(' >> %s [WITH CONTENT]' % path)

167

log.debug(' >> %s [WITH CONTENT]' % path)

154

indexed_w_content += 1

168

indexed_w_content += 1

155

169

156

else:

170

else:

157

log.debug(' >> %s' % path)

171

log.debug(' >> %s' % path)

158

# just index file name without it's content

172

# just index file name without it's content

159

u_content = u''

173

u_content = u''

160

indexed += 1

174

indexed += 1

161

175

162

p = safe_unicode(path)

176

p = safe_unicode(path)

163

writer.add_document(

177

writer.add_document(

164

fileid=p,

178

fileid=p,

165

owner=unicode(repo.contact),

179

owner=unicode(repo.contact),

166

repository=safe_unicode(repo_name),

180

repository=safe_unicode(repo_name),

167

path=p,

181

path=p,

168

content=u_content,

182

content=u_content,

169

modtime=self.get_node_mtime(node),

183

modtime=self.get_node_mtime(node),

170

extension=node.extension

184

extension=node.extension

171

)

185

)

172

return indexed, indexed_w_content

186

return indexed, indexed_w_content

173

187

174

def index_changesets(self, writer, repo_name, repo, start_rev=None):

188

def index_changesets(self, writer, repo_name, repo, start_rev=None):

175

"""

189

"""

176

Add all changeset in the vcs repo starting at start_rev

190

Add all changeset in the vcs repo starting at start_rev

177

to the index writer

191

to the index writer

178

192

179

:param writer: the whoosh index writer to add to

193

:param writer: the whoosh index writer to add to

180

:param repo_name: name of the repository from whence the

194

:param repo_name: name of the repository from whence the

181

changeset originates including the repository group

195

changeset originates including the repository group

182

:param repo: the vcs repository instance to index changesets for,

196

:param repo: the vcs repository instance to index changesets for,

183

the presumption is the repo has changesets to index

197

the presumption is the repo has changesets to index

184

:param start_rev=None: the full sha id to start indexing from

198

:param start_rev=None: the full sha id to start indexing from

185

if start_rev is None then index from the first changeset in

199

if start_rev is None then index from the first changeset in

186

the repo

200

the repo

187

"""

201

"""

188

202

189

if start_rev is None:

203

if start_rev is None:

190

start_rev = repo[0].raw_id

204

start_rev = repo[0].raw_id

191

205

192

log.debug('indexing changesets in %s starting at rev: %s' %

206

log.debug('indexing changesets in %s starting at rev: %s' %

193

(repo_name, start_rev))

207

(repo_name, start_rev))

194

208

195

indexed = 0

209

indexed = 0

196

for cs in repo.get_changesets(start=start_rev):

210

for cs in repo.get_changesets(start=start_rev):

197

log.debug(' >> %s' % cs)

211

log.debug(' >> %s' % cs)

198

writer.add_document(

212

writer.add_document(

199

raw_id=unicode(cs.raw_id),

213

raw_id=unicode(cs.raw_id),

200

owner=unicode(repo.contact),

214

owner=unicode(repo.contact),

201

date=cs._timestamp,

215

date=cs._timestamp,

202

repository=safe_unicode(repo_name),

216

repository=safe_unicode(repo_name),

203

author=cs.author,

217

author=cs.author,

204

message=cs.message,

218

message=cs.message,

205

last=cs.last,

219

last=cs.last,

206

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

220

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

207

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

221

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

208

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

222

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

209

parents=u' '.join([cs.raw_id for cs in cs.parents]),

223

parents=u' '.join([cs.raw_id for cs in cs.parents]),

210

)

224

)

211

indexed += 1

225

indexed += 1

212

226

213

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

227

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

214

return indexed

228

return indexed

215

229

216

def index_files(self, file_idx_writer, repo_name, repo):

230

def index_files(self, file_idx_writer, repo_name, repo):

217

"""

231

"""

218

Index files for given repo_name

232

Index files for given repo_name

219

233

220

:param file_idx_writer: the whoosh index writer to add to

234

:param file_idx_writer: the whoosh index writer to add to

221

:param repo_name: name of the repository we're indexing

235

:param repo_name: name of the repository we're indexing

222

:param repo: instance of vcs repo

236

:param repo: instance of vcs repo

223

"""

237

"""

224

i_cnt = iwc_cnt = 0

238

i_cnt = iwc_cnt = 0

225

log.debug('building index for [%s]' % repo.path)

239

log.debug('building index for %s @revision:%s' % (repo.path,

240

self._get_index_revision(repo)))

226

for idx_path in self.get_paths(repo):

241

for idx_path in self.get_paths(repo):

227

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

242

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

228

i_cnt += i

243

i_cnt += i

229

iwc_cnt += iwc

244

iwc_cnt += iwc

230

245

231

log.debug('added %s files %s with content for repo %s' %

246

log.debug('added %s files %s with content for repo %s' %

232

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

247

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

233

return i_cnt, iwc_cnt

248

return i_cnt, iwc_cnt

234

249

235

def update_changeset_index(self):

250

def update_changeset_index(self):

236

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

251

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

237

252

238

with idx.searcher() as searcher:

253

with idx.searcher() as searcher:

239

writer = idx.writer()

254

writer = idx.writer()

240

writer_is_dirty = False

255

writer_is_dirty = False

241

try:

256

try:

242

indexed_total = 0

257

indexed_total = 0

243

repo_name = None

258

repo_name = None

244

for repo_name, repo in self.repo_paths.items():

259

for repo_name, repo in self.repo_paths.items():

245

# skip indexing if there aren't any revs in the repo

260

# skip indexing if there aren't any revs in the repo

246

num_of_revs = len(repo)

261

num_of_revs = len(repo)

247

if num_of_revs < 1:

262

if num_of_revs < 1:

248

continue

263

continue

249

264

250

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

265

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

251

q = qp.parse(u"last:t AND %s" % repo_name)

266

q = qp.parse(u"last:t AND %s" % repo_name)

252

267

253

results = searcher.search(q)

268

results = searcher.search(q)

254

269

255

# default to scanning the entire repo

270

# default to scanning the entire repo

256

last_rev = 0

271

last_rev = 0

257

start_id = None

272

start_id = None

258

273

259

if len(results) > 0:

274

if len(results) > 0:

260

# assuming that there is only one result, if not this

275

# assuming that there is only one result, if not this

261

# may require a full re-index.

276

# may require a full re-index.

262

start_id = results[0]['raw_id']

277

start_id = results[0]['raw_id']

263

last_rev = repo.get_changeset(revision=start_id).revision

278

last_rev = repo.get_changeset(revision=start_id).revision

264

279

265

# there are new changesets to index or a new repo to index

280

# there are new changesets to index or a new repo to index

266

if last_rev == 0 or num_of_revs > last_rev + 1:

281

if last_rev == 0 or num_of_revs > last_rev + 1:

267

# delete the docs in the index for the previous

282

# delete the docs in the index for the previous

268

# last changeset(s)

283

# last changeset(s)

269

for hit in results:

284

for hit in results:

270

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

285

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

271

(repo_name, hit['raw_id']))

286

(repo_name, hit['raw_id']))

272

writer.delete_by_query(q)

287

writer.delete_by_query(q)

273

288

274

# index from the previous last changeset + all new ones

289

# index from the previous last changeset + all new ones

275

indexed_total += self.index_changesets(writer,

290

indexed_total += self.index_changesets(writer,

276

repo_name, repo, start_id)

291

repo_name, repo, start_id)

277

writer_is_dirty = True

292

writer_is_dirty = True

278

log.debug('indexed %s changesets for repo %s' % (

293

log.debug('indexed %s changesets for repo %s' % (

279

indexed_total, repo_name)

294

indexed_total, repo_name)

280

)

295

)

281

finally:

296

finally:

282

if writer_is_dirty:

297

if writer_is_dirty:

283

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

298

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

284

writer.commit(merge=True)

299

writer.commit(merge=True)

285

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

300

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

286

else:

301

else:

287

writer.cancel

288

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

302

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

289

303

290

def update_file_index(self):

304

def update_file_index(self):

291

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

305

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

292

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

306

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

293

307

294

idx = open_dir(self.index_location, indexname=self.indexname)

308

idx = open_dir(self.index_location, indexname=self.indexname)

295

# The set of all paths in the index

309

# The set of all paths in the index

296

indexed_paths = set()

310

indexed_paths = set()

297

# The set of all paths we need to re-index

311

# The set of all paths we need to re-index

298

to_index = set()

312

to_index = set()

299

313

300

writer = idx.writer()

314

writer = idx.writer()

301

writer_is_dirty = False

315

writer_is_dirty = False

302

try:

316

try:

303

with idx.reader() as reader:

317

with idx.reader() as reader:

304

318

305

# Loop over the stored fields in the index

319

# Loop over the stored fields in the index

306

for fields in reader.all_stored_fields():

320

for fields in reader.all_stored_fields():

307

indexed_path = fields['path']

321

indexed_path = fields['path']

308

indexed_repo_path = fields['repository']

322

indexed_repo_path = fields['repository']

309

indexed_paths.add(indexed_path)

323

indexed_paths.add(indexed_path)

310

324

311

if not indexed_repo_path in self.filtered_repo_update_paths:

325

if not indexed_repo_path in self.filtered_repo_update_paths:

312

continue

326

continue

313

327

314

repo = self.repo_paths[indexed_repo_path]

328

repo = self.repo_paths[indexed_repo_path]

315

329

316

try:

330

try:

317

node = self.get_node(repo, indexed_path)

331

node = self.get_node(repo, indexed_path)

318

# Check if this file was changed since it was indexed

332

# Check if this file was changed since it was indexed

319

indexed_time = fields['modtime']

333

indexed_time = fields['modtime']

320

mtime = self.get_node_mtime(node)

334

mtime = self.get_node_mtime(node)

321

if mtime > indexed_time:

335

if mtime > indexed_time:

322

# The file has changed, delete it and add it to

336

# The file has changed, delete it and add it to

323

# the list of files to reindex

337

# the list of files to reindex

324

log.debug(

338

log.debug(

325

'adding to reindex list %s mtime: %s vs %s' % (

339

'adding to reindex list %s mtime: %s vs %s' % (

326

indexed_path, mtime, indexed_time)

340

indexed_path, mtime, indexed_time)

327

)

341

)

328

writer.delete_by_term('fileid', indexed_path)

342

writer.delete_by_term('fileid', indexed_path)

329

writer_is_dirty = True

343

writer_is_dirty = True

330

344

331

to_index.add(indexed_path)

345

to_index.add(indexed_path)

332

except (ChangesetError, NodeDoesNotExistError):

346

except (ChangesetError, NodeDoesNotExistError):

333

# This file was deleted since it was indexed

347

# This file was deleted since it was indexed

334

log.debug('removing from index %s' % indexed_path)

348

log.debug('removing from index %s' % indexed_path)

335

writer.delete_by_term('path', indexed_path)

349

writer.delete_by_term('path', indexed_path)

336

writer_is_dirty = True

350

writer_is_dirty = True

337

351

338

# Loop over the files in the filesystem

352

# Loop over the files in the filesystem

339

# Assume we have a function that gathers the filenames of the

353

# Assume we have a function that gathers the filenames of the

340

# documents to be indexed

354

# documents to be indexed

341

ri_cnt_total = 0 # indexed

355

ri_cnt_total = 0 # indexed

342

riwc_cnt_total = 0 # indexed with content

356

riwc_cnt_total = 0 # indexed with content

343

for repo_name, repo in self.repo_paths.items():

357

for repo_name, repo in self.repo_paths.items():

344

# skip indexing if there aren't any revisions

358

# skip indexing if there aren't any revisions

345

if len(repo) < 1:

359

if len(repo) < 1:

346

continue

360

continue

347

ri_cnt = 0 # indexed

361

ri_cnt = 0 # indexed

348

riwc_cnt = 0 # indexed with content

362

riwc_cnt = 0 # indexed with content

349

for path in self.get_paths(repo):

363

for path in self.get_paths(repo):

350

path = safe_unicode(path)

364

path = safe_unicode(path)

351

if path in to_index or path not in indexed_paths:

365

if path in to_index or path not in indexed_paths:

352

366

353

# This is either a file that's changed, or a new file

367

# This is either a file that's changed, or a new file

354

# that wasn't indexed before. So index it!

368

# that wasn't indexed before. So index it!

355

i, iwc = self.add_doc(writer, path, repo, repo_name)

369

i, iwc = self.add_doc(writer, path, repo, repo_name)

356

writer_is_dirty = True

370

writer_is_dirty = True

357

log.debug('re indexing %s' % path)

371

log.debug('re indexing %s' % path)

358

ri_cnt += i

372

ri_cnt += i

359

ri_cnt_total += 1

373

ri_cnt_total += 1

360

riwc_cnt += iwc

374

riwc_cnt += iwc

361

riwc_cnt_total += iwc

375

riwc_cnt_total += iwc

362

log.debug('added %s files %s with content for repo %s' % (

376

log.debug('added %s files %s with content for repo %s' % (

363

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

377

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

364

)

378

)

365

log.debug('indexed %s files in total and %s with content' % (

379

log.debug('indexed %s files in total and %s with content' % (

366

ri_cnt_total, riwc_cnt_total)

380

ri_cnt_total, riwc_cnt_total)

367

)

381

)

368

finally:

382

finally:

369

if writer_is_dirty:

383

if writer_is_dirty:

370

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

384

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

371

writer.commit(merge=True)

385

writer.commit(merge=True)

372

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

386

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

373

else:

387

else:

374

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

388

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

375

writer.cancel()

389

writer.cancel()

376

390

377

def build_indexes(self):

391

def build_indexes(self):

378

if os.path.exists(self.index_location):

392

if os.path.exists(self.index_location):

379

log.debug('removing previous index')

393

log.debug('removing previous index')

380

rmtree(self.index_location)

394

rmtree(self.index_location)

381

395

382

if not os.path.exists(self.index_location):

396

if not os.path.exists(self.index_location):

383

os.mkdir(self.index_location)

397

os.mkdir(self.index_location)

384

398

385

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

399

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

386

indexname=CHGSET_IDX_NAME)

400

indexname=CHGSET_IDX_NAME)

387

chgset_idx_writer = chgset_idx.writer()

401

chgset_idx_writer = chgset_idx.writer()

388

402

389

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

403

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

390

file_idx_writer = file_idx.writer()

404

file_idx_writer = file_idx.writer()

391

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

405

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

392

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

406

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

393

407

394

for repo_name, repo in self.repo_paths.items():

408

for repo_name, repo in self.repo_paths.items():

395

# skip indexing if there aren't any revisions

409

# skip indexing if there aren't any revisions

396

if len(repo) < 1:

410

if len(repo) < 1:

397

continue

411

continue

398

412

399

self.index_files(file_idx_writer, repo_name, repo)

413

self.index_files(file_idx_writer, repo_name, repo)

400

self.index_changesets(chgset_idx_writer, repo_name, repo)

414

self.index_changesets(chgset_idx_writer, repo_name, repo)

401

415

402

log.debug('>> COMMITING CHANGES <<')

416

log.debug('>> COMMITING CHANGES <<')

403

file_idx_writer.commit(merge=True)

417

file_idx_writer.commit(merge=True)

404

chgset_idx_writer.commit(merge=True)

418

chgset_idx_writer.commit(merge=True)

405

log.debug('>>> FINISHED BUILDING INDEX <<<')

419

log.debug('>>> FINISHED BUILDING INDEX <<<')

406

420

407

def update_indexes(self):

421

def update_indexes(self):

408

self.update_file_index()

422

self.update_file_index()

409

self.update_changeset_index()

423

self.update_changeset_index()

410

424

411

def run(self, full_index=False):

425

def run(self, full_index=False):

412

"""Run daemon"""

426

"""Run daemon"""

413

if full_index or self.initial:

427

if full_index or self.initial:

414

self.build_indexes()

428

self.build_indexes()

415

else:

429

else:

416

self.update_indexes()

430

self.update_indexes()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.daemon
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 A daemon will read from task table and run tasks
                 :created_on: Jan 26, 2010
                 :author: marcink
                 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             from __future__ import with_statement
             import os
             import sys
             import logging
             import traceback
             from shutil import rmtree
             from time import mktime
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the rhodecode import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from rhodecode.config.conf import INDEX_EXTENSIONS
             from rhodecode.model.scm import ScmModel
+            from rhodecode.model.db import Repository
             from rhodecode.lib.utils2 import safe_unicode, safe_str
             from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
                 CHGSET_IDX_NAME
             from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
                 NodeDoesNotExistError
             from whoosh.index import create_in, open_dir, exists_in
             from whoosh.query import *
             from whoosh.qparser import QueryParser
             log = logging.getLogger('whoosh_indexer')
             class WhooshIndexingDaemon(object):
                 """
                 Daemon for atomic indexing jobs
                 """
                 def __init__(self, indexname=IDX_NAME, index_location=None,
                              repo_location=None, sa=None, repo_list=None,
                              repo_update_list=None):
                     self.indexname = indexname
                     self.index_location = index_location
                     if not index_location:
                         raise Exception('You have to provide index location')
                     self.repo_location = repo_location
                     if not repo_location:
                         raise Exception('You have to provide repositories location')
                     self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
                     #filter repo list
                     if repo_list:
                         #Fix non-ascii repo names to unicode
                         repo_list = map(safe_unicode, repo_list)
                         self.filtered_repo_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_list:
                                 self.filtered_repo_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_paths
                     #filter update repo list
                     self.filtered_repo_update_paths = {}
                     if repo_update_list:
                         self.filtered_repo_update_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_update_list:
                                 self.filtered_repo_update_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_update_paths
                     self.initial = True
                     if not os.path.isdir(self.index_location):
                         os.makedirs(self.index_location)
-                        log.info('Cannot run incremental index since it does not'
+                        log.info('Cannot run incremental index since it does not '
-                                 ' yet exist running full build')
+                                 'yet exist running full build')
                     elif not exists_in(self.index_location, IDX_NAME):
-                        log.info('Running full index build as the file content'
+                        log.info('Running full index build as the file content '
-                                 ' index does not exist')
+                                 'index does not exist')
                     elif not exists_in(self.index_location, CHGSET_IDX_NAME):
-                        log.info('Running full index build as the changeset'
+                        log.info('Running full index build as the changeset '
-                                 ' index does not exist')
+                                 'index does not exist')
                     else:
                         self.initial = False
+                def _get_index_revision(self, repo):
+                    db_repo = Repository.get_by_repo_name(repo.name)
+                    landing_rev = 'tip'
+                    if db_repo:
+                        landing_rev = db_repo.landing_rev
+                    return landing_rev
+                def _get_index_changeset(self, repo):
+                    index_rev = self._get_index_revision(repo)
+                    cs = repo.get_changeset(index_rev)
+                    return cs
                 def get_paths(self, repo):
                     """
                     recursive walk in root dir and return a set of all path in that dir
                     based on repository walk function
                     """
                     index_paths_ = set()
                     try:
-                        tip = repo.get_changeset('tip')
+                        cs = self._get_index_changeset(repo)
-                        for _topnode, _dirs, files in tip.walk('/'):
+                        for _topnode, _dirs, files in cs.walk('/'):
                             for f in files:
                                 index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
                     except RepositoryError:
                         log.debug(traceback.format_exc())
                         pass
                     return index_paths_
                 def get_node(self, repo, path):
                     n_path = path[len(repo.path) + 1:]
-                    node = repo.get_changeset().get_node(n_path)
+                    cs = self._get_index_changeset(repo)
+                    node = cs.get_node(n_path)
                     return node
                 def get_node_mtime(self, node):
                     return mktime(node.last_changeset.date.timetuple())
                 def add_doc(self, writer, path, repo, repo_name):
                     """
                     Adding doc to writer this function itself fetches data from
                     the instance of vcs backend
                     """
                     node = self.get_node(repo, path)
                     indexed = indexed_w_content = 0
                     # we just index the content of chosen files, and skip binary files
                     if node.extension in INDEX_EXTENSIONS and not node.is_binary:
                         u_content = node.content
                         if not isinstance(u_content, unicode):
                             log.warning('  >> %s Could not get this content as unicode '
                                         'replacing with empty content' % path)
                             u_content = u''
                         else:
                             log.debug('    >> %s [WITH CONTENT]' % path)
                             indexed_w_content += 1
                     else:
                         log.debug('    >> %s' % path)
                         # just index file name without it's content
                         u_content = u''
                         indexed += 1
                     p = safe_unicode(path)
                     writer.add_document(
                         fileid=p,
                         owner=unicode(repo.contact),
                         repository=safe_unicode(repo_name),
                         path=p,
                         content=u_content,
                         modtime=self.get_node_mtime(node),
                         extension=node.extension
                     )
                     return indexed, indexed_w_content
                 def index_changesets(self, writer, repo_name, repo, start_rev=None):
                     """
                     Add all changeset in the vcs repo starting at start_rev
                     to the index writer
                     :param writer: the whoosh index writer to add to
                     :param repo_name: name of the repository from whence the
                       changeset originates including the repository group
                     :param repo: the vcs repository instance to index changesets for,
                       the presumption is the repo has changesets to index
                     :param start_rev=None: the full sha id to start indexing from
                       if start_rev is None then index from the first changeset in
                       the repo
                     """
                     if start_rev is None:
                         start_rev = repo[0].raw_id
                     log.debug('indexing changesets in %s starting at rev: %s' %
                               (repo_name, start_rev))
                     indexed = 0
                     for cs in repo.get_changesets(start=start_rev):
                         log.debug('    >> %s' % cs)
                         writer.add_document(
                             raw_id=unicode(cs.raw_id),
                             owner=unicode(repo.contact),
                             date=cs._timestamp,
                             repository=safe_unicode(repo_name),
                             author=cs.author,
                             message=cs.message,
                             last=cs.last,
                             added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
                             removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
                             changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
                             parents=u' '.join([cs.raw_id for cs in cs.parents]),
                         )
                         indexed += 1
                     log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
                     return indexed
                 def index_files(self, file_idx_writer, repo_name, repo):
                     """
                     Index files for given repo_name
                     :param file_idx_writer: the whoosh index writer to add to
                     :param repo_name: name of the repository we're indexing
                     :param repo: instance of vcs repo
                     """
                     i_cnt = iwc_cnt = 0
-                    log.debug('building index for [%s]' % repo.path)
+                    log.debug('building index for %s @revision:%s' % (repo.path,
+                                                            self._get_index_revision(repo)))
                     for idx_path in self.get_paths(repo):
                         i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
                         i_cnt += i
                         iwc_cnt += iwc
                     log.debug('added %s files %s with content for repo %s' %
                               (i_cnt + iwc_cnt, iwc_cnt, repo.path))
                     return i_cnt, iwc_cnt
                 def update_changeset_index(self):
                     idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
                     with idx.searcher() as searcher:
                         writer = idx.writer()
                         writer_is_dirty = False
                         try:
                             indexed_total = 0
                             repo_name = None
                             for repo_name, repo in self.repo_paths.items():
                                 # skip indexing if there aren't any revs in the repo
                                 num_of_revs = len(repo)
                                 if num_of_revs < 1:
                                     continue
                                 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                                 q = qp.parse(u"last:t AND %s" % repo_name)
                                 results = searcher.search(q)
                                 # default to scanning the entire repo
                                 last_rev = 0
                                 start_id = None
                                 if len(results) > 0:
                                     # assuming that there is only one result, if not this
                                     # may require a full re-index.
                                     start_id = results[0]['raw_id']
                                     last_rev = repo.get_changeset(revision=start_id).revision
                                 # there are new changesets to index or a new repo to index
                                 if last_rev == 0 or num_of_revs > last_rev + 1:
                                     # delete the docs in the index for the previous
                                     # last changeset(s)
                                     for hit in results:
                                         q = qp.parse(u"last:t AND %s AND raw_id:%s" %
                                                         (repo_name, hit['raw_id']))
                                         writer.delete_by_query(q)
                                     # index from the previous last changeset + all new ones
                                     indexed_total += self.index_changesets(writer,
                                                             repo_name, repo, start_id)
                                     writer_is_dirty = True
                             log.debug('indexed %s changesets for repo %s' % (
-                                         indexed_total, repo_name)
+                                      indexed_total, repo_name)
                             )
                         finally:
                             if writer_is_dirty:
                                 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
                                 writer.commit(merge=True)
                                 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
                             else:
-                                writer.cancel
                                 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
                 def update_file_index(self):
                     log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
                                'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     idx = open_dir(self.index_location, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     writer = idx.writer()
                     writer_is_dirty = False
                     try:
                         with idx.reader() as reader:
                             # Loop over the stored fields in the index
                             for fields in reader.all_stored_fields():
                                 indexed_path = fields['path']
                                 indexed_repo_path = fields['repository']
                                 indexed_paths.add(indexed_path)
                                 if not indexed_repo_path in self.filtered_repo_update_paths:
                                     continue
                                 repo = self.repo_paths[indexed_repo_path]
                                 try:
                                     node = self.get_node(repo, indexed_path)
                                     # Check if this file was changed since it was indexed
                                     indexed_time = fields['modtime']
                                     mtime = self.get_node_mtime(node)
                                     if mtime > indexed_time:
                                         # The file has changed, delete it and add it to
                                         # the list of files to reindex
                                         log.debug(
                                             'adding to reindex list %s mtime: %s vs %s' % (
                                                 indexed_path, mtime, indexed_time)
                                         )
                                         writer.delete_by_term('fileid', indexed_path)
                                         writer_is_dirty = True
                                         to_index.add(indexed_path)
                                 except (ChangesetError, NodeDoesNotExistError):
                                     # This file was deleted since it was indexed
                                     log.debug('removing from index %s' % indexed_path)
                                     writer.delete_by_term('path', indexed_path)
                                     writer_is_dirty = True
                         # Loop over the files in the filesystem
                         # Assume we have a function that gathers the filenames of the
                         # documents to be indexed
                         ri_cnt_total = 0  # indexed
                         riwc_cnt_total = 0  # indexed with content
                         for repo_name, repo in self.repo_paths.items():
                             # skip indexing if there aren't any revisions
                             if len(repo) < 1:
                                 continue
                             ri_cnt = 0   # indexed
                             riwc_cnt = 0  # indexed with content
                             for path in self.get_paths(repo):
                                 path = safe_unicode(path)
                                 if path in to_index or path not in indexed_paths:
                                     # This is either a file that's changed, or a new file
                                     # that wasn't indexed before. So index it!
                                     i, iwc = self.add_doc(writer, path, repo, repo_name)
                                     writer_is_dirty = True
                                     log.debug('re indexing %s' % path)
                                     ri_cnt += i
                                     ri_cnt_total += 1
                                     riwc_cnt += iwc
                                     riwc_cnt_total += iwc
                             log.debug('added %s files %s with content for repo %s' % (
                                          ri_cnt + riwc_cnt, riwc_cnt, repo.path)
                             )
                         log.debug('indexed %s files in total and %s with content' % (
                                     ri_cnt_total, riwc_cnt_total)
                         )
                     finally:
                         if writer_is_dirty:
                             log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
                             writer.commit(merge=True)
                             log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
                         else:
                             log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
                             writer.cancel()
                 def build_indexes(self):
                     if os.path.exists(self.index_location):
                         log.debug('removing previous index')
                         rmtree(self.index_location)
                     if not os.path.exists(self.index_location):
                         os.mkdir(self.index_location)
                     chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
                                            indexname=CHGSET_IDX_NAME)
                     chgset_idx_writer = chgset_idx.writer()
                     file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
                     file_idx_writer = file_idx.writer()
                     log.debug('BUILDING INDEX FOR EXTENSIONS %s '
                               'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     for repo_name, repo in self.repo_paths.items():
                         # skip indexing if there aren't any revisions
                         if len(repo) < 1:
                             continue
                         self.index_files(file_idx_writer, repo_name, repo)
                         self.index_changesets(chgset_idx_writer, repo_name, repo)
                     log.debug('>> COMMITING CHANGES <<')
                     file_idx_writer.commit(merge=True)
                     chgset_idx_writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_indexes(self):
                     self.update_file_index()
                     self.update_changeset_index()
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index or self.initial:
                         self.build_indexes()
                     else:
                         self.update_indexes()