upstream/kallithea Commit - r2841:2fa3c09f

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.daemon

3

rhodecode.lib.indexers.daemon

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

A daemon will read from task table and run tasks

6

A daemon will read from task table and run tasks

7

8

:created_on: Jan 26, 2010

8

:created_on: Jan 26, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

from __future__ import with_statement

25

from __future__ import with_statement

26

27

import os

27

import os

28

import sys

28

import sys

29

import logging

29

import logging

30

import traceback

30

import traceback

31

32

from shutil import rmtree

32

from shutil import rmtree

33

from time import mktime

33

from time import mktime

34

35

from os.path import dirname as dn

35

from os.path import dirname as dn

36

from os.path import join as jn

36

from os.path import join as jn

37

38

#to get the rhodecode import

38

#to get the rhodecode import

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

40

sys.path.append(project_path)

40

sys.path.append(project_path)

41

42

from rhodecode.config.conf import INDEX_EXTENSIONS

42

from rhodecode.config.conf import INDEX_EXTENSIONS

43

from rhodecode.model.scm import ScmModel

43

from rhodecode.model.scm import ScmModel

44

from rhodecode.lib.utils2 import safe_unicode

44

from rhodecode.lib.utils2 import safe_unicode

45

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

45

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

46

CHGSET_IDX_NAME

46

CHGSET_IDX_NAME

47

48

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

48

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

49

NodeDoesNotExistError

49

NodeDoesNotExistError

50

51

from whoosh.index import create_in, open_dir, exists_in

51

from whoosh.index import create_in, open_dir, exists_in

52

from whoosh.query import *

52

from whoosh.query import *

53

from whoosh.qparser import QueryParser

53

from whoosh.qparser import QueryParser

54

55

log = logging.getLogger('whoosh_indexer')

55

log = logging.getLogger('whoosh_indexer')

56

57

58

class WhooshIndexingDaemon(object):

58

class WhooshIndexingDaemon(object):

59

"""

59

"""

60

Daemon for atomic indexing jobs

60

Daemon for atomic indexing jobs

61

"""

61

"""

62

63

def __init__(self, indexname=IDX_NAME, index_location=None,

63

def __init__(self, indexname=IDX_NAME, index_location=None,

64

repo_location=None, sa=None, repo_list=None,

64

repo_location=None, sa=None, repo_list=None,

65

repo_update_list=None):

65

repo_update_list=None):

66

self.indexname = indexname

66

self.indexname = indexname

67

68

self.index_location = index_location

68

self.index_location = index_location

69

if not index_location:

69

if not index_location:

70

raise Exception('You have to provide index location')

70

raise Exception('You have to provide index location')

71

72

self.repo_location = repo_location

72

self.repo_location = repo_location

73

if not repo_location:

73

if not repo_location:

74

raise Exception('You have to provide repositories location')

74

raise Exception('You have to provide repositories location')

75

76

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

76

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

77

78

#filter repo list

78

#filter repo list

79

if repo_list:

79

if repo_list:

80

#Fix non-ascii repo names to unicode

81

repo_list = map(safe_unicode, repo_list)

80

self.filtered_repo_paths = {}

82

self.filtered_repo_paths = {}

81

for repo_name, repo in self.repo_paths.items():

83

for repo_name, repo in self.repo_paths.items():

82

if repo_name in repo_list:

84

if repo_name in repo_list:

83

self.filtered_repo_paths[repo_name] = repo

85

self.filtered_repo_paths[repo_name] = repo

84

86

85

self.repo_paths = self.filtered_repo_paths

87

self.repo_paths = self.filtered_repo_paths

86

88

87

#filter update repo list

89

#filter update repo list

88

self.filtered_repo_update_paths = {}

90

self.filtered_repo_update_paths = {}

89

if repo_update_list:

91

if repo_update_list:

90

self.filtered_repo_update_paths = {}

92

self.filtered_repo_update_paths = {}

91

for repo_name, repo in self.repo_paths.items():

93

for repo_name, repo in self.repo_paths.items():

92

if repo_name in repo_update_list:

94

if repo_name in repo_update_list:

93

self.filtered_repo_update_paths[repo_name] = repo

95

self.filtered_repo_update_paths[repo_name] = repo

94

self.repo_paths = self.filtered_repo_update_paths

96

self.repo_paths = self.filtered_repo_update_paths

95

97

96

self.initial = True

98

self.initial = True

97

if not os.path.isdir(self.index_location):

99

if not os.path.isdir(self.index_location):

98

os.makedirs(self.index_location)

100

os.makedirs(self.index_location)

99

log.info('Cannot run incremental index since it does not'

101

log.info('Cannot run incremental index since it does not'

100

' yet exist running full build')

102

' yet exist running full build')

101

elif not exists_in(self.index_location, IDX_NAME):

103

elif not exists_in(self.index_location, IDX_NAME):

102

log.info('Running full index build as the file content'

104

log.info('Running full index build as the file content'

103

' index does not exist')

105

' index does not exist')

104

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

106

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

105

log.info('Running full index build as the changeset'

107

log.info('Running full index build as the changeset'

106

' index does not exist')

108

' index does not exist')

107

else:

109

else:

108

self.initial = False

110

self.initial = False

109

111

110

def get_paths(self, repo):

112

def get_paths(self, repo):

111

"""

113

"""

112

recursive walk in root dir and return a set of all path in that dir

114

recursive walk in root dir and return a set of all path in that dir

113

based on repository walk function

115

based on repository walk function

114

"""

116

"""

115

index_paths_ = set()

117

index_paths_ = set()

116

try:

118

try:

117

tip = repo.get_changeset('tip')

119

tip = repo.get_changeset('tip')

118

for _topnode, _dirs, files in tip.walk('/'):

120

for _topnode, _dirs, files in tip.walk('/'):

119

for f in files:

121

for f in files:

120

index_paths_.add(jn(repo.path, f.path))

122

index_paths_.add(jn(repo.path, f.path))

121

123

122

except RepositoryError:

124

except RepositoryError:

123

log.debug(traceback.format_exc())

125

log.debug(traceback.format_exc())

124

pass

126

pass

125

return index_paths_

127

return index_paths_

126

128

127

def get_node(self, repo, path):

129

def get_node(self, repo, path):

128

n_path = path[len(repo.path) + 1:]

130

n_path = path[len(repo.path) + 1:]

129

node = repo.get_changeset().get_node(n_path)

131

node = repo.get_changeset().get_node(n_path)

130

return node

132

return node

131

133

132

def get_node_mtime(self, node):

134

def get_node_mtime(self, node):

133

return mktime(node.last_changeset.date.timetuple())

135

return mktime(node.last_changeset.date.timetuple())

134

136

135

def add_doc(self, writer, path, repo, repo_name):

137

def add_doc(self, writer, path, repo, repo_name):

136

"""

138

"""

137

Adding doc to writer this function itself fetches data from

139

Adding doc to writer this function itself fetches data from

138

the instance of vcs backend

140

the instance of vcs backend

139

"""

141

"""

140

142

141

node = self.get_node(repo, path)

143

node = self.get_node(repo, path)

142

indexed = indexed_w_content = 0

144

indexed = indexed_w_content = 0

143

# we just index the content of chosen files, and skip binary files

145

# we just index the content of chosen files, and skip binary files

144

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

146

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

145

u_content = node.content

147

u_content = node.content

146

if not isinstance(u_content, unicode):

148

if not isinstance(u_content, unicode):

147

log.warning(' >> %s Could not get this content as unicode '

149

log.warning(' >> %s Could not get this content as unicode '

148

'replacing with empty content' % path)

150

'replacing with empty content' % path)

149

u_content = u''

151

u_content = u''

150

else:

152

else:

151

log.debug(' >> %s [WITH CONTENT]' % path)

153

log.debug(' >> %s [WITH CONTENT]' % path)

152

indexed_w_content += 1

154

indexed_w_content += 1

153

155

154

else:

156

else:

155

log.debug(' >> %s' % path)

157

log.debug(' >> %s' % path)

156

# just index file name without it's content

158

# just index file name without it's content

157

u_content = u''

159

u_content = u''

158

indexed += 1

160

indexed += 1

159

161

160

p = safe_unicode(path)

162

p = safe_unicode(path)

161

writer.add_document(

163

writer.add_document(

162

fileid=p,

164

fileid=p,

163

owner=unicode(repo.contact),

165

owner=unicode(repo.contact),

164

repository=safe_unicode(repo_name),

166

repository=safe_unicode(repo_name),

165

path=p,

167

path=p,

166

content=u_content,

168

content=u_content,

167

modtime=self.get_node_mtime(node),

169

modtime=self.get_node_mtime(node),

168

extension=node.extension

170

extension=node.extension

169

)

171

)

170

return indexed, indexed_w_content

172

return indexed, indexed_w_content

171

173

172

def index_changesets(self, writer, repo_name, repo, start_rev=None):

174

def index_changesets(self, writer, repo_name, repo, start_rev=None):

173

"""

175

"""

174

Add all changeset in the vcs repo starting at start_rev

176

Add all changeset in the vcs repo starting at start_rev

175

to the index writer

177

to the index writer

176

178

177

:param writer: the whoosh index writer to add to

179

:param writer: the whoosh index writer to add to

178

:param repo_name: name of the repository from whence the

180

:param repo_name: name of the repository from whence the

179

changeset originates including the repository group

181

changeset originates including the repository group

180

:param repo: the vcs repository instance to index changesets for,

182

:param repo: the vcs repository instance to index changesets for,

181

the presumption is the repo has changesets to index

183

the presumption is the repo has changesets to index

182

:param start_rev=None: the full sha id to start indexing from

184

:param start_rev=None: the full sha id to start indexing from

183

if start_rev is None then index from the first changeset in

185

if start_rev is None then index from the first changeset in

184

the repo

186

the repo

185

"""

187

"""

186

188

187

if start_rev is None:

189

if start_rev is None:

188

start_rev = repo[0].raw_id

190

start_rev = repo[0].raw_id

189

191

190

log.debug('indexing changesets in %s starting at rev: %s' %

192

log.debug('indexing changesets in %s starting at rev: %s' %

191

(repo_name, start_rev))

193

(repo_name, start_rev))

192

194

193

indexed = 0

195

indexed = 0

194

for cs in repo.get_changesets(start=start_rev):

196

for cs in repo.get_changesets(start=start_rev):

195

log.debug(' >> %s' % cs)

197

log.debug(' >> %s' % cs)

196

writer.add_document(

198

writer.add_document(

197

raw_id=unicode(cs.raw_id),

199

raw_id=unicode(cs.raw_id),

198

owner=unicode(repo.contact),

200

owner=unicode(repo.contact),

199

date=cs._timestamp,

201

date=cs._timestamp,

200

repository=safe_unicode(repo_name),

202

repository=safe_unicode(repo_name),

201

author=cs.author,

203

author=cs.author,

202

message=cs.message,

204

message=cs.message,

203

last=cs.last,

205

last=cs.last,

204

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

206

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

205

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

207

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

206

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

208

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

207

parents=u' '.join([cs.raw_id for cs in cs.parents]),

209

parents=u' '.join([cs.raw_id for cs in cs.parents]),

208

)

210

)

209

indexed += 1

211

indexed += 1

210

212

211

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

213

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

212

return indexed

214

return indexed

213

215

214

def index_files(self, file_idx_writer, repo_name, repo):

216

def index_files(self, file_idx_writer, repo_name, repo):

215

"""

217

"""

216

Index files for given repo_name

218

Index files for given repo_name

217

219

218

:param file_idx_writer: the whoosh index writer to add to

220

:param file_idx_writer: the whoosh index writer to add to

219

:param repo_name: name of the repository we're indexing

221

:param repo_name: name of the repository we're indexing

220

:param repo: instance of vcs repo

222

:param repo: instance of vcs repo

221

"""

223

"""

222

i_cnt = iwc_cnt = 0

224

i_cnt = iwc_cnt = 0

223

log.debug('building index for [%s]' % repo.path)

225

log.debug('building index for [%s]' % repo.path)

224

for idx_path in self.get_paths(repo):

226

for idx_path in self.get_paths(repo):

225

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

227

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

226

i_cnt += i

228

i_cnt += i

227

iwc_cnt += iwc

229

iwc_cnt += iwc

228

230

229

log.debug('added %s files %s with content for repo %s' %

231

log.debug('added %s files %s with content for repo %s' %

230

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

232

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

231

return i_cnt, iwc_cnt

233

return i_cnt, iwc_cnt

232

234

233

def update_changeset_index(self):

235

def update_changeset_index(self):

234

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

236

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

235

237

236

with idx.searcher() as searcher:

238

with idx.searcher() as searcher:

237

writer = idx.writer()

239

writer = idx.writer()

238

writer_is_dirty = False

240

writer_is_dirty = False

239

try:

241

try:

240

indexed_total = 0

242

indexed_total = 0

241

repo_name = None

243

repo_name = None

242

for repo_name, repo in self.repo_paths.items():

244

for repo_name, repo in self.repo_paths.items():

243

# skip indexing if there aren't any revs in the repo

245

# skip indexing if there aren't any revs in the repo

244

num_of_revs = len(repo)

246

num_of_revs = len(repo)

245

if num_of_revs < 1:

247

if num_of_revs < 1:

246

continue

248

continue

247

249

248

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

250

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

249

q = qp.parse(u"last:t AND %s" % repo_name)

251

q = qp.parse(u"last:t AND %s" % repo_name)

250

252

251

results = searcher.search(q)

253

results = searcher.search(q)

252

254

253

# default to scanning the entire repo

255

# default to scanning the entire repo

254

last_rev = 0

256

last_rev = 0

255

start_id = None

257

start_id = None

256

258

257

if len(results) > 0:

259

if len(results) > 0:

258

# assuming that there is only one result, if not this

260

# assuming that there is only one result, if not this

259

# may require a full re-index.

261

# may require a full re-index.

260

start_id = results[0]['raw_id']

262

start_id = results[0]['raw_id']

261

last_rev = repo.get_changeset(revision=start_id).revision

263

last_rev = repo.get_changeset(revision=start_id).revision

262

264

263

# there are new changesets to index or a new repo to index

265

# there are new changesets to index or a new repo to index

264

if last_rev == 0 or num_of_revs > last_rev + 1:

266

if last_rev == 0 or num_of_revs > last_rev + 1:

265

# delete the docs in the index for the previous

267

# delete the docs in the index for the previous

266

# last changeset(s)

268

# last changeset(s)

267

for hit in results:

269

for hit in results:

268

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

270

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

269

(repo_name, hit['raw_id']))

271

(repo_name, hit['raw_id']))

270

writer.delete_by_query(q)

272

writer.delete_by_query(q)

271

273

272

# index from the previous last changeset + all new ones

274

# index from the previous last changeset + all new ones

273

indexed_total += self.index_changesets(writer,

275

indexed_total += self.index_changesets(writer,

274

repo_name, repo, start_id)

276

repo_name, repo, start_id)

275

writer_is_dirty = True

277

writer_is_dirty = True

276

log.debug('indexed %s changesets for repo %s' % (

278

log.debug('indexed %s changesets for repo %s' % (

277

indexed_total, repo_name)

279

indexed_total, repo_name)

278

)

280

)

279

finally:

281

finally:

280

if writer_is_dirty:

282

if writer_is_dirty:

281

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

283

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

282

writer.commit(merge=True)

284

writer.commit(merge=True)

283

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

285

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

284

else:

286

else:

285

writer.cancel

287

writer.cancel

286

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

288

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

287

289

288

def update_file_index(self):

290

def update_file_index(self):

289

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

291

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

290

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

292

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

291

293

292

idx = open_dir(self.index_location, indexname=self.indexname)

294

idx = open_dir(self.index_location, indexname=self.indexname)

293

# The set of all paths in the index

295

# The set of all paths in the index

294

indexed_paths = set()

296

indexed_paths = set()

295

# The set of all paths we need to re-index

297

# The set of all paths we need to re-index

296

to_index = set()

298

to_index = set()

297

299

298

writer = idx.writer()

300

writer = idx.writer()

299

writer_is_dirty = False

301

writer_is_dirty = False

300

try:

302

try:

301

with idx.reader() as reader:

303

with idx.reader() as reader:

302

304

303

# Loop over the stored fields in the index

305

# Loop over the stored fields in the index

304

for fields in reader.all_stored_fields():

306

for fields in reader.all_stored_fields():

305

indexed_path = fields['path']

307

indexed_path = fields['path']

306

indexed_repo_path = fields['repository']

308

indexed_repo_path = fields['repository']

307

indexed_paths.add(indexed_path)

309

indexed_paths.add(indexed_path)

308

310

309

if not indexed_repo_path in self.filtered_repo_update_paths:

311

if not indexed_repo_path in self.filtered_repo_update_paths:

310

continue

312

continue

311

313

312

repo = self.repo_paths[indexed_repo_path]

314

repo = self.repo_paths[indexed_repo_path]

313

315

314

try:

316

try:

315

node = self.get_node(repo, indexed_path)

317

node = self.get_node(repo, indexed_path)

316

# Check if this file was changed since it was indexed

318

# Check if this file was changed since it was indexed

317

indexed_time = fields['modtime']

319

indexed_time = fields['modtime']

318

mtime = self.get_node_mtime(node)

320

mtime = self.get_node_mtime(node)

319

if mtime > indexed_time:

321

if mtime > indexed_time:

320

# The file has changed, delete it and add it to

322

# The file has changed, delete it and add it to

321

# the list of files to reindex

323

# the list of files to reindex

322

log.debug(

324

log.debug(

323

'adding to reindex list %s mtime: %s vs %s' % (

325

'adding to reindex list %s mtime: %s vs %s' % (

324

indexed_path, mtime, indexed_time)

326

indexed_path, mtime, indexed_time)

325

)

327

)

326

writer.delete_by_term('fileid', indexed_path)

328

writer.delete_by_term('fileid', indexed_path)

327

writer_is_dirty = True

329

writer_is_dirty = True

328

330

329

to_index.add(indexed_path)

331

to_index.add(indexed_path)

330

except (ChangesetError, NodeDoesNotExistError):

332

except (ChangesetError, NodeDoesNotExistError):

331

# This file was deleted since it was indexed

333

# This file was deleted since it was indexed

332

log.debug('removing from index %s' % indexed_path)

334

log.debug('removing from index %s' % indexed_path)

333

writer.delete_by_term('path', indexed_path)

335

writer.delete_by_term('path', indexed_path)

334

writer_is_dirty = True

336

writer_is_dirty = True

335

337

336

# Loop over the files in the filesystem

338

# Loop over the files in the filesystem

337

# Assume we have a function that gathers the filenames of the

339

# Assume we have a function that gathers the filenames of the

338

# documents to be indexed

340

# documents to be indexed

339

ri_cnt_total = 0 # indexed

341

ri_cnt_total = 0 # indexed

340

riwc_cnt_total = 0 # indexed with content

342

riwc_cnt_total = 0 # indexed with content

341

for repo_name, repo in self.repo_paths.items():

343

for repo_name, repo in self.repo_paths.items():

342

# skip indexing if there aren't any revisions

344

# skip indexing if there aren't any revisions

343

if len(repo) < 1:

345

if len(repo) < 1:

344

continue

346

continue

345

ri_cnt = 0 # indexed

347

ri_cnt = 0 # indexed

346

riwc_cnt = 0 # indexed with content

348

riwc_cnt = 0 # indexed with content

347

for path in self.get_paths(repo):

349

for path in self.get_paths(repo):

348

path = safe_unicode(path)

350

path = safe_unicode(path)

349

if path in to_index or path not in indexed_paths:

351

if path in to_index or path not in indexed_paths:

350

352

351

# This is either a file that's changed, or a new file

353

# This is either a file that's changed, or a new file

352

# that wasn't indexed before. So index it!

354

# that wasn't indexed before. So index it!

353

i, iwc = self.add_doc(writer, path, repo, repo_name)

355

i, iwc = self.add_doc(writer, path, repo, repo_name)

354

writer_is_dirty = True

356

writer_is_dirty = True

355

log.debug('re indexing %s' % path)

357

log.debug('re indexing %s' % path)

356

ri_cnt += i

358

ri_cnt += i

357

ri_cnt_total += 1

359

ri_cnt_total += 1

358

riwc_cnt += iwc

360

riwc_cnt += iwc

359

riwc_cnt_total += iwc

361

riwc_cnt_total += iwc

360

log.debug('added %s files %s with content for repo %s' % (

362

log.debug('added %s files %s with content for repo %s' % (

361

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

363

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

362

)

364

)

363

log.debug('indexed %s files in total and %s with content' % (

365

log.debug('indexed %s files in total and %s with content' % (

364

ri_cnt_total, riwc_cnt_total)

366

ri_cnt_total, riwc_cnt_total)

365

)

367

)

366

finally:

368

finally:

367

if writer_is_dirty:

369

if writer_is_dirty:

368

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

370

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

369

writer.commit(merge=True)

371

writer.commit(merge=True)

370

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

372

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

371

else:

373

else:

372

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

374

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

373

writer.cancel()

375

writer.cancel()

374

376

375

def build_indexes(self):

377

def build_indexes(self):

376

if os.path.exists(self.index_location):

378

if os.path.exists(self.index_location):

377

log.debug('removing previous index')

379

log.debug('removing previous index')

378

rmtree(self.index_location)

380

rmtree(self.index_location)

379

381

380

if not os.path.exists(self.index_location):

382

if not os.path.exists(self.index_location):

381

os.mkdir(self.index_location)

383

os.mkdir(self.index_location)

382

384

383

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

385

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

384

indexname=CHGSET_IDX_NAME)

386

indexname=CHGSET_IDX_NAME)

385

chgset_idx_writer = chgset_idx.writer()

387

chgset_idx_writer = chgset_idx.writer()

386

388

387

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

389

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

388

file_idx_writer = file_idx.writer()

390

file_idx_writer = file_idx.writer()

389

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

391

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

390

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

392

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

391

393

392

for repo_name, repo in self.repo_paths.items():

394

for repo_name, repo in self.repo_paths.items():

393

# skip indexing if there aren't any revisions

395

# skip indexing if there aren't any revisions

394

if len(repo) < 1:

396

if len(repo) < 1:

395

continue

397

continue

396

398

397

self.index_files(file_idx_writer, repo_name, repo)

399

self.index_files(file_idx_writer, repo_name, repo)

398

self.index_changesets(chgset_idx_writer, repo_name, repo)

400

self.index_changesets(chgset_idx_writer, repo_name, repo)

399

401

400

log.debug('>> COMMITING CHANGES <<')

402

log.debug('>> COMMITING CHANGES <<')

401

file_idx_writer.commit(merge=True)

403

file_idx_writer.commit(merge=True)

402

chgset_idx_writer.commit(merge=True)

404

chgset_idx_writer.commit(merge=True)

403

log.debug('>>> FINISHED BUILDING INDEX <<<')

405

log.debug('>>> FINISHED BUILDING INDEX <<<')

404

406

405

def update_indexes(self):

407

def update_indexes(self):

406

self.update_file_index()

408

self.update_file_index()

407

self.update_changeset_index()

409

self.update_changeset_index()

408

410

409

def run(self, full_index=False):

411

def run(self, full_index=False):

410

"""Run daemon"""

412

"""Run daemon"""

411

if full_index or self.initial:

413

if full_index or self.initial:

412

self.build_indexes()

414

self.build_indexes()

413

else:

415

else:

414

self.update_indexes()

416

self.update_indexes()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.daemon
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 A daemon will read from task table and run tasks
                 :created_on: Jan 26, 2010
                 :author: marcink
                 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             from __future__ import with_statement
             import os
             import sys
             import logging
             import traceback
             from shutil import rmtree
             from time import mktime
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the rhodecode import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from rhodecode.config.conf import INDEX_EXTENSIONS
             from rhodecode.model.scm import ScmModel
             from rhodecode.lib.utils2 import safe_unicode
             from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
                 CHGSET_IDX_NAME
             from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
                 NodeDoesNotExistError
             from whoosh.index import create_in, open_dir, exists_in
             from whoosh.query import *
             from whoosh.qparser import QueryParser
             log = logging.getLogger('whoosh_indexer')
             class WhooshIndexingDaemon(object):
                 """
                 Daemon for atomic indexing jobs
                 """
                 def __init__(self, indexname=IDX_NAME, index_location=None,
                              repo_location=None, sa=None, repo_list=None,
                              repo_update_list=None):
                     self.indexname = indexname
                     self.index_location = index_location
                     if not index_location:
                         raise Exception('You have to provide index location')
                     self.repo_location = repo_location
                     if not repo_location:
                         raise Exception('You have to provide repositories location')
                     self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
                     #filter repo list
                     if repo_list:
+                        #Fix non-ascii repo names to unicode
+                        repo_list = map(safe_unicode, repo_list)
                         self.filtered_repo_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_list:
                                 self.filtered_repo_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_paths
                     #filter update repo list
                     self.filtered_repo_update_paths = {}
                     if repo_update_list:
                         self.filtered_repo_update_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_update_list:
                                 self.filtered_repo_update_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_update_paths
                     self.initial = True
                     if not os.path.isdir(self.index_location):
                         os.makedirs(self.index_location)
                         log.info('Cannot run incremental index since it does not'
                                  ' yet exist running full build')
                     elif not exists_in(self.index_location, IDX_NAME):
                         log.info('Running full index build as the file content'
                                  ' index does not exist')
                     elif not exists_in(self.index_location, CHGSET_IDX_NAME):
                         log.info('Running full index build as the changeset'
                                  ' index does not exist')
                     else:
                         self.initial = False
                 def get_paths(self, repo):
                     """
                     recursive walk in root dir and return a set of all path in that dir
                     based on repository walk function
                     """
                     index_paths_ = set()
                     try:
                         tip = repo.get_changeset('tip')
                         for _topnode, _dirs, files in tip.walk('/'):
                             for f in files:
                                 index_paths_.add(jn(repo.path, f.path))
                     except RepositoryError:
                         log.debug(traceback.format_exc())
                         pass
                     return index_paths_
                 def get_node(self, repo, path):
                     n_path = path[len(repo.path) + 1:]
                     node = repo.get_changeset().get_node(n_path)
                     return node
                 def get_node_mtime(self, node):
                     return mktime(node.last_changeset.date.timetuple())
                 def add_doc(self, writer, path, repo, repo_name):
                     """
                     Adding doc to writer this function itself fetches data from
                     the instance of vcs backend
                     """
                     node = self.get_node(repo, path)
                     indexed = indexed_w_content = 0
                     # we just index the content of chosen files, and skip binary files
                     if node.extension in INDEX_EXTENSIONS and not node.is_binary:
                         u_content = node.content
                         if not isinstance(u_content, unicode):
                             log.warning('  >> %s Could not get this content as unicode '
                                         'replacing with empty content' % path)
                             u_content = u''
                         else:
                             log.debug('    >> %s [WITH CONTENT]' % path)
                             indexed_w_content += 1
                     else:
                         log.debug('    >> %s' % path)
                         # just index file name without it's content
                         u_content = u''
                         indexed += 1
                     p = safe_unicode(path)
                     writer.add_document(
                         fileid=p,
                         owner=unicode(repo.contact),
                         repository=safe_unicode(repo_name),
                         path=p,
                         content=u_content,
                         modtime=self.get_node_mtime(node),
                         extension=node.extension
                     )
                     return indexed, indexed_w_content
                 def index_changesets(self, writer, repo_name, repo, start_rev=None):
                     """
                     Add all changeset in the vcs repo starting at start_rev
                     to the index writer
                     :param writer: the whoosh index writer to add to
                     :param repo_name: name of the repository from whence the
                       changeset originates including the repository group
                     :param repo: the vcs repository instance to index changesets for,
                       the presumption is the repo has changesets to index
                     :param start_rev=None: the full sha id to start indexing from
                       if start_rev is None then index from the first changeset in
                       the repo
                     """
                     if start_rev is None:
                         start_rev = repo[0].raw_id
                     log.debug('indexing changesets in %s starting at rev: %s' %
                               (repo_name, start_rev))
                     indexed = 0
                     for cs in repo.get_changesets(start=start_rev):
                         log.debug('    >> %s' % cs)
                         writer.add_document(
                             raw_id=unicode(cs.raw_id),
                             owner=unicode(repo.contact),
                             date=cs._timestamp,
                             repository=safe_unicode(repo_name),
                             author=cs.author,
                             message=cs.message,
                             last=cs.last,
                             added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
                             removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
                             changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
                             parents=u' '.join([cs.raw_id for cs in cs.parents]),
                         )
                         indexed += 1
                     log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
                     return indexed
                 def index_files(self, file_idx_writer, repo_name, repo):
                     """
                     Index files for given repo_name
                     :param file_idx_writer: the whoosh index writer to add to
                     :param repo_name: name of the repository we're indexing
                     :param repo: instance of vcs repo
                     """
                     i_cnt = iwc_cnt = 0
                     log.debug('building index for [%s]' % repo.path)
                     for idx_path in self.get_paths(repo):
                         i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
                         i_cnt += i
                         iwc_cnt += iwc
                     log.debug('added %s files %s with content for repo %s' %
                               (i_cnt + iwc_cnt, iwc_cnt, repo.path))
                     return i_cnt, iwc_cnt
                 def update_changeset_index(self):
                     idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
                     with idx.searcher() as searcher:
                         writer = idx.writer()
                         writer_is_dirty = False
                         try:
                             indexed_total = 0
                             repo_name = None
                             for repo_name, repo in self.repo_paths.items():
                                 # skip indexing if there aren't any revs in the repo
                                 num_of_revs = len(repo)
                                 if num_of_revs < 1:
                                     continue
                                 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                                 q = qp.parse(u"last:t AND %s" % repo_name)
                                 results = searcher.search(q)
                                 # default to scanning the entire repo
                                 last_rev = 0
                                 start_id = None
                                 if len(results) > 0:
                                     # assuming that there is only one result, if not this
                                     # may require a full re-index.
                                     start_id = results[0]['raw_id']
                                     last_rev = repo.get_changeset(revision=start_id).revision
                                 # there are new changesets to index or a new repo to index
                                 if last_rev == 0 or num_of_revs > last_rev + 1:
                                     # delete the docs in the index for the previous
                                     # last changeset(s)
                                     for hit in results:
                                         q = qp.parse(u"last:t AND %s AND raw_id:%s" %
                                                         (repo_name, hit['raw_id']))
                                         writer.delete_by_query(q)
                                     # index from the previous last changeset + all new ones
                                     indexed_total += self.index_changesets(writer,
                                                             repo_name, repo, start_id)
                                     writer_is_dirty = True
                             log.debug('indexed %s changesets for repo %s' % (
                                          indexed_total, repo_name)
                             )
                         finally:
                             if writer_is_dirty:
                                 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
                                 writer.commit(merge=True)
                                 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
                             else:
                                 writer.cancel
                                 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
                 def update_file_index(self):
                     log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
                                'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     idx = open_dir(self.index_location, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     writer = idx.writer()
                     writer_is_dirty = False
                     try:
                         with idx.reader() as reader:
                             # Loop over the stored fields in the index
                             for fields in reader.all_stored_fields():
                                 indexed_path = fields['path']
                                 indexed_repo_path = fields['repository']
                                 indexed_paths.add(indexed_path)
                                 if not indexed_repo_path in self.filtered_repo_update_paths:
                                     continue
                                 repo = self.repo_paths[indexed_repo_path]
                                 try:
                                     node = self.get_node(repo, indexed_path)
                                     # Check if this file was changed since it was indexed
                                     indexed_time = fields['modtime']
                                     mtime = self.get_node_mtime(node)
                                     if mtime > indexed_time:
                                         # The file has changed, delete it and add it to
                                         # the list of files to reindex
                                         log.debug(
                                             'adding to reindex list %s mtime: %s vs %s' % (
                                                 indexed_path, mtime, indexed_time)
                                         )
                                         writer.delete_by_term('fileid', indexed_path)
                                         writer_is_dirty = True
                                         to_index.add(indexed_path)
                                 except (ChangesetError, NodeDoesNotExistError):
                                     # This file was deleted since it was indexed
                                     log.debug('removing from index %s' % indexed_path)
                                     writer.delete_by_term('path', indexed_path)
                                     writer_is_dirty = True
                         # Loop over the files in the filesystem
                         # Assume we have a function that gathers the filenames of the
                         # documents to be indexed
                         ri_cnt_total = 0  # indexed
                         riwc_cnt_total = 0  # indexed with content
                         for repo_name, repo in self.repo_paths.items():
                             # skip indexing if there aren't any revisions
                             if len(repo) < 1:
                                 continue
                             ri_cnt = 0   # indexed
                             riwc_cnt = 0  # indexed with content
                             for path in self.get_paths(repo):
                                 path = safe_unicode(path)
                                 if path in to_index or path not in indexed_paths:
                                     # This is either a file that's changed, or a new file
                                     # that wasn't indexed before. So index it!
                                     i, iwc = self.add_doc(writer, path, repo, repo_name)
                                     writer_is_dirty = True
                                     log.debug('re indexing %s' % path)
                                     ri_cnt += i
                                     ri_cnt_total += 1
                                     riwc_cnt += iwc
                                     riwc_cnt_total += iwc
                             log.debug('added %s files %s with content for repo %s' % (
                                          ri_cnt + riwc_cnt, riwc_cnt, repo.path)
                             )
                         log.debug('indexed %s files in total and %s with content' % (
                                     ri_cnt_total, riwc_cnt_total)
                         )
                     finally:
                         if writer_is_dirty:
                             log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
                             writer.commit(merge=True)
                             log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
                         else:
                             log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
                             writer.cancel()
                 def build_indexes(self):
                     if os.path.exists(self.index_location):
                         log.debug('removing previous index')
                         rmtree(self.index_location)
                     if not os.path.exists(self.index_location):
                         os.mkdir(self.index_location)
                     chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
                                            indexname=CHGSET_IDX_NAME)
                     chgset_idx_writer = chgset_idx.writer()
                     file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
                     file_idx_writer = file_idx.writer()
                     log.debug('BUILDING INDEX FOR EXTENSIONS %s '
                               'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     for repo_name, repo in self.repo_paths.items():
                         # skip indexing if there aren't any revisions
                         if len(repo) < 1:
                             continue
                         self.index_files(file_idx_writer, repo_name, repo)
                         self.index_changesets(chgset_idx_writer, repo_name, repo)
                     log.debug('>> COMMITING CHANGES <<')
                     file_idx_writer.commit(merge=True)
                     chgset_idx_writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_indexes(self):
                     self.update_file_index()
                     self.update_changeset_index()
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index or self.initial:
                         self.build_indexes()
                     else:
                         self.update_indexes()