upstream/kallithea Commit - r3921:932c84e8

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

"""

2

"""

3

rhodecode.lib.indexers.daemon

3

rhodecode.lib.indexers.daemon

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

4

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

5

6

A daemon will read from task table and run tasks

6

A daemon will read from task table and run tasks

7

8

:created_on: Jan 26, 2010

8

:created_on: Jan 26, 2010

9

:author: marcink

9

:author: marcink

10

11

:license: GPLv3, see COPYING for more details.

11

:license: GPLv3, see COPYING for more details.

12

"""

12

"""

13

# This program is free software: you can redistribute it and/or modify

13

# This program is free software: you can redistribute it and/or modify

14

# it under the terms of the GNU General Public License as published by

14

# it under the terms of the GNU General Public License as published by

15

# the Free Software Foundation, either version 3 of the License, or

15

# the Free Software Foundation, either version 3 of the License, or

16

# (at your option) any later version.

16

# (at your option) any later version.

17

#

17

#

18

# This program is distributed in the hope that it will be useful,

18

# This program is distributed in the hope that it will be useful,

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

19

# but WITHOUT ANY WARRANTY; without even the implied warranty of

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

# GNU General Public License for more details.

21

# GNU General Public License for more details.

22

#

22

#

23

# You should have received a copy of the GNU General Public License

23

# You should have received a copy of the GNU General Public License

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

24

# along with this program. If not, see <http://www.gnu.org/licenses/>.

25

from __future__ import with_statement

25

from __future__ import with_statement

26

27

import os

27

import os

28

import sys

28

import sys

29

import logging

29

import logging

30

import traceback

30

import traceback

31

32

from shutil import rmtree

32

from shutil import rmtree

33

from time import mktime

33

from time import mktime

34

35

from os.path import dirname as dn

35

from os.path import dirname as dn

36

from os.path import join as jn

36

from os.path import join as jn

37

38

#to get the rhodecode import

38

#to get the rhodecode import

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

39

project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))

40

sys.path.append(project_path)

40

sys.path.append(project_path)

41

42

from rhodecode.config.conf import INDEX_EXTENSIONS

42

from rhodecode.config.conf import INDEX_EXTENSIONS

43

from rhodecode.model.scm import ScmModel

43

from rhodecode.model.scm import ScmModel

44

from rhodecode.model.db import Repository

44

from rhodecode.model.db import Repository

45

from rhodecode.lib.utils2 import safe_unicode, safe_str

45

from rhodecode.lib.utils2 import safe_unicode, safe_str

46

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

46

from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \

47

CHGSET_IDX_NAME

47

CHGSET_IDX_NAME

48

49

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

49

from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \

50

NodeDoesNotExistError

50

NodeDoesNotExistError

51

52

from whoosh.index import create_in, open_dir, exists_in

52

from whoosh.index import create_in, open_dir, exists_in

53

from whoosh.query import *

53

from whoosh.query import *

54

from whoosh.qparser import QueryParser

54

from whoosh.qparser import QueryParser

55

56

log = logging.getLogger('whoosh_indexer')

56

log = logging.getLogger('whoosh_indexer')

57

58

59

class WhooshIndexingDaemon(object):

59

class WhooshIndexingDaemon(object):

60

"""

60

"""

61

Daemon for atomic indexing jobs

61

Daemon for atomic indexing jobs

62

"""

62

"""

63

64

def __init__(self, indexname=IDX_NAME, index_location=None,

64

def __init__(self, indexname=IDX_NAME, index_location=None,

65

repo_location=None, sa=None, repo_list=None,

65

repo_location=None, sa=None, repo_list=None,

66

repo_update_list=None):

66

repo_update_list=None):

67

self.indexname = indexname

67

self.indexname = indexname

68

69

self.index_location = index_location

69

self.index_location = index_location

70

if not index_location:

70

if not index_location:

71

raise Exception('You have to provide index location')

71

raise Exception('You have to provide index location')

72

73

self.repo_location = repo_location

73

self.repo_location = repo_location

74

if not repo_location:

74

if not repo_location:

75

raise Exception('You have to provide repositories location')

75

raise Exception('You have to provide repositories location')

76

77

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

77

self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

78

79

#filter repo list

79

#filter repo list

80

if repo_list:

80

if repo_list:

81

#Fix non-ascii repo names to unicode

81

#Fix non-ascii repo names to unicode

82

repo_list = map(safe_unicode, repo_list)

82

repo_list = map(safe_unicode, repo_list)

83

self.filtered_repo_paths = {}

83

self.filtered_repo_paths = {}

84

for repo_name, repo in self.repo_paths.items():

84

for repo_name, repo in self.repo_paths.items():

85

if repo_name in repo_list:

85

if repo_name in repo_list:

86

self.filtered_repo_paths[repo_name] = repo

86

self.filtered_repo_paths[repo_name] = repo

87

88

self.repo_paths = self.filtered_repo_paths

88

self.repo_paths = self.filtered_repo_paths

89

90

#filter update repo list

90

#filter update repo list

91

self.filtered_repo_update_paths = {}

91

self.filtered_repo_update_paths = {}

92

if repo_update_list:

92

if repo_update_list:

93

self.filtered_repo_update_paths = {}

93

self.filtered_repo_update_paths = {}

94

for repo_name, repo in self.repo_paths.items():

94

for repo_name, repo in self.repo_paths.items():

95

if repo_name in repo_update_list:

95

if repo_name in repo_update_list:

96

self.filtered_repo_update_paths[repo_name] = repo

96

self.filtered_repo_update_paths[repo_name] = repo

97

self.repo_paths = self.filtered_repo_update_paths

97

self.repo_paths = self.filtered_repo_update_paths

98

99

self.initial = True

99

self.initial = True

100

if not os.path.isdir(self.index_location):

100

if not os.path.isdir(self.index_location):

101

os.makedirs(self.index_location)

101

os.makedirs(self.index_location)

102

log.info('Cannot run incremental index since it does not '

102

log.info('Cannot run incremental index since it does not '

103

'yet exist running full build')

103

'yet exist running full build')

104

elif not exists_in(self.index_location, IDX_NAME):

104

elif not exists_in(self.index_location, IDX_NAME):

105

log.info('Running full index build as the file content '

105

log.info('Running full index build as the file content '

106

'index does not exist')

106

'index does not exist')

107

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

107

elif not exists_in(self.index_location, CHGSET_IDX_NAME):

108

log.info('Running full index build as the changeset '

108

log.info('Running full index build as the changeset '

109

'index does not exist')

109

'index does not exist')

110

else:

110

else:

111

self.initial = False

111

self.initial = False

112

113

def _get_index_revision(self, repo):

113

def _get_index_revision(self, repo):

114

db_repo = Repository.get_by_repo_name(repo.name)

114

db_repo = Repository.get_by_repo_name(repo.name)

115

landing_rev = 'tip'

115

landing_rev = 'tip'

116

if db_repo:

116

if db_repo:

117

landing_rev = db_repo.landing_rev

117

landing_rev = db_repo.landing_rev

118

return landing_rev

118

return landing_rev

119

120

def _get_index_changeset(self, repo):

120

def _get_index_changeset(self, repo):

121

index_rev = self._get_index_revision(repo)

121

index_rev = self._get_index_revision(repo)

122

cs = repo.get_changeset(index_rev)

122

cs = repo.get_changeset(index_rev)

123

return cs

123

return cs

124

125

def get_paths(self, repo):

125

def get_paths(self, repo):

126

"""

126

"""

127

recursive walk in root dir and return a set of all path in that dir

127

recursive walk in root dir and return a set of all path in that dir

128

based on repository walk function

128

based on repository walk function

129

"""

129

"""

130

index_paths_ = set()

130

index_paths_ = set()

131

try:

131

try:

132

cs = self._get_index_changeset(repo)

132

cs = self._get_index_changeset(repo)

133

for _topnode, _dirs, files in cs.walk('/'):

133

for _topnode, _dirs, files in cs.walk('/'):

134

for f in files:

134

for f in files:

135

index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))

135

index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))

136

137

except RepositoryError:

137

except RepositoryError:

138

log.debug(traceback.format_exc())

138

log.debug(traceback.format_exc())

139

pass

139

pass

140

return index_paths_

140

return index_paths_

141

142

def get_node(self, repo, path):

142

def get_node(self, repo, path):

143

n_path = path[len(repo.path) + 1:]

143

"""

144

gets a filenode based on given full path.It operates on string for

145

hg git compatability.

146

147

:param repo: scm repo instance

148

:param path: full path including root location

149

:return: FileNode

150

"""

151

root_path = safe_str(repo.path)+'/'

152

parts = safe_str(path).partition(root_path)

144

cs = self._get_index_changeset(repo)

153

cs = self._get_index_changeset(repo)

145

node = cs.get_node(~~n_path~~)

154

node = cs.get_node(parts[-1])

146

return node

155

return node

147

156

148

def get_node_mtime(self, node):

157

def get_node_mtime(self, node):

149

return mktime(node.last_changeset.date.timetuple())

158

return mktime(node.last_changeset.date.timetuple())

150

159

151

def add_doc(self, writer, path, repo, repo_name):

160

def add_doc(self, writer, path, repo, repo_name):

152

"""

161

"""

153

Adding doc to writer this function itself fetches data from

162

Adding doc to writer this function itself fetches data from

154

the instance of vcs backend

163

the instance of vcs backend

155

"""

164

"""

156

165

157

node = self.get_node(repo, path)

166

node = self.get_node(repo, path)

158

indexed = indexed_w_content = 0

167

indexed = indexed_w_content = 0

159

# we just index the content of chosen files, and skip binary files

168

# we just index the content of chosen files, and skip binary files

160

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

169

if node.extension in INDEX_EXTENSIONS and not node.is_binary:

161

u_content = node.content

170

u_content = node.content

162

if not isinstance(u_content, unicode):

171

if not isinstance(u_content, unicode):

163

log.warning(' >> %s Could not get this content as unicode '

172

log.warning(' >> %s Could not get this content as unicode '

164

'replacing with empty content' % path)

173

'replacing with empty content' % path)

165

u_content = u''

174

u_content = u''

166

else:

175

else:

167

log.debug(' >> %s [WITH CONTENT]' % path)

176

log.debug(' >> %s [WITH CONTENT]' % path)

168

indexed_w_content += 1

177

indexed_w_content += 1

169

178

170

else:

179

else:

171

log.debug(' >> %s' % path)

180

log.debug(' >> %s' % path)

172

# just index file name without it's content

181

# just index file name without it's content

173

u_content = u''

182

u_content = u''

174

indexed += 1

183

indexed += 1

175

184

176

p = safe_unicode(path)

185

p = safe_unicode(path)

177

writer.add_document(

186

writer.add_document(

178

fileid=p,

187

fileid=p,

179

owner=unicode(repo.contact),

188

owner=unicode(repo.contact),

180

repository=safe_unicode(repo_name),

189

repository=safe_unicode(repo_name),

181

path=p,

190

path=p,

182

content=u_content,

191

content=u_content,

183

modtime=self.get_node_mtime(node),

192

modtime=self.get_node_mtime(node),

184

extension=node.extension

193

extension=node.extension

185

)

194

)

186

return indexed, indexed_w_content

195

return indexed, indexed_w_content

187

196

188

def index_changesets(self, writer, repo_name, repo, start_rev=None):

197

def index_changesets(self, writer, repo_name, repo, start_rev=None):

189

"""

198

"""

190

Add all changeset in the vcs repo starting at start_rev

199

Add all changeset in the vcs repo starting at start_rev

191

to the index writer

200

to the index writer

192

201

193

:param writer: the whoosh index writer to add to

202

:param writer: the whoosh index writer to add to

194

:param repo_name: name of the repository from whence the

203

:param repo_name: name of the repository from whence the

195

changeset originates including the repository group

204

changeset originates including the repository group

196

:param repo: the vcs repository instance to index changesets for,

205

:param repo: the vcs repository instance to index changesets for,

197

the presumption is the repo has changesets to index

206

the presumption is the repo has changesets to index

198

:param start_rev=None: the full sha id to start indexing from

207

:param start_rev=None: the full sha id to start indexing from

199

if start_rev is None then index from the first changeset in

208

if start_rev is None then index from the first changeset in

200

the repo

209

the repo

201

"""

210

"""

202

211

203

if start_rev is None:

212

if start_rev is None:

204

start_rev = repo[0].raw_id

213

start_rev = repo[0].raw_id

205

214

206

log.debug('indexing changesets in %s starting at rev: %s' %

215

log.debug('indexing changesets in %s starting at rev: %s' %

207

(repo_name, start_rev))

216

(repo_name, start_rev))

208

217

209

indexed = 0

218

indexed = 0

210

for cs in repo.get_changesets(start=start_rev):

219

for cs in repo.get_changesets(start=start_rev):

211

log.debug(' >> %s' % cs)

220

log.debug(' >> %s' % cs)

212

writer.add_document(

221

writer.add_document(

213

raw_id=unicode(cs.raw_id),

222

raw_id=unicode(cs.raw_id),

214

owner=unicode(repo.contact),

223

owner=unicode(repo.contact),

215

date=cs._timestamp,

224

date=cs._timestamp,

216

repository=safe_unicode(repo_name),

225

repository=safe_unicode(repo_name),

217

author=cs.author,

226

author=cs.author,

218

message=cs.message,

227

message=cs.message,

219

last=cs.last,

228

last=cs.last,

220

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

229

added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),

221

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

230

removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),

222

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

231

changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),

223

parents=u' '.join([cs.raw_id for cs in cs.parents]),

232

parents=u' '.join([cs.raw_id for cs in cs.parents]),

224

)

233

)

225

indexed += 1

234

indexed += 1

226

235

227

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

236

log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))

228

return indexed

237

return indexed

229

238

230

def index_files(self, file_idx_writer, repo_name, repo):

239

def index_files(self, file_idx_writer, repo_name, repo):

231

"""

240

"""

232

Index files for given repo_name

241

Index files for given repo_name

233

242

234

:param file_idx_writer: the whoosh index writer to add to

243

:param file_idx_writer: the whoosh index writer to add to

235

:param repo_name: name of the repository we're indexing

244

:param repo_name: name of the repository we're indexing

236

:param repo: instance of vcs repo

245

:param repo: instance of vcs repo

237

"""

246

"""

238

i_cnt = iwc_cnt = 0

247

i_cnt = iwc_cnt = 0

239

log.debug('building index for %s @revision:%s' % (repo.path,

248

log.debug('building index for %s @revision:%s' % (repo.path,

240

self._get_index_revision(repo)))

249

self._get_index_revision(repo)))

241

for idx_path in self.get_paths(repo):

250

for idx_path in self.get_paths(repo):

242

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

251

i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)

243

i_cnt += i

252

i_cnt += i

244

iwc_cnt += iwc

253

iwc_cnt += iwc

245

254

246

log.debug('added %s files %s with content for repo %s' %

255

log.debug('added %s files %s with content for repo %s' %

247

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

256

(i_cnt + iwc_cnt, iwc_cnt, repo.path))

248

return i_cnt, iwc_cnt

257

return i_cnt, iwc_cnt

249

258

250

def update_changeset_index(self):

259

def update_changeset_index(self):

251

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

260

idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)

252

261

253

with idx.searcher() as searcher:

262

with idx.searcher() as searcher:

254

writer = idx.writer()

263

writer = idx.writer()

255

writer_is_dirty = False

264

writer_is_dirty = False

256

try:

265

try:

257

indexed_total = 0

266

indexed_total = 0

258

repo_name = None

267

repo_name = None

259

for repo_name, repo in self.repo_paths.items():

268

for repo_name, repo in self.repo_paths.items():

260

# skip indexing if there aren't any revs in the repo

269

# skip indexing if there aren't any revs in the repo

261

num_of_revs = len(repo)

270

num_of_revs = len(repo)

262

if num_of_revs < 1:

271

if num_of_revs < 1:

263

continue

272

continue

264

273

265

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

274

qp = QueryParser('repository', schema=CHGSETS_SCHEMA)

266

q = qp.parse(u"last:t AND %s" % repo_name)

275

q = qp.parse(u"last:t AND %s" % repo_name)

267

276

268

results = searcher.search(q)

277

results = searcher.search(q)

269

278

270

# default to scanning the entire repo

279

# default to scanning the entire repo

271

last_rev = 0

280

last_rev = 0

272

start_id = None

281

start_id = None

273

282

274

if len(results) > 0:

283

if len(results) > 0:

275

# assuming that there is only one result, if not this

284

# assuming that there is only one result, if not this

276

# may require a full re-index.

285

# may require a full re-index.

277

start_id = results[0]['raw_id']

286

start_id = results[0]['raw_id']

278

last_rev = repo.get_changeset(revision=start_id).revision

287

last_rev = repo.get_changeset(revision=start_id).revision

279

288

280

# there are new changesets to index or a new repo to index

289

# there are new changesets to index or a new repo to index

281

if last_rev == 0 or num_of_revs > last_rev + 1:

290

if last_rev == 0 or num_of_revs > last_rev + 1:

282

# delete the docs in the index for the previous

291

# delete the docs in the index for the previous

283

# last changeset(s)

292

# last changeset(s)

284

for hit in results:

293

for hit in results:

285

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

294

q = qp.parse(u"last:t AND %s AND raw_id:%s" %

286

(repo_name, hit['raw_id']))

295

(repo_name, hit['raw_id']))

287

writer.delete_by_query(q)

296

writer.delete_by_query(q)

288

297

289

# index from the previous last changeset + all new ones

298

# index from the previous last changeset + all new ones

290

indexed_total += self.index_changesets(writer,

299

indexed_total += self.index_changesets(writer,

291

repo_name, repo, start_id)

300

repo_name, repo, start_id)

292

writer_is_dirty = True

301

writer_is_dirty = True

293

log.debug('indexed %s changesets for repo %s' % (

302

log.debug('indexed %s changesets for repo %s' % (

294

indexed_total, repo_name)

303

indexed_total, repo_name)

295

)

304

)

296

finally:

305

finally:

297

if writer_is_dirty:

306

if writer_is_dirty:

298

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

307

log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')

299

writer.commit(merge=True)

308

writer.commit(merge=True)

300

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

309

log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')

301

else:

310

else:

302

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

311

log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')

303

312

304

def update_file_index(self):

313

def update_file_index(self):

305

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

314

log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '

306

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

315

'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

307

316

308

idx = open_dir(self.index_location, indexname=self.indexname)

317

idx = open_dir(self.index_location, indexname=self.indexname)

309

# The set of all paths in the index

318

# The set of all paths in the index

310

indexed_paths = set()

319

indexed_paths = set()

311

# The set of all paths we need to re-index

320

# The set of all paths we need to re-index

312

to_index = set()

321

to_index = set()

313

322

314

writer = idx.writer()

323

writer = idx.writer()

315

writer_is_dirty = False

324

writer_is_dirty = False

316

try:

325

try:

317

with idx.reader() as reader:

326

with idx.reader() as reader:

318

327

319

# Loop over the stored fields in the index

328

# Loop over the stored fields in the index

320

for fields in reader.all_stored_fields():

329

for fields in reader.all_stored_fields():

321

indexed_path = fields['path']

330

indexed_path = fields['path']

322

indexed_repo_path = fields['repository']

331

indexed_repo_path = fields['repository']

323

indexed_paths.add(indexed_path)

332

indexed_paths.add(indexed_path)

324

333

325

if not indexed_repo_path in self.filtered_repo_update_paths:

334

if not indexed_repo_path in self.filtered_repo_update_paths:

326

continue

335

continue

327

336

328

repo = self.repo_paths[indexed_repo_path]

337

repo = self.repo_paths[indexed_repo_path]

329

338

330

try:

339

try:

331

node = self.get_node(repo, indexed_path)

340

node = self.get_node(repo, indexed_path)

332

# Check if this file was changed since it was indexed

341

# Check if this file was changed since it was indexed

333

indexed_time = fields['modtime']

342

indexed_time = fields['modtime']

334

mtime = self.get_node_mtime(node)

343

mtime = self.get_node_mtime(node)

335

if mtime > indexed_time:

344

if mtime > indexed_time:

336

# The file has changed, delete it and add it to

345

# The file has changed, delete it and add it to

337

# the list of files to reindex

346

# the list of files to reindex

338

log.debug(

347

log.debug(

339

'adding to reindex list %s mtime: %s vs %s' % (

348

'adding to reindex list %s mtime: %s vs %s' % (

340

indexed_path, mtime, indexed_time)

349

indexed_path, mtime, indexed_time)

341

)

350

)

342

writer.delete_by_term('fileid', indexed_path)

351

writer.delete_by_term('fileid', indexed_path)

343

writer_is_dirty = True

352

writer_is_dirty = True

344

353

345

to_index.add(indexed_path)

354

to_index.add(indexed_path)

346

except (ChangesetError, NodeDoesNotExistError):

355

except (ChangesetError, NodeDoesNotExistError):

347

# This file was deleted since it was indexed

356

# This file was deleted since it was indexed

348

log.debug('removing from index %s' % indexed_path)

357

log.debug('removing from index %s' % indexed_path)

349

writer.delete_by_term('path', indexed_path)

358

writer.delete_by_term('path', indexed_path)

350

writer_is_dirty = True

359

writer_is_dirty = True

351

360

352

# Loop over the files in the filesystem

361

# Loop over the files in the filesystem

353

# Assume we have a function that gathers the filenames of the

362

# Assume we have a function that gathers the filenames of the

354

# documents to be indexed

363

# documents to be indexed

355

ri_cnt_total = 0 # indexed

364

ri_cnt_total = 0 # indexed

356

riwc_cnt_total = 0 # indexed with content

365

riwc_cnt_total = 0 # indexed with content

357

for repo_name, repo in self.repo_paths.items():

366

for repo_name, repo in self.repo_paths.items():

358

# skip indexing if there aren't any revisions

367

# skip indexing if there aren't any revisions

359

if len(repo) < 1:

368

if len(repo) < 1:

360

continue

369

continue

361

ri_cnt = 0 # indexed

370

ri_cnt = 0 # indexed

362

riwc_cnt = 0 # indexed with content

371

riwc_cnt = 0 # indexed with content

363

for path in self.get_paths(repo):

372

for path in self.get_paths(repo):

364

path = safe_unicode(path)

373

path = safe_unicode(path)

365

if path in to_index or path not in indexed_paths:

374

if path in to_index or path not in indexed_paths:

366

375

367

# This is either a file that's changed, or a new file

376

# This is either a file that's changed, or a new file

368

# that wasn't indexed before. So index it!

377

# that wasn't indexed before. So index it!

369

i, iwc = self.add_doc(writer, path, repo, repo_name)

378

i, iwc = self.add_doc(writer, path, repo, repo_name)

370

writer_is_dirty = True

379

writer_is_dirty = True

371

log.debug('re indexing %s' % path)

380

log.debug('re indexing %s' % path)

372

ri_cnt += i

381

ri_cnt += i

373

ri_cnt_total += 1

382

ri_cnt_total += 1

374

riwc_cnt += iwc

383

riwc_cnt += iwc

375

riwc_cnt_total += iwc

384

riwc_cnt_total += iwc

376

log.debug('added %s files %s with content for repo %s' % (

385

log.debug('added %s files %s with content for repo %s' % (

377

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

386

ri_cnt + riwc_cnt, riwc_cnt, repo.path)

378

)

387

)

379

log.debug('indexed %s files in total and %s with content' % (

388

log.debug('indexed %s files in total and %s with content' % (

380

ri_cnt_total, riwc_cnt_total)

389

ri_cnt_total, riwc_cnt_total)

381

)

390

)

382

finally:

391

finally:

383

if writer_is_dirty:

392

if writer_is_dirty:

384

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

393

log.debug('>> COMMITING CHANGES TO FILE INDEX <<')

385

writer.commit(merge=True)

394

writer.commit(merge=True)

386

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

395

log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')

387

else:

396

else:

388

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

397

log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')

389

writer.cancel()

398

writer.cancel()

390

399

391

def build_indexes(self):

400

def build_indexes(self):

392

if os.path.exists(self.index_location):

401

if os.path.exists(self.index_location):

393

log.debug('removing previous index')

402

log.debug('removing previous index')

394

rmtree(self.index_location)

403

rmtree(self.index_location)

395

404

396

if not os.path.exists(self.index_location):

405

if not os.path.exists(self.index_location):

397

os.mkdir(self.index_location)

406

os.mkdir(self.index_location)

398

407

399

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

408

chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,

400

indexname=CHGSET_IDX_NAME)

409

indexname=CHGSET_IDX_NAME)

401

chgset_idx_writer = chgset_idx.writer()

410

chgset_idx_writer = chgset_idx.writer()

402

411

403

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

412

file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)

404

file_idx_writer = file_idx.writer()

413

file_idx_writer = file_idx.writer()

405

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

414

log.debug('BUILDING INDEX FOR EXTENSIONS %s '

406

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

415

'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

407

416

408

for repo_name, repo in self.repo_paths.items():

417

for repo_name, repo in self.repo_paths.items():

409

# skip indexing if there aren't any revisions

418

# skip indexing if there aren't any revisions

410

if len(repo) < 1:

419

if len(repo) < 1:

411

continue

420

continue

412

421

413

self.index_files(file_idx_writer, repo_name, repo)

422

self.index_files(file_idx_writer, repo_name, repo)

414

self.index_changesets(chgset_idx_writer, repo_name, repo)

423

self.index_changesets(chgset_idx_writer, repo_name, repo)

415

424

416

log.debug('>> COMMITING CHANGES <<')

425

log.debug('>> COMMITING CHANGES <<')

417

file_idx_writer.commit(merge=True)

426

file_idx_writer.commit(merge=True)

418

chgset_idx_writer.commit(merge=True)

427

chgset_idx_writer.commit(merge=True)

419

log.debug('>>> FINISHED BUILDING INDEX <<<')

428

log.debug('>>> FINISHED BUILDING INDEX <<<')

420

429

421

def update_indexes(self):

430

def update_indexes(self):

422

self.update_file_index()

431

self.update_file_index()

423

self.update_changeset_index()

432

self.update_changeset_index()

424

433

425

def run(self, full_index=False):

434

def run(self, full_index=False):

426

"""Run daemon"""

435

"""Run daemon"""

427

if full_index or self.initial:

436

if full_index or self.initial:

428

self.build_indexes()

437

self.build_indexes()

429

else:

438

else:

430

self.update_indexes()

439

self.update_indexes()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             """
                 rhodecode.lib.indexers.daemon
                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                 A daemon will read from task table and run tasks
                 :created_on: Jan 26, 2010
                 :author: marcink
                 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
                 :license: GPLv3, see COPYING for more details.
             """
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU General Public License as published by
             # the Free Software Foundation, either version 3 of the License, or
             # (at your option) any later version.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             from __future__ import with_statement
             import os
             import sys
             import logging
             import traceback
             from shutil import rmtree
             from time import mktime
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the rhodecode import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from rhodecode.config.conf import INDEX_EXTENSIONS
             from rhodecode.model.scm import ScmModel
             from rhodecode.model.db import Repository
             from rhodecode.lib.utils2 import safe_unicode, safe_str
             from rhodecode.lib.indexers import SCHEMA, IDX_NAME, CHGSETS_SCHEMA, \
                 CHGSET_IDX_NAME
             from rhodecode.lib.vcs.exceptions import ChangesetError, RepositoryError, \
                 NodeDoesNotExistError
             from whoosh.index import create_in, open_dir, exists_in
             from whoosh.query import *
             from whoosh.qparser import QueryParser
             log = logging.getLogger('whoosh_indexer')
             class WhooshIndexingDaemon(object):
                 """
                 Daemon for atomic indexing jobs
                 """
                 def __init__(self, indexname=IDX_NAME, index_location=None,
                              repo_location=None, sa=None, repo_list=None,
                              repo_update_list=None):
                     self.indexname = indexname
                     self.index_location = index_location
                     if not index_location:
                         raise Exception('You have to provide index location')
                     self.repo_location = repo_location
                     if not repo_location:
                         raise Exception('You have to provide repositories location')
                     self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)
                     #filter repo list
                     if repo_list:
                         #Fix non-ascii repo names to unicode
                         repo_list = map(safe_unicode, repo_list)
                         self.filtered_repo_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_list:
                                 self.filtered_repo_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_paths
                     #filter update repo list
                     self.filtered_repo_update_paths = {}
                     if repo_update_list:
                         self.filtered_repo_update_paths = {}
                         for repo_name, repo in self.repo_paths.items():
                             if repo_name in repo_update_list:
                                 self.filtered_repo_update_paths[repo_name] = repo
                         self.repo_paths = self.filtered_repo_update_paths
                     self.initial = True
                     if not os.path.isdir(self.index_location):
                         os.makedirs(self.index_location)
                         log.info('Cannot run incremental index since it does not '
                                  'yet exist running full build')
                     elif not exists_in(self.index_location, IDX_NAME):
                         log.info('Running full index build as the file content '
                                  'index does not exist')
                     elif not exists_in(self.index_location, CHGSET_IDX_NAME):
                         log.info('Running full index build as the changeset '
                                  'index does not exist')
                     else:
                         self.initial = False
                 def _get_index_revision(self, repo):
                     db_repo = Repository.get_by_repo_name(repo.name)
                     landing_rev = 'tip'
                     if db_repo:
                         landing_rev = db_repo.landing_rev
                     return landing_rev
                 def _get_index_changeset(self, repo):
                     index_rev = self._get_index_revision(repo)
                     cs = repo.get_changeset(index_rev)
                     return cs
                 def get_paths(self, repo):
                     """
                     recursive walk in root dir and return a set of all path in that dir
                     based on repository walk function
                     """
                     index_paths_ = set()
                     try:
                         cs = self._get_index_changeset(repo)
                         for _topnode, _dirs, files in cs.walk('/'):
                             for f in files:
                                 index_paths_.add(jn(safe_str(repo.path), safe_str(f.path)))
                     except RepositoryError:
                         log.debug(traceback.format_exc())
                         pass
                     return index_paths_
                 def get_node(self, repo, path):
-                    n_path = path[len(repo.path) + 1:]
+                    """
+                    gets a filenode based on given full path.It operates on string for
+                    hg git compatability.
+                    :param repo: scm repo instance
+                    :param path: full path including root location
+                    :return: FileNode
+                    """
+                    root_path = safe_str(repo.path)+'/'
+                    parts = safe_str(path).partition(root_path)
                     cs = self._get_index_changeset(repo)
-                    node = cs.get_node(n_path)
+                    node = cs.get_node(parts[-1])
                     return node
                 def get_node_mtime(self, node):
                     return mktime(node.last_changeset.date.timetuple())
                 def add_doc(self, writer, path, repo, repo_name):
                     """
                     Adding doc to writer this function itself fetches data from
                     the instance of vcs backend
                     """
                     node = self.get_node(repo, path)
                     indexed = indexed_w_content = 0
                     # we just index the content of chosen files, and skip binary files
                     if node.extension in INDEX_EXTENSIONS and not node.is_binary:
                         u_content = node.content
                         if not isinstance(u_content, unicode):
                             log.warning('  >> %s Could not get this content as unicode '
                                         'replacing with empty content' % path)
                             u_content = u''
                         else:
                             log.debug('    >> %s [WITH CONTENT]' % path)
                             indexed_w_content += 1
                     else:
                         log.debug('    >> %s' % path)
                         # just index file name without it's content
                         u_content = u''
                         indexed += 1
                     p = safe_unicode(path)
                     writer.add_document(
                         fileid=p,
                         owner=unicode(repo.contact),
                         repository=safe_unicode(repo_name),
                         path=p,
                         content=u_content,
                         modtime=self.get_node_mtime(node),
                         extension=node.extension
                     )
                     return indexed, indexed_w_content
                 def index_changesets(self, writer, repo_name, repo, start_rev=None):
                     """
                     Add all changeset in the vcs repo starting at start_rev
                     to the index writer
                     :param writer: the whoosh index writer to add to
                     :param repo_name: name of the repository from whence the
                       changeset originates including the repository group
                     :param repo: the vcs repository instance to index changesets for,
                       the presumption is the repo has changesets to index
                     :param start_rev=None: the full sha id to start indexing from
                       if start_rev is None then index from the first changeset in
                       the repo
                     """
                     if start_rev is None:
                         start_rev = repo[0].raw_id
                     log.debug('indexing changesets in %s starting at rev: %s' %
                               (repo_name, start_rev))
                     indexed = 0
                     for cs in repo.get_changesets(start=start_rev):
                         log.debug('    >> %s' % cs)
                         writer.add_document(
                             raw_id=unicode(cs.raw_id),
                             owner=unicode(repo.contact),
                             date=cs._timestamp,
                             repository=safe_unicode(repo_name),
                             author=cs.author,
                             message=cs.message,
                             last=cs.last,
                             added=u' '.join([safe_unicode(node.path) for node in cs.added]).lower(),
                             removed=u' '.join([safe_unicode(node.path) for node in cs.removed]).lower(),
                             changed=u' '.join([safe_unicode(node.path) for node in cs.changed]).lower(),
                             parents=u' '.join([cs.raw_id for cs in cs.parents]),
                         )
                         indexed += 1
                     log.debug('indexed %d changesets for repo %s' % (indexed, repo_name))
                     return indexed
                 def index_files(self, file_idx_writer, repo_name, repo):
                     """
                     Index files for given repo_name
                     :param file_idx_writer: the whoosh index writer to add to
                     :param repo_name: name of the repository we're indexing
                     :param repo: instance of vcs repo
                     """
                     i_cnt = iwc_cnt = 0
                     log.debug('building index for %s @revision:%s' % (repo.path,
                                                             self._get_index_revision(repo)))
                     for idx_path in self.get_paths(repo):
                         i, iwc = self.add_doc(file_idx_writer, idx_path, repo, repo_name)
                         i_cnt += i
                         iwc_cnt += iwc
                     log.debug('added %s files %s with content for repo %s' %
                               (i_cnt + iwc_cnt, iwc_cnt, repo.path))
                     return i_cnt, iwc_cnt
                 def update_changeset_index(self):
                     idx = open_dir(self.index_location, indexname=CHGSET_IDX_NAME)
                     with idx.searcher() as searcher:
                         writer = idx.writer()
                         writer_is_dirty = False
                         try:
                             indexed_total = 0
                             repo_name = None
                             for repo_name, repo in self.repo_paths.items():
                                 # skip indexing if there aren't any revs in the repo
                                 num_of_revs = len(repo)
                                 if num_of_revs < 1:
                                     continue
                                 qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                                 q = qp.parse(u"last:t AND %s" % repo_name)
                                 results = searcher.search(q)
                                 # default to scanning the entire repo
                                 last_rev = 0
                                 start_id = None
                                 if len(results) > 0:
                                     # assuming that there is only one result, if not this
                                     # may require a full re-index.
                                     start_id = results[0]['raw_id']
                                     last_rev = repo.get_changeset(revision=start_id).revision
                                 # there are new changesets to index or a new repo to index
                                 if last_rev == 0 or num_of_revs > last_rev + 1:
                                     # delete the docs in the index for the previous
                                     # last changeset(s)
                                     for hit in results:
                                         q = qp.parse(u"last:t AND %s AND raw_id:%s" %
                                                         (repo_name, hit['raw_id']))
                                         writer.delete_by_query(q)
                                     # index from the previous last changeset + all new ones
                                     indexed_total += self.index_changesets(writer,
                                                             repo_name, repo, start_id)
                                     writer_is_dirty = True
                             log.debug('indexed %s changesets for repo %s' % (
                                       indexed_total, repo_name)
                             )
                         finally:
                             if writer_is_dirty:
                                 log.debug('>> COMMITING CHANGES TO CHANGESET INDEX<<')
                                 writer.commit(merge=True)
                                 log.debug('>>> FINISHED REBUILDING CHANGESET INDEX <<<')
                             else:
                                 log.debug('>> NOTHING TO COMMIT TO CHANGESET INDEX<<')
                 def update_file_index(self):
                     log.debug((u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
                                'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     idx = open_dir(self.index_location, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     writer = idx.writer()
                     writer_is_dirty = False
                     try:
                         with idx.reader() as reader:
                             # Loop over the stored fields in the index
                             for fields in reader.all_stored_fields():
                                 indexed_path = fields['path']
                                 indexed_repo_path = fields['repository']
                                 indexed_paths.add(indexed_path)
                                 if not indexed_repo_path in self.filtered_repo_update_paths:
                                     continue
                                 repo = self.repo_paths[indexed_repo_path]
                                 try:
                                     node = self.get_node(repo, indexed_path)
                                     # Check if this file was changed since it was indexed
                                     indexed_time = fields['modtime']
                                     mtime = self.get_node_mtime(node)
                                     if mtime > indexed_time:
                                         # The file has changed, delete it and add it to
                                         # the list of files to reindex
                                         log.debug(
                                             'adding to reindex list %s mtime: %s vs %s' % (
                                                 indexed_path, mtime, indexed_time)
                                         )
                                         writer.delete_by_term('fileid', indexed_path)
                                         writer_is_dirty = True
                                         to_index.add(indexed_path)
                                 except (ChangesetError, NodeDoesNotExistError):
                                     # This file was deleted since it was indexed
                                     log.debug('removing from index %s' % indexed_path)
                                     writer.delete_by_term('path', indexed_path)
                                     writer_is_dirty = True
                         # Loop over the files in the filesystem
                         # Assume we have a function that gathers the filenames of the
                         # documents to be indexed
                         ri_cnt_total = 0  # indexed
                         riwc_cnt_total = 0  # indexed with content
                         for repo_name, repo in self.repo_paths.items():
                             # skip indexing if there aren't any revisions
                             if len(repo) < 1:
                                 continue
                             ri_cnt = 0   # indexed
                             riwc_cnt = 0  # indexed with content
                             for path in self.get_paths(repo):
                                 path = safe_unicode(path)
                                 if path in to_index or path not in indexed_paths:
                                     # This is either a file that's changed, or a new file
                                     # that wasn't indexed before. So index it!
                                     i, iwc = self.add_doc(writer, path, repo, repo_name)
                                     writer_is_dirty = True
                                     log.debug('re indexing %s' % path)
                                     ri_cnt += i
                                     ri_cnt_total += 1
                                     riwc_cnt += iwc
                                     riwc_cnt_total += iwc
                             log.debug('added %s files %s with content for repo %s' % (
                                          ri_cnt + riwc_cnt, riwc_cnt, repo.path)
                             )
                         log.debug('indexed %s files in total and %s with content' % (
                                     ri_cnt_total, riwc_cnt_total)
                         )
                     finally:
                         if writer_is_dirty:
                             log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
                             writer.commit(merge=True)
                             log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
                         else:
                             log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
                             writer.cancel()
                 def build_indexes(self):
                     if os.path.exists(self.index_location):
                         log.debug('removing previous index')
                         rmtree(self.index_location)
                     if not os.path.exists(self.index_location):
                         os.mkdir(self.index_location)
                     chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
                                            indexname=CHGSET_IDX_NAME)
                     chgset_idx_writer = chgset_idx.writer()
                     file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
                     file_idx_writer = file_idx.writer()
                     log.debug('BUILDING INDEX FOR EXTENSIONS %s '
                               'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))
                     for repo_name, repo in self.repo_paths.items():
                         # skip indexing if there aren't any revisions
                         if len(repo) < 1:
                             continue
                         self.index_files(file_idx_writer, repo_name, repo)
                         self.index_changesets(chgset_idx_writer, repo_name, repo)
                     log.debug('>> COMMITING CHANGES <<')
                     file_idx_writer.commit(merge=True)
                     chgset_idx_writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_indexes(self):
                     self.update_file_index()
                     self.update_changeset_index()
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index or self.initial:
                         self.build_indexes()
                     else:
                         self.update_indexes()