##// END OF EJS Templates
#469 added --update-only option to whoosh to re-index only given list...
#469 added --update-only option to whoosh to re-index only given list of repos in index

File last commit:

r2226:ce04e6ef beta
r2373:1828eb7f beta
Show More
rhodecode_crawler.py
184 lines | 4.8 KiB | text/x-python | PythonLexer
/ rhodecode / tests / rhodecode_crawler.py
renamed crawler to not be runned at tests, bug found by slestak.
r1374 # -*- coding: utf-8 -*-
"""
rhodecode.tests.test_crawer
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Test for crawling a project for memory usage
This should be runned just as regular script together
with a watch script that will show memory usage.
auto white-space removal
r1818
renamed crawler to not be runned at tests, bug found by slestak.
r1374 watch -n1 ./rhodecode/tests/mem_watch
:created_on: Apr 21, 2010
:author: marcink
2012 copyrights
r1824 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
renamed crawler to not be runned at tests, bug found by slestak.
r1374 :license: GPLv3, see COPYING for more details.
"""
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import cookielib
import urllib
import urllib2
import time
small improvements in rhodecode_crawler
r2211 import os
import sys
from os.path import join as jn
from os.path import dirname as dn
renamed crawler to not be runned at tests, bug found by slestak.
r1374
small improvements in rhodecode_crawler
r2211 __here__ = os.path.abspath(__file__)
__root__ = dn(dn(dn(__here__)))
sys.path.append(__root__)
Added VCS into rhodecode core for faster and easier deployments of new versions
r2007 from rhodecode.lib import vcs
improvements for rhodecode crawler
r2226 from rhodecode.lib.compat import OrderedSet
from rhodecode.lib.vcs.exceptions import RepositoryError
renamed crawler to not be runned at tests, bug found by slestak.
r1374
improvements for rhodecode crawler
r2226 PASES = 3
HOST = 'http://127.0.0.1'
PORT = 5000
BASE_URI = '%s:%s/' % (HOST, PORT)
if len(sys.argv) == 2:
BASE_URI = sys.argv[1]
if not BASE_URI.endswith('/'):
BASE_URI += '/'
print 'Crawling @ %s' % BASE_URI
BASE_URI += '%s'
renamed crawler to not be runned at tests, bug found by slestak.
r1374 PROJECT_PATH = jn('/', 'home', 'marcink', 'hg_repos')
added more repos to crawler
r2213 PROJECTS = [
'linux-magx-pbranch',
'CPython',
'rhodecode_tip',
]
renamed crawler to not be runned at tests, bug found by slestak.
r1374
cj = cookielib.FileCookieJar('/tmp/rc_test_cookie.txt')
o = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
o.addheaders = [
small improvements in rhodecode_crawler
r2211 ('User-agent', 'rhodecode-crawler'),
('Accept-Language', 'en - us, en;q = 0.5')
]
renamed crawler to not be runned at tests, bug found by slestak.
r1374
urllib2.install_opener(o)
improvements for rhodecode crawler
r2226 def _get_repo(proj):
if isinstance(proj, basestring):
repo = vcs.get_repo(jn(PROJECT_PATH, proj))
proj = proj
else:
repo = proj
proj = repo.name
return repo, proj
small improvements in rhodecode_crawler
r2211 def test_changelog_walk(proj, pages=100):
improvements for rhodecode crawler
r2226 repo, proj = _get_repo(proj)
renamed crawler to not be runned at tests, bug found by slestak.
r1374 total_time = 0
for i in range(1, pages):
small improvements in rhodecode_crawler
r2211 page = '/'.join((proj, 'changelog',))
renamed crawler to not be runned at tests, bug found by slestak.
r1374
full_uri = (BASE_URI % page) + '?' + urllib.urlencode({'page':i})
s = time.time()
f = o.open(full_uri)
size = len(f.read())
e = time.time() - s
total_time += e
print 'visited %s size:%s req:%s ms' % (full_uri, size, e)
print 'total_time', total_time
print 'average on req', total_time / float(pages)
small improvements in rhodecode_crawler
r2211 def test_changeset_walk(proj, limit=None):
improvements for rhodecode crawler
r2226 repo, proj = _get_repo(proj)
small improvements in rhodecode_crawler
r2211 print 'processing', jn(PROJECT_PATH, proj)
renamed crawler to not be runned at tests, bug found by slestak.
r1374 total_time = 0
cnt = 0
for i in repo:
cnt += 1
small improvements in rhodecode_crawler
r2211 raw_cs = '/'.join((proj, 'changeset', i.raw_id))
renamed crawler to not be runned at tests, bug found by slestak.
r1374 if limit and limit == cnt:
break
full_uri = (BASE_URI % raw_cs)
small improvements in rhodecode_crawler
r2211 print '%s visiting %s\%s' % (cnt, full_uri, i)
renamed crawler to not be runned at tests, bug found by slestak.
r1374 s = time.time()
f = o.open(full_uri)
size = len(f.read())
e = time.time() - s
total_time += e
print '%s visited %s\%s size:%s req:%s ms' % (cnt, full_uri, i, size, e)
print 'total_time', total_time
print 'average on req', total_time / float(cnt)
small improvements in rhodecode_crawler
r2211 def test_files_walk(proj, limit=100):
improvements for rhodecode crawler
r2226 repo, proj = _get_repo(proj)
small improvements in rhodecode_crawler
r2211 print 'processing', jn(PROJECT_PATH, proj)
renamed crawler to not be runned at tests, bug found by slestak.
r1374 total_time = 0
paths_ = OrderedSet([''])
try:
tip = repo.get_changeset('tip')
for topnode, dirs, files in tip.walk('/'):
for dir in dirs:
paths_.add(dir.path)
for f in dir:
paths_.add(f.path)
for f in files:
paths_.add(f.path)
fixed exception in rhodecode_crawler
r1977 except RepositoryError, e:
renamed crawler to not be runned at tests, bug found by slestak.
r1374 pass
cnt = 0
for f in paths_:
cnt += 1
if limit and limit == cnt:
break
small improvements in rhodecode_crawler
r2211 file_path = '/'.join((proj, 'files', 'tip', f))
renamed crawler to not be runned at tests, bug found by slestak.
r1374 full_uri = (BASE_URI % file_path)
small improvements in rhodecode_crawler
r2211 print '%s visiting %s' % (cnt, full_uri)
renamed crawler to not be runned at tests, bug found by slestak.
r1374 s = time.time()
f = o.open(full_uri)
size = len(f.read())
e = time.time() - s
total_time += e
small improvements in rhodecode_crawler
r2211 print '%s visited OK size:%s req:%s ms' % (cnt, size, e)
renamed crawler to not be runned at tests, bug found by slestak.
r1374
print 'total_time', total_time
print 'average on req', total_time / float(cnt)
small improvements in rhodecode_crawler
r2211 if __name__ == '__main__':
improvements for rhodecode crawler
r2226 for path in PROJECTS:
repo = vcs.get_repo(jn(PROJECT_PATH, path))
for i in range(PASES):
print 'PASS %s/%s' % (i, PASES)
test_changelog_walk(repo, pages=80)
test_changeset_walk(repo, limit=100)
test_files_walk(repo, limit=100)