upstream/kallithea Commit - r443:e5157e2a

added safe unicode funtion, and implemented it in whoosh indexer

marcink -

r443:e5157e2a default

parent child

pylons_app/lib/helpers.py

0 +16 0

             """Helper functions
             Consists of functions to typically be used within templates, but also
             available to Controllers. This module is available to both as 'h'.
             """
             from pygments.formatters import HtmlFormatter
             from pygments import highlight as code_highlight
             from pylons import url, app_globals as g
             from pylons.i18n.translation import _, ungettext
             from vcs.utils.annotate import annotate_highlight
             from webhelpers.html import literal, HTML, escape
             from webhelpers.html.tools import *
             from webhelpers.html.builder import make_tag
             from webhelpers.html.tags import auto_discovery_link, checkbox, css_classes, \
                 end_form, file, form, hidden, image, javascript_link, link_to, link_to_if, \
                 link_to_unless, ol, required_legend, select, stylesheet_link, submit, text, \
                 password, textarea, title, ul, xml_declaration, radio
             from webhelpers.html.tools import auto_link, button_to, highlight, js_obfuscate, \
                 mail_to, strip_links, strip_tags, tag_re
             from webhelpers.number import format_byte_size, format_bit_size
             from webhelpers.pylonslib import Flash as _Flash
             from webhelpers.pylonslib.secure_form import secure_form
             from webhelpers.text import chop_at, collapse, convert_accented_entities, \
                 convert_misc_entities, lchop, plural, rchop, remove_formatting, \
                 replace_whitespace, urlify, truncate, wrap_paragraphs
             #Custom helpers here :)
             class _Link(object):
                 '''
                 Make a url based on label and url with help of url_for
                 @param label:name of link    if not defined url is used
                 @param url: the url for link
                 '''
                 def __call__(self, label='', *url_, **urlargs):
                     if label is None or '':
                         label = url
                     link_fn = link_to(label, url(*url_, **urlargs))
                     return link_fn
             link = _Link()
             class _GetError(object):
                 def __call__(self, field_name, form_errors):
                     tmpl = """<span class="error_msg">%s</span>"""
                     if form_errors and form_errors.has_key(field_name):
                         return literal(tmpl % form_errors.get(field_name))
             get_error = _GetError()
             def recursive_replace(str, replace=' '):
                 """
                 Recursive replace of given sign to just one instance
                 @param str: given string
                 @param replace:char to find and replace multiple instances
                 Examples::
                 >>> recursive_replace("Mighty---Mighty-Bo--sstones",'-')
                 'Mighty-Mighty-Bo-sstones'
                 """
                 if str.find(replace * 2) == -1:
                     return str
                 else:
                     str = str.replace(replace * 2, replace)
                     return recursive_replace(str, replace)
             class _ToolTip(object):
                 def __call__(self, tooltip_title, trim_at=50):
                     """
                     Special function just to wrap our text into nice formatted autowrapped
                     text
                     @param tooltip_title:
                     """
                     return literal(wrap_paragraphs(tooltip_title, trim_at)\
                                    .replace('\n', '<br/>'))
                 def activate(self):
                     """
                     Adds tooltip mechanism to the given Html all tooltips have to have
                     set class tooltip and set attribute tooltip_title.
                     Then a tooltip will be generated based on that
                     All with yui js tooltip
                     """
                     js = '''
                     YAHOO.util.Event.onDOMReady(function(){
                         function toolTipsId(){
                             var ids = [];
                             var tts = YAHOO.util.Dom.getElementsByClassName('tooltip');
                             for (var i = 0; i < tts.length; i++) {
                                 //if element doesn not have and id autgenerate one for tooltip
                                 if (!tts[i].id){
                                     tts[i].id='tt'+i*100;
                                 }
                                 ids.push(tts[i].id);
                             }
                             return ids
                         };
                         var myToolTips = new YAHOO.widget.Tooltip("tooltip", {
                             context: toolTipsId(),
                             monitorresize:false,
                             xyoffset :[0,0],
                             autodismissdelay:300000,
                             hidedelay:5,
                             showdelay:20,
                         });
                         //Mouse Over event disabled for new repositories since they dont
                         //have last commit message
                         myToolTips.contextMouseOverEvent.subscribe(
                             function(type, args) {
                                 var context = args[0];
                                 var txt = context.getAttribute('tooltip_title');
                                 if(txt){
                                     return true;
                                 }
                                 else{
                                     return false;
                                 }
                             });
                         // Set the text for the tooltip just before we display it. Lazy method
                         myToolTips.contextTriggerEvent.subscribe(
                              function(type, args) {
                                     var context = args[0];
                                     var txt = context.getAttribute('tooltip_title');
                                     this.cfg.setProperty("text", txt);
                                     // positioning of tooltip
                                     var tt_w = this.element.clientWidth;
                                     var tt_h = this.element.clientHeight;
                                     var context_w = context.offsetWidth;
                                     var context_h = context.offsetHeight;
                                     var pos_x = YAHOO.util.Dom.getX(context);
                                     var pos_y = YAHOO.util.Dom.getY(context);
                                     var display_strategy = 'top';
                                     var xy_pos = [0,0];
                                     switch (display_strategy){
                                         case 'top':
                                             var cur_x = (pos_x+context_w/2)-(tt_w/2);
                                             var cur_y = pos_y-tt_h-4;
                                             xy_pos = [cur_x,cur_y];
                                             break;
                                         case 'bottom':
                                             var cur_x = (pos_x+context_w/2)-(tt_w/2);
                                             var cur_y = pos_y+context_h+4;
                                             xy_pos = [cur_x,cur_y];
                                             break;
                                         case 'left':
                                             var cur_x = (pos_x-tt_w-4);
                                             var cur_y = pos_y-((tt_h/2)-context_h/2);
                                             xy_pos = [cur_x,cur_y];
                                             break;
                                         case 'right':
                                             var cur_x = (pos_x+context_w+4);
                                             var cur_y = pos_y-((tt_h/2)-context_h/2);
                                             xy_pos = [cur_x,cur_y];
                                             break;
                                          default:
                                             var cur_x = (pos_x+context_w/2)-(tt_w/2);
                                             var cur_y = pos_y-tt_h-4;
                                             xy_pos = [cur_x,cur_y];
                                             break;
                                     }
                                     this.cfg.setProperty("xy",xy_pos);
                               });
                         //Mouse out
                         myToolTips.contextMouseOutEvent.subscribe(
                             function(type, args) {
                                 var context = args[0];
                             });
                     });
                     '''
                     return literal(js)
             tooltip = _ToolTip()
             class _FilesBreadCrumbs(object):
                 def __call__(self, repo_name, rev, paths):
                     url_l = [link_to(repo_name, url('files_home',
                                                     repo_name=repo_name,
                                                     revision=rev, f_path=''))]
                     paths_l = paths.split('/')
                     for cnt, p in enumerate(paths_l, 1):
                         if p != '':
                             url_l.append(link_to(p, url('files_home',
                                                         repo_name=repo_name,
                                                         revision=rev,
                                                         f_path='/'.join(paths_l[:cnt]))))
                     return literal(' / '.join(url_l))
             files_breadcrumbs = _FilesBreadCrumbs()
             def pygmentize(filenode, **kwargs):
                 """
                 pygmentize function using pygments
                 @param filenode:
                 """
                 return literal(code_highlight(filenode.content,
                                               filenode.lexer, HtmlFormatter(**kwargs)))
             def pygmentize_annotation(filenode, **kwargs):
                 """
                 pygmentize function for annotation
                 @param filenode:
                 """
                 color_dict = {}
                 def gen_color():
                     """generator for getting 10k of evenly distibuted colors using hsv color
                     and golden ratio.
                     """
                     import colorsys
                     n = 10000
                     golden_ratio = 0.618033988749895
                     h = 0.22717784590367374
                     #generate 10k nice web friendly colors in the same order
                     for c in xrange(n):
                         h +=golden_ratio
                         h %= 1
                         HSV_tuple = [h, 0.95, 0.95]
                         RGB_tuple = colorsys.hsv_to_rgb(*HSV_tuple)
                         yield map(lambda x:str(int(x*256)),RGB_tuple)
                 cgenerator = gen_color()
                 def get_color_string(cs):
                     if color_dict.has_key(cs):
                         col = color_dict[cs]
                     else:
                         col = color_dict[cs] = cgenerator.next()
                     return "color: rgb(%s)! important;" % (', '.join(col))
                 def url_func(changeset):
                     tooltip_html = "<div style='font-size:0.8em'><b>Author:</b>"+\
                     " %s<br/><b>Date:</b> %s</b><br/><b>Message:</b> %s<br/></div>"
                     tooltip_html = tooltip_html % (changeset.author,
                                                            changeset.date,
                                                            tooltip(changeset.message))
                     lnk_format = 'r%-5s:%s' % (changeset.revision,
                                              changeset.raw_id)
                     uri = link_to(
                             lnk_format,
                             url('changeset_home', repo_name=changeset.repository.name,
                                 revision=changeset.raw_id),
                             style=get_color_string(changeset.raw_id),
                             class_='tooltip',
                             tooltip_title=tooltip_html
                           )
                     uri += '\n'
                     return uri
                 return literal(annotate_highlight(filenode, url_func, **kwargs))
             def repo_name_slug(value):
                 """
                 Return slug of name of repository
                 """
                 slug = urlify(value)
                 for c in """=[]\;'"<>,/~!@#$%^&*()+{}|:""":
                     slug = slug.replace(c, '-')
                 slug = recursive_replace(slug, '-')
                 return slug
             flash = _Flash()
             #===============================================================================
             # MERCURIAL FILTERS available via h.
             #===============================================================================
             from mercurial import util
             from mercurial.templatefilters import age as _age, person as _person
             age = lambda  x:_age(x)
             capitalize = lambda x: x.capitalize()
             date = lambda x: util.datestr(x)
             email = util.email
             email_or_none = lambda x: util.email(x) if util.email(x) != x else None
             person = lambda x: _person(x)
             hgdate = lambda  x: "%d %d" % x
             isodate = lambda  x: util.datestr(x, '%Y-%m-%d %H:%M %1%2')
             isodatesec = lambda  x: util.datestr(x, '%Y-%m-%d %H:%M:%S %1%2')
             localdate = lambda  x: (x[0], util.makedate()[1])
             rfc822date = lambda  x: util.datestr(x, "%a, %d %b %Y %H:%M:%S %1%2")
             rfc3339date = lambda  x: util.datestr(x, "%Y-%m-%dT%H:%M:%S%1:%2")
             time_ago = lambda x: util.datestr(_age(x), "%a, %d %b %Y %H:%M:%S %1%2")
             #===============================================================================
             # PERMS
             #===============================================================================
             from pylons_app.lib.auth import HasPermissionAny, HasPermissionAll, \
             HasRepoPermissionAny, HasRepoPermissionAll
             #===============================================================================
             # GRAVATAR URL
             #===============================================================================
             import hashlib
             import urllib
             from pylons import request
             def gravatar_url(email_address, size=30):
                 ssl_enabled = 'https' == request.environ.get('HTTP_X_URL_SCHEME')
                 default = 'identicon'
                 baseurl_nossl = "http://www.gravatar.com/avatar/"
                 baseurl_ssl = "https://secure.gravatar.com/avatar/"
                 baseurl = baseurl_ssl if ssl_enabled else baseurl_nossl
                 # construct the url
                 gravatar_url = baseurl + hashlib.md5(email_address.lower()).hexdigest() + "?"
                 gravatar_url += urllib.urlencode({'d':default, 's':str(size)})
                 return gravatar_url
+            def safe_unicode(str):
+                """safe unicode function. In case of UnicodeDecode error we try to return
+                unicode with errors replace, if this failes we return unicode with
+                string_escape decoding """
+                try:
+                    u_str = unicode(str)
+                except UnicodeDecodeError:
+                    try:
+                        u_str = unicode(str, 'utf-8', 'replace')
+                    except UnicodeDecodeError:
+                        #incase we have a decode error just represent as byte string
+                        u_str = unicode(str(str).encode('string_escape'))
+                return u_str
  No newline at end of file

pylons_app/lib/indexers/daemon.py

0 +2 -5

             #!/usr/bin/env python
             # encoding: utf-8
             # whoosh indexer daemon for hg-app
             # Copyright (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
             #
             # This program is free software; you can redistribute it and/or
             # modify it under the terms of the GNU General Public License
             # as published by the Free Software Foundation; version 2
             # of the License or (at your opinion) any later version of the license.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU General Public License
             # along with this program; if not, write to the Free Software
             # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
             # MA  02110-1301, USA.
             """
             Created on Jan 26, 2010
             @author: marcink
             A deamon will read from task table and run tasks
             """
             import sys
             import os
             from os.path import dirname as dn
             from os.path import join as jn
             #to get the pylons_app import
             project_path = dn(dn(dn(dn(os.path.realpath(__file__)))))
             sys.path.append(project_path)
             from pidlock import LockHeld, DaemonLock
             import traceback
             from pylons_app.config.environment import load_environment
             from pylons_app.model.hg_model import HgModel
+            from pylons_app.lib.helpers import safe_unicode
             from whoosh.index import create_in, open_dir
             from shutil import rmtree
             from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
             SCHEMA, IDX_NAME
             import logging
             import logging.config
             logging.config.fileConfig(jn(project_path, 'development.ini'))
             log = logging.getLogger('whooshIndexer')
             def scan_paths(root_location):
                 return HgModel.repo_scan('/', root_location, None, True)
             class WhooshIndexingDaemon(object):
                 """Deamon for atomic jobs"""
                 def __init__(self, indexname='HG_INDEX', repo_location=None):
                     self.indexname = indexname
                     self.repo_location = repo_location
                 def get_paths(self, root_dir):
                     """recursive walk in root dir and return a set of all path in that dir
                     excluding files in .hg dir"""
                     index_paths_ = set()
                     for path, dirs, files in os.walk(root_dir):
                         if path.find('.hg') == -1:
                             for f in files:
                                 index_paths_.add(jn(path, f))
                     return index_paths_
                 def add_doc(self, writer, path, repo):
                     """Adding doc to writer"""
                     ext = unicode(path.split('/')[-1].split('.')[-1].lower())
                     #we just index the content of choosen files
                     if ext in INDEX_EXTENSIONS:
                         log.debug('    >> %s [WITH CONTENT]' % path)
                         fobj = open(path, 'rb')
                         content = fobj.read()
                         fobj.close()
-                        try:
+                        u_content = safe_unicode(content)
-                            u_content = unicode(content)
-                        except UnicodeDecodeError:
-                            #incase we have a decode error just represent as byte string
-                            u_content = unicode(str(content).encode('string_escape'))
                     else:
                         log.debug('    >> %s' % path)
                         #just index file name without it's content
                         u_content = u''
                     try:
                         os.stat(path)
                         writer.add_document(owner=unicode(repo.contact),
                                         repository=u"%s" % repo.name,
                                         path=u"%s" % path,
                                         content=u_content,
                                         modtime=os.path.getmtime(path),
                                         extension=ext)
                     except OSError, e:
                         import errno
                         if e.errno == errno.ENOENT:
                             log.debug('path %s does not exist or is a broken symlink' % path)
                         else:
                             raise e
                 def build_index(self):
                     if os.path.exists(IDX_LOCATION):
                         log.debug('removing previos index')
                         rmtree(IDX_LOCATION)
                     if not os.path.exists(IDX_LOCATION):
                         os.mkdir(IDX_LOCATION)
                     idx = create_in(IDX_LOCATION, SCHEMA, indexname=IDX_NAME)
                     writer = idx.writer()
                     for cnt, repo in enumerate(scan_paths(self.repo_location).values()):
                         log.debug('building index @ %s' % repo.path)
                         for idx_path in self.get_paths(repo.path):
                             self.add_doc(writer, idx_path, repo)
                     writer.commit(merge=True)
                     log.debug('>>> FINISHED BUILDING INDEX <<<')
                 def update_index(self):
                     log.debug('STARTING INCREMENTAL INDEXING UPDATE')
                     idx = open_dir(IDX_LOCATION, indexname=self.indexname)
                     # The set of all paths in the index
                     indexed_paths = set()
                     # The set of all paths we need to re-index
                     to_index = set()
                     reader = idx.reader()
                     writer = idx.writer()
                     # Loop over the stored fields in the index
                     for fields in reader.all_stored_fields():
                         indexed_path = fields['path']
                         indexed_paths.add(indexed_path)
                         if not os.path.exists(indexed_path):
                             # This file was deleted since it was indexed
                             log.debug('removing from index %s' % indexed_path)
                             writer.delete_by_term('path', indexed_path)
                         else:
                             # Check if this file was changed since it
                             # was indexed
                             indexed_time = fields['modtime']
                             mtime = os.path.getmtime(indexed_path)
                             if mtime > indexed_time:
                                 # The file has changed, delete it and add it to the list of
                                 # files to reindex
                                 log.debug('adding to reindex list %s' % indexed_path)
                                 writer.delete_by_term('path', indexed_path)
                                 to_index.add(indexed_path)
                                 #writer.commit()
                     # Loop over the files in the filesystem
                     # Assume we have a function that gathers the filenames of the
                     # documents to be indexed
                     for repo in scan_paths(self.repo_location).values():
                         for path in self.get_paths(repo.path):
                             if path in to_index or path not in indexed_paths:
                                 # This is either a file that's changed, or a new file
                                 # that wasn't indexed before. So index it!
                                 self.add_doc(writer, path, repo)
                                 log.debug('reindexing %s' % path)
                     writer.commit(merge=True)
                     #idx.optimize()
                     log.debug('>>> FINISHED <<<')
                 def run(self, full_index=False):
                     """Run daemon"""
                     if full_index:
                         self.build_index()
                     else:
                         self.update_index()
             if __name__ == "__main__":
                 repo_location = '/home/marcink/hg_repos/*'
                 full_index = True # False means looking just for changes
                 try:
                     l = DaemonLock()
                     WhooshIndexingDaemon(repo_location=repo_location)\
                         .run(full_index=full_index)
                     l.release()
                 except LockHeld:
                     sys.exit(1)

General Comments 0

Write
Preview

You need to be logged in to leave comments. Login now

No TODOs yet

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages