# HG changeset patch # User FUJIWARA Katsunori # Date 2015-01-10 14:18:11 # Node ID 91dbb98b35135aaa576303203fee373b1f0559c0 # Parent 9d25bb84cf6ce4f09519c805454df54c8064c244 revset: make tokenize extensible to parse alias declarations and definitions Before this patch, "tokenize" doesn't recognize the symbol starting with "$" as a valid one. This prevents revset alias declarations and definitions from being parsed with "tokenize", because "$" may be used as the initial letter of alias arguments. BTW, the alias argument name doesn't require leading "$" itself, in fact. But we have to assume that users may use "$" as the initial letter of argument names in their aliases, because examples in "hg help revsets" uses such names for a long time. To make "tokenize" extensible to parse alias declarations and definitions, this patch introduces optional arguments "syminitletters" and "symletters". Giving these sets can change the policy of "valid symbol" in tokenization easily. This patch keeps original examination of letter validity for reviewability, even though there is redundant interchanging between "chr"/"ord" at initialization of "_syminitletters" and "_symletters". At most 256 times examination (per initialization) is cheaper enough than revset evaluation itself. This patch is a part of preparation for parsing alias declarations and definitions more strictly. diff --git a/mercurial/revset.py b/mercurial/revset.py --- a/mercurial/revset.py +++ b/mercurial/revset.py @@ -129,15 +129,39 @@ elements = { keywords = set(['and', 'or', 'not']) -def tokenize(program, lookup=None): +# default set of valid characters for the initial letter of symbols +_syminitletters = set(c for c in [chr(i) for i in xrange(256)] + if c.isalnum() or c in '._@' or ord(c) > 127) + +# default set of valid characters for non-initial letters of symbols +_symletters = set(c for c in [chr(i) for i in xrange(256)] + if c.isalnum() or c in '-._/@' or ord(c) > 127) + +def tokenize(program, lookup=None, syminitletters=None, symletters=None): ''' Parse a revset statement into a stream of tokens + ``syminitletters`` is the set of valid characters for the initial + letter of symbols. + + By default, character ``c`` is recognized as valid for initial + letter of symbols, if ``c.isalnum() or c in '._@' or ord(c) > 127``. + + ``symletters`` is the set of valid characters for non-initial + letters of symbols. + + By default, character ``c`` is recognized as valid for non-initial + letters of symbols, if ``c.isalnum() or c in '-._/@' or ord(c) > 127``. + Check that @ is a valid unquoted token character (issue3686): >>> list(tokenize("@::")) [('symbol', '@', 0), ('::', None, 1), ('end', None, 3)] ''' + if syminitletters is None: + syminitletters = _syminitletters + if symletters is None: + symletters = _symletters pos, l = 0, len(program) while pos < l: @@ -177,12 +201,12 @@ def tokenize(program, lookup=None): else: raise error.ParseError(_("unterminated string"), s) # gather up a symbol/keyword - elif c.isalnum() or c in '._@' or ord(c) > 127: + elif c in syminitletters: s = pos pos += 1 while pos < l: # find end of symbol d = program[pos] - if not (d.isalnum() or d in "-._/@" or ord(d) > 127): + if d not in symletters: break if d == '.' and program[pos - 1] == '.': # special case for .. pos -= 1