rhodecode-enterprise-ce Commit - r1358:f0122102

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

3

4

#

4

#

5

# This program is free software: you can redistribute it and/or modify

5

# This program is free software: you can redistribute it and/or modify

6

# it under the terms of the GNU Affero General Public License, version 3

6

# it under the terms of the GNU Affero General Public License, version 3

7

# (only), as published by the Free Software Foundation.

7

# (only), as published by the Free Software Foundation.

8

#

8

#

9

# This program is distributed in the hope that it will be useful,

9

# This program is distributed in the hope that it will be useful,

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

# GNU General Public License for more details.

12

# GNU General Public License for more details.

13

#

13

#

14

# You should have received a copy of the GNU Affero General Public License

14

# You should have received a copy of the GNU Affero General Public License

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

16

#

16

#

17

# This program is dual-licensed. If you wish to learn more about the

17

# This program is dual-licensed. If you wish to learn more about the

18

# RhodeCode Enterprise Edition, including its added features, Support services,

18

# RhodeCode Enterprise Edition, including its added features, Support services,

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

20

21

import logging

21

import logging

22

import difflib

22

import difflib

23

from itertools import groupby

23

from itertools import groupby

24

25

from pygments import lex

25

from pygments import lex

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

27

from rhodecode.lib.helpers import (

27

from rhodecode.lib.helpers import (

28

get_lexer_for_filenode, ~~get_lexer_safe~~, html_escape)

28

get_lexer_for_filenode, html_escape)

29

from rhodecode.lib.utils2 import AttributeDict

29

from rhodecode.lib.utils2 import AttributeDict

30

from rhodecode.lib.vcs.nodes import FileNode

30

from rhodecode.lib.vcs.nodes import FileNode

31

from rhodecode.lib.diff_match_patch import diff_match_patch

31

from rhodecode.lib.diff_match_patch import diff_match_patch

32

from rhodecode.lib.diffs import LimitedDiffContainer

32

from rhodecode.lib.diffs import LimitedDiffContainer

33

from pygments.lexers import get_lexer_by_name

33

from pygments.lexers import get_lexer_by_name

34

35

plain_text_lexer = get_lexer_by_name(

35

plain_text_lexer = get_lexer_by_name(

36

'text', stripall=False, stripnl=False, ensurenl=False)

36

'text', stripall=False, stripnl=False, ensurenl=False)

37

38

39

log = logging.getLogger()

39

log = logging.getLogger()

40

41

42

def filenode_as_lines_tokens(filenode, lexer=None):

42

def filenode_as_lines_tokens(filenode, lexer=None):

43

org_lexer = lexer

43

org_lexer = lexer

44

lexer = lexer or get_lexer_for_filenode(filenode)

44

lexer = lexer or get_lexer_for_filenode(filenode)

45

log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',

45

log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',

46

lexer, filenode, org_lexer)

46

lexer, filenode, org_lexer)

47

tokens = tokenize_string(filenode.content, lexer)

47

tokens = tokenize_string(filenode.content, lexer)

48

lines = split_token_stream(tokens, split_string='\n')

48

lines = split_token_stream(tokens, split_string='\n')

49

rv = list(lines)

49

rv = list(lines)

50

return rv

50

return rv

51

52

53

def tokenize_string(content, lexer):

53

def tokenize_string(content, lexer):

54

"""

54

"""

55

Use pygments to tokenize some content based on a lexer

55

Use pygments to tokenize some content based on a lexer

56

ensuring all original new lines and whitespace is preserved

56

ensuring all original new lines and whitespace is preserved

57

"""

57

"""

58

59

lexer.stripall = False

59

lexer.stripall = False

60

lexer.stripnl = False

60

lexer.stripnl = False

61

lexer.ensurenl = False

61

lexer.ensurenl = False

62

for token_type, token_text in lex(content, lexer):

62

for token_type, token_text in lex(content, lexer):

63

yield pygment_token_class(token_type), token_text

63

yield pygment_token_class(token_type), token_text

64

65

66

def split_token_stream(tokens, split_string=u'\n'):

66

def split_token_stream(tokens, split_string=u'\n'):

67

"""

67

"""

68

Take a list of (TokenType, text) tuples and split them by a string

68

Take a list of (TokenType, text) tuples and split them by a string

69

70

>>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

70

>>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

71

[(TEXT, 'some'), (TEXT, 'text'),

71

[(TEXT, 'some'), (TEXT, 'text'),

72

(TEXT, 'more'), (TEXT, 'text')]

72

(TEXT, 'more'), (TEXT, 'text')]

73

"""

73

"""

74

75

buffer = []

75

buffer = []

76

for token_class, token_text in tokens:

76

for token_class, token_text in tokens:

77

parts = token_text.split(split_string)

77

parts = token_text.split(split_string)

78

for part in parts[:-1]:

78

for part in parts[:-1]:

79

buffer.append((token_class, part))

79

buffer.append((token_class, part))

80

yield buffer

80

yield buffer

81

buffer = []

81

buffer = []

82

83

buffer.append((token_class, parts[-1]))

83

buffer.append((token_class, parts[-1]))

84

85

if buffer:

85

if buffer:

86

yield buffer

86

yield buffer

87

88

89

def filenode_as_annotated_lines_tokens(filenode):

89

def filenode_as_annotated_lines_tokens(filenode):

90

"""

90

"""

91

Take a file node and return a list of annotations => lines, if no annotation

91

Take a file node and return a list of annotations => lines, if no annotation

92

is found, it will be None.

92

is found, it will be None.

93

94

eg:

94

eg:

95

96

[

96

[

97

(annotation1, [

97

(annotation1, [

98

(1, line1_tokens_list),

98

(1, line1_tokens_list),

99

(2, line2_tokens_list),

99

(2, line2_tokens_list),

100

]),

100

]),

101

(annotation2, [

101

(annotation2, [

102

(3, line1_tokens_list),

102

(3, line1_tokens_list),

103

]),

103

]),

104

(None, [

104

(None, [

105

(4, line1_tokens_list),

105

(4, line1_tokens_list),

106

]),

106

]),

107

(annotation1, [

107

(annotation1, [

108

(5, line1_tokens_list),

108

(5, line1_tokens_list),

109

(6, line2_tokens_list),

109

(6, line2_tokens_list),

110

])

110

])

111

]

111

]

112

"""

112

"""

113

114

commit_cache = {} # cache commit_getter lookups

114

commit_cache = {} # cache commit_getter lookups

115

116

def _get_annotation(commit_id, commit_getter):

116

def _get_annotation(commit_id, commit_getter):

117

if commit_id not in commit_cache:

117

if commit_id not in commit_cache:

118

commit_cache[commit_id] = commit_getter()

118

commit_cache[commit_id] = commit_getter()

119

return commit_cache[commit_id]

119

return commit_cache[commit_id]

120

121

annotation_lookup = {

121

annotation_lookup = {

122

line_no: _get_annotation(commit_id, commit_getter)

122

line_no: _get_annotation(commit_id, commit_getter)

123

for line_no, commit_id, commit_getter, line_content

123

for line_no, commit_id, commit_getter, line_content

124

in filenode.annotate

124

in filenode.annotate

125

}

125

}

126

127

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

127

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

128

for line_no, tokens

128

for line_no, tokens

129

in enumerate(filenode_as_lines_tokens(filenode), 1))

129

in enumerate(filenode_as_lines_tokens(filenode), 1))

130

131

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

131

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

132

133

for annotation, group in grouped_annotations_lines:

133

for annotation, group in grouped_annotations_lines:

134

yield (

134

yield (

135

annotation, [(line_no, tokens)

135

annotation, [(line_no, tokens)

136

for (_, line_no, tokens) in group]

136

for (_, line_no, tokens) in group]

137

)

137

)

138

139

140

def render_tokenstream(tokenstream):

140

def render_tokenstream(tokenstream):

141

result = []

141

result = []

142

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

142

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

143

144

if token_class:

144

if token_class:

145

result.append(u'' % token_class)

145

result.append(u'' % token_class)

146

else:

146

else:

147

result.append(u'')

147

result.append(u'')

148

149

for op_tag, token_text in token_ops_texts:

149

for op_tag, token_text in token_ops_texts:

150

151

if op_tag:

151

if op_tag:

152

result.append(u'<%s>' % op_tag)

152

result.append(u'<%s>' % op_tag)

153

154

escaped_text = html_escape(token_text)

154

escaped_text = html_escape(token_text)

155

156

# TODO: dan: investigate showing hidden characters like space/nl/tab

156

# TODO: dan: investigate showing hidden characters like space/nl/tab

157

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

157

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

158

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

158

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

159

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

159

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

160

161

result.append(escaped_text)

161

result.append(escaped_text)

162

163

if op_tag:

163

if op_tag:

164

result.append(u'</%s>' % op_tag)

164

result.append(u'</%s>' % op_tag)

165

166

result.append(u'')

166

result.append(u'')

167

168

html = ''.join(result)

168

html = ''.join(result)

169

return html

169

return html

170

171

172

def rollup_tokenstream(tokenstream):

172

def rollup_tokenstream(tokenstream):

173

"""

173

"""

174

Group a token stream of the format:

174

Group a token stream of the format:

175

176

('class', 'op', 'text')

176

('class', 'op', 'text')

177

or

177

or

178

('class', 'text')

178

('class', 'text')

179

180

into

180

into

181

182

[('class1',

182

[('class1',

183

[('op1', 'text'),

183

[('op1', 'text'),

184

('op2', 'text')]),

184

('op2', 'text')]),

185

('class2',

185

('class2',

186

[('op3', 'text')])]

186

[('op3', 'text')])]

187

188

This is used to get the minimal tags necessary when

188

This is used to get the minimal tags necessary when

189

rendering to html eg for a token stream ie.

189

rendering to html eg for a token stream ie.

190

191

<ins>he</ins>llo

191

<ins>he</ins>llo

192

vs

192

vs

193

<ins>he</ins>llo

193

<ins>he</ins>llo

194

195

If a 2 tuple is passed in, the output op will be an empty string.

195

If a 2 tuple is passed in, the output op will be an empty string.

196

197

eg:

197

eg:

198

199

>>> rollup_tokenstream([('classA', '', 'h'),

199

>>> rollup_tokenstream([('classA', '', 'h'),

200

('classA', 'del', 'ell'),

200

('classA', 'del', 'ell'),

201

('classA', '', 'o'),

201

('classA', '', 'o'),

202

('classB', '', ' '),

202

('classB', '', ' '),

203

('classA', '', 'the'),

203

('classA', '', 'the'),

204

('classA', '', 're'),

204

('classA', '', 're'),

205

])

205

])

206

207

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

207

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

208

('classB', [('', ' ')],

208

('classB', [('', ' ')],

209

('classA', [('', 'there')]]

209

('classA', [('', 'there')]]

210

211

"""

211

"""

212

if tokenstream and len(tokenstream[0]) == 2:

212

if tokenstream and len(tokenstream[0]) == 2:

213

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

213

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

214

215

result = []

215

result = []

216

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

216

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

217

ops = []

217

ops = []

218

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

218

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

219

text_buffer = []

219

text_buffer = []

220

for t_class, t_op, t_text in token_text_list:

220

for t_class, t_op, t_text in token_text_list:

221

text_buffer.append(t_text)

221

text_buffer.append(t_text)

222

ops.append((token_op, ''.join(text_buffer)))

222

ops.append((token_op, ''.join(text_buffer)))

223

result.append((token_class, ops))

223

result.append((token_class, ops))

224

return result

224

return result

225

226

227

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

227

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

228

"""

228

"""

229

Converts a list of (token_class, token_text) tuples to a list of

229

Converts a list of (token_class, token_text) tuples to a list of

230

(token_class, token_op, token_text) tuples where token_op is one of

230

(token_class, token_op, token_text) tuples where token_op is one of

231

('ins', 'del', '')

231

('ins', 'del', '')

232

233

:param old_tokens: list of (token_class, token_text) tuples of old line

233

:param old_tokens: list of (token_class, token_text) tuples of old line

234

:param new_tokens: list of (token_class, token_text) tuples of new line

234

:param new_tokens: list of (token_class, token_text) tuples of new line

235

:param use_diff_match_patch: boolean, will use google's diff match patch

235

:param use_diff_match_patch: boolean, will use google's diff match patch

236

library which has options to 'smooth' out the character by character

236

library which has options to 'smooth' out the character by character

237

differences making nicer ins/del blocks

237

differences making nicer ins/del blocks

238

"""

238

"""

239

240

old_tokens_result = []

240

old_tokens_result = []

241

new_tokens_result = []

241

new_tokens_result = []

242

243

similarity = difflib.SequenceMatcher(None,

243

similarity = difflib.SequenceMatcher(None,

244

''.join(token_text for token_class, token_text in old_tokens),

244

''.join(token_text for token_class, token_text in old_tokens),

245

''.join(token_text for token_class, token_text in new_tokens)

245

''.join(token_text for token_class, token_text in new_tokens)

246

).ratio()

246

).ratio()

247

248

if similarity < 0.6: # return, the blocks are too different

248

if similarity < 0.6: # return, the blocks are too different

249

for token_class, token_text in old_tokens:

249

for token_class, token_text in old_tokens:

250

old_tokens_result.append((token_class, '', token_text))

250

old_tokens_result.append((token_class, '', token_text))

251

for token_class, token_text in new_tokens:

251

for token_class, token_text in new_tokens:

252

new_tokens_result.append((token_class, '', token_text))

252

new_tokens_result.append((token_class, '', token_text))

253

return old_tokens_result, new_tokens_result, similarity

253

return old_tokens_result, new_tokens_result, similarity

254

255

token_sequence_matcher = difflib.SequenceMatcher(None,

255

token_sequence_matcher = difflib.SequenceMatcher(None,

256

[x[1] for x in old_tokens],

256

[x[1] for x in old_tokens],

257

[x[1] for x in new_tokens])

257

[x[1] for x in new_tokens])

258

259

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

259

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

260

# check the differences by token block types first to give a more

260

# check the differences by token block types first to give a more

261

# nicer "block" level replacement vs character diffs

261

# nicer "block" level replacement vs character diffs

262

263

if tag == 'equal':

263

if tag == 'equal':

264

for token_class, token_text in old_tokens[o1:o2]:

264

for token_class, token_text in old_tokens[o1:o2]:

265

old_tokens_result.append((token_class, '', token_text))

265

old_tokens_result.append((token_class, '', token_text))

266

for token_class, token_text in new_tokens[n1:n2]:

266

for token_class, token_text in new_tokens[n1:n2]:

267

new_tokens_result.append((token_class, '', token_text))

267

new_tokens_result.append((token_class, '', token_text))

268

elif tag == 'delete':

268

elif tag == 'delete':

269

for token_class, token_text in old_tokens[o1:o2]:

269

for token_class, token_text in old_tokens[o1:o2]:

270

old_tokens_result.append((token_class, 'del', token_text))

270

old_tokens_result.append((token_class, 'del', token_text))

271

elif tag == 'insert':

271

elif tag == 'insert':

272

for token_class, token_text in new_tokens[n1:n2]:

272

for token_class, token_text in new_tokens[n1:n2]:

273

new_tokens_result.append((token_class, 'ins', token_text))

273

new_tokens_result.append((token_class, 'ins', token_text))

274

elif tag == 'replace':

274

elif tag == 'replace':

275

# if same type token blocks must be replaced, do a diff on the

275

# if same type token blocks must be replaced, do a diff on the

276

# characters in the token blocks to show individual changes

276

# characters in the token blocks to show individual changes

277

278

old_char_tokens = []

278

old_char_tokens = []

279

new_char_tokens = []

279

new_char_tokens = []

280

for token_class, token_text in old_tokens[o1:o2]:

280

for token_class, token_text in old_tokens[o1:o2]:

281

for char in token_text:

281

for char in token_text:

282

old_char_tokens.append((token_class, char))

282

old_char_tokens.append((token_class, char))

283

284

for token_class, token_text in new_tokens[n1:n2]:

284

for token_class, token_text in new_tokens[n1:n2]:

285

for char in token_text:

285

for char in token_text:

286

new_char_tokens.append((token_class, char))

286

new_char_tokens.append((token_class, char))

287

288

old_string = ''.join([token_text for

288

old_string = ''.join([token_text for

289

token_class, token_text in old_char_tokens])

289

token_class, token_text in old_char_tokens])

290

new_string = ''.join([token_text for

290

new_string = ''.join([token_text for

291

token_class, token_text in new_char_tokens])

291

token_class, token_text in new_char_tokens])

292

293

char_sequence = difflib.SequenceMatcher(

293

char_sequence = difflib.SequenceMatcher(

294

None, old_string, new_string)

294

None, old_string, new_string)

295

copcodes = char_sequence.get_opcodes()

295

copcodes = char_sequence.get_opcodes()

296

obuffer, nbuffer = [], []

296

obuffer, nbuffer = [], []

297

298

if use_diff_match_patch:

298

if use_diff_match_patch:

299

dmp = diff_match_patch()

299

dmp = diff_match_patch()

300

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

300

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

301

reps = dmp.diff_main(old_string, new_string)

301

reps = dmp.diff_main(old_string, new_string)

302

dmp.diff_cleanupEfficiency(reps)

302

dmp.diff_cleanupEfficiency(reps)

303

304

a, b = 0, 0

304

a, b = 0, 0

305

for op, rep in reps:

305

for op, rep in reps:

306

l = len(rep)

306

l = len(rep)

307

if op == 0:

307

if op == 0:

308

for i, c in enumerate(rep):

308

for i, c in enumerate(rep):

309

obuffer.append((old_char_tokens[a+i][0], '', c))

309

obuffer.append((old_char_tokens[a+i][0], '', c))

310

nbuffer.append((new_char_tokens[b+i][0], '', c))

310

nbuffer.append((new_char_tokens[b+i][0], '', c))

311

a += l

311

a += l

312

b += l

312

b += l

313

elif op == -1:

313

elif op == -1:

314

for i, c in enumerate(rep):

314

for i, c in enumerate(rep):

315

obuffer.append((old_char_tokens[a+i][0], 'del', c))

315

obuffer.append((old_char_tokens[a+i][0], 'del', c))

316

a += l

316

a += l

317

elif op == 1:

317

elif op == 1:

318

for i, c in enumerate(rep):

318

for i, c in enumerate(rep):

319

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

319

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

320

b += l

320

b += l

321

else:

321

else:

322

for ctag, co1, co2, cn1, cn2 in copcodes:

322

for ctag, co1, co2, cn1, cn2 in copcodes:

323

if ctag == 'equal':

323

if ctag == 'equal':

324

for token_class, token_text in old_char_tokens[co1:co2]:

324

for token_class, token_text in old_char_tokens[co1:co2]:

325

obuffer.append((token_class, '', token_text))

325

obuffer.append((token_class, '', token_text))

326

for token_class, token_text in new_char_tokens[cn1:cn2]:

326

for token_class, token_text in new_char_tokens[cn1:cn2]:

327

nbuffer.append((token_class, '', token_text))

327

nbuffer.append((token_class, '', token_text))

328

elif ctag == 'delete':

328

elif ctag == 'delete':

329

for token_class, token_text in old_char_tokens[co1:co2]:

329

for token_class, token_text in old_char_tokens[co1:co2]:

330

obuffer.append((token_class, 'del', token_text))

330

obuffer.append((token_class, 'del', token_text))

331

elif ctag == 'insert':

331

elif ctag == 'insert':

332

for token_class, token_text in new_char_tokens[cn1:cn2]:

332

for token_class, token_text in new_char_tokens[cn1:cn2]:

333

nbuffer.append((token_class, 'ins', token_text))

333

nbuffer.append((token_class, 'ins', token_text))

334

elif ctag == 'replace':

334

elif ctag == 'replace':

335

for token_class, token_text in old_char_tokens[co1:co2]:

335

for token_class, token_text in old_char_tokens[co1:co2]:

336

obuffer.append((token_class, 'del', token_text))

336

obuffer.append((token_class, 'del', token_text))

337

for token_class, token_text in new_char_tokens[cn1:cn2]:

337

for token_class, token_text in new_char_tokens[cn1:cn2]:

338

nbuffer.append((token_class, 'ins', token_text))

338

nbuffer.append((token_class, 'ins', token_text))

339

340

old_tokens_result.extend(obuffer)

340

old_tokens_result.extend(obuffer)

341

new_tokens_result.extend(nbuffer)

341

new_tokens_result.extend(nbuffer)

342

343

return old_tokens_result, new_tokens_result, similarity

343

return old_tokens_result, new_tokens_result, similarity

344

345

346

class DiffSet(object):

346

class DiffSet(object):

347

"""

347

"""

348

An object for parsing the diff result from diffs.DiffProcessor and

348

An object for parsing the diff result from diffs.DiffProcessor and

349

adding highlighting, side by side/unified renderings and line diffs

349

adding highlighting, side by side/unified renderings and line diffs

350

"""

350

"""

351

352

HL_REAL = 'REAL' # highlights using original file, slow

352

HL_REAL = 'REAL' # highlights using original file, slow

353

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

353

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

354

# in the case of multiline code

354

# in the case of multiline code

355

HL_NONE = 'NONE' # no highlighting, fastest

355

HL_NONE = 'NONE' # no highlighting, fastest

356

357

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

357

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

358

source_repo_name=None,

358

source_repo_name=None,

359

source_node_getter=lambda filename: None,

359

source_node_getter=lambda filename: None,

360

target_node_getter=lambda filename: None,

360

target_node_getter=lambda filename: None,

361

source_nodes=None, target_nodes=None,

361

source_nodes=None, target_nodes=None,

362

max_file_size_limit=150 * 1024, # files over this size will

362

max_file_size_limit=150 * 1024, # files over this size will

363

# use fast highlighting

363

# use fast highlighting

364

comments=None,

364

comments=None,

365

):

365

):

366

367

self.highlight_mode = highlight_mode

367

self.highlight_mode = highlight_mode

368

self.highlighted_filenodes = {}

368

self.highlighted_filenodes = {}

369

self.source_node_getter = source_node_getter

369

self.source_node_getter = source_node_getter

370

self.target_node_getter = target_node_getter

370

self.target_node_getter = target_node_getter

371

self.source_nodes = source_nodes or {}

371

self.source_nodes = source_nodes or {}

372

self.target_nodes = target_nodes or {}

372

self.target_nodes = target_nodes or {}

373

self.repo_name = repo_name

373

self.repo_name = repo_name

374

self.source_repo_name = source_repo_name or repo_name

374

self.source_repo_name = source_repo_name or repo_name

375

self.comments = comments or {}

375

self.comments = comments or {}

376

self.comments_store = self.comments.copy()

376

self.comments_store = self.comments.copy()

377

self.max_file_size_limit = max_file_size_limit

377

self.max_file_size_limit = max_file_size_limit

378

379

def render_patchset(self, patchset, source_ref=None, target_ref=None):

379

def render_patchset(self, patchset, source_ref=None, target_ref=None):

380

diffset = AttributeDict(dict(

380

diffset = AttributeDict(dict(

381

lines_added=0,

381

lines_added=0,

382

lines_deleted=0,

382

lines_deleted=0,

383

changed_files=0,

383

changed_files=0,

384

files=[],

384

files=[],

385

limited_diff=isinstance(patchset, LimitedDiffContainer),

385

limited_diff=isinstance(patchset, LimitedDiffContainer),

386

repo_name=self.repo_name,

386

repo_name=self.repo_name,

387

source_repo_name=self.source_repo_name,

387

source_repo_name=self.source_repo_name,

388

source_ref=source_ref,

388

source_ref=source_ref,

389

target_ref=target_ref,

389

target_ref=target_ref,

390

))

390

))

391

for patch in patchset:

391

for patch in patchset:

392

filediff = self.render_patch(patch)

392

filediff = self.render_patch(patch)

393

filediff.diffset = diffset

393

filediff.diffset = diffset

394

diffset.files.append(filediff)

394

diffset.files.append(filediff)

395

diffset.changed_files += 1

395

diffset.changed_files += 1

396

if not patch['stats']['binary']:

396

if not patch['stats']['binary']:

397

diffset.lines_added += patch['stats']['added']

397

diffset.lines_added += patch['stats']['added']

398

diffset.lines_deleted += patch['stats']['deleted']

398

diffset.lines_deleted += patch['stats']['deleted']

399

400

return diffset

400

return diffset

401

402

_lexer_cache = {}

402

_lexer_cache = {}

403

def _get_lexer_for_filename(self, filename, filenode=None):

403

def _get_lexer_for_filename(self, filename, filenode=None):

404

# cached because we might need to call it twice for source/target

404

# cached because we might need to call it twice for source/target

405

if filename not in self._lexer_cache:

405

if filename not in self._lexer_cache:

406

if filenode:

406

if filenode:

407

lexer = filenode.lexer

407

lexer = filenode.lexer

408

else:

408

else:

409

lexer = get_lexer~~_safe~~(file~~path~~=filename)

409

lexer = FileNode.get_lexer(filename=filename)

410

self._lexer_cache[filename] = lexer

410

self._lexer_cache[filename] = lexer

411

return self._lexer_cache[filename]

411

return self._lexer_cache[filename]

412

413

def render_patch(self, patch):

413

def render_patch(self, patch):

414

log.debug('rendering diff for %r' % patch['filename'])

414

log.debug('rendering diff for %r' % patch['filename'])

415

416

source_filename = patch['original_filename']

416

source_filename = patch['original_filename']

417

target_filename = patch['filename']

417

target_filename = patch['filename']

418

419

source_lexer = plain_text_lexer

419

source_lexer = plain_text_lexer

420

target_lexer = plain_text_lexer

420

target_lexer = plain_text_lexer

421

422

if not patch['stats']['binary']:

422

if not patch['stats']['binary']:

423

if self.highlight_mode == self.HL_REAL:

423

if self.highlight_mode == self.HL_REAL:

424

if (source_filename and patch['operation'] in ('D', 'M')

424

if (source_filename and patch['operation'] in ('D', 'M')

425

and source_filename not in self.source_nodes):

425

and source_filename not in self.source_nodes):

426

self.source_nodes[source_filename] = (

426

self.source_nodes[source_filename] = (

427

self.source_node_getter(source_filename))

427

self.source_node_getter(source_filename))

428

429

if (target_filename and patch['operation'] in ('A', 'M')

429

if (target_filename and patch['operation'] in ('A', 'M')

430

and target_filename not in self.target_nodes):

430

and target_filename not in self.target_nodes):

431

self.target_nodes[target_filename] = (

431

self.target_nodes[target_filename] = (

432

self.target_node_getter(target_filename))

432

self.target_node_getter(target_filename))

433

434

elif self.highlight_mode == self.HL_FAST:

434

elif self.highlight_mode == self.HL_FAST:

435

source_lexer = self._get_lexer_for_filename(source_filename)

435

source_lexer = self._get_lexer_for_filename(source_filename)

436

target_lexer = self._get_lexer_for_filename(target_filename)

436

target_lexer = self._get_lexer_for_filename(target_filename)

437

438

source_file = self.source_nodes.get(source_filename, source_filename)

438

source_file = self.source_nodes.get(source_filename, source_filename)

439

target_file = self.target_nodes.get(target_filename, target_filename)

439

target_file = self.target_nodes.get(target_filename, target_filename)

440

441

source_filenode, target_filenode = None, None

441

source_filenode, target_filenode = None, None

442

443

# TODO: dan: FileNode.lexer works on the content of the file - which

443

# TODO: dan: FileNode.lexer works on the content of the file - which

444

# can be slow - issue #4289 explains a lexer clean up - which once

444

# can be slow - issue #4289 explains a lexer clean up - which once

445

# done can allow caching a lexer for a filenode to avoid the file lookup

445

# done can allow caching a lexer for a filenode to avoid the file lookup

446

if isinstance(source_file, FileNode):

446

if isinstance(source_file, FileNode):

447

source_filenode = source_file

447

source_filenode = source_file

448

#source_lexer = source_file.lexer

448

#source_lexer = source_file.lexer

449

source_lexer = self._get_lexer_for_filename(source_filename)

449

source_lexer = self._get_lexer_for_filename(source_filename)

450

source_file.lexer = source_lexer

450

source_file.lexer = source_lexer

451

452

if isinstance(target_file, FileNode):

452

if isinstance(target_file, FileNode):

453

target_filenode = target_file

453

target_filenode = target_file

454

#target_lexer = target_file.lexer

454

#target_lexer = target_file.lexer

455

target_lexer = self._get_lexer_for_filename(target_filename)

455

target_lexer = self._get_lexer_for_filename(target_filename)

456

target_file.lexer = target_lexer

456

target_file.lexer = target_lexer

457

458

source_file_path, target_file_path = None, None

458

source_file_path, target_file_path = None, None

459

460

if source_filename != '/dev/null':

460

if source_filename != '/dev/null':

461

source_file_path = source_filename

461

source_file_path = source_filename

462

if target_filename != '/dev/null':

462

if target_filename != '/dev/null':

463

target_file_path = target_filename

463

target_file_path = target_filename

464

465

source_file_type = source_lexer.name

465

source_file_type = source_lexer.name

466

target_file_type = target_lexer.name

466

target_file_type = target_lexer.name

467

468

op_hunks = patch['chunks'][0]

468

op_hunks = patch['chunks'][0]

469

hunks = patch['chunks'][1:]

469

hunks = patch['chunks'][1:]

470

471

filediff = AttributeDict({

471

filediff = AttributeDict({

472

'source_file_path': source_file_path,

472

'source_file_path': source_file_path,

473

'target_file_path': target_file_path,

473

'target_file_path': target_file_path,

474

'source_filenode': source_filenode,

474

'source_filenode': source_filenode,

475

'target_filenode': target_filenode,

475

'target_filenode': target_filenode,

476

'hunks': [],

476

'hunks': [],

477

'source_file_type': target_file_type,

477

'source_file_type': target_file_type,

478

'target_file_type': source_file_type,

478

'target_file_type': source_file_type,

479

'patch': patch,

479

'patch': patch,

480

'source_mode': patch['stats']['old_mode'],

480

'source_mode': patch['stats']['old_mode'],

481

'target_mode': patch['stats']['new_mode'],

481

'target_mode': patch['stats']['new_mode'],

482

'limited_diff': isinstance(patch, LimitedDiffContainer),

482

'limited_diff': isinstance(patch, LimitedDiffContainer),

483

'diffset': self,

483

'diffset': self,

484

})

484

})

485

486

for hunk in hunks:

486

for hunk in hunks:

487

hunkbit = self.parse_hunk(hunk, source_file, target_file)

487

hunkbit = self.parse_hunk(hunk, source_file, target_file)

488

hunkbit.filediff = filediff

488

hunkbit.filediff = filediff

489

filediff.hunks.append(hunkbit)

489

filediff.hunks.append(hunkbit)

490

491

left_comments = {}

491

left_comments = {}

492

493

if source_file_path in self.comments_store:

493

if source_file_path in self.comments_store:

494

for lineno, comments in self.comments_store[source_file_path].items():

494

for lineno, comments in self.comments_store[source_file_path].items():

495

left_comments[lineno] = comments

495

left_comments[lineno] = comments

496

497

if target_file_path in self.comments_store:

497

if target_file_path in self.comments_store:

498

for lineno, comments in self.comments_store[target_file_path].items():

498

for lineno, comments in self.comments_store[target_file_path].items():

499

left_comments[lineno] = comments

499

left_comments[lineno] = comments

500

501

filediff.left_comments = left_comments

501

filediff.left_comments = left_comments

502

return filediff

502

return filediff

503

504

def parse_hunk(self, hunk, source_file, target_file):

504

def parse_hunk(self, hunk, source_file, target_file):

505

result = AttributeDict(dict(

505

result = AttributeDict(dict(

506

source_start=hunk['source_start'],

506

source_start=hunk['source_start'],

507

source_length=hunk['source_length'],

507

source_length=hunk['source_length'],

508

target_start=hunk['target_start'],

508

target_start=hunk['target_start'],

509

target_length=hunk['target_length'],

509

target_length=hunk['target_length'],

510

section_header=hunk['section_header'],

510

section_header=hunk['section_header'],

511

lines=[],

511

lines=[],

512

))

512

))

513

before, after = [], []

513

before, after = [], []

514

515

for line in hunk['lines']:

515

for line in hunk['lines']:

516

if line['action'] == 'unmod':

516

if line['action'] == 'unmod':

517

result.lines.extend(

517

result.lines.extend(

518

self.parse_lines(before, after, source_file, target_file))

518

self.parse_lines(before, after, source_file, target_file))

519

after.append(line)

519

after.append(line)

520

before.append(line)

520

before.append(line)

521

elif line['action'] == 'add':

521

elif line['action'] == 'add':

522

after.append(line)

522

after.append(line)

523

elif line['action'] == 'del':

523

elif line['action'] == 'del':

524

before.append(line)

524

before.append(line)

525

elif line['action'] == 'old-no-nl':

525

elif line['action'] == 'old-no-nl':

526

before.append(line)

526

before.append(line)

527

elif line['action'] == 'new-no-nl':

527

elif line['action'] == 'new-no-nl':

528

after.append(line)

528

after.append(line)

529

530

result.lines.extend(

530

result.lines.extend(

531

self.parse_lines(before, after, source_file, target_file))

531

self.parse_lines(before, after, source_file, target_file))

532

result.unified = self.as_unified(result.lines)

532

result.unified = self.as_unified(result.lines)

533

result.sideside = result.lines

533

result.sideside = result.lines

534

535

return result

535

return result

536

537

def parse_lines(self, before_lines, after_lines, source_file, target_file):

537

def parse_lines(self, before_lines, after_lines, source_file, target_file):

538

# TODO: dan: investigate doing the diff comparison and fast highlighting

538

# TODO: dan: investigate doing the diff comparison and fast highlighting

539

# on the entire before and after buffered block lines rather than by

539

# on the entire before and after buffered block lines rather than by

540

# line, this means we can get better 'fast' highlighting if the context

540

# line, this means we can get better 'fast' highlighting if the context

541

# allows it - eg.

541

# allows it - eg.

542

# line 4: """

542

# line 4: """

543

# line 5: this gets highlighted as a string

543

# line 5: this gets highlighted as a string

544

# line 6: """

544

# line 6: """

545

546

lines = []

546

lines = []

547

while before_lines or after_lines:

547

while before_lines or after_lines:

548

before, after = None, None

548

before, after = None, None

549

before_tokens, after_tokens = None, None

549

before_tokens, after_tokens = None, None

550

551

if before_lines:

551

if before_lines:

552

before = before_lines.pop(0)

552

before = before_lines.pop(0)

553

if after_lines:

553

if after_lines:

554

after = after_lines.pop(0)

554

after = after_lines.pop(0)

555

556

original = AttributeDict()

556

original = AttributeDict()

557

modified = AttributeDict()

557

modified = AttributeDict()

558

559

if before:

559

if before:

560

if before['action'] == 'old-no-nl':

560

if before['action'] == 'old-no-nl':

561

before_tokens = [('nonl', before['line'])]

561

before_tokens = [('nonl', before['line'])]

562

else:

562

else:

563

before_tokens = self.get_line_tokens(

563

before_tokens = self.get_line_tokens(

564

line_text=before['line'], line_number=before['old_lineno'],

564

line_text=before['line'], line_number=before['old_lineno'],

565

file=source_file)

565

file=source_file)

566

original.lineno = before['old_lineno']

566

original.lineno = before['old_lineno']

567

original.content = before['line']

567

original.content = before['line']

568

original.action = self.action_to_op(before['action'])

568

original.action = self.action_to_op(before['action'])

569

original.comments = self.get_comments_for('old',

569

original.comments = self.get_comments_for('old',

570

source_file, before['old_lineno'])

570

source_file, before['old_lineno'])

571

572

if after:

572

if after:

573

if after['action'] == 'new-no-nl':

573

if after['action'] == 'new-no-nl':

574

after_tokens = [('nonl', after['line'])]

574

after_tokens = [('nonl', after['line'])]

575

else:

575

else:

576

after_tokens = self.get_line_tokens(

576

after_tokens = self.get_line_tokens(

577

line_text=after['line'], line_number=after['new_lineno'],

577

line_text=after['line'], line_number=after['new_lineno'],

578

file=target_file)

578

file=target_file)

579

modified.lineno = after['new_lineno']

579

modified.lineno = after['new_lineno']

580

modified.content = after['line']

580

modified.content = after['line']

581

modified.action = self.action_to_op(after['action'])

581

modified.action = self.action_to_op(after['action'])

582

modified.comments = self.get_comments_for('new',

582

modified.comments = self.get_comments_for('new',

583

target_file, after['new_lineno'])

583

target_file, after['new_lineno'])

584

585

# diff the lines

585

# diff the lines

586

if before_tokens and after_tokens:

586

if before_tokens and after_tokens:

587

o_tokens, m_tokens, similarity = tokens_diff(

587

o_tokens, m_tokens, similarity = tokens_diff(

588

before_tokens, after_tokens)

588

before_tokens, after_tokens)

589

original.content = render_tokenstream(o_tokens)

589

original.content = render_tokenstream(o_tokens)

590

modified.content = render_tokenstream(m_tokens)

590

modified.content = render_tokenstream(m_tokens)

591

elif before_tokens:

591

elif before_tokens:

592

original.content = render_tokenstream(

592

original.content = render_tokenstream(

593

[(x[0], '', x[1]) for x in before_tokens])

593

[(x[0], '', x[1]) for x in before_tokens])

594

elif after_tokens:

594

elif after_tokens:

595

modified.content = render_tokenstream(

595

modified.content = render_tokenstream(

596

[(x[0], '', x[1]) for x in after_tokens])

596

[(x[0], '', x[1]) for x in after_tokens])

597

598

lines.append(AttributeDict({

598

lines.append(AttributeDict({

599

'original': original,

599

'original': original,

600

'modified': modified,

600

'modified': modified,

601

}))

601

}))

602

603

return lines

603

return lines

604

605

def get_comments_for(self, version, file, line_number):

605

def get_comments_for(self, version, file, line_number):

606

if hasattr(file, 'unicode_path'):

606

if hasattr(file, 'unicode_path'):

607

file = file.unicode_path

607

file = file.unicode_path

608

609

if not isinstance(file, basestring):

609

if not isinstance(file, basestring):

610

return None

610

return None

611

612

line_key = {

612

line_key = {

613

'old': 'o',

613

'old': 'o',

614

'new': 'n',

614

'new': 'n',

615

}[version] + str(line_number)

615

}[version] + str(line_number)

616

617

if file in self.comments_store:

617

if file in self.comments_store:

618

file_comments = self.comments_store[file]

618

file_comments = self.comments_store[file]

619

if line_key in file_comments:

619

if line_key in file_comments:

620

return file_comments.pop(line_key)

620

return file_comments.pop(line_key)

621

622

def get_line_tokens(self, line_text, line_number, file=None):

622

def get_line_tokens(self, line_text, line_number, file=None):

623

filenode = None

623

filenode = None

624

filename = None

624

filename = None

625

626

if isinstance(file, basestring):

626

if isinstance(file, basestring):

627

filename = file

627

filename = file

628

elif isinstance(file, FileNode):

628

elif isinstance(file, FileNode):

629

filenode = file

629

filenode = file

630

filename = file.unicode_path

630

filename = file.unicode_path

631

632

if self.highlight_mode == self.HL_REAL and filenode:

632

if self.highlight_mode == self.HL_REAL and filenode:

633

lexer = self._get_lexer_for_filename(filename)

633

lexer = self._get_lexer_for_filename(filename)

634

file_size_allowed = file.size < self.max_file_size_limit

634

file_size_allowed = file.size < self.max_file_size_limit

635

if line_number and file_size_allowed:

635

if line_number and file_size_allowed:

636

return self.get_tokenized_filenode_line(

636

return self.get_tokenized_filenode_line(

637

file, line_number, lexer)

637

file, line_number, lexer)

638

639

if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:

639

if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:

640

lexer = self._get_lexer_for_filename(filename)

640

lexer = self._get_lexer_for_filename(filename)

641

return list(tokenize_string(line_text, lexer))

641

return list(tokenize_string(line_text, lexer))

642

643

return list(tokenize_string(line_text, plain_text_lexer))

643

return list(tokenize_string(line_text, plain_text_lexer))

644

645

def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):

645

def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):

646

647

if filenode not in self.highlighted_filenodes:

647

if filenode not in self.highlighted_filenodes:

648

tokenized_lines = filenode_as_lines_tokens(filenode, lexer)

648

tokenized_lines = filenode_as_lines_tokens(filenode, lexer)

649

self.highlighted_filenodes[filenode] = tokenized_lines

649

self.highlighted_filenodes[filenode] = tokenized_lines

650

return self.highlighted_filenodes[filenode][line_number - 1]

650

return self.highlighted_filenodes[filenode][line_number - 1]

651

652

def action_to_op(self, action):

652

def action_to_op(self, action):

653

return {

653

return {

654

'add': '+',

654

'add': '+',

655

'del': '-',

655

'del': '-',

656

'unmod': ' ',

656

'unmod': ' ',

657

'old-no-nl': ' ',

657

'old-no-nl': ' ',

658

'new-no-nl': ' ',

658

'new-no-nl': ' ',

659

}.get(action, action)

659

}.get(action, action)

660

661

def as_unified(self, lines):

661

def as_unified(self, lines):

662

"""

662

"""

663

Return a generator that yields the lines of a diff in unified order

663

Return a generator that yields the lines of a diff in unified order

664

"""

664

"""

665

def generator():

665

def generator():

666

buf = []

666

buf = []

667

for line in lines:

667

for line in lines:

668

669

if buf and not line.original or line.original.action == ' ':

669

if buf and not line.original or line.original.action == ' ':

670

for b in buf:

670

for b in buf:

671

yield b

671

yield b

672

buf = []

672

buf = []

673

674

if line.original:

674

if line.original:

675

if line.original.action == ' ':

675

if line.original.action == ' ':

676

yield (line.original.lineno, line.modified.lineno,

676

yield (line.original.lineno, line.modified.lineno,

677

line.original.action, line.original.content,

677

line.original.action, line.original.content,

678

line.original.comments)

678

line.original.comments)

679

continue

679

continue

680

681

if line.original.action == '-':

681

if line.original.action == '-':

682

yield (line.original.lineno, None,

682

yield (line.original.lineno, None,

683

line.original.action, line.original.content,

683

line.original.action, line.original.content,

684

line.original.comments)

684

line.original.comments)

685

686

if line.modified.action == '+':

686

if line.modified.action == '+':

687

buf.append((

687

buf.append((

688

None, line.modified.lineno,

688

None, line.modified.lineno,

689

line.modified.action, line.modified.content,

689

line.modified.action, line.modified.content,

690

line.modified.comments))

690

line.modified.comments))

691

continue

691

continue

692

693

if line.modified:

693

if line.modified:

694

yield (None, line.modified.lineno,

694

yield (None, line.modified.lineno,

695

line.modified.action, line.modified.content,

695

line.modified.action, line.modified.content,

696

line.modified.comments)

696

line.modified.comments)

697

698

for b in buf:

698

for b in buf:

699

yield b

699

yield b

700

701

return generator()

701

return generator()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             # Copyright (C) 2011-2017 RhodeCode GmbH
             #
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU Affero General Public License, version 3
             # (only), as published by the Free Software Foundation.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU Affero General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             #
             # This program is dual-licensed. If you wish to learn more about the
             # RhodeCode Enterprise Edition, including its added features, Support services,
             # and proprietary license terms, please see https://rhodecode.com/licenses/
             import logging
             import difflib
             from itertools import groupby
             from pygments import lex
             from pygments.formatters.html import _get_ttype_class as pygment_token_class
             from rhodecode.lib.helpers import (
-                get_lexer_for_filenode, get_lexer_safe, html_escape)
+                get_lexer_for_filenode, html_escape)
             from rhodecode.lib.utils2 import AttributeDict
             from rhodecode.lib.vcs.nodes import FileNode
             from rhodecode.lib.diff_match_patch import diff_match_patch
             from rhodecode.lib.diffs import LimitedDiffContainer
             from pygments.lexers import get_lexer_by_name
             plain_text_lexer = get_lexer_by_name(
                 'text', stripall=False, stripnl=False, ensurenl=False)
             log = logging.getLogger()
             def filenode_as_lines_tokens(filenode, lexer=None):
                 org_lexer = lexer
                 lexer = lexer or get_lexer_for_filenode(filenode)
                 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
                           lexer, filenode, org_lexer)
                 tokens = tokenize_string(filenode.content, lexer)
                 lines = split_token_stream(tokens, split_string='\n')
                 rv = list(lines)
                 return rv
             def tokenize_string(content, lexer):
                 """
                 Use pygments to tokenize some content based on a lexer
                 ensuring all original new lines and whitespace is preserved
                 """
                 lexer.stripall = False
                 lexer.stripnl = False
                 lexer.ensurenl = False
                 for token_type, token_text in lex(content, lexer):
                     yield pygment_token_class(token_type), token_text
             def split_token_stream(tokens, split_string=u'\n'):
                 """
                 Take a list of (TokenType, text) tuples and split them by a string
                 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
                 [(TEXT, 'some'), (TEXT, 'text'),
                  (TEXT, 'more'), (TEXT, 'text')]
                 """
                 buffer = []
                 for token_class, token_text in tokens:
                     parts = token_text.split(split_string)
                     for part in parts[:-1]:
                         buffer.append((token_class, part))
                         yield buffer
                         buffer = []
                     buffer.append((token_class, parts[-1]))
                 if buffer:
                     yield buffer
             def filenode_as_annotated_lines_tokens(filenode):
                 """
                 Take a file node and return a list of annotations => lines, if no annotation
                 is found, it will be None.
                 eg:
                 [
                     (annotation1, [
                         (1, line1_tokens_list),
                         (2, line2_tokens_list),
                     ]),
                     (annotation2, [
                         (3, line1_tokens_list),
                     ]),
                     (None, [
                         (4, line1_tokens_list),
                     ]),
                     (annotation1, [
                         (5, line1_tokens_list),
                         (6, line2_tokens_list),
                     ])
                 ]
                 """
                 commit_cache = {} # cache commit_getter lookups
                 def _get_annotation(commit_id, commit_getter):
                     if commit_id not in commit_cache:
                         commit_cache[commit_id] = commit_getter()
                     return commit_cache[commit_id]
                 annotation_lookup = {
                     line_no: _get_annotation(commit_id, commit_getter)
                     for line_no, commit_id, commit_getter, line_content
                     in filenode.annotate
                 }
                 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
                                       for line_no, tokens
                                       in enumerate(filenode_as_lines_tokens(filenode), 1))
                 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
                 for annotation, group in grouped_annotations_lines:
                     yield (
                         annotation, [(line_no, tokens)
                                       for (_, line_no, tokens) in group]
                     )
             def render_tokenstream(tokenstream):
                 result = []
                 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
                     if token_class:
                         result.append(u'<span class="%s">' % token_class)
                     else:
                         result.append(u'<span>')
                     for op_tag, token_text in token_ops_texts:
                         if op_tag:
                             result.append(u'<%s>' % op_tag)
                         escaped_text = html_escape(token_text)
                         # TODO: dan: investigate showing hidden characters like space/nl/tab
                         # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
                         # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
                         # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
                         result.append(escaped_text)
                         if op_tag:
                             result.append(u'</%s>' % op_tag)
                     result.append(u'</span>')
                 html = ''.join(result)
                 return html
             def rollup_tokenstream(tokenstream):
                 """
                 Group a token stream of the format:
                     ('class', 'op', 'text')
                 or
                     ('class', 'text')
                 into
                     [('class1',
                         [('op1', 'text'),
                          ('op2', 'text')]),
                      ('class2',
                         [('op3', 'text')])]
                 This is used to get the minimal tags necessary when
                 rendering to html eg for a token stream ie.
                 <span class="A"><ins>he</ins>llo</span>
                 vs
                 <span class="A"><ins>he</ins></span><span class="A">llo</span>
                 If a 2 tuple is passed in, the output op will be an empty string.
                 eg:
                 >>> rollup_tokenstream([('classA', '',      'h'),
                                         ('classA', 'del',   'ell'),
                                         ('classA', '',      'o'),
                                         ('classB', '',      ' '),
                                         ('classA', '',      'the'),
                                         ('classA', '',      're'),
                                         ])
                 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
                  ('classB', [('', ' ')],
                  ('classA', [('', 'there')]]
                 """
                 if tokenstream and len(tokenstream[0]) == 2:
                     tokenstream = ((t[0], '', t[1]) for t in tokenstream)
                 result = []
                 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
                     ops = []
                     for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
                         text_buffer = []
                         for t_class, t_op, t_text in token_text_list:
                             text_buffer.append(t_text)
                         ops.append((token_op, ''.join(text_buffer)))
                     result.append((token_class, ops))
                 return result
             def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
                 """
                 Converts a list of (token_class, token_text) tuples to a list of
                 (token_class, token_op, token_text) tuples where token_op is one of
                 ('ins', 'del', '')
                 :param old_tokens: list of (token_class, token_text) tuples of old line
                 :param new_tokens: list of (token_class, token_text) tuples of new line
                 :param use_diff_match_patch: boolean, will use google's diff match patch
                     library which has options to 'smooth' out the character by character
                     differences making nicer ins/del blocks
                 """
                 old_tokens_result = []
                 new_tokens_result = []
                 similarity = difflib.SequenceMatcher(None,
                     ''.join(token_text for token_class, token_text in old_tokens),
                     ''.join(token_text for token_class, token_text in new_tokens)
                 ).ratio()
                 if similarity < 0.6: # return, the blocks are too different
                     for token_class, token_text in old_tokens:
                         old_tokens_result.append((token_class, '', token_text))
                     for token_class, token_text in new_tokens:
                         new_tokens_result.append((token_class, '', token_text))
                     return old_tokens_result, new_tokens_result, similarity
                 token_sequence_matcher = difflib.SequenceMatcher(None,
                     [x[1] for x in old_tokens],
                     [x[1] for x in new_tokens])
                 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
                     # check the differences by token block types first to give a more
                     # nicer "block" level replacement vs character diffs
                     if tag == 'equal':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, '', token_text))
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, '', token_text))
                     elif tag == 'delete':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, 'del', token_text))
                     elif tag == 'insert':
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, 'ins', token_text))
                     elif tag == 'replace':
                         # if same type token blocks must be replaced, do a diff on the
                         # characters in the token blocks to show individual changes
                         old_char_tokens = []
                         new_char_tokens = []
                         for token_class, token_text in old_tokens[o1:o2]:
                             for char in token_text:
                                 old_char_tokens.append((token_class, char))
                         for token_class, token_text in new_tokens[n1:n2]:
                             for char in token_text:
                                 new_char_tokens.append((token_class, char))
                         old_string = ''.join([token_text for
                             token_class, token_text in old_char_tokens])
                         new_string = ''.join([token_text for
                             token_class, token_text in new_char_tokens])
                         char_sequence = difflib.SequenceMatcher(
                             None, old_string, new_string)
                         copcodes = char_sequence.get_opcodes()
                         obuffer, nbuffer = [], []
                         if use_diff_match_patch:
                             dmp = diff_match_patch()
                             dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
                             reps = dmp.diff_main(old_string, new_string)
                             dmp.diff_cleanupEfficiency(reps)
                             a, b = 0, 0
                             for op, rep in reps:
                                 l = len(rep)
                                 if op == 0:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], '', c))
                                         nbuffer.append((new_char_tokens[b+i][0], '', c))
                                     a += l
                                     b += l
                                 elif op == -1:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], 'del', c))
                                     a += l
                                 elif op == 1:
                                     for i, c in enumerate(rep):
                                         nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
                                     b += l
                         else:
                             for ctag, co1, co2, cn1, cn2 in copcodes:
                                 if ctag == 'equal':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, '', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, '', token_text))
                                 elif ctag == 'delete':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                 elif ctag == 'insert':
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                                 elif ctag == 'replace':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                         old_tokens_result.extend(obuffer)
                         new_tokens_result.extend(nbuffer)
                 return old_tokens_result, new_tokens_result, similarity
             class DiffSet(object):
                 """
                 An object for parsing the diff result from diffs.DiffProcessor and
                 adding highlighting, side by side/unified renderings and line diffs
                 """
                 HL_REAL = 'REAL' # highlights using original file, slow
                 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
                                  # in the case of multiline code
                 HL_NONE = 'NONE' # no highlighting, fastest
                 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
                              source_repo_name=None,
                              source_node_getter=lambda filename: None,
                              target_node_getter=lambda filename: None,
                              source_nodes=None, target_nodes=None,
                              max_file_size_limit=150 * 1024, # files over this size will
                                                              # use fast highlighting
                              comments=None,
                              ):
                     self.highlight_mode = highlight_mode
                     self.highlighted_filenodes = {}
                     self.source_node_getter = source_node_getter
                     self.target_node_getter = target_node_getter
                     self.source_nodes = source_nodes or {}
                     self.target_nodes = target_nodes or {}
                     self.repo_name = repo_name
                     self.source_repo_name = source_repo_name or repo_name
                     self.comments = comments or {}
                     self.comments_store = self.comments.copy()
                     self.max_file_size_limit = max_file_size_limit
                 def render_patchset(self, patchset, source_ref=None, target_ref=None):
                     diffset = AttributeDict(dict(
                         lines_added=0,
                         lines_deleted=0,
                         changed_files=0,
                         files=[],
                         limited_diff=isinstance(patchset, LimitedDiffContainer),
                         repo_name=self.repo_name,
                         source_repo_name=self.source_repo_name,
                         source_ref=source_ref,
                         target_ref=target_ref,
                     ))
                     for patch in patchset:
                         filediff = self.render_patch(patch)
                         filediff.diffset = diffset
                         diffset.files.append(filediff)
                         diffset.changed_files += 1
                         if not patch['stats']['binary']:
                             diffset.lines_added += patch['stats']['added']
                             diffset.lines_deleted += patch['stats']['deleted']
                     return diffset
                 _lexer_cache = {}
                 def _get_lexer_for_filename(self, filename, filenode=None):
                     # cached because we might need to call it twice for source/target
                     if filename not in self._lexer_cache:
                         if filenode:
                             lexer = filenode.lexer
                         else:
-                            lexer = get_lexer_safe(filepath=filename)
+                            lexer = FileNode.get_lexer(filename=filename)
                         self._lexer_cache[filename] = lexer
                     return self._lexer_cache[filename]
                 def render_patch(self, patch):
                     log.debug('rendering diff for %r' % patch['filename'])
                     source_filename = patch['original_filename']
                     target_filename = patch['filename']
                     source_lexer = plain_text_lexer
                     target_lexer = plain_text_lexer
                     if not patch['stats']['binary']:
                         if self.highlight_mode == self.HL_REAL:
                             if (source_filename and patch['operation'] in ('D', 'M')
                                 and source_filename not in self.source_nodes):
                                     self.source_nodes[source_filename] = (
                                         self.source_node_getter(source_filename))
                             if (target_filename and patch['operation'] in ('A', 'M')
                                 and target_filename not in self.target_nodes):
                                     self.target_nodes[target_filename] = (
                                         self.target_node_getter(target_filename))
                         elif self.highlight_mode == self.HL_FAST:
                             source_lexer = self._get_lexer_for_filename(source_filename)
                             target_lexer = self._get_lexer_for_filename(target_filename)
                     source_file = self.source_nodes.get(source_filename, source_filename)
                     target_file = self.target_nodes.get(target_filename, target_filename)
                     source_filenode, target_filenode = None, None
                     # TODO: dan: FileNode.lexer works on the content of the file - which
                     # can be slow - issue #4289 explains a lexer clean up - which once
                     # done can allow caching a lexer for a filenode to avoid the file lookup
                     if isinstance(source_file, FileNode):
                         source_filenode = source_file
                         #source_lexer = source_file.lexer
                         source_lexer = self._get_lexer_for_filename(source_filename)
                         source_file.lexer = source_lexer
                     if isinstance(target_file, FileNode):
                         target_filenode = target_file
                         #target_lexer = target_file.lexer
                         target_lexer = self._get_lexer_for_filename(target_filename)
                         target_file.lexer = target_lexer
                     source_file_path, target_file_path = None, None
                     if source_filename != '/dev/null':
                         source_file_path = source_filename
                     if target_filename != '/dev/null':
                         target_file_path = target_filename
                     source_file_type = source_lexer.name
                     target_file_type = target_lexer.name
                     op_hunks = patch['chunks'][0]
                     hunks = patch['chunks'][1:]
                     filediff = AttributeDict({
                         'source_file_path': source_file_path,
                         'target_file_path': target_file_path,
                         'source_filenode': source_filenode,
                         'target_filenode': target_filenode,
                         'hunks': [],
                         'source_file_type': target_file_type,
                         'target_file_type': source_file_type,
                         'patch': patch,
                         'source_mode': patch['stats']['old_mode'],
                         'target_mode': patch['stats']['new_mode'],
                         'limited_diff': isinstance(patch, LimitedDiffContainer),
                         'diffset': self,
                     })
                     for hunk in hunks:
                         hunkbit = self.parse_hunk(hunk, source_file, target_file)
                         hunkbit.filediff = filediff
                         filediff.hunks.append(hunkbit)
                     left_comments = {}
                     if source_file_path in self.comments_store:
                         for lineno, comments in self.comments_store[source_file_path].items():
                             left_comments[lineno] = comments
                     if target_file_path in self.comments_store:
                         for lineno, comments in self.comments_store[target_file_path].items():
                             left_comments[lineno] = comments
                     filediff.left_comments = left_comments
                     return filediff
                 def parse_hunk(self, hunk, source_file, target_file):
                     result = AttributeDict(dict(
                         source_start=hunk['source_start'],
                         source_length=hunk['source_length'],
                         target_start=hunk['target_start'],
                         target_length=hunk['target_length'],
                         section_header=hunk['section_header'],
                         lines=[],
                     ))
                     before, after = [], []
                     for line in hunk['lines']:
                         if line['action'] == 'unmod':
                             result.lines.extend(
                                 self.parse_lines(before, after, source_file, target_file))
                             after.append(line)
                             before.append(line)
                         elif line['action'] == 'add':
                             after.append(line)
                         elif line['action'] == 'del':
                             before.append(line)
                         elif line['action'] == 'old-no-nl':
                             before.append(line)
                         elif line['action'] == 'new-no-nl':
                             after.append(line)
                     result.lines.extend(
                         self.parse_lines(before, after, source_file, target_file))
                     result.unified = self.as_unified(result.lines)
                     result.sideside = result.lines
                     return result
                 def parse_lines(self, before_lines, after_lines, source_file, target_file):
                     # TODO: dan: investigate doing the diff comparison and fast highlighting
                     # on the entire before and after buffered block lines rather than by
                     # line, this means we can get better 'fast' highlighting if the context
                     # allows it - eg.
                     # line 4: """
                     # line 5: this gets highlighted as a string
                     # line 6: """
                     lines = []
                     while before_lines or after_lines:
                         before, after = None, None
                         before_tokens, after_tokens = None, None
                         if before_lines:
                             before = before_lines.pop(0)
                         if after_lines:
                             after = after_lines.pop(0)
                         original = AttributeDict()
                         modified = AttributeDict()
                         if before:
                             if before['action'] == 'old-no-nl':
                                 before_tokens = [('nonl', before['line'])]
                             else:
                                 before_tokens = self.get_line_tokens(
                                     line_text=before['line'], line_number=before['old_lineno'],
                                     file=source_file)
                             original.lineno = before['old_lineno']
                             original.content = before['line']
                             original.action = self.action_to_op(before['action'])
                             original.comments = self.get_comments_for('old',
                                 source_file, before['old_lineno'])
                         if after:
                             if after['action'] == 'new-no-nl':
                                 after_tokens = [('nonl', after['line'])]
                             else:
                                 after_tokens = self.get_line_tokens(
                                     line_text=after['line'], line_number=after['new_lineno'],
                                     file=target_file)
                             modified.lineno = after['new_lineno']
                             modified.content = after['line']
                             modified.action = self.action_to_op(after['action'])
                             modified.comments = self.get_comments_for('new',
                                 target_file, after['new_lineno'])
                         # diff the lines
                         if before_tokens and after_tokens:
                             o_tokens, m_tokens, similarity = tokens_diff(
                                 before_tokens, after_tokens)
                             original.content = render_tokenstream(o_tokens)
                             modified.content = render_tokenstream(m_tokens)
                         elif before_tokens:
                             original.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in before_tokens])
                         elif after_tokens:
                             modified.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in after_tokens])
                         lines.append(AttributeDict({
                             'original': original,
                             'modified': modified,
                         }))
                     return lines
                 def get_comments_for(self, version, file, line_number):
                     if hasattr(file, 'unicode_path'):
                         file = file.unicode_path
                     if not isinstance(file, basestring):
                         return None
                     line_key = {
                         'old': 'o',
                         'new': 'n',
                     }[version] + str(line_number)
                     if file in self.comments_store:
                         file_comments = self.comments_store[file]
                         if line_key in file_comments:
                             return file_comments.pop(line_key)
                 def get_line_tokens(self, line_text, line_number, file=None):
                     filenode = None
                     filename = None
                     if isinstance(file, basestring):
                         filename = file
                     elif isinstance(file, FileNode):
                         filenode = file
                         filename = file.unicode_path
                     if self.highlight_mode == self.HL_REAL and filenode:
                         lexer = self._get_lexer_for_filename(filename)
                         file_size_allowed = file.size < self.max_file_size_limit
                         if line_number and file_size_allowed:
                             return self.get_tokenized_filenode_line(
                                 file, line_number, lexer)
                     if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
                         lexer = self._get_lexer_for_filename(filename)
                         return list(tokenize_string(line_text, lexer))
                     return list(tokenize_string(line_text, plain_text_lexer))
                 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
                     if filenode not in self.highlighted_filenodes:
                         tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
                         self.highlighted_filenodes[filenode] = tokenized_lines
                     return self.highlighted_filenodes[filenode][line_number - 1]
                 def action_to_op(self, action):
                     return {
                         'add': '+',
                         'del': '-',
                         'unmod': ' ',
                         'old-no-nl': ' ',
                         'new-no-nl': ' ',
                     }.get(action, action)
                 def as_unified(self, lines):
                     """
                     Return a generator that yields the lines of a diff in unified order
                     """
                     def generator():
                         buf = []
                         for line in lines:
                             if buf and not line.original or line.original.action == ' ':
                                 for b in buf:
                                     yield b
                                 buf = []
                             if line.original:
                                 if line.original.action == ' ':
                                     yield (line.original.lineno, line.modified.lineno,
                                            line.original.action, line.original.content,
                                            line.original.comments)
                                     continue
                                 if line.original.action == '-':
                                     yield (line.original.lineno, None,
                                            line.original.action, line.original.content,
                                            line.original.comments)
                                 if line.modified.action == '+':
                                     buf.append((
                                         None, line.modified.lineno,
                                         line.modified.action, line.modified.content,
                                         line.modified.comments))
                                     continue
                             if line.modified:
                                 yield (None, line.modified.lineno,
                                        line.modified.action, line.modified.content,
                                        line.modified.comments)
                         for b in buf:
                             yield b
                     return generator()