rhodecode-enterprise-ce Commit - r1356:1e4a47eb

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

3

4

#

4

#

5

# This program is free software: you can redistribute it and/or modify

5

# This program is free software: you can redistribute it and/or modify

6

# it under the terms of the GNU Affero General Public License, version 3

6

# it under the terms of the GNU Affero General Public License, version 3

7

# (only), as published by the Free Software Foundation.

7

# (only), as published by the Free Software Foundation.

8

#

8

#

9

# This program is distributed in the hope that it will be useful,

9

# This program is distributed in the hope that it will be useful,

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

# GNU General Public License for more details.

12

# GNU General Public License for more details.

13

#

13

#

14

# You should have received a copy of the GNU Affero General Public License

14

# You should have received a copy of the GNU Affero General Public License

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

16

#

16

#

17

# This program is dual-licensed. If you wish to learn more about the

17

# This program is dual-licensed. If you wish to learn more about the

18

# RhodeCode Enterprise Edition, including its added features, Support services,

18

# RhodeCode Enterprise Edition, including its added features, Support services,

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

20

21

import logging

21

import logging

22

import difflib

22

import difflib

23

from itertools import groupby

23

from itertools import groupby

24

25

from pygments import lex

25

from pygments import lex

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

27

from rhodecode.lib.helpers import (

27

from rhodecode.lib.helpers import (

28

get_lexer_for_filenode, get_lexer_safe, html_escape)

28

get_lexer_for_filenode, get_lexer_safe, html_escape)

29

from rhodecode.lib.utils2 import AttributeDict

29

from rhodecode.lib.utils2 import AttributeDict

30

from rhodecode.lib.vcs.nodes import FileNode

30

from rhodecode.lib.vcs.nodes import FileNode

31

from rhodecode.lib.diff_match_patch import diff_match_patch

31

from rhodecode.lib.diff_match_patch import diff_match_patch

32

from rhodecode.lib.diffs import LimitedDiffContainer

32

from rhodecode.lib.diffs import LimitedDiffContainer

33

from pygments.lexers import get_lexer_by_name

33

from pygments.lexers import get_lexer_by_name

34

35

plain_text_lexer = get_lexer_by_name(

35

plain_text_lexer = get_lexer_by_name(

36

'text', stripall=False, stripnl=False, ensurenl=False)

36

'text', stripall=False, stripnl=False, ensurenl=False)

37

38

39

log = logging.getLogger()

39

log = logging.getLogger()

40

41

42

def filenode_as_lines_tokens(filenode, lexer=None):

42

def filenode_as_lines_tokens(filenode, lexer=None):

43

org_lexer = lexer

43

lexer = lexer or get_lexer_for_filenode(filenode)

44

lexer = lexer or get_lexer_for_filenode(filenode)

44

log.debug('Generating file node pygment tokens for %s, %s', ~~lexer~~, ~~filenode~~)

45

log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',

46

lexer, filenode, org_lexer)

45

tokens = tokenize_string(filenode.content, lexer)

47

tokens = tokenize_string(filenode.content, lexer)

46

lines = split_token_stream(tokens, split_string='\n')

48

lines = split_token_stream(tokens, split_string='\n')

47

rv = list(lines)

49

rv = list(lines)

48

return rv

50

return rv

49

51

50

52

51

def tokenize_string(content, lexer):

53

def tokenize_string(content, lexer):

52

"""

54

"""

53

Use pygments to tokenize some content based on a lexer

55

Use pygments to tokenize some content based on a lexer

54

ensuring all original new lines and whitespace is preserved

56

ensuring all original new lines and whitespace is preserved

55

"""

57

"""

56

58

57

lexer.stripall = False

59

lexer.stripall = False

58

lexer.stripnl = False

60

lexer.stripnl = False

59

lexer.ensurenl = False

61

lexer.ensurenl = False

60

for token_type, token_text in lex(content, lexer):

62

for token_type, token_text in lex(content, lexer):

61

yield pygment_token_class(token_type), token_text

63

yield pygment_token_class(token_type), token_text

62

64

63

65

64

def split_token_stream(tokens, split_string=u'\n'):

66

def split_token_stream(tokens, split_string=u'\n'):

65

"""

67

"""

66

Take a list of (TokenType, text) tuples and split them by a string

68

Take a list of (TokenType, text) tuples and split them by a string

67

69

68

>>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

70

>>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

69

[(TEXT, 'some'), (TEXT, 'text'),

71

[(TEXT, 'some'), (TEXT, 'text'),

70

(TEXT, 'more'), (TEXT, 'text')]

72

(TEXT, 'more'), (TEXT, 'text')]

71

"""

73

"""

72

74

73

buffer = []

75

buffer = []

74

for token_class, token_text in tokens:

76

for token_class, token_text in tokens:

75

parts = token_text.split(split_string)

77

parts = token_text.split(split_string)

76

for part in parts[:-1]:

78

for part in parts[:-1]:

77

buffer.append((token_class, part))

79

buffer.append((token_class, part))

78

yield buffer

80

yield buffer

79

buffer = []

81

buffer = []

80

82

81

buffer.append((token_class, parts[-1]))

83

buffer.append((token_class, parts[-1]))

82

84

83

if buffer:

85

if buffer:

84

yield buffer

86

yield buffer

85

87

86

88

87

def filenode_as_annotated_lines_tokens(filenode):

89

def filenode_as_annotated_lines_tokens(filenode):

88

"""

90

"""

89

Take a file node and return a list of annotations => lines, if no annotation

91

Take a file node and return a list of annotations => lines, if no annotation

90

is found, it will be None.

92

is found, it will be None.

91

93

92

eg:

94

eg:

93

95

94

[

96

[

95

(annotation1, [

97

(annotation1, [

96

(1, line1_tokens_list),

98

(1, line1_tokens_list),

97

(2, line2_tokens_list),

99

(2, line2_tokens_list),

98

]),

100

]),

99

(annotation2, [

101

(annotation2, [

100

(3, line1_tokens_list),

102

(3, line1_tokens_list),

101

]),

103

]),

102

(None, [

104

(None, [

103

(4, line1_tokens_list),

105

(4, line1_tokens_list),

104

]),

106

]),

105

(annotation1, [

107

(annotation1, [

106

(5, line1_tokens_list),

108

(5, line1_tokens_list),

107

(6, line2_tokens_list),

109

(6, line2_tokens_list),

108

])

110

])

109

]

111

]

110

"""

112

"""

111

113

112

commit_cache = {} # cache commit_getter lookups

114

commit_cache = {} # cache commit_getter lookups

113

115

114

def _get_annotation(commit_id, commit_getter):

116

def _get_annotation(commit_id, commit_getter):

115

if commit_id not in commit_cache:

117

if commit_id not in commit_cache:

116

commit_cache[commit_id] = commit_getter()

118

commit_cache[commit_id] = commit_getter()

117

return commit_cache[commit_id]

119

return commit_cache[commit_id]

118

120

119

annotation_lookup = {

121

annotation_lookup = {

120

line_no: _get_annotation(commit_id, commit_getter)

122

line_no: _get_annotation(commit_id, commit_getter)

121

for line_no, commit_id, commit_getter, line_content

123

for line_no, commit_id, commit_getter, line_content

122

in filenode.annotate

124

in filenode.annotate

123

}

125

}

124

126

125

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

127

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

126

for line_no, tokens

128

for line_no, tokens

127

in enumerate(filenode_as_lines_tokens(filenode), 1))

129

in enumerate(filenode_as_lines_tokens(filenode), 1))

128

130

129

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

131

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

130

132

131

for annotation, group in grouped_annotations_lines:

133

for annotation, group in grouped_annotations_lines:

132

yield (

134

yield (

133

annotation, [(line_no, tokens)

135

annotation, [(line_no, tokens)

134

for (_, line_no, tokens) in group]

136

for (_, line_no, tokens) in group]

135

)

137

)

136

138

137

139

138

def render_tokenstream(tokenstream):

140

def render_tokenstream(tokenstream):

139

result = []

141

result = []

140

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

142

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

141

143

142

if token_class:

144

if token_class:

143

result.append(u'' % token_class)

145

result.append(u'' % token_class)

144

else:

146

else:

145

result.append(u'')

147

result.append(u'')

146

148

147

for op_tag, token_text in token_ops_texts:

149

for op_tag, token_text in token_ops_texts:

148

150

149

if op_tag:

151

if op_tag:

150

result.append(u'<%s>' % op_tag)

152

result.append(u'<%s>' % op_tag)

151

153

152

escaped_text = html_escape(token_text)

154

escaped_text = html_escape(token_text)

153

155

154

# TODO: dan: investigate showing hidden characters like space/nl/tab

156

# TODO: dan: investigate showing hidden characters like space/nl/tab

155

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

157

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

156

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

158

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

157

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

159

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

158

160

159

result.append(escaped_text)

161

result.append(escaped_text)

160

162

161

if op_tag:

163

if op_tag:

162

result.append(u'</%s>' % op_tag)

164

result.append(u'</%s>' % op_tag)

163

165

164

result.append(u'')

166

result.append(u'')

165

167

166

html = ''.join(result)

168

html = ''.join(result)

167

return html

169

return html

168

170

169

171

170

def rollup_tokenstream(tokenstream):

172

def rollup_tokenstream(tokenstream):

171

"""

173

"""

172

Group a token stream of the format:

174

Group a token stream of the format:

173

175

174

('class', 'op', 'text')

176

('class', 'op', 'text')

175

or

177

or

176

('class', 'text')

178

('class', 'text')

177

179

178

into

180

into

179

181

180

[('class1',

182

[('class1',

181

[('op1', 'text'),

183

[('op1', 'text'),

182

('op2', 'text')]),

184

('op2', 'text')]),

183

('class2',

185

('class2',

184

[('op3', 'text')])]

186

[('op3', 'text')])]

185

187

186

This is used to get the minimal tags necessary when

188

This is used to get the minimal tags necessary when

187

rendering to html eg for a token stream ie.

189

rendering to html eg for a token stream ie.

188

190

189

<ins>he</ins>llo

191

<ins>he</ins>llo

190

vs

192

vs

191

<ins>he</ins>llo

193

<ins>he</ins>llo

192

194

193

If a 2 tuple is passed in, the output op will be an empty string.

195

If a 2 tuple is passed in, the output op will be an empty string.

194

196

195

eg:

197

eg:

196

198

197

>>> rollup_tokenstream([('classA', '', 'h'),

199

>>> rollup_tokenstream([('classA', '', 'h'),

198

('classA', 'del', 'ell'),

200

('classA', 'del', 'ell'),

199

('classA', '', 'o'),

201

('classA', '', 'o'),

200

('classB', '', ' '),

202

('classB', '', ' '),

201

('classA', '', 'the'),

203

('classA', '', 'the'),

202

('classA', '', 're'),

204

('classA', '', 're'),

203

])

205

])

204

206

205

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

207

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

206

('classB', [('', ' ')],

208

('classB', [('', ' ')],

207

('classA', [('', 'there')]]

209

('classA', [('', 'there')]]

208

210

209

"""

211

"""

210

if tokenstream and len(tokenstream[0]) == 2:

212

if tokenstream and len(tokenstream[0]) == 2:

211

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

213

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

212

214

213

result = []

215

result = []

214

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

216

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

215

ops = []

217

ops = []

216

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

218

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

217

text_buffer = []

219

text_buffer = []

218

for t_class, t_op, t_text in token_text_list:

220

for t_class, t_op, t_text in token_text_list:

219

text_buffer.append(t_text)

221

text_buffer.append(t_text)

220

ops.append((token_op, ''.join(text_buffer)))

222

ops.append((token_op, ''.join(text_buffer)))

221

result.append((token_class, ops))

223

result.append((token_class, ops))

222

return result

224

return result

223

225

224

226

225

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

227

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

226

"""

228

"""

227

Converts a list of (token_class, token_text) tuples to a list of

229

Converts a list of (token_class, token_text) tuples to a list of

228

(token_class, token_op, token_text) tuples where token_op is one of

230

(token_class, token_op, token_text) tuples where token_op is one of

229

('ins', 'del', '')

231

('ins', 'del', '')

230

232

231

:param old_tokens: list of (token_class, token_text) tuples of old line

233

:param old_tokens: list of (token_class, token_text) tuples of old line

232

:param new_tokens: list of (token_class, token_text) tuples of new line

234

:param new_tokens: list of (token_class, token_text) tuples of new line

233

:param use_diff_match_patch: boolean, will use google's diff match patch

235

:param use_diff_match_patch: boolean, will use google's diff match patch

234

library which has options to 'smooth' out the character by character

236

library which has options to 'smooth' out the character by character

235

differences making nicer ins/del blocks

237

differences making nicer ins/del blocks

236

"""

238

"""

237

239

238

old_tokens_result = []

240

old_tokens_result = []

239

new_tokens_result = []

241

new_tokens_result = []

240

242

241

similarity = difflib.SequenceMatcher(None,

243

similarity = difflib.SequenceMatcher(None,

242

''.join(token_text for token_class, token_text in old_tokens),

244

''.join(token_text for token_class, token_text in old_tokens),

243

''.join(token_text for token_class, token_text in new_tokens)

245

''.join(token_text for token_class, token_text in new_tokens)

244

).ratio()

246

).ratio()

245

247

246

if similarity < 0.6: # return, the blocks are too different

248

if similarity < 0.6: # return, the blocks are too different

247

for token_class, token_text in old_tokens:

249

for token_class, token_text in old_tokens:

248

old_tokens_result.append((token_class, '', token_text))

250

old_tokens_result.append((token_class, '', token_text))

249

for token_class, token_text in new_tokens:

251

for token_class, token_text in new_tokens:

250

new_tokens_result.append((token_class, '', token_text))

252

new_tokens_result.append((token_class, '', token_text))

251

return old_tokens_result, new_tokens_result, similarity

253

return old_tokens_result, new_tokens_result, similarity

252

254

253

token_sequence_matcher = difflib.SequenceMatcher(None,

255

token_sequence_matcher = difflib.SequenceMatcher(None,

254

[x[1] for x in old_tokens],

256

[x[1] for x in old_tokens],

255

[x[1] for x in new_tokens])

257

[x[1] for x in new_tokens])

256

258

257

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

259

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

258

# check the differences by token block types first to give a more

260

# check the differences by token block types first to give a more

259

# nicer "block" level replacement vs character diffs

261

# nicer "block" level replacement vs character diffs

260

262

261

if tag == 'equal':

263

if tag == 'equal':

262

for token_class, token_text in old_tokens[o1:o2]:

264

for token_class, token_text in old_tokens[o1:o2]:

263

old_tokens_result.append((token_class, '', token_text))

265

old_tokens_result.append((token_class, '', token_text))

264

for token_class, token_text in new_tokens[n1:n2]:

266

for token_class, token_text in new_tokens[n1:n2]:

265

new_tokens_result.append((token_class, '', token_text))

267

new_tokens_result.append((token_class, '', token_text))

266

elif tag == 'delete':

268

elif tag == 'delete':

267

for token_class, token_text in old_tokens[o1:o2]:

269

for token_class, token_text in old_tokens[o1:o2]:

268

old_tokens_result.append((token_class, 'del', token_text))

270

old_tokens_result.append((token_class, 'del', token_text))

269

elif tag == 'insert':

271

elif tag == 'insert':

270

for token_class, token_text in new_tokens[n1:n2]:

272

for token_class, token_text in new_tokens[n1:n2]:

271

new_tokens_result.append((token_class, 'ins', token_text))

273

new_tokens_result.append((token_class, 'ins', token_text))

272

elif tag == 'replace':

274

elif tag == 'replace':

273

# if same type token blocks must be replaced, do a diff on the

275

# if same type token blocks must be replaced, do a diff on the

274

# characters in the token blocks to show individual changes

276

# characters in the token blocks to show individual changes

275

277

276

old_char_tokens = []

278

old_char_tokens = []

277

new_char_tokens = []

279

new_char_tokens = []

278

for token_class, token_text in old_tokens[o1:o2]:

280

for token_class, token_text in old_tokens[o1:o2]:

279

for char in token_text:

281

for char in token_text:

280

old_char_tokens.append((token_class, char))

282

old_char_tokens.append((token_class, char))

281

283

282

for token_class, token_text in new_tokens[n1:n2]:

284

for token_class, token_text in new_tokens[n1:n2]:

283

for char in token_text:

285

for char in token_text:

284

new_char_tokens.append((token_class, char))

286

new_char_tokens.append((token_class, char))

285

287

286

old_string = ''.join([token_text for

288

old_string = ''.join([token_text for

287

token_class, token_text in old_char_tokens])

289

token_class, token_text in old_char_tokens])

288

new_string = ''.join([token_text for

290

new_string = ''.join([token_text for

289

token_class, token_text in new_char_tokens])

291

token_class, token_text in new_char_tokens])

290

292

291

char_sequence = difflib.SequenceMatcher(

293

char_sequence = difflib.SequenceMatcher(

292

None, old_string, new_string)

294

None, old_string, new_string)

293

copcodes = char_sequence.get_opcodes()

295

copcodes = char_sequence.get_opcodes()

294

obuffer, nbuffer = [], []

296

obuffer, nbuffer = [], []

295

297

296

if use_diff_match_patch:

298

if use_diff_match_patch:

297

dmp = diff_match_patch()

299

dmp = diff_match_patch()

298

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

300

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

299

reps = dmp.diff_main(old_string, new_string)

301

reps = dmp.diff_main(old_string, new_string)

300

dmp.diff_cleanupEfficiency(reps)

302

dmp.diff_cleanupEfficiency(reps)

301

303

302

a, b = 0, 0

304

a, b = 0, 0

303

for op, rep in reps:

305

for op, rep in reps:

304

l = len(rep)

306

l = len(rep)

305

if op == 0:

307

if op == 0:

306

for i, c in enumerate(rep):

308

for i, c in enumerate(rep):

307

obuffer.append((old_char_tokens[a+i][0], '', c))

309

obuffer.append((old_char_tokens[a+i][0], '', c))

308

nbuffer.append((new_char_tokens[b+i][0], '', c))

310

nbuffer.append((new_char_tokens[b+i][0], '', c))

309

a += l

311

a += l

310

b += l

312

b += l

311

elif op == -1:

313

elif op == -1:

312

for i, c in enumerate(rep):

314

for i, c in enumerate(rep):

313

obuffer.append((old_char_tokens[a+i][0], 'del', c))

315

obuffer.append((old_char_tokens[a+i][0], 'del', c))

314

a += l

316

a += l

315

elif op == 1:

317

elif op == 1:

316

for i, c in enumerate(rep):

318

for i, c in enumerate(rep):

317

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

319

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

318

b += l

320

b += l

319

else:

321

else:

320

for ctag, co1, co2, cn1, cn2 in copcodes:

322

for ctag, co1, co2, cn1, cn2 in copcodes:

321

if ctag == 'equal':

323

if ctag == 'equal':

322

for token_class, token_text in old_char_tokens[co1:co2]:

324

for token_class, token_text in old_char_tokens[co1:co2]:

323

obuffer.append((token_class, '', token_text))

325

obuffer.append((token_class, '', token_text))

324

for token_class, token_text in new_char_tokens[cn1:cn2]:

326

for token_class, token_text in new_char_tokens[cn1:cn2]:

325

nbuffer.append((token_class, '', token_text))

327

nbuffer.append((token_class, '', token_text))

326

elif ctag == 'delete':

328

elif ctag == 'delete':

327

for token_class, token_text in old_char_tokens[co1:co2]:

329

for token_class, token_text in old_char_tokens[co1:co2]:

328

obuffer.append((token_class, 'del', token_text))

330

obuffer.append((token_class, 'del', token_text))

329

elif ctag == 'insert':

331

elif ctag == 'insert':

330

for token_class, token_text in new_char_tokens[cn1:cn2]:

332

for token_class, token_text in new_char_tokens[cn1:cn2]:

331

nbuffer.append((token_class, 'ins', token_text))

333

nbuffer.append((token_class, 'ins', token_text))

332

elif ctag == 'replace':

334

elif ctag == 'replace':

333

for token_class, token_text in old_char_tokens[co1:co2]:

335

for token_class, token_text in old_char_tokens[co1:co2]:

334

obuffer.append((token_class, 'del', token_text))

336

obuffer.append((token_class, 'del', token_text))

335

for token_class, token_text in new_char_tokens[cn1:cn2]:

337

for token_class, token_text in new_char_tokens[cn1:cn2]:

336

nbuffer.append((token_class, 'ins', token_text))

338

nbuffer.append((token_class, 'ins', token_text))

337

339

338

old_tokens_result.extend(obuffer)

340

old_tokens_result.extend(obuffer)

339

new_tokens_result.extend(nbuffer)

341

new_tokens_result.extend(nbuffer)

340

342

341

return old_tokens_result, new_tokens_result, similarity

343

return old_tokens_result, new_tokens_result, similarity

342

344

343

345

344

class DiffSet(object):

346

class DiffSet(object):

345

"""

347

"""

346

An object for parsing the diff result from diffs.DiffProcessor and

348

An object for parsing the diff result from diffs.DiffProcessor and

347

adding highlighting, side by side/unified renderings and line diffs

349

adding highlighting, side by side/unified renderings and line diffs

348

"""

350

"""

349

351

350

HL_REAL = 'REAL' # highlights using original file, slow

352

HL_REAL = 'REAL' # highlights using original file, slow

351

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

353

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

352

# in the case of multiline code

354

# in the case of multiline code

353

HL_NONE = 'NONE' # no highlighting, fastest

355

HL_NONE = 'NONE' # no highlighting, fastest

354

356

355

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

357

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

356

source_repo_name=None,

358

source_repo_name=None,

357

source_node_getter=lambda filename: None,

359

source_node_getter=lambda filename: None,

358

target_node_getter=lambda filename: None,

360

target_node_getter=lambda filename: None,

359

source_nodes=None, target_nodes=None,

361

source_nodes=None, target_nodes=None,

360

max_file_size_limit=150 * 1024, # files over this size will

362

max_file_size_limit=150 * 1024, # files over this size will

361

# use fast highlighting

363

# use fast highlighting

362

comments=None,

364

comments=None,

363

):

365

):

364

366

365

self.highlight_mode = highlight_mode

367

self.highlight_mode = highlight_mode

366

self.highlighted_filenodes = {}

368

self.highlighted_filenodes = {}

367

self.source_node_getter = source_node_getter

369

self.source_node_getter = source_node_getter

368

self.target_node_getter = target_node_getter

370

self.target_node_getter = target_node_getter

369

self.source_nodes = source_nodes or {}

371

self.source_nodes = source_nodes or {}

370

self.target_nodes = target_nodes or {}

372

self.target_nodes = target_nodes or {}

371

self.repo_name = repo_name

373

self.repo_name = repo_name

372

self.source_repo_name = source_repo_name or repo_name

374

self.source_repo_name = source_repo_name or repo_name

373

self.comments = comments or {}

375

self.comments = comments or {}

374

self.comments_store = self.comments.copy()

376

self.comments_store = self.comments.copy()

375

self.max_file_size_limit = max_file_size_limit

377

self.max_file_size_limit = max_file_size_limit

376

378

377

def render_patchset(self, patchset, source_ref=None, target_ref=None):

379

def render_patchset(self, patchset, source_ref=None, target_ref=None):

378

diffset = AttributeDict(dict(

380

diffset = AttributeDict(dict(

379

lines_added=0,

381

lines_added=0,

380

lines_deleted=0,

382

lines_deleted=0,

381

changed_files=0,

383

changed_files=0,

382

files=[],

384

files=[],

383

limited_diff=isinstance(patchset, LimitedDiffContainer),

385

limited_diff=isinstance(patchset, LimitedDiffContainer),

384

repo_name=self.repo_name,

386

repo_name=self.repo_name,

385

source_repo_name=self.source_repo_name,

387

source_repo_name=self.source_repo_name,

386

source_ref=source_ref,

388

source_ref=source_ref,

387

target_ref=target_ref,

389

target_ref=target_ref,

388

))

390

))

389

for patch in patchset:

391

for patch in patchset:

390

filediff = self.render_patch(patch)

392

filediff = self.render_patch(patch)

391

filediff.diffset = diffset

393

filediff.diffset = diffset

392

diffset.files.append(filediff)

394

diffset.files.append(filediff)

393

diffset.changed_files += 1

395

diffset.changed_files += 1

394

if not patch['stats']['binary']:

396

if not patch['stats']['binary']:

395

diffset.lines_added += patch['stats']['added']

397

diffset.lines_added += patch['stats']['added']

396

diffset.lines_deleted += patch['stats']['deleted']

398

diffset.lines_deleted += patch['stats']['deleted']

397

399

398

return diffset

400

return diffset

399

401

400

_lexer_cache = {}

402

_lexer_cache = {}

401

def _get_lexer_for_filename(self, filename):

403

def _get_lexer_for_filename(self, filename, filenode=None):

402

# cached because we might need to call it twice for source/target

404

# cached because we might need to call it twice for source/target

403

if filename not in self._lexer_cache:

405

if filename not in self._lexer_cache:

404

self._lexer_cache[filename] = get_lexer_safe(filepath=filename)

406

if filenode:

407

lexer = filenode.lexer

408

else:

409

lexer = get_lexer_safe(filepath=filename)

410

self._lexer_cache[filename] = lexer

405

return self._lexer_cache[filename]

411

return self._lexer_cache[filename]

406

412

407

def render_patch(self, patch):

413

def render_patch(self, patch):

408

log.debug('rendering diff for %r' % patch['filename'])

414

log.debug('rendering diff for %r' % patch['filename'])

409

415

410

source_filename = patch['original_filename']

416

source_filename = patch['original_filename']

411

target_filename = patch['filename']

417

target_filename = patch['filename']

412

418

413

source_lexer = plain_text_lexer

419

source_lexer = plain_text_lexer

414

target_lexer = plain_text_lexer

420

target_lexer = plain_text_lexer

415

421

416

if not patch['stats']['binary']:

422

if not patch['stats']['binary']:

417

if self.highlight_mode == self.HL_REAL:

423

if self.highlight_mode == self.HL_REAL:

418

if (source_filename and patch['operation'] in ('D', 'M')

424

if (source_filename and patch['operation'] in ('D', 'M')

419

and source_filename not in self.source_nodes):

425

and source_filename not in self.source_nodes):

420

self.source_nodes[source_filename] = (

426

self.source_nodes[source_filename] = (

421

self.source_node_getter(source_filename))

427

self.source_node_getter(source_filename))

422

428

423

if (target_filename and patch['operation'] in ('A', 'M')

429

if (target_filename and patch['operation'] in ('A', 'M')

424

and target_filename not in self.target_nodes):

430

and target_filename not in self.target_nodes):

425

self.target_nodes[target_filename] = (

431

self.target_nodes[target_filename] = (

426

self.target_node_getter(target_filename))

432

self.target_node_getter(target_filename))

427

433

428

elif self.highlight_mode == self.HL_FAST:

434

elif self.highlight_mode == self.HL_FAST:

429

source_lexer = self._get_lexer_for_filename(source_filename)

435

source_lexer = self._get_lexer_for_filename(source_filename)

430

target_lexer = self._get_lexer_for_filename(target_filename)

436

target_lexer = self._get_lexer_for_filename(target_filename)

431

437

432

source_file = self.source_nodes.get(source_filename, source_filename)

438

source_file = self.source_nodes.get(source_filename, source_filename)

433

target_file = self.target_nodes.get(target_filename, target_filename)

439

target_file = self.target_nodes.get(target_filename, target_filename)

434

440

435

source_filenode, target_filenode = None, None

441

source_filenode, target_filenode = None, None

436

442

437

# TODO: dan: FileNode.lexer works on the content of the file - which

443

# TODO: dan: FileNode.lexer works on the content of the file - which

438

# can be slow - issue #4289 explains a lexer clean up - which once

444

# can be slow - issue #4289 explains a lexer clean up - which once

439

# done can allow caching a lexer for a filenode to avoid the file lookup

445

# done can allow caching a lexer for a filenode to avoid the file lookup

440

if isinstance(source_file, FileNode):

446

if isinstance(source_file, FileNode):

441

source_filenode = source_file

447

source_filenode = source_file

442

source_lexer = source_file.lexer

448

#source_lexer = source_file.lexer

449

source_lexer = self._get_lexer_for_filename(source_filename)

450

source_file.lexer = source_lexer

451

443

if isinstance(target_file, FileNode):

452

if isinstance(target_file, FileNode):

444

target_filenode = target_file

453

target_filenode = target_file

445

target_lexer = target_file.lexer

454

#target_lexer = target_file.lexer

455

target_lexer = self._get_lexer_for_filename(target_filename)

456

target_file.lexer = target_lexer

446

457

447

source_file_path, target_file_path = None, None

458

source_file_path, target_file_path = None, None

448

459

449

if source_filename != '/dev/null':

460

if source_filename != '/dev/null':

450

source_file_path = source_filename

461

source_file_path = source_filename

451

if target_filename != '/dev/null':

462

if target_filename != '/dev/null':

452

target_file_path = target_filename

463

target_file_path = target_filename

453

464

454

source_file_type = source_lexer.name

465

source_file_type = source_lexer.name

455

target_file_type = target_lexer.name

466

target_file_type = target_lexer.name

456

467

457

op_hunks = patch['chunks'][0]

468

op_hunks = patch['chunks'][0]

458

hunks = patch['chunks'][1:]

469

hunks = patch['chunks'][1:]

459

470

460

filediff = AttributeDict({

471

filediff = AttributeDict({

461

'source_file_path': source_file_path,

472

'source_file_path': source_file_path,

462

'target_file_path': target_file_path,

473

'target_file_path': target_file_path,

463

'source_filenode': source_filenode,

474

'source_filenode': source_filenode,

464

'target_filenode': target_filenode,

475

'target_filenode': target_filenode,

465

'hunks': [],

476

'hunks': [],

466

'source_file_type': target_file_type,

477

'source_file_type': target_file_type,

467

'target_file_type': source_file_type,

478

'target_file_type': source_file_type,

468

'patch': patch,

479

'patch': patch,

469

'source_mode': patch['stats']['old_mode'],

480

'source_mode': patch['stats']['old_mode'],

470

'target_mode': patch['stats']['new_mode'],

481

'target_mode': patch['stats']['new_mode'],

471

'limited_diff': isinstance(patch, LimitedDiffContainer),

482

'limited_diff': isinstance(patch, LimitedDiffContainer),

472

'diffset': self,

483

'diffset': self,

473

})

484

})

474

485

475

for hunk in hunks:

486

for hunk in hunks:

476

hunkbit = self.parse_hunk(hunk, source_file, target_file)

487

hunkbit = self.parse_hunk(hunk, source_file, target_file)

477

hunkbit.filediff = filediff

488

hunkbit.filediff = filediff

478

filediff.hunks.append(hunkbit)

489

filediff.hunks.append(hunkbit)

479

490

480

left_comments = {}

491

left_comments = {}

481

492

482

if source_file_path in self.comments_store:

493

if source_file_path in self.comments_store:

483

for lineno, comments in self.comments_store[source_file_path].items():

494

for lineno, comments in self.comments_store[source_file_path].items():

484

left_comments[lineno] = comments

495

left_comments[lineno] = comments

485

496

486

if target_file_path in self.comments_store:

497

if target_file_path in self.comments_store:

487

for lineno, comments in self.comments_store[target_file_path].items():

498

for lineno, comments in self.comments_store[target_file_path].items():

488

left_comments[lineno] = comments

499

left_comments[lineno] = comments

489

500

490

filediff.left_comments = left_comments

501

filediff.left_comments = left_comments

491

return filediff

502

return filediff

492

503

493

def parse_hunk(self, hunk, source_file, target_file):

504

def parse_hunk(self, hunk, source_file, target_file):

494

result = AttributeDict(dict(

505

result = AttributeDict(dict(

495

source_start=hunk['source_start'],

506

source_start=hunk['source_start'],

496

source_length=hunk['source_length'],

507

source_length=hunk['source_length'],

497

target_start=hunk['target_start'],

508

target_start=hunk['target_start'],

498

target_length=hunk['target_length'],

509

target_length=hunk['target_length'],

499

section_header=hunk['section_header'],

510

section_header=hunk['section_header'],

500

lines=[],

511

lines=[],

501

))

512

))

502

before, after = [], []

513

before, after = [], []

503

514

504

for line in hunk['lines']:

515

for line in hunk['lines']:

505

if line['action'] == 'unmod':

516

if line['action'] == 'unmod':

506

result.lines.extend(

517

result.lines.extend(

507

self.parse_lines(before, after, source_file, target_file))

518

self.parse_lines(before, after, source_file, target_file))

508

after.append(line)

519

after.append(line)

509

before.append(line)

520

before.append(line)

510

elif line['action'] == 'add':

521

elif line['action'] == 'add':

511

after.append(line)

522

after.append(line)

512

elif line['action'] == 'del':

523

elif line['action'] == 'del':

513

before.append(line)

524

before.append(line)

514

elif line['action'] == 'old-no-nl':

525

elif line['action'] == 'old-no-nl':

515

before.append(line)

526

before.append(line)

516

elif line['action'] == 'new-no-nl':

527

elif line['action'] == 'new-no-nl':

517

after.append(line)

528

after.append(line)

518

529

519

result.lines.extend(

530

result.lines.extend(

520

self.parse_lines(before, after, source_file, target_file))

531

self.parse_lines(before, after, source_file, target_file))

521

result.unified = self.as_unified(result.lines)

532

result.unified = self.as_unified(result.lines)

522

result.sideside = result.lines

533

result.sideside = result.lines

523

534

524

return result

535

return result

525

536

526

def parse_lines(self, before_lines, after_lines, source_file, target_file):

537

def parse_lines(self, before_lines, after_lines, source_file, target_file):

527

# TODO: dan: investigate doing the diff comparison and fast highlighting

538

# TODO: dan: investigate doing the diff comparison and fast highlighting

528

# on the entire before and after buffered block lines rather than by

539

# on the entire before and after buffered block lines rather than by

529

# line, this means we can get better 'fast' highlighting if the context

540

# line, this means we can get better 'fast' highlighting if the context

530

# allows it - eg.

541

# allows it - eg.

531

# line 4: """

542

# line 4: """

532

# line 5: this gets highlighted as a string

543

# line 5: this gets highlighted as a string

533

# line 6: """

544

# line 6: """

534

545

535

lines = []

546

lines = []

536

while before_lines or after_lines:

547

while before_lines or after_lines:

537

before, after = None, None

548

before, after = None, None

538

before_tokens, after_tokens = None, None

549

before_tokens, after_tokens = None, None

539

550

540

if before_lines:

551

if before_lines:

541

before = before_lines.pop(0)

552

before = before_lines.pop(0)

542

if after_lines:

553

if after_lines:

543

after = after_lines.pop(0)

554

after = after_lines.pop(0)

544

555

545

original = AttributeDict()

556

original = AttributeDict()

546

modified = AttributeDict()

557

modified = AttributeDict()

547

558

548

if before:

559

if before:

549

if before['action'] == 'old-no-nl':

560

if before['action'] == 'old-no-nl':

550

before_tokens = [('nonl', before['line'])]

561

before_tokens = [('nonl', before['line'])]

551

else:

562

else:

552

before_tokens = self.get_line_tokens(

563

before_tokens = self.get_line_tokens(

553

line_text=before['line'], line_number=before['old_lineno'],

564

line_text=before['line'], line_number=before['old_lineno'],

554

file=source_file)

565

file=source_file)

555

original.lineno = before['old_lineno']

566

original.lineno = before['old_lineno']

556

original.content = before['line']

567

original.content = before['line']

557

original.action = self.action_to_op(before['action'])

568

original.action = self.action_to_op(before['action'])

558

original.comments = self.get_comments_for('old',

569

original.comments = self.get_comments_for('old',

559

source_file, before['old_lineno'])

570

source_file, before['old_lineno'])

560

571

561

if after:

572

if after:

562

if after['action'] == 'new-no-nl':

573

if after['action'] == 'new-no-nl':

563

after_tokens = [('nonl', after['line'])]

574

after_tokens = [('nonl', after['line'])]

564

else:

575

else:

565

after_tokens = self.get_line_tokens(

576

after_tokens = self.get_line_tokens(

566

line_text=after['line'], line_number=after['new_lineno'],

577

line_text=after['line'], line_number=after['new_lineno'],

567

file=target_file)

578

file=target_file)

568

modified.lineno = after['new_lineno']

579

modified.lineno = after['new_lineno']

569

modified.content = after['line']

580

modified.content = after['line']

570

modified.action = self.action_to_op(after['action'])

581

modified.action = self.action_to_op(after['action'])

571

modified.comments = self.get_comments_for('new',

582

modified.comments = self.get_comments_for('new',

572

target_file, after['new_lineno'])

583

target_file, after['new_lineno'])

573

584

574

# diff the lines

585

# diff the lines

575

if before_tokens and after_tokens:

586

if before_tokens and after_tokens:

576

o_tokens, m_tokens, similarity = tokens_diff(

587

o_tokens, m_tokens, similarity = tokens_diff(

577

before_tokens, after_tokens)

588

before_tokens, after_tokens)

578

original.content = render_tokenstream(o_tokens)

589

original.content = render_tokenstream(o_tokens)

579

modified.content = render_tokenstream(m_tokens)

590

modified.content = render_tokenstream(m_tokens)

580

elif before_tokens:

591

elif before_tokens:

581

original.content = render_tokenstream(

592

original.content = render_tokenstream(

582

[(x[0], '', x[1]) for x in before_tokens])

593

[(x[0], '', x[1]) for x in before_tokens])

583

elif after_tokens:

594

elif after_tokens:

584

modified.content = render_tokenstream(

595

modified.content = render_tokenstream(

585

[(x[0], '', x[1]) for x in after_tokens])

596

[(x[0], '', x[1]) for x in after_tokens])

586

597

587

lines.append(AttributeDict({

598

lines.append(AttributeDict({

588

'original': original,

599

'original': original,

589

'modified': modified,

600

'modified': modified,

590

}))

601

}))

591

602

592

return lines

603

return lines

593

604

594

def get_comments_for(self, version, file, line_number):

605

def get_comments_for(self, version, file, line_number):

595

if hasattr(file, 'unicode_path'):

606

if hasattr(file, 'unicode_path'):

596

file = file.unicode_path

607

file = file.unicode_path

597

608

598

if not isinstance(file, basestring):

609

if not isinstance(file, basestring):

599

return None

610

return None

600

611

601

line_key = {

612

line_key = {

602

'old': 'o',

613

'old': 'o',

603

'new': 'n',

614

'new': 'n',

604

}[version] + str(line_number)

615

}[version] + str(line_number)

605

616

606

if file in self.comments_store:

617

if file in self.comments_store:

607

file_comments = self.comments_store[file]

618

file_comments = self.comments_store[file]

608

if line_key in file_comments:

619

if line_key in file_comments:

609

return file_comments.pop(line_key)

620

return file_comments.pop(line_key)

610

621

611

def get_line_tokens(self, line_text, line_number, file=None):

622

def get_line_tokens(self, line_text, line_number, file=None):

612

filenode = None

623

filenode = None

613

filename = None

624

filename = None

614

625

615

if isinstance(file, basestring):

626

if isinstance(file, basestring):

616

filename = file

627

filename = file

617

elif isinstance(file, FileNode):

628

elif isinstance(file, FileNode):

618

filenode = file

629

filenode = file

619

filename = file.unicode_path

630

filename = file.unicode_path

620

631

621

if self.highlight_mode == self.HL_REAL and filenode:

632

if self.highlight_mode == self.HL_REAL and filenode:

622

if line_number and file.size < self.max_file_size_limit:

633

lexer = self._get_lexer_for_filename(filename)

623

return self.get_tokenized_filenode_line(file, line_number)

634

file_size_allowed = file.size < self.max_file_size_limit

635

if line_number and file_size_allowed:

636

return self.get_tokenized_filenode_line(

637

file, line_number, lexer)

624

638

625

if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:

639

if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:

626

lexer = self._get_lexer_for_filename(filename)

640

lexer = self._get_lexer_for_filename(filename)

627

return list(tokenize_string(line_text, lexer))

641

return list(tokenize_string(line_text, lexer))

628

642

629

return list(tokenize_string(line_text, plain_text_lexer))

643

return list(tokenize_string(line_text, plain_text_lexer))

630

644

631

def get_tokenized_filenode_line(self, filenode, line_number):

645

def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):

632

646

633

if filenode not in self.highlighted_filenodes:

647

if filenode not in self.highlighted_filenodes:

634

tokenized_lines = filenode_as_lines_tokens(filenode, ~~filenode~~.lexer)

648

tokenized_lines = filenode_as_lines_tokens(filenode, lexer)

635

self.highlighted_filenodes[filenode] = tokenized_lines

649

self.highlighted_filenodes[filenode] = tokenized_lines

636

return self.highlighted_filenodes[filenode][line_number - 1]

650

return self.highlighted_filenodes[filenode][line_number - 1]

637

651

638

def action_to_op(self, action):

652

def action_to_op(self, action):

639

return {

653

return {

640

'add': '+',

654

'add': '+',

641

'del': '-',

655

'del': '-',

642

'unmod': ' ',

656

'unmod': ' ',

643

'old-no-nl': ' ',

657

'old-no-nl': ' ',

644

'new-no-nl': ' ',

658

'new-no-nl': ' ',

645

}.get(action, action)

659

}.get(action, action)

646

660

647

def as_unified(self, lines):

661

def as_unified(self, lines):

648

"""

662

"""

649

Return a generator that yields the lines of a diff in unified order

663

Return a generator that yields the lines of a diff in unified order

650

"""

664

"""

651

def generator():

665

def generator():

652

buf = []

666

buf = []

653

for line in lines:

667

for line in lines:

654

668

655

if buf and not line.original or line.original.action == ' ':

669

if buf and not line.original or line.original.action == ' ':

656

for b in buf:

670

for b in buf:

657

yield b

671

yield b

658

buf = []

672

buf = []

659

673

660

if line.original:

674

if line.original:

661

if line.original.action == ' ':

675

if line.original.action == ' ':

662

yield (line.original.lineno, line.modified.lineno,

676

yield (line.original.lineno, line.modified.lineno,

663

line.original.action, line.original.content,

677

line.original.action, line.original.content,

664

line.original.comments)

678

line.original.comments)

665

continue

679

continue

666

680

667

if line.original.action == '-':

681

if line.original.action == '-':

668

yield (line.original.lineno, None,

682

yield (line.original.lineno, None,

669

line.original.action, line.original.content,

683

line.original.action, line.original.content,

670

line.original.comments)

684

line.original.comments)

671

685

672

if line.modified.action == '+':

686

if line.modified.action == '+':

673

buf.append((

687

buf.append((

674

None, line.modified.lineno,

688

None, line.modified.lineno,

675

line.modified.action, line.modified.content,

689

line.modified.action, line.modified.content,

676

line.modified.comments))

690

line.modified.comments))

677

continue

691

continue

678

692

679

if line.modified:

693

if line.modified:

680

yield (None, line.modified.lineno,

694

yield (None, line.modified.lineno,

681

line.modified.action, line.modified.content,

695

line.modified.action, line.modified.content,

682

line.modified.comments)

696

line.modified.comments)

683

697

684

for b in buf:

698

for b in buf:

685

yield b

699

yield b

686

700

687

return generator()

701

return generator()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             # Copyright (C) 2011-2017 RhodeCode GmbH
             #
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU Affero General Public License, version 3
             # (only), as published by the Free Software Foundation.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU Affero General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             #
             # This program is dual-licensed. If you wish to learn more about the
             # RhodeCode Enterprise Edition, including its added features, Support services,
             # and proprietary license terms, please see https://rhodecode.com/licenses/
             import logging
             import difflib
             from itertools import groupby
             from pygments import lex
             from pygments.formatters.html import _get_ttype_class as pygment_token_class
             from rhodecode.lib.helpers import (
                 get_lexer_for_filenode, get_lexer_safe, html_escape)
             from rhodecode.lib.utils2 import AttributeDict
             from rhodecode.lib.vcs.nodes import FileNode
             from rhodecode.lib.diff_match_patch import diff_match_patch
             from rhodecode.lib.diffs import LimitedDiffContainer
             from pygments.lexers import get_lexer_by_name
             plain_text_lexer = get_lexer_by_name(
                 'text', stripall=False, stripnl=False, ensurenl=False)
             log = logging.getLogger()
             def filenode_as_lines_tokens(filenode, lexer=None):
+                org_lexer = lexer
                 lexer = lexer or get_lexer_for_filenode(filenode)
-                log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)
+                log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
+                          lexer, filenode, org_lexer)
                 tokens = tokenize_string(filenode.content, lexer)
                 lines = split_token_stream(tokens, split_string='\n')
                 rv = list(lines)
                 return rv
             def tokenize_string(content, lexer):
                 """
                 Use pygments to tokenize some content based on a lexer
                 ensuring all original new lines and whitespace is preserved
                 """
                 lexer.stripall = False
                 lexer.stripnl = False
                 lexer.ensurenl = False
                 for token_type, token_text in lex(content, lexer):
                     yield pygment_token_class(token_type), token_text
             def split_token_stream(tokens, split_string=u'\n'):
                 """
                 Take a list of (TokenType, text) tuples and split them by a string
                 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
                 [(TEXT, 'some'), (TEXT, 'text'),
                  (TEXT, 'more'), (TEXT, 'text')]
                 """
                 buffer = []
                 for token_class, token_text in tokens:
                     parts = token_text.split(split_string)
                     for part in parts[:-1]:
                         buffer.append((token_class, part))
                         yield buffer
                         buffer = []
                     buffer.append((token_class, parts[-1]))
                 if buffer:
                     yield buffer
             def filenode_as_annotated_lines_tokens(filenode):
                 """
                 Take a file node and return a list of annotations => lines, if no annotation
                 is found, it will be None.
                 eg:
                 [
                     (annotation1, [
                         (1, line1_tokens_list),
                         (2, line2_tokens_list),
                     ]),
                     (annotation2, [
                         (3, line1_tokens_list),
                     ]),
                     (None, [
                         (4, line1_tokens_list),
                     ]),
                     (annotation1, [
                         (5, line1_tokens_list),
                         (6, line2_tokens_list),
                     ])
                 ]
                 """
                 commit_cache = {} # cache commit_getter lookups
                 def _get_annotation(commit_id, commit_getter):
                     if commit_id not in commit_cache:
                         commit_cache[commit_id] = commit_getter()
                     return commit_cache[commit_id]
                 annotation_lookup = {
                     line_no: _get_annotation(commit_id, commit_getter)
                     for line_no, commit_id, commit_getter, line_content
                     in filenode.annotate
                 }
                 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
                                       for line_no, tokens
                                       in enumerate(filenode_as_lines_tokens(filenode), 1))
                 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
                 for annotation, group in grouped_annotations_lines:
                     yield (
                         annotation, [(line_no, tokens)
                                       for (_, line_no, tokens) in group]
                     )
             def render_tokenstream(tokenstream):
                 result = []
                 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
                     if token_class:
                         result.append(u'<span class="%s">' % token_class)
                     else:
                         result.append(u'<span>')
                     for op_tag, token_text in token_ops_texts:
                         if op_tag:
                             result.append(u'<%s>' % op_tag)
                         escaped_text = html_escape(token_text)
                         # TODO: dan: investigate showing hidden characters like space/nl/tab
                         # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
                         # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
                         # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
                         result.append(escaped_text)
                         if op_tag:
                             result.append(u'</%s>' % op_tag)
                     result.append(u'</span>')
                 html = ''.join(result)
                 return html
             def rollup_tokenstream(tokenstream):
                 """
                 Group a token stream of the format:
                     ('class', 'op', 'text')
                 or
                     ('class', 'text')
                 into
                     [('class1',
                         [('op1', 'text'),
                          ('op2', 'text')]),
                      ('class2',
                         [('op3', 'text')])]
                 This is used to get the minimal tags necessary when
                 rendering to html eg for a token stream ie.
                 <span class="A"><ins>he</ins>llo</span>
                 vs
                 <span class="A"><ins>he</ins></span><span class="A">llo</span>
                 If a 2 tuple is passed in, the output op will be an empty string.
                 eg:
                 >>> rollup_tokenstream([('classA', '',      'h'),
                                         ('classA', 'del',   'ell'),
                                         ('classA', '',      'o'),
                                         ('classB', '',      ' '),
                                         ('classA', '',      'the'),
                                         ('classA', '',      're'),
                                         ])
                 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
                  ('classB', [('', ' ')],
                  ('classA', [('', 'there')]]
                 """
                 if tokenstream and len(tokenstream[0]) == 2:
                     tokenstream = ((t[0], '', t[1]) for t in tokenstream)
                 result = []
                 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
                     ops = []
                     for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
                         text_buffer = []
                         for t_class, t_op, t_text in token_text_list:
                             text_buffer.append(t_text)
                         ops.append((token_op, ''.join(text_buffer)))
                     result.append((token_class, ops))
                 return result
             def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
                 """
                 Converts a list of (token_class, token_text) tuples to a list of
                 (token_class, token_op, token_text) tuples where token_op is one of
                 ('ins', 'del', '')
                 :param old_tokens: list of (token_class, token_text) tuples of old line
                 :param new_tokens: list of (token_class, token_text) tuples of new line
                 :param use_diff_match_patch: boolean, will use google's diff match patch
                     library which has options to 'smooth' out the character by character
                     differences making nicer ins/del blocks
                 """
                 old_tokens_result = []
                 new_tokens_result = []
                 similarity = difflib.SequenceMatcher(None,
                     ''.join(token_text for token_class, token_text in old_tokens),
                     ''.join(token_text for token_class, token_text in new_tokens)
                 ).ratio()
                 if similarity < 0.6: # return, the blocks are too different
                     for token_class, token_text in old_tokens:
                         old_tokens_result.append((token_class, '', token_text))
                     for token_class, token_text in new_tokens:
                         new_tokens_result.append((token_class, '', token_text))
                     return old_tokens_result, new_tokens_result, similarity
                 token_sequence_matcher = difflib.SequenceMatcher(None,
                     [x[1] for x in old_tokens],
                     [x[1] for x in new_tokens])
                 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
                     # check the differences by token block types first to give a more
                     # nicer "block" level replacement vs character diffs
                     if tag == 'equal':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, '', token_text))
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, '', token_text))
                     elif tag == 'delete':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, 'del', token_text))
                     elif tag == 'insert':
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, 'ins', token_text))
                     elif tag == 'replace':
                         # if same type token blocks must be replaced, do a diff on the
                         # characters in the token blocks to show individual changes
                         old_char_tokens = []
                         new_char_tokens = []
                         for token_class, token_text in old_tokens[o1:o2]:
                             for char in token_text:
                                 old_char_tokens.append((token_class, char))
                         for token_class, token_text in new_tokens[n1:n2]:
                             for char in token_text:
                                 new_char_tokens.append((token_class, char))
                         old_string = ''.join([token_text for
                             token_class, token_text in old_char_tokens])
                         new_string = ''.join([token_text for
                             token_class, token_text in new_char_tokens])
                         char_sequence = difflib.SequenceMatcher(
                             None, old_string, new_string)
                         copcodes = char_sequence.get_opcodes()
                         obuffer, nbuffer = [], []
                         if use_diff_match_patch:
                             dmp = diff_match_patch()
                             dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
                             reps = dmp.diff_main(old_string, new_string)
                             dmp.diff_cleanupEfficiency(reps)
                             a, b = 0, 0
                             for op, rep in reps:
                                 l = len(rep)
                                 if op == 0:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], '', c))
                                         nbuffer.append((new_char_tokens[b+i][0], '', c))
                                     a += l
                                     b += l
                                 elif op == -1:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], 'del', c))
                                     a += l
                                 elif op == 1:
                                     for i, c in enumerate(rep):
                                         nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
                                     b += l
                         else:
                             for ctag, co1, co2, cn1, cn2 in copcodes:
                                 if ctag == 'equal':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, '', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, '', token_text))
                                 elif ctag == 'delete':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                 elif ctag == 'insert':
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                                 elif ctag == 'replace':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                         old_tokens_result.extend(obuffer)
                         new_tokens_result.extend(nbuffer)
                 return old_tokens_result, new_tokens_result, similarity
             class DiffSet(object):
                 """
                 An object for parsing the diff result from diffs.DiffProcessor and
                 adding highlighting, side by side/unified renderings and line diffs
                 """
                 HL_REAL = 'REAL' # highlights using original file, slow
                 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
                                  # in the case of multiline code
                 HL_NONE = 'NONE' # no highlighting, fastest
                 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
                              source_repo_name=None,
                              source_node_getter=lambda filename: None,
                              target_node_getter=lambda filename: None,
                              source_nodes=None, target_nodes=None,
                              max_file_size_limit=150 * 1024, # files over this size will
                                                              # use fast highlighting
                              comments=None,
                              ):
                     self.highlight_mode = highlight_mode
                     self.highlighted_filenodes = {}
                     self.source_node_getter = source_node_getter
                     self.target_node_getter = target_node_getter
                     self.source_nodes = source_nodes or {}
                     self.target_nodes = target_nodes or {}
                     self.repo_name = repo_name
                     self.source_repo_name = source_repo_name or repo_name
                     self.comments = comments or {}
                     self.comments_store = self.comments.copy()
                     self.max_file_size_limit = max_file_size_limit
                 def render_patchset(self, patchset, source_ref=None, target_ref=None):
                     diffset = AttributeDict(dict(
                         lines_added=0,
                         lines_deleted=0,
                         changed_files=0,
                         files=[],
                         limited_diff=isinstance(patchset, LimitedDiffContainer),
                         repo_name=self.repo_name,
                         source_repo_name=self.source_repo_name,
                         source_ref=source_ref,
                         target_ref=target_ref,
                     ))
                     for patch in patchset:
                         filediff = self.render_patch(patch)
                         filediff.diffset = diffset
                         diffset.files.append(filediff)
                         diffset.changed_files += 1
                         if not patch['stats']['binary']:
                             diffset.lines_added += patch['stats']['added']
                             diffset.lines_deleted += patch['stats']['deleted']
                     return diffset
                 _lexer_cache = {}
-                def _get_lexer_for_filename(self, filename):
+                def _get_lexer_for_filename(self, filename, filenode=None):
                     # cached because we might need to call it twice for source/target
                     if filename not in self._lexer_cache:
-                        self._lexer_cache[filename] = get_lexer_safe(filepath=filename)
+                        if filenode:
+                            lexer = filenode.lexer
+                        else:
+                            lexer = get_lexer_safe(filepath=filename)
+                        self._lexer_cache[filename] = lexer
                     return self._lexer_cache[filename]
                 def render_patch(self, patch):
                     log.debug('rendering diff for %r' % patch['filename'])
                     source_filename = patch['original_filename']
                     target_filename = patch['filename']
                     source_lexer = plain_text_lexer
                     target_lexer = plain_text_lexer
                     if not patch['stats']['binary']:
                         if self.highlight_mode == self.HL_REAL:
                             if (source_filename and patch['operation'] in ('D', 'M')
                                 and source_filename not in self.source_nodes):
                                     self.source_nodes[source_filename] = (
                                         self.source_node_getter(source_filename))
                             if (target_filename and patch['operation'] in ('A', 'M')
                                 and target_filename not in self.target_nodes):
                                     self.target_nodes[target_filename] = (
                                         self.target_node_getter(target_filename))
                         elif self.highlight_mode == self.HL_FAST:
                             source_lexer = self._get_lexer_for_filename(source_filename)
                             target_lexer = self._get_lexer_for_filename(target_filename)
                     source_file = self.source_nodes.get(source_filename, source_filename)
                     target_file = self.target_nodes.get(target_filename, target_filename)
                     source_filenode, target_filenode = None, None
                     # TODO: dan: FileNode.lexer works on the content of the file - which
                     # can be slow - issue #4289 explains a lexer clean up - which once
                     # done can allow caching a lexer for a filenode to avoid the file lookup
                     if isinstance(source_file, FileNode):
                         source_filenode = source_file
-                        source_lexer = source_file.lexer
+                        #source_lexer = source_file.lexer
+                        source_lexer = self._get_lexer_for_filename(source_filename)
+                        source_file.lexer = source_lexer
                     if isinstance(target_file, FileNode):
                         target_filenode = target_file
-                        target_lexer = target_file.lexer
+                        #target_lexer = target_file.lexer
+                        target_lexer = self._get_lexer_for_filename(target_filename)
+                        target_file.lexer = target_lexer
                     source_file_path, target_file_path = None, None
                     if source_filename != '/dev/null':
                         source_file_path = source_filename
                     if target_filename != '/dev/null':
                         target_file_path = target_filename
                     source_file_type = source_lexer.name
                     target_file_type = target_lexer.name
                     op_hunks = patch['chunks'][0]
                     hunks = patch['chunks'][1:]
                     filediff = AttributeDict({
                         'source_file_path': source_file_path,
                         'target_file_path': target_file_path,
                         'source_filenode': source_filenode,
                         'target_filenode': target_filenode,
                         'hunks': [],
                         'source_file_type': target_file_type,
                         'target_file_type': source_file_type,
                         'patch': patch,
                         'source_mode': patch['stats']['old_mode'],
                         'target_mode': patch['stats']['new_mode'],
                         'limited_diff': isinstance(patch, LimitedDiffContainer),
                         'diffset': self,
                     })
                     for hunk in hunks:
                         hunkbit = self.parse_hunk(hunk, source_file, target_file)
                         hunkbit.filediff = filediff
                         filediff.hunks.append(hunkbit)
                     left_comments = {}
                     if source_file_path in self.comments_store:
                         for lineno, comments in self.comments_store[source_file_path].items():
                             left_comments[lineno] = comments
                     if target_file_path in self.comments_store:
                         for lineno, comments in self.comments_store[target_file_path].items():
                             left_comments[lineno] = comments
                     filediff.left_comments = left_comments
                     return filediff
                 def parse_hunk(self, hunk, source_file, target_file):
                     result = AttributeDict(dict(
                         source_start=hunk['source_start'],
                         source_length=hunk['source_length'],
                         target_start=hunk['target_start'],
                         target_length=hunk['target_length'],
                         section_header=hunk['section_header'],
                         lines=[],
                     ))
                     before, after = [], []
                     for line in hunk['lines']:
                         if line['action'] == 'unmod':
                             result.lines.extend(
                                 self.parse_lines(before, after, source_file, target_file))
                             after.append(line)
                             before.append(line)
                         elif line['action'] == 'add':
                             after.append(line)
                         elif line['action'] == 'del':
                             before.append(line)
                         elif line['action'] == 'old-no-nl':
                             before.append(line)
                         elif line['action'] == 'new-no-nl':
                             after.append(line)
                     result.lines.extend(
                         self.parse_lines(before, after, source_file, target_file))
                     result.unified = self.as_unified(result.lines)
                     result.sideside = result.lines
                     return result
                 def parse_lines(self, before_lines, after_lines, source_file, target_file):
                     # TODO: dan: investigate doing the diff comparison and fast highlighting
                     # on the entire before and after buffered block lines rather than by
                     # line, this means we can get better 'fast' highlighting if the context
                     # allows it - eg.
                     # line 4: """
                     # line 5: this gets highlighted as a string
                     # line 6: """
                     lines = []
                     while before_lines or after_lines:
                         before, after = None, None
                         before_tokens, after_tokens = None, None
                         if before_lines:
                             before = before_lines.pop(0)
                         if after_lines:
                             after = after_lines.pop(0)
                         original = AttributeDict()
                         modified = AttributeDict()
                         if before:
                             if before['action'] == 'old-no-nl':
                                 before_tokens = [('nonl', before['line'])]
                             else:
                                 before_tokens = self.get_line_tokens(
                                     line_text=before['line'], line_number=before['old_lineno'],
                                     file=source_file)
                             original.lineno = before['old_lineno']
                             original.content = before['line']
                             original.action = self.action_to_op(before['action'])
                             original.comments = self.get_comments_for('old',
                                 source_file, before['old_lineno'])
                         if after:
                             if after['action'] == 'new-no-nl':
                                 after_tokens = [('nonl', after['line'])]
                             else:
                                 after_tokens = self.get_line_tokens(
                                     line_text=after['line'], line_number=after['new_lineno'],
                                     file=target_file)
                             modified.lineno = after['new_lineno']
                             modified.content = after['line']
                             modified.action = self.action_to_op(after['action'])
                             modified.comments = self.get_comments_for('new',
                                 target_file, after['new_lineno'])
                         # diff the lines
                         if before_tokens and after_tokens:
                             o_tokens, m_tokens, similarity = tokens_diff(
                                 before_tokens, after_tokens)
                             original.content = render_tokenstream(o_tokens)
                             modified.content = render_tokenstream(m_tokens)
                         elif before_tokens:
                             original.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in before_tokens])
                         elif after_tokens:
                             modified.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in after_tokens])
                         lines.append(AttributeDict({
                             'original': original,
                             'modified': modified,
                         }))
                     return lines
                 def get_comments_for(self, version, file, line_number):
                     if hasattr(file, 'unicode_path'):
                         file = file.unicode_path
                     if not isinstance(file, basestring):
                         return None
                     line_key = {
                         'old': 'o',
                         'new': 'n',
                     }[version] + str(line_number)
                     if file in self.comments_store:
                         file_comments = self.comments_store[file]
                         if line_key in file_comments:
                             return file_comments.pop(line_key)
                 def get_line_tokens(self, line_text, line_number, file=None):
                     filenode = None
                     filename = None
                     if isinstance(file, basestring):
                         filename = file
                     elif isinstance(file, FileNode):
                         filenode = file
                         filename = file.unicode_path
                     if self.highlight_mode == self.HL_REAL and filenode:
-                        if line_number and file.size < self.max_file_size_limit:
+                        lexer = self._get_lexer_for_filename(filename)
-                            return self.get_tokenized_filenode_line(file, line_number)
+                        file_size_allowed = file.size < self.max_file_size_limit
+                        if line_number and file_size_allowed:
+                            return self.get_tokenized_filenode_line(
+                                file, line_number, lexer)
                     if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
                         lexer = self._get_lexer_for_filename(filename)
                         return list(tokenize_string(line_text, lexer))
                     return list(tokenize_string(line_text, plain_text_lexer))
-                def get_tokenized_filenode_line(self, filenode, line_number):
+                def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
                     if filenode not in self.highlighted_filenodes:
-                        tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)
+                        tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
                         self.highlighted_filenodes[filenode] = tokenized_lines
                     return self.highlighted_filenodes[filenode][line_number - 1]
                 def action_to_op(self, action):
                     return {
                         'add': '+',
                         'del': '-',
                         'unmod': ' ',
                         'old-no-nl': ' ',
                         'new-no-nl': ' ',
                     }.get(action, action)
                 def as_unified(self, lines):
                     """
                     Return a generator that yields the lines of a diff in unified order
                     """
                     def generator():
                         buf = []
                         for line in lines:
                             if buf and not line.original or line.original.action == ' ':
                                 for b in buf:
                                     yield b
                                 buf = []
                             if line.original:
                                 if line.original.action == ' ':
                                     yield (line.original.lineno, line.modified.lineno,
                                            line.original.action, line.original.content,
                                            line.original.comments)
                                     continue
                                 if line.original.action == '-':
                                     yield (line.original.lineno, None,
                                            line.original.action, line.original.content,
                                            line.original.comments)
                                 if line.modified.action == '+':
                                     buf.append((
                                         None, line.modified.lineno,
                                         line.modified.action, line.modified.content,
                                         line.modified.comments))
                                     continue
                             if line.modified:
                                 yield (None, line.modified.lineno,
                                        line.modified.action, line.modified.content,
                                        line.modified.comments)
                         for b in buf:
                             yield b
                     return generator()