rhodecode-enterprise-ce Commit - r4592:8ef51af3

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

3

4

#

4

#

5

# This program is free software: you can redistribute it and/or modify

5

# This program is free software: you can redistribute it and/or modify

6

# it under the terms of the GNU Affero General Public License, version 3

6

# it under the terms of the GNU Affero General Public License, version 3

7

# (only), as published by the Free Software Foundation.

7

# (only), as published by the Free Software Foundation.

8

#

8

#

9

# This program is distributed in the hope that it will be useful,

9

# This program is distributed in the hope that it will be useful,

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

# GNU General Public License for more details.

12

# GNU General Public License for more details.

13

#

13

#

14

# You should have received a copy of the GNU Affero General Public License

14

# You should have received a copy of the GNU Affero General Public License

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

16

#

16

#

17

# This program is dual-licensed. If you wish to learn more about the

17

# This program is dual-licensed. If you wish to learn more about the

18

# RhodeCode Enterprise Edition, including its added features, Support services,

18

# RhodeCode Enterprise Edition, including its added features, Support services,

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

20

21

import logging

21

import logging

22

import difflib

22

import difflib

23

from itertools import groupby

23

from itertools import groupby

24

25

from pygments import lex

25

from pygments import lex

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

27

from pygments.lexers.special import TextLexer, Token

27

from pygments.lexers.special import TextLexer, Token

28

from pygments.lexers import get_lexer_by_name

28

from pygments.lexers import get_lexer_by_name

29

from pyramid import compat

29

from pyramid import compat

30

31

from rhodecode.lib.helpers import (

31

from rhodecode.lib.helpers import (

32

get_lexer_for_filenode, html_escape, get_custom_lexer)

32

get_lexer_for_filenode, html_escape, get_custom_lexer)

33

from rhodecode.lib.utils2 import AttributeDict, StrictAttributeDict, safe_unicode

33

from rhodecode.lib.utils2 import AttributeDict, StrictAttributeDict, safe_unicode

34

from rhodecode.lib.vcs.nodes import FileNode

34

from rhodecode.lib.vcs.nodes import FileNode

35

from rhodecode.lib.vcs.exceptions import VCSError, NodeDoesNotExistError

35

from rhodecode.lib.vcs.exceptions import VCSError, NodeDoesNotExistError

36

from rhodecode.lib.diff_match_patch import diff_match_patch

36

from rhodecode.lib.diff_match_patch import diff_match_patch

37

from rhodecode.lib.diffs import LimitedDiffContainer, DEL_FILENODE, BIN_FILENODE

37

from rhodecode.lib.diffs import LimitedDiffContainer, DEL_FILENODE, BIN_FILENODE

38

39

40

plain_text_lexer = get_lexer_by_name(

40

plain_text_lexer = get_lexer_by_name(

41

'text', stripall=False, stripnl=False, ensurenl=False)

41

'text', stripall=False, stripnl=False, ensurenl=False)

42

43

44

log = logging.getLogger(__name__)

44

log = logging.getLogger(__name__)

45

46

47

def filenode_as_lines_tokens(filenode, lexer=None):

47

def filenode_as_lines_tokens(filenode, lexer=None):

48

org_lexer = lexer

48

org_lexer = lexer

49

lexer = lexer or get_lexer_for_filenode(filenode)

49

lexer = lexer or get_lexer_for_filenode(filenode)

50

log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',

50

log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',

51

lexer, filenode, org_lexer)

51

lexer, filenode, org_lexer)

52

content = filenode.content

52

content = filenode.content

53

tokens = tokenize_string(content, lexer)

53

tokens = tokenize_string(content, lexer)

54

lines = split_token_stream(tokens, content)

54

lines = split_token_stream(tokens, content)

55

rv = list(lines)

55

rv = list(lines)

56

return rv

56

return rv

57

58

59

def tokenize_string(content, lexer):

59

def tokenize_string(content, lexer):

60

"""

60

"""

61

Use pygments to tokenize some content based on a lexer

61

Use pygments to tokenize some content based on a lexer

62

ensuring all original new lines and whitespace is preserved

62

ensuring all original new lines and whitespace is preserved

63

"""

63

"""

64

65

lexer.stripall = False

65

lexer.stripall = False

66

lexer.stripnl = False

66

lexer.stripnl = False

67

lexer.ensurenl = False

67

lexer.ensurenl = False

68

69

if isinstance(lexer, TextLexer):

69

if isinstance(lexer, TextLexer):

70

lexed = [(Token.Text, content)]

70

lexed = [(Token.Text, content)]

71

else:

71

else:

72

lexed = lex(content, lexer)

72

lexed = lex(content, lexer)

73

74

for token_type, token_text in lexed:

74

for token_type, token_text in lexed:

75

yield pygment_token_class(token_type), token_text

75

yield pygment_token_class(token_type), token_text

76

77

78

def split_token_stream(tokens, content):

78

def split_token_stream(tokens, content):

79

"""

79

"""

80

Take a list of (TokenType, text) tuples and split them by a string

80

Take a list of (TokenType, text) tuples and split them by a string

81

82

split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

82

split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

83

[(TEXT, 'some'), (TEXT, 'text'),

83

[(TEXT, 'some'), (TEXT, 'text'),

84

(TEXT, 'more'), (TEXT, 'text')]

84

(TEXT, 'more'), (TEXT, 'text')]

85

"""

85

"""

86

87

token_buffer = []

87

token_buffer = []

88

for token_class, token_text in tokens:

88

for token_class, token_text in tokens:

89

parts = token_text.split('\n')

89

parts = token_text.split('\n')

90

for part in parts[:-1]:

90

for part in parts[:-1]:

91

token_buffer.append((token_class, part))

91

token_buffer.append((token_class, part))

92

yield token_buffer

92

yield token_buffer

93

token_buffer = []

93

token_buffer = []

94

95

token_buffer.append((token_class, parts[-1]))

95

token_buffer.append((token_class, parts[-1]))

96

97

if token_buffer:

97

if token_buffer:

98

yield token_buffer

98

yield token_buffer

99

elif content:

99

elif content:

100

# this is a special case, we have the content, but tokenization didn't produce

100

# this is a special case, we have the content, but tokenization didn't produce

101

# any results. THis can happen if know file extensions like .css have some bogus

101

# any results. THis can happen if know file extensions like .css have some bogus

102

# unicode content without any newline characters

102

# unicode content without any newline characters

103

yield [(pygment_token_class(Token.Text), content)]

103

yield [(pygment_token_class(Token.Text), content)]

104

105

106

def filenode_as_annotated_lines_tokens(filenode):

106

def filenode_as_annotated_lines_tokens(filenode):

107

"""

107

"""

108

Take a file node and return a list of annotations => lines, if no annotation

108

Take a file node and return a list of annotations => lines, if no annotation

109

is found, it will be None.

109

is found, it will be None.

110

111

eg:

111

eg:

112

113

[

113

[

114

(annotation1, [

114

(annotation1, [

115

(1, line1_tokens_list),

115

(1, line1_tokens_list),

116

(2, line2_tokens_list),

116

(2, line2_tokens_list),

117

]),

117

]),

118

(annotation2, [

118

(annotation2, [

119

(3, line1_tokens_list),

119

(3, line1_tokens_list),

120

]),

120

]),

121

(None, [

121

(None, [

122

(4, line1_tokens_list),

122

(4, line1_tokens_list),

123

]),

123

]),

124

(annotation1, [

124

(annotation1, [

125

(5, line1_tokens_list),

125

(5, line1_tokens_list),

126

(6, line2_tokens_list),

126

(6, line2_tokens_list),

127

])

127

])

128

]

128

]

129

"""

129

"""

130

131

commit_cache = {} # cache commit_getter lookups

131

commit_cache = {} # cache commit_getter lookups

132

133

def _get_annotation(commit_id, commit_getter):

133

def _get_annotation(commit_id, commit_getter):

134

if commit_id not in commit_cache:

134

if commit_id not in commit_cache:

135

commit_cache[commit_id] = commit_getter()

135

commit_cache[commit_id] = commit_getter()

136

return commit_cache[commit_id]

136

return commit_cache[commit_id]

137

138

annotation_lookup = {

138

annotation_lookup = {

139

line_no: _get_annotation(commit_id, commit_getter)

139

line_no: _get_annotation(commit_id, commit_getter)

140

for line_no, commit_id, commit_getter, line_content

140

for line_no, commit_id, commit_getter, line_content

141

in filenode.annotate

141

in filenode.annotate

142

}

142

}

143

144

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

144

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

145

for line_no, tokens

145

for line_no, tokens

146

in enumerate(filenode_as_lines_tokens(filenode), 1))

146

in enumerate(filenode_as_lines_tokens(filenode), 1))

147

148

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

148

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

149

150

for annotation, group in grouped_annotations_lines:

150

for annotation, group in grouped_annotations_lines:

151

yield (

151

yield (

152

annotation, [(line_no, tokens)

152

annotation, [(line_no, tokens)

153

for (_, line_no, tokens) in group]

153

for (_, line_no, tokens) in group]

154

)

154

)

155

156

157

def render_tokenstream(tokenstream):

157

def render_tokenstream(tokenstream):

158

result = []

158

result = []

159

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

159

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

160

161

if token_class:

161

if token_class:

162

result.append(u'' % token_class)

162

result.append(u'' % token_class)

163

else:

163

else:

164

result.append(u'')

164

result.append(u'')

165

166

for op_tag, token_text in token_ops_texts:

166

for op_tag, token_text in token_ops_texts:

167

168

if op_tag:

168

if op_tag:

169

result.append(u'<%s>' % op_tag)

169

result.append(u'<%s>' % op_tag)

170

171

# NOTE(marcink): in some cases of mixed encodings, we might run into

171

# NOTE(marcink): in some cases of mixed encodings, we might run into

172

# troubles in the html_escape, in this case we say unicode force on token_text

172

# troubles in the html_escape, in this case we say unicode force on token_text

173

# that would ensure "correct" data even with the cost of rendered

173

# that would ensure "correct" data even with the cost of rendered

174

try:

174

try:

175

escaped_text = html_escape(token_text)

175

escaped_text = html_escape(token_text)

176

except TypeError:

176

except TypeError:

177

escaped_text = html_escape(safe_unicode(token_text))

177

escaped_text = html_escape(safe_unicode(token_text))

178

179

# TODO: dan: investigate showing hidden characters like space/nl/tab

179

# TODO: dan: investigate showing hidden characters like space/nl/tab

180

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

180

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

181

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

181

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

182

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

182

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

183

184

result.append(escaped_text)

184

result.append(escaped_text)

185

186

if op_tag:

186

if op_tag:

187

result.append(u'</%s>' % op_tag)

187

result.append(u'</%s>' % op_tag)

188

189

result.append(u'')

189

result.append(u'')

190

191

html = ''.join(result)

191

html = ''.join(result)

192

return html

192

return html

193

194

195

def rollup_tokenstream(tokenstream):

195

def rollup_tokenstream(tokenstream):

196

"""

196

"""

197

Group a token stream of the format:

197

Group a token stream of the format:

198

199

('class', 'op', 'text')

199

('class', 'op', 'text')

200

or

200

or

201

('class', 'text')

201

('class', 'text')

202

203

into

203

into

204

205

[('class1',

205

[('class1',

206

[('op1', 'text'),

206

[('op1', 'text'),

207

('op2', 'text')]),

207

('op2', 'text')]),

208

('class2',

208

('class2',

209

[('op3', 'text')])]

209

[('op3', 'text')])]

210

211

This is used to get the minimal tags necessary when

211

This is used to get the minimal tags necessary when

212

rendering to html eg for a token stream ie.

212

rendering to html eg for a token stream ie.

213

214

<ins>he</ins>llo

214

<ins>he</ins>llo

215

vs

215

vs

216

<ins>he</ins>llo

216

<ins>he</ins>llo

217

218

If a 2 tuple is passed in, the output op will be an empty string.

218

If a 2 tuple is passed in, the output op will be an empty string.

219

220

eg:

220

eg:

221

222

>>> rollup_tokenstream([('classA', '', 'h'),

222

>>> rollup_tokenstream([('classA', '', 'h'),

223

('classA', 'del', 'ell'),

223

('classA', 'del', 'ell'),

224

('classA', '', 'o'),

224

('classA', '', 'o'),

225

('classB', '', ' '),

225

('classB', '', ' '),

226

('classA', '', 'the'),

226

('classA', '', 'the'),

227

('classA', '', 're'),

227

('classA', '', 're'),

228

])

228

])

229

230

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

230

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

231

('classB', [('', ' ')],

231

('classB', [('', ' ')],

232

('classA', [('', 'there')]]

232

('classA', [('', 'there')]]

233

234

"""

234

"""

235

if tokenstream and len(tokenstream[0]) == 2:

235

if tokenstream and len(tokenstream[0]) == 2:

236

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

236

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

237

238

result = []

238

result = []

239

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

239

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

240

ops = []

240

ops = []

241

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

241

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

242

text_buffer = []

242

text_buffer = []

243

for t_class, t_op, t_text in token_text_list:

243

for t_class, t_op, t_text in token_text_list:

244

text_buffer.append(t_text)

244

text_buffer.append(t_text)

245

ops.append((token_op, ''.join(text_buffer)))

245

ops.append((token_op, ''.join(text_buffer)))

246

result.append((token_class, ops))

246

result.append((token_class, ops))

247

return result

247

return result

248

249

250

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

250

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

251

"""

251

"""

252

Converts a list of (token_class, token_text) tuples to a list of

252

Converts a list of (token_class, token_text) tuples to a list of

253

(token_class, token_op, token_text) tuples where token_op is one of

253

(token_class, token_op, token_text) tuples where token_op is one of

254

('ins', 'del', '')

254

('ins', 'del', '')

255

256

:param old_tokens: list of (token_class, token_text) tuples of old line

256

:param old_tokens: list of (token_class, token_text) tuples of old line

257

:param new_tokens: list of (token_class, token_text) tuples of new line

257

:param new_tokens: list of (token_class, token_text) tuples of new line

258

:param use_diff_match_patch: boolean, will use google's diff match patch

258

:param use_diff_match_patch: boolean, will use google's diff match patch

259

library which has options to 'smooth' out the character by character

259

library which has options to 'smooth' out the character by character

260

differences making nicer ins/del blocks

260

differences making nicer ins/del blocks

261

"""

261

"""

262

263

old_tokens_result = []

263

old_tokens_result = []

264

new_tokens_result = []

264

new_tokens_result = []

265

266

similarity = difflib.SequenceMatcher(None,

266

similarity = difflib.SequenceMatcher(None,

267

''.join(token_text for token_class, token_text in old_tokens),

267

''.join(token_text for token_class, token_text in old_tokens),

268

''.join(token_text for token_class, token_text in new_tokens)

268

''.join(token_text for token_class, token_text in new_tokens)

269

).ratio()

269

).ratio()

270

271

if similarity < 0.6: # return, the blocks are too different

271

if similarity < 0.6: # return, the blocks are too different

272

for token_class, token_text in old_tokens:

272

for token_class, token_text in old_tokens:

273

old_tokens_result.append((token_class, '', token_text))

273

old_tokens_result.append((token_class, '', token_text))

274

for token_class, token_text in new_tokens:

274

for token_class, token_text in new_tokens:

275

new_tokens_result.append((token_class, '', token_text))

275

new_tokens_result.append((token_class, '', token_text))

276

return old_tokens_result, new_tokens_result, similarity

276

return old_tokens_result, new_tokens_result, similarity

277

278

token_sequence_matcher = difflib.SequenceMatcher(None,

278

token_sequence_matcher = difflib.SequenceMatcher(None,

279

[x[1] for x in old_tokens],

279

[x[1] for x in old_tokens],

280

[x[1] for x in new_tokens])

280

[x[1] for x in new_tokens])

281

282

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

282

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

283

# check the differences by token block types first to give a more

283

# check the differences by token block types first to give a more

284

# nicer "block" level replacement vs character diffs

284

# nicer "block" level replacement vs character diffs

285

286

if tag == 'equal':

286

if tag == 'equal':

287

for token_class, token_text in old_tokens[o1:o2]:

287

for token_class, token_text in old_tokens[o1:o2]:

288

old_tokens_result.append((token_class, '', token_text))

288

old_tokens_result.append((token_class, '', token_text))

289

for token_class, token_text in new_tokens[n1:n2]:

289

for token_class, token_text in new_tokens[n1:n2]:

290

new_tokens_result.append((token_class, '', token_text))

290

new_tokens_result.append((token_class, '', token_text))

291

elif tag == 'delete':

291

elif tag == 'delete':

292

for token_class, token_text in old_tokens[o1:o2]:

292

for token_class, token_text in old_tokens[o1:o2]:

293

old_tokens_result.append((token_class, 'del', token_text))

293

old_tokens_result.append((token_class, 'del', token_text))

294

elif tag == 'insert':

294

elif tag == 'insert':

295

for token_class, token_text in new_tokens[n1:n2]:

295

for token_class, token_text in new_tokens[n1:n2]:

296

new_tokens_result.append((token_class, 'ins', token_text))

296

new_tokens_result.append((token_class, 'ins', token_text))

297

elif tag == 'replace':

297

elif tag == 'replace':

298

# if same type token blocks must be replaced, do a diff on the

298

# if same type token blocks must be replaced, do a diff on the

299

# characters in the token blocks to show individual changes

299

# characters in the token blocks to show individual changes

300

301

old_char_tokens = []

301

old_char_tokens = []

302

new_char_tokens = []

302

new_char_tokens = []

303

for token_class, token_text in old_tokens[o1:o2]:

303

for token_class, token_text in old_tokens[o1:o2]:

304

for char in token_text:

304

for char in token_text:

305

old_char_tokens.append((token_class, char))

305

old_char_tokens.append((token_class, char))

306

307

for token_class, token_text in new_tokens[n1:n2]:

307

for token_class, token_text in new_tokens[n1:n2]:

308

for char in token_text:

308

for char in token_text:

309

new_char_tokens.append((token_class, char))

309

new_char_tokens.append((token_class, char))

310

311

old_string = ''.join([token_text for

311

old_string = ''.join([token_text for

312

token_class, token_text in old_char_tokens])

312

token_class, token_text in old_char_tokens])

313

new_string = ''.join([token_text for

313

new_string = ''.join([token_text for

314

token_class, token_text in new_char_tokens])

314

token_class, token_text in new_char_tokens])

315

316

char_sequence = difflib.SequenceMatcher(

316

char_sequence = difflib.SequenceMatcher(

317

None, old_string, new_string)

317

None, old_string, new_string)

318

copcodes = char_sequence.get_opcodes()

318

copcodes = char_sequence.get_opcodes()

319

obuffer, nbuffer = [], []

319

obuffer, nbuffer = [], []

320

321

if use_diff_match_patch:

321

if use_diff_match_patch:

322

dmp = diff_match_patch()

322

dmp = diff_match_patch()

323

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

323

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

324

reps = dmp.diff_main(old_string, new_string)

324

reps = dmp.diff_main(old_string, new_string)

325

dmp.diff_cleanupEfficiency(reps)

325

dmp.diff_cleanupEfficiency(reps)

326

327

a, b = 0, 0

327

a, b = 0, 0

328

for op, rep in reps:

328

for op, rep in reps:

329

l = len(rep)

329

l = len(rep)

330

if op == 0:

330

if op == 0:

331

for i, c in enumerate(rep):

331

for i, c in enumerate(rep):

332

obuffer.append((old_char_tokens[a+i][0], '', c))

332

obuffer.append((old_char_tokens[a+i][0], '', c))

333

nbuffer.append((new_char_tokens[b+i][0], '', c))

333

nbuffer.append((new_char_tokens[b+i][0], '', c))

334

a += l

334

a += l

335

b += l

335

b += l

336

elif op == -1:

336

elif op == -1:

337

for i, c in enumerate(rep):

337

for i, c in enumerate(rep):

338

obuffer.append((old_char_tokens[a+i][0], 'del', c))

338

obuffer.append((old_char_tokens[a+i][0], 'del', c))

339

a += l

339

a += l

340

elif op == 1:

340

elif op == 1:

341

for i, c in enumerate(rep):

341

for i, c in enumerate(rep):

342

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

342

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

343

b += l

343

b += l

344

else:

344

else:

345

for ctag, co1, co2, cn1, cn2 in copcodes:

345

for ctag, co1, co2, cn1, cn2 in copcodes:

346

if ctag == 'equal':

346

if ctag == 'equal':

347

for token_class, token_text in old_char_tokens[co1:co2]:

347

for token_class, token_text in old_char_tokens[co1:co2]:

348

obuffer.append((token_class, '', token_text))

348

obuffer.append((token_class, '', token_text))

349

for token_class, token_text in new_char_tokens[cn1:cn2]:

349

for token_class, token_text in new_char_tokens[cn1:cn2]:

350

nbuffer.append((token_class, '', token_text))

350

nbuffer.append((token_class, '', token_text))

351

elif ctag == 'delete':

351

elif ctag == 'delete':

352

for token_class, token_text in old_char_tokens[co1:co2]:

352

for token_class, token_text in old_char_tokens[co1:co2]:

353

obuffer.append((token_class, 'del', token_text))

353

obuffer.append((token_class, 'del', token_text))

354

elif ctag == 'insert':

354

elif ctag == 'insert':

355

for token_class, token_text in new_char_tokens[cn1:cn2]:

355

for token_class, token_text in new_char_tokens[cn1:cn2]:

356

nbuffer.append((token_class, 'ins', token_text))

356

nbuffer.append((token_class, 'ins', token_text))

357

elif ctag == 'replace':

357

elif ctag == 'replace':

358

for token_class, token_text in old_char_tokens[co1:co2]:

358

for token_class, token_text in old_char_tokens[co1:co2]:

359

obuffer.append((token_class, 'del', token_text))

359

obuffer.append((token_class, 'del', token_text))

360

for token_class, token_text in new_char_tokens[cn1:cn2]:

360

for token_class, token_text in new_char_tokens[cn1:cn2]:

361

nbuffer.append((token_class, 'ins', token_text))

361

nbuffer.append((token_class, 'ins', token_text))

362

363

old_tokens_result.extend(obuffer)

363

old_tokens_result.extend(obuffer)

364

new_tokens_result.extend(nbuffer)

364

new_tokens_result.extend(nbuffer)

365

366

return old_tokens_result, new_tokens_result, similarity

366

return old_tokens_result, new_tokens_result, similarity

367

368

369

def diffset_node_getter(commit):

369

def diffset_node_getter(commit):

370

def get_node(fname):

370

def get_node(fname):

371

try:

371

try:

372

return commit.get_node(fname)

372

return commit.get_node(fname)

373

except NodeDoesNotExistError:

373

except NodeDoesNotExistError:

374

return None

374

return None

375

376

return get_node

376

return get_node

377

378

379

class DiffSet(object):

379

class DiffSet(object):

380

"""

380

"""

381

An object for parsing the diff result from diffs.DiffProcessor and

381

An object for parsing the diff result from diffs.DiffProcessor and

382

adding highlighting, side by side/unified renderings and line diffs

382

adding highlighting, side by side/unified renderings and line diffs

383

"""

383

"""

384

385

HL_REAL = 'REAL' # highlights using original file, slow

385

HL_REAL = 'REAL' # highlights using original file, slow

386

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

386

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

387

# in the case of multiline code

387

# in the case of multiline code

388

HL_NONE = 'NONE' # no highlighting, fastest

388

HL_NONE = 'NONE' # no highlighting, fastest

389

390

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

390

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

391

source_repo_name=None,

391

source_repo_name=None,

392

source_node_getter=lambda filename: None,

392

source_node_getter=lambda filename: None,

393

target_repo_name=None,

393

target_repo_name=None,

394

target_node_getter=lambda filename: None,

394

target_node_getter=lambda filename: None,

395

source_nodes=None, target_nodes=None,

395

source_nodes=None, target_nodes=None,

396

# files over this size will use fast highlighting

396

# files over this size will use fast highlighting

397

max_file_size_limit=150 * 1024,

397

max_file_size_limit=150 * 1024,

398

):

398

):

399

400

self.highlight_mode = highlight_mode

400

self.highlight_mode = highlight_mode

401

self.highlighted_filenodes = {}

401

self.highlighted_filenodes = {

402

'before': {},

403

'after': {}

404

}

402

self.source_node_getter = source_node_getter

405

self.source_node_getter = source_node_getter

403

self.target_node_getter = target_node_getter

406

self.target_node_getter = target_node_getter

404

self.source_nodes = source_nodes or {}

407

self.source_nodes = source_nodes or {}

405

self.target_nodes = target_nodes or {}

408

self.target_nodes = target_nodes or {}

406

self.repo_name = repo_name

409

self.repo_name = repo_name

407

self.target_repo_name = target_repo_name or repo_name

410

self.target_repo_name = target_repo_name or repo_name

408

self.source_repo_name = source_repo_name or repo_name

411

self.source_repo_name = source_repo_name or repo_name

409

self.max_file_size_limit = max_file_size_limit

412

self.max_file_size_limit = max_file_size_limit

410

413

411

def render_patchset(self, patchset, source_ref=None, target_ref=None):

414

def render_patchset(self, patchset, source_ref=None, target_ref=None):

412

diffset = AttributeDict(dict(

415

diffset = AttributeDict(dict(

413

lines_added=0,

416

lines_added=0,

414

lines_deleted=0,

417

lines_deleted=0,

415

changed_files=0,

418

changed_files=0,

416

files=[],

419

files=[],

417

file_stats={},

420

file_stats={},

418

limited_diff=isinstance(patchset, LimitedDiffContainer),

421

limited_diff=isinstance(patchset, LimitedDiffContainer),

419

repo_name=self.repo_name,

422

repo_name=self.repo_name,

420

target_repo_name=self.target_repo_name,

423

target_repo_name=self.target_repo_name,

421

source_repo_name=self.source_repo_name,

424

source_repo_name=self.source_repo_name,

422

source_ref=source_ref,

425

source_ref=source_ref,

423

target_ref=target_ref,

426

target_ref=target_ref,

424

))

427

))

425

for patch in patchset:

428

for patch in patchset:

426

diffset.file_stats[patch['filename']] = patch['stats']

429

diffset.file_stats[patch['filename']] = patch['stats']

427

filediff = self.render_patch(patch)

430

filediff = self.render_patch(patch)

428

filediff.diffset = StrictAttributeDict(dict(

431

filediff.diffset = StrictAttributeDict(dict(

429

source_ref=diffset.source_ref,

432

source_ref=diffset.source_ref,

430

target_ref=diffset.target_ref,

433

target_ref=diffset.target_ref,

431

repo_name=diffset.repo_name,

434

repo_name=diffset.repo_name,

432

source_repo_name=diffset.source_repo_name,

435

source_repo_name=diffset.source_repo_name,

433

target_repo_name=diffset.target_repo_name,

436

target_repo_name=diffset.target_repo_name,

434

))

437

))

435

diffset.files.append(filediff)

438

diffset.files.append(filediff)

436

diffset.changed_files += 1

439

diffset.changed_files += 1

437

if not patch['stats']['binary']:

440

if not patch['stats']['binary']:

438

diffset.lines_added += patch['stats']['added']

441

diffset.lines_added += patch['stats']['added']

439

diffset.lines_deleted += patch['stats']['deleted']

442

diffset.lines_deleted += patch['stats']['deleted']

440

443

441

return diffset

444

return diffset

442

445

443

_lexer_cache = {}

446

_lexer_cache = {}

444

447

445

def _get_lexer_for_filename(self, filename, filenode=None):

448

def _get_lexer_for_filename(self, filename, filenode=None):

446

# cached because we might need to call it twice for source/target

449

# cached because we might need to call it twice for source/target

447

if filename not in self._lexer_cache:

450

if filename not in self._lexer_cache:

448

if filenode:

451

if filenode:

449

lexer = filenode.lexer

452

lexer = filenode.lexer

450

extension = filenode.extension

453

extension = filenode.extension

451

else:

454

else:

452

lexer = FileNode.get_lexer(filename=filename)

455

lexer = FileNode.get_lexer(filename=filename)

453

extension = filename.split('.')[-1]

456

extension = filename.split('.')[-1]

454

457

455

lexer = get_custom_lexer(extension) or lexer

458

lexer = get_custom_lexer(extension) or lexer

456

self._lexer_cache[filename] = lexer

459

self._lexer_cache[filename] = lexer

457

return self._lexer_cache[filename]

460

return self._lexer_cache[filename]

458

461

459

def render_patch(self, patch):

462

def render_patch(self, patch):

460

log.debug('rendering diff for %r', patch['filename'])

463

log.debug('rendering diff for %r', patch['filename'])

461

464

462

source_filename = patch['original_filename']

465

source_filename = patch['original_filename']

463

target_filename = patch['filename']

466

target_filename = patch['filename']

464

467

465

source_lexer = plain_text_lexer

468

source_lexer = plain_text_lexer

466

target_lexer = plain_text_lexer

469

target_lexer = plain_text_lexer

467

470

468

if not patch['stats']['binary']:

471

if not patch['stats']['binary']:

469

node_hl_mode = self.HL_NONE if patch['chunks'] == [] else None

472

node_hl_mode = self.HL_NONE if patch['chunks'] == [] else None

470

hl_mode = node_hl_mode or self.highlight_mode

473

hl_mode = node_hl_mode or self.highlight_mode

471

474

472

if hl_mode == self.HL_REAL:

475

if hl_mode == self.HL_REAL:

473

if (source_filename and patch['operation'] in ('D', 'M')

476

if (source_filename and patch['operation'] in ('D', 'M')

474

and source_filename not in self.source_nodes):

477

and source_filename not in self.source_nodes):

475

self.source_nodes[source_filename] = (

478

self.source_nodes[source_filename] = (

476

self.source_node_getter(source_filename))

479

self.source_node_getter(source_filename))

477

480

478

if (target_filename and patch['operation'] in ('A', 'M')

481

if (target_filename and patch['operation'] in ('A', 'M')

479

and target_filename not in self.target_nodes):

482

and target_filename not in self.target_nodes):

480

self.target_nodes[target_filename] = (

483

self.target_nodes[target_filename] = (

481

self.target_node_getter(target_filename))

484

self.target_node_getter(target_filename))

482

485

483

elif hl_mode == self.HL_FAST:

486

elif hl_mode == self.HL_FAST:

484

source_lexer = self._get_lexer_for_filename(source_filename)

487

source_lexer = self._get_lexer_for_filename(source_filename)

485

target_lexer = self._get_lexer_for_filename(target_filename)

488

target_lexer = self._get_lexer_for_filename(target_filename)

486

489

487

source_file = self.source_nodes.get(source_filename, source_filename)

490

source_file = self.source_nodes.get(source_filename, source_filename)

488

target_file = self.target_nodes.get(target_filename, target_filename)

491

target_file = self.target_nodes.get(target_filename, target_filename)

489

raw_id_uid = ''

492

raw_id_uid = ''

490

if self.source_nodes.get(source_filename):

493

if self.source_nodes.get(source_filename):

491

raw_id_uid = self.source_nodes[source_filename].commit.raw_id

494

raw_id_uid = self.source_nodes[source_filename].commit.raw_id

492

495

493

if not raw_id_uid and self.target_nodes.get(target_filename):

496

if not raw_id_uid and self.target_nodes.get(target_filename):

494

# in case this is a new file we only have it in target

497

# in case this is a new file we only have it in target

495

raw_id_uid = self.target_nodes[target_filename].commit.raw_id

498

raw_id_uid = self.target_nodes[target_filename].commit.raw_id

496

499

497

source_filenode, target_filenode = None, None

500

source_filenode, target_filenode = None, None

498

501

499

# TODO: dan: FileNode.lexer works on the content of the file - which

502

# TODO: dan: FileNode.lexer works on the content of the file - which

500

# can be slow - issue #4289 explains a lexer clean up - which once

503

# can be slow - issue #4289 explains a lexer clean up - which once

501

# done can allow caching a lexer for a filenode to avoid the file lookup

504

# done can allow caching a lexer for a filenode to avoid the file lookup

502

if isinstance(source_file, FileNode):

505

if isinstance(source_file, FileNode):

503

source_filenode = source_file

506

source_filenode = source_file

504

#source_lexer = source_file.lexer

507

#source_lexer = source_file.lexer

505

source_lexer = self._get_lexer_for_filename(source_filename)

508

source_lexer = self._get_lexer_for_filename(source_filename)

506

source_file.lexer = source_lexer

509

source_file.lexer = source_lexer

507

510

508

if isinstance(target_file, FileNode):

511

if isinstance(target_file, FileNode):

509

target_filenode = target_file

512

target_filenode = target_file

510

#target_lexer = target_file.lexer

513

#target_lexer = target_file.lexer

511

target_lexer = self._get_lexer_for_filename(target_filename)

514

target_lexer = self._get_lexer_for_filename(target_filename)

512

target_file.lexer = target_lexer

515

target_file.lexer = target_lexer

513

516

514

source_file_path, target_file_path = None, None

517

source_file_path, target_file_path = None, None

515

518

516

if source_filename != '/dev/null':

519

if source_filename != '/dev/null':

517

source_file_path = source_filename

520

source_file_path = source_filename

518

if target_filename != '/dev/null':

521

if target_filename != '/dev/null':

519

target_file_path = target_filename

522

target_file_path = target_filename

520

523

521

source_file_type = source_lexer.name

524

source_file_type = source_lexer.name

522

target_file_type = target_lexer.name

525

target_file_type = target_lexer.name

523

526

524

filediff = AttributeDict({

527

filediff = AttributeDict({

525

'source_file_path': source_file_path,

528

'source_file_path': source_file_path,

526

'target_file_path': target_file_path,

529

'target_file_path': target_file_path,

527

'source_filenode': source_filenode,

530

'source_filenode': source_filenode,

528

'target_filenode': target_filenode,

531

'target_filenode': target_filenode,

529

'source_file_type': target_file_type,

532

'source_file_type': target_file_type,

530

'target_file_type': source_file_type,

533

'target_file_type': source_file_type,

531

'patch': {'filename': patch['filename'], 'stats': patch['stats']},

534

'patch': {'filename': patch['filename'], 'stats': patch['stats']},

532

'operation': patch['operation'],

535

'operation': patch['operation'],

533

'source_mode': patch['stats']['old_mode'],

536

'source_mode': patch['stats']['old_mode'],

534

'target_mode': patch['stats']['new_mode'],

537

'target_mode': patch['stats']['new_mode'],

535

'limited_diff': patch['is_limited_diff'],

538

'limited_diff': patch['is_limited_diff'],

536

'hunks': [],

539

'hunks': [],

537

'hunk_ops': None,

540

'hunk_ops': None,

538

'diffset': self,

541

'diffset': self,

539

'raw_id': raw_id_uid,

542

'raw_id': raw_id_uid,

540

})

543

})

541

544

542

file_chunks = patch['chunks'][1:]

545

file_chunks = patch['chunks'][1:]

543

for i, hunk in enumerate(file_chunks, 1):

546

for i, hunk in enumerate(file_chunks, 1):

544

hunkbit = self.parse_hunk(hunk, source_file, target_file)

547

hunkbit = self.parse_hunk(hunk, source_file, target_file)

545

hunkbit.source_file_path = source_file_path

548

hunkbit.source_file_path = source_file_path

546

hunkbit.target_file_path = target_file_path

549

hunkbit.target_file_path = target_file_path

547

hunkbit.index = i

550

hunkbit.index = i

548

filediff.hunks.append(hunkbit)

551

filediff.hunks.append(hunkbit)

549

552

550

# Simulate hunk on OPS type line which doesn't really contain any diff

553

# Simulate hunk on OPS type line which doesn't really contain any diff

551

# this allows commenting on those

554

# this allows commenting on those

552

if not file_chunks:

555

if not file_chunks:

553

actions = []

556

actions = []

554

for op_id, op_text in filediff.patch['stats']['ops'].items():

557

for op_id, op_text in filediff.patch['stats']['ops'].items():

555

if op_id == DEL_FILENODE:

558

if op_id == DEL_FILENODE:

556

actions.append(u'file was removed')

559

actions.append(u'file was removed')

557

elif op_id == BIN_FILENODE:

560

elif op_id == BIN_FILENODE:

558

actions.append(u'binary diff hidden')

561

actions.append(u'binary diff hidden')

559

else:

562

else:

560

actions.append(safe_unicode(op_text))

563

actions.append(safe_unicode(op_text))

561

action_line = u'NO CONTENT: ' + \

564

action_line = u'NO CONTENT: ' + \

562

u', '.join(actions) or u'UNDEFINED_ACTION'

565

u', '.join(actions) or u'UNDEFINED_ACTION'

563

566

564

hunk_ops = {'source_length': 0, 'source_start': 0,

567

hunk_ops = {'source_length': 0, 'source_start': 0,

565

'lines': [

568

'lines': [

566

{'new_lineno': 0, 'old_lineno': 1,

569

{'new_lineno': 0, 'old_lineno': 1,

567

'action': 'unmod-no-hl', 'line': action_line}

570

'action': 'unmod-no-hl', 'line': action_line}

568

],

571

],

569

'section_header': u'', 'target_start': 1, 'target_length': 1}

572

'section_header': u'', 'target_start': 1, 'target_length': 1}

570

573

571

hunkbit = self.parse_hunk(hunk_ops, source_file, target_file)

574

hunkbit = self.parse_hunk(hunk_ops, source_file, target_file)

572

hunkbit.source_file_path = source_file_path

575

hunkbit.source_file_path = source_file_path

573

hunkbit.target_file_path = target_file_path

576

hunkbit.target_file_path = target_file_path

574

filediff.hunk_ops = hunkbit

577

filediff.hunk_ops = hunkbit

575

return filediff

578

return filediff

576

579

577

def parse_hunk(self, hunk, source_file, target_file):

580

def parse_hunk(self, hunk, source_file, target_file):

578

result = AttributeDict(dict(

581

result = AttributeDict(dict(

579

source_start=hunk['source_start'],

582

source_start=hunk['source_start'],

580

source_length=hunk['source_length'],

583

source_length=hunk['source_length'],

581

target_start=hunk['target_start'],

584

target_start=hunk['target_start'],

582

target_length=hunk['target_length'],

585

target_length=hunk['target_length'],

583

section_header=hunk['section_header'],

586

section_header=hunk['section_header'],

584

lines=[],

587

lines=[],

585

))

588

))

586

before, after = [], []

589

before, after = [], []

587

590

588

for line in hunk['lines']:

591

for line in hunk['lines']:

589

if line['action'] in ['unmod', 'unmod-no-hl']:

592

if line['action'] in ['unmod', 'unmod-no-hl']:

590

no_hl = line['action'] == 'unmod-no-hl'

593

no_hl = line['action'] == 'unmod-no-hl'

591

result.lines.extend(

594

result.lines.extend(

592

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

595

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

593

after.append(line)

596

after.append(line)

594

before.append(line)

597

before.append(line)

595

elif line['action'] == 'add':

598

elif line['action'] == 'add':

596

after.append(line)

599

after.append(line)

597

elif line['action'] == 'del':

600

elif line['action'] == 'del':

598

before.append(line)

601

before.append(line)

599

elif line['action'] == 'old-no-nl':

602

elif line['action'] == 'old-no-nl':

600

before.append(line)

603

before.append(line)

601

elif line['action'] == 'new-no-nl':

604

elif line['action'] == 'new-no-nl':

602

after.append(line)

605

after.append(line)

603

606

604

all_actions = [x['action'] for x in after] + [x['action'] for x in before]

607

all_actions = [x['action'] for x in after] + [x['action'] for x in before]

605

no_hl = {x for x in all_actions} == {'unmod-no-hl'}

608

no_hl = {x for x in all_actions} == {'unmod-no-hl'}

606

result.lines.extend(

609

result.lines.extend(

607

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

610

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

608

# NOTE(marcink): we must keep list() call here so we can cache the result...

611

# NOTE(marcink): we must keep list() call here so we can cache the result...

609

result.unified = list(self.as_unified(result.lines))

612

result.unified = list(self.as_unified(result.lines))

610

result.sideside = result.lines

613

result.sideside = result.lines

611

614

612

return result

615

return result

613

616

614

def parse_lines(self, before_lines, after_lines, source_file, target_file,

617

def parse_lines(self, before_lines, after_lines, source_file, target_file,

615

no_hl=False):

618

no_hl=False):

616

# TODO: dan: investigate doing the diff comparison and fast highlighting

619

# TODO: dan: investigate doing the diff comparison and fast highlighting

617

# on the entire before and after buffered block lines rather than by

620

# on the entire before and after buffered block lines rather than by

618

# line, this means we can get better 'fast' highlighting if the context

621

# line, this means we can get better 'fast' highlighting if the context

619

# allows it - eg.

622

# allows it - eg.

620

# line 4: """

623

# line 4: """

621

# line 5: this gets highlighted as a string

624

# line 5: this gets highlighted as a string

622

# line 6: """

625

# line 6: """

623

626

624

lines = []

627

lines = []

625

628

626

before_newline = AttributeDict()

629

before_newline = AttributeDict()

627

after_newline = AttributeDict()

630

after_newline = AttributeDict()

628

if before_lines and before_lines[-1]['action'] == 'old-no-nl':

631

if before_lines and before_lines[-1]['action'] == 'old-no-nl':

629

before_newline_line = before_lines.pop(-1)

632

before_newline_line = before_lines.pop(-1)

630

before_newline.content = '\n {}'.format(

633

before_newline.content = '\n {}'.format(

631

render_tokenstream(

634

render_tokenstream(

632

[(x[0], '', x[1])

635

[(x[0], '', x[1])

633

for x in [('nonl', before_newline_line['line'])]]))

636

for x in [('nonl', before_newline_line['line'])]]))

634

637

635

if after_lines and after_lines[-1]['action'] == 'new-no-nl':

638

if after_lines and after_lines[-1]['action'] == 'new-no-nl':

636

after_newline_line = after_lines.pop(-1)

639

after_newline_line = after_lines.pop(-1)

637

after_newline.content = '\n {}'.format(

640

after_newline.content = '\n {}'.format(

638

render_tokenstream(

641

render_tokenstream(

639

[(x[0], '', x[1])

642

[(x[0], '', x[1])

640

for x in [('nonl', after_newline_line['line'])]]))

643

for x in [('nonl', after_newline_line['line'])]]))

641

644

642

while before_lines or after_lines:

645

while before_lines or after_lines:

643

before, after = None, None

646

before, after = None, None

644

before_tokens, after_tokens = None, None

647

before_tokens, after_tokens = None, None

645

648

646

if before_lines:

649

if before_lines:

647

before = before_lines.pop(0)

650

before = before_lines.pop(0)

648

if after_lines:

651

if after_lines:

649

after = after_lines.pop(0)

652

after = after_lines.pop(0)

650

653

651

original = AttributeDict()

654

original = AttributeDict()

652

modified = AttributeDict()

655

modified = AttributeDict()

653

656

654

if before:

657

if before:

655

if before['action'] == 'old-no-nl':

658

if before['action'] == 'old-no-nl':

656

before_tokens = [('nonl', before['line'])]

659

before_tokens = [('nonl', before['line'])]

657

else:

660

else:

658

before_tokens = self.get_line_tokens(

661

before_tokens = self.get_line_tokens(

659

line_text=before['line'], line_number=before['old_lineno'],

662

line_text=before['line'], line_number=before['old_lineno'],

660

input_file=source_file, no_hl=no_hl)

663

input_file=source_file, no_hl=no_hl, source='before')

661

original.lineno = before['old_lineno']

664

original.lineno = before['old_lineno']

662

original.content = before['line']

665

original.content = before['line']

663

original.action = self.action_to_op(before['action'])

666

original.action = self.action_to_op(before['action'])

664

667

665

original.get_comment_args = (

668

original.get_comment_args = (

666

source_file, 'o', before['old_lineno'])

669

source_file, 'o', before['old_lineno'])

667

670

668

if after:

671

if after:

669

if after['action'] == 'new-no-nl':

672

if after['action'] == 'new-no-nl':

670

after_tokens = [('nonl', after['line'])]

673

after_tokens = [('nonl', after['line'])]

671

else:

674

else:

672

after_tokens = self.get_line_tokens(

675

after_tokens = self.get_line_tokens(

673

line_text=after['line'], line_number=after['new_lineno'],

676

line_text=after['line'], line_number=after['new_lineno'],

674

input_file=target_file, no_hl=no_hl)

677

input_file=target_file, no_hl=no_hl, source='after')

675

modified.lineno = after['new_lineno']

678

modified.lineno = after['new_lineno']

676

modified.content = after['line']

679

modified.content = after['line']

677

modified.action = self.action_to_op(after['action'])

680

modified.action = self.action_to_op(after['action'])

678

681

679

modified.get_comment_args = (target_file, 'n', after['new_lineno'])

682

modified.get_comment_args = (target_file, 'n', after['new_lineno'])

680

683

681

# diff the lines

684

# diff the lines

682

if before_tokens and after_tokens:

685

if before_tokens and after_tokens:

683

o_tokens, m_tokens, similarity = tokens_diff(

686

o_tokens, m_tokens, similarity = tokens_diff(

684

before_tokens, after_tokens)

687

before_tokens, after_tokens)

685

original.content = render_tokenstream(o_tokens)

688

original.content = render_tokenstream(o_tokens)

686

modified.content = render_tokenstream(m_tokens)

689

modified.content = render_tokenstream(m_tokens)

687

elif before_tokens:

690

elif before_tokens:

688

original.content = render_tokenstream(

691

original.content = render_tokenstream(

689

[(x[0], '', x[1]) for x in before_tokens])

692

[(x[0], '', x[1]) for x in before_tokens])

690

elif after_tokens:

693

elif after_tokens:

691

modified.content = render_tokenstream(

694

modified.content = render_tokenstream(

692

[(x[0], '', x[1]) for x in after_tokens])

695

[(x[0], '', x[1]) for x in after_tokens])

693

696

694

if not before_lines and before_newline:

697

if not before_lines and before_newline:

695

original.content += before_newline.content

698

original.content += before_newline.content

696

before_newline = None

699

before_newline = None

697

if not after_lines and after_newline:

700

if not after_lines and after_newline:

698

modified.content += after_newline.content

701

modified.content += after_newline.content

699

after_newline = None

702

after_newline = None

700

703

701

lines.append(AttributeDict({

704

lines.append(AttributeDict({

702

'original': original,

705

'original': original,

703

'modified': modified,

706

'modified': modified,

704

}))

707

}))

705

708

706

return lines

709

return lines

707

710

708

def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False):

711

def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False, source=''):

709

filenode = None

712

filenode = None

710

filename = None

713

filename = None

711

714

712

if isinstance(input_file, compat.string_types):

715

if isinstance(input_file, compat.string_types):

713

filename = input_file

716

filename = input_file

714

elif isinstance(input_file, FileNode):

717

elif isinstance(input_file, FileNode):

715

filenode = input_file

718

filenode = input_file

716

filename = input_file.unicode_path

719

filename = input_file.unicode_path

717

720

718

hl_mode = self.HL_NONE if no_hl else self.highlight_mode

721

hl_mode = self.HL_NONE if no_hl else self.highlight_mode

719

if hl_mode == self.HL_REAL and filenode:

722

if hl_mode == self.HL_REAL and filenode:

720

lexer = self._get_lexer_for_filename(filename)

723

lexer = self._get_lexer_for_filename(filename)

721

file_size_allowed = input_file.size < self.max_file_size_limit

724

file_size_allowed = input_file.size < self.max_file_size_limit

722

if line_number and file_size_allowed:

725

if line_number and file_size_allowed:

723

return self.get_tokenized_filenode_line(

726

return self.get_tokenized_filenode_line(input_file, line_number, lexer, source)

724

input_file, line_number, lexer)

725

727

726

if hl_mode in (self.HL_REAL, self.HL_FAST) and filename:

728

if hl_mode in (self.HL_REAL, self.HL_FAST) and filename:

727

lexer = self._get_lexer_for_filename(filename)

729

lexer = self._get_lexer_for_filename(filename)

728

return list(tokenize_string(line_text, lexer))

730

return list(tokenize_string(line_text, lexer))

729

731

730

return list(tokenize_string(line_text, plain_text_lexer))

732

return list(tokenize_string(line_text, plain_text_lexer))

731

733

732

def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):

734

def get_tokenized_filenode_line(self, filenode, line_number, lexer=None, source=''):

733

735

734

if filenode not in self.highlighted_filenodes:

736

def tokenize(_filenode):

735

~~tokenized_lines~~ = filenode_as_lines_tokens(filenode, lexer)

737

self.highlighted_filenodes[source][filenode] = filenode_as_lines_tokens(filenode, lexer)

736

self.highlighted_filenodes[filenode] = tokenized_lines

738

739

if filenode not in self.highlighted_filenodes[source]:

740

tokenize(filenode)

737

741

738

try:

742

try:

739

return self.highlighted_filenodes[filenode][line_number - 1]

743

return self.highlighted_filenodes[source][filenode][line_number - 1]

740

except Exception:

744

except Exception:

741

log.exception('diff rendering error')

745

log.exception('diff rendering error')

742

return [('', u'rhodecode diff rendering error')]

746

return [('', u'L{}: rhodecode diff rendering error'.format(line_number))]

743

747

744

def action_to_op(self, action):

748

def action_to_op(self, action):

745

return {

749

return {

746

'add': '+',

750

'add': '+',

747

'del': '-',

751

'del': '-',

748

'unmod': ' ',

752

'unmod': ' ',

749

'unmod-no-hl': ' ',

753

'unmod-no-hl': ' ',

750

'old-no-nl': ' ',

754

'old-no-nl': ' ',

751

'new-no-nl': ' ',

755

'new-no-nl': ' ',

752

}.get(action, action)

756

}.get(action, action)

753

757

754

def as_unified(self, lines):

758

def as_unified(self, lines):

755

"""

759

"""

756

Return a generator that yields the lines of a diff in unified order

760

Return a generator that yields the lines of a diff in unified order

757

"""

761

"""

758

def generator():

762

def generator():

759

buf = []

763

buf = []

760

for line in lines:

764

for line in lines:

761

765

762

if buf and not line.original or line.original.action == ' ':

766

if buf and not line.original or line.original.action == ' ':

763

for b in buf:

767

for b in buf:

764

yield b

768

yield b

765

buf = []

769

buf = []

766

770

767

if line.original:

771

if line.original:

768

if line.original.action == ' ':

772

if line.original.action == ' ':

769

yield (line.original.lineno, line.modified.lineno,

773

yield (line.original.lineno, line.modified.lineno,

770

line.original.action, line.original.content,

774

line.original.action, line.original.content,

771

line.original.get_comment_args)

775

line.original.get_comment_args)

772

continue

776

continue

773

777

774

if line.original.action == '-':

778

if line.original.action == '-':

775

yield (line.original.lineno, None,

779

yield (line.original.lineno, None,

776

line.original.action, line.original.content,

780

line.original.action, line.original.content,

777

line.original.get_comment_args)

781

line.original.get_comment_args)

778

782

779

if line.modified.action == '+':

783

if line.modified.action == '+':

780

buf.append((

784

buf.append((

781

None, line.modified.lineno,

785

None, line.modified.lineno,

782

line.modified.action, line.modified.content,

786

line.modified.action, line.modified.content,

783

line.modified.get_comment_args))

787

line.modified.get_comment_args))

784

continue

788

continue

785

789

786

if line.modified:

790

if line.modified:

787

yield (None, line.modified.lineno,

791

yield (None, line.modified.lineno,

788

line.modified.action, line.modified.content,

792

line.modified.action, line.modified.content,

789

line.modified.get_comment_args)

793

line.modified.get_comment_args)

790

794

791

for b in buf:

795

for b in buf:

792

yield b

796

yield b

793

797

794

return generator()

798

return generator()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             # Copyright (C) 2011-2020 RhodeCode GmbH
             #
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU Affero General Public License, version 3
             # (only), as published by the Free Software Foundation.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU Affero General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             #
             # This program is dual-licensed. If you wish to learn more about the
             # RhodeCode Enterprise Edition, including its added features, Support services,
             # and proprietary license terms, please see https://rhodecode.com/licenses/
             import logging
             import difflib
             from itertools import groupby
             from pygments import lex
             from pygments.formatters.html import _get_ttype_class as pygment_token_class
             from pygments.lexers.special import TextLexer, Token
             from pygments.lexers import get_lexer_by_name
             from pyramid import compat
             from rhodecode.lib.helpers import (
                 get_lexer_for_filenode, html_escape, get_custom_lexer)
             from rhodecode.lib.utils2 import AttributeDict, StrictAttributeDict, safe_unicode
             from rhodecode.lib.vcs.nodes import FileNode
             from rhodecode.lib.vcs.exceptions import VCSError, NodeDoesNotExistError
             from rhodecode.lib.diff_match_patch import diff_match_patch
             from rhodecode.lib.diffs import LimitedDiffContainer, DEL_FILENODE, BIN_FILENODE
             plain_text_lexer = get_lexer_by_name(
                 'text', stripall=False, stripnl=False, ensurenl=False)
             log = logging.getLogger(__name__)
             def filenode_as_lines_tokens(filenode, lexer=None):
                 org_lexer = lexer
                 lexer = lexer or get_lexer_for_filenode(filenode)
                 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
                           lexer, filenode, org_lexer)
                 content = filenode.content
                 tokens = tokenize_string(content, lexer)
                 lines = split_token_stream(tokens, content)
                 rv = list(lines)
                 return rv
             def tokenize_string(content, lexer):
                 """
                 Use pygments to tokenize some content based on a lexer
                 ensuring all original new lines and whitespace is preserved
                 """
                 lexer.stripall = False
                 lexer.stripnl = False
                 lexer.ensurenl = False
                 if isinstance(lexer, TextLexer):
                     lexed = [(Token.Text, content)]
                 else:
                     lexed = lex(content, lexer)
                 for token_type, token_text in lexed:
                     yield pygment_token_class(token_type), token_text
             def split_token_stream(tokens, content):
                 """
                 Take a list of (TokenType, text) tuples and split them by a string
                 split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
                 [(TEXT, 'some'), (TEXT, 'text'),
                  (TEXT, 'more'), (TEXT, 'text')]
                 """
                 token_buffer = []
                 for token_class, token_text in tokens:
                     parts = token_text.split('\n')
                     for part in parts[:-1]:
                         token_buffer.append((token_class, part))
                         yield token_buffer
                         token_buffer = []
                     token_buffer.append((token_class, parts[-1]))
                 if token_buffer:
                     yield token_buffer
                 elif content:
                     # this is a special case, we have the content, but tokenization didn't produce
                     # any results. THis can happen if know file extensions like .css have some bogus
                     # unicode content without any newline characters
                     yield [(pygment_token_class(Token.Text), content)]
             def filenode_as_annotated_lines_tokens(filenode):
                 """
                 Take a file node and return a list of annotations => lines, if no annotation
                 is found, it will be None.
                 eg:
                 [
                     (annotation1, [
                         (1, line1_tokens_list),
                         (2, line2_tokens_list),
                     ]),
                     (annotation2, [
                         (3, line1_tokens_list),
                     ]),
                     (None, [
                         (4, line1_tokens_list),
                     ]),
                     (annotation1, [
                         (5, line1_tokens_list),
                         (6, line2_tokens_list),
                     ])
                 ]
                 """
                 commit_cache = {}  # cache commit_getter lookups
                 def _get_annotation(commit_id, commit_getter):
                     if commit_id not in commit_cache:
                         commit_cache[commit_id] = commit_getter()
                     return commit_cache[commit_id]
                 annotation_lookup = {
                     line_no: _get_annotation(commit_id, commit_getter)
                     for line_no, commit_id, commit_getter, line_content
                     in filenode.annotate
                 }
                 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
                                       for line_no, tokens
                                       in enumerate(filenode_as_lines_tokens(filenode), 1))
                 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
                 for annotation, group in grouped_annotations_lines:
                     yield (
                         annotation, [(line_no, tokens)
                                       for (_, line_no, tokens) in group]
                     )
             def render_tokenstream(tokenstream):
                 result = []
                 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
                     if token_class:
                         result.append(u'<span class="%s">' % token_class)
                     else:
                         result.append(u'<span>')
                     for op_tag, token_text in token_ops_texts:
                         if op_tag:
                             result.append(u'<%s>' % op_tag)
                         # NOTE(marcink): in some cases of mixed encodings, we might run into
                         # troubles in the html_escape, in this case we say unicode force on token_text
                         # that would ensure "correct" data even with the cost of rendered
                         try:
                             escaped_text = html_escape(token_text)
                         except TypeError:
                             escaped_text = html_escape(safe_unicode(token_text))
                         # TODO: dan: investigate showing hidden characters like space/nl/tab
                         # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
                         # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
                         # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
                         result.append(escaped_text)
                         if op_tag:
                             result.append(u'</%s>' % op_tag)
                     result.append(u'</span>')
                 html = ''.join(result)
                 return html
             def rollup_tokenstream(tokenstream):
                 """
                 Group a token stream of the format:
                     ('class', 'op', 'text')
                 or
                     ('class', 'text')
                 into
                     [('class1',
                         [('op1', 'text'),
                          ('op2', 'text')]),
                      ('class2',
                         [('op3', 'text')])]
                 This is used to get the minimal tags necessary when
                 rendering to html eg for a token stream ie.
                 <span class="A"><ins>he</ins>llo</span>
                 vs
                 <span class="A"><ins>he</ins></span><span class="A">llo</span>
                 If a 2 tuple is passed in, the output op will be an empty string.
                 eg:
                 >>> rollup_tokenstream([('classA', '',      'h'),
                                         ('classA', 'del',   'ell'),
                                         ('classA', '',      'o'),
                                         ('classB', '',      ' '),
                                         ('classA', '',      'the'),
                                         ('classA', '',      're'),
                                         ])
                 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
                  ('classB', [('', ' ')],
                  ('classA', [('', 'there')]]
                 """
                 if tokenstream and len(tokenstream[0]) == 2:
                     tokenstream = ((t[0], '', t[1]) for t in tokenstream)
                 result = []
                 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
                     ops = []
                     for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
                         text_buffer = []
                         for t_class, t_op, t_text in token_text_list:
                             text_buffer.append(t_text)
                         ops.append((token_op, ''.join(text_buffer)))
                     result.append((token_class, ops))
                 return result
             def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
                 """
                 Converts a list of (token_class, token_text) tuples to a list of
                 (token_class, token_op, token_text) tuples where token_op is one of
                 ('ins', 'del', '')
                 :param old_tokens: list of (token_class, token_text) tuples of old line
                 :param new_tokens: list of (token_class, token_text) tuples of new line
                 :param use_diff_match_patch: boolean, will use google's diff match patch
                     library which has options to 'smooth' out the character by character
                     differences making nicer ins/del blocks
                 """
                 old_tokens_result = []
                 new_tokens_result = []
                 similarity = difflib.SequenceMatcher(None,
                     ''.join(token_text for token_class, token_text in old_tokens),
                     ''.join(token_text for token_class, token_text in new_tokens)
                 ).ratio()
                 if similarity < 0.6: # return, the blocks are too different
                     for token_class, token_text in old_tokens:
                         old_tokens_result.append((token_class, '', token_text))
                     for token_class, token_text in new_tokens:
                         new_tokens_result.append((token_class, '', token_text))
                     return old_tokens_result, new_tokens_result, similarity
                 token_sequence_matcher = difflib.SequenceMatcher(None,
                     [x[1] for x in old_tokens],
                     [x[1] for x in new_tokens])
                 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
                     # check the differences by token block types first to give a more
                     # nicer "block" level replacement vs character diffs
                     if tag == 'equal':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, '', token_text))
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, '', token_text))
                     elif tag == 'delete':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, 'del', token_text))
                     elif tag == 'insert':
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, 'ins', token_text))
                     elif tag == 'replace':
                         # if same type token blocks must be replaced, do a diff on the
                         # characters in the token blocks to show individual changes
                         old_char_tokens = []
                         new_char_tokens = []
                         for token_class, token_text in old_tokens[o1:o2]:
                             for char in token_text:
                                 old_char_tokens.append((token_class, char))
                         for token_class, token_text in new_tokens[n1:n2]:
                             for char in token_text:
                                 new_char_tokens.append((token_class, char))
                         old_string = ''.join([token_text for
                             token_class, token_text in old_char_tokens])
                         new_string = ''.join([token_text for
                             token_class, token_text in new_char_tokens])
                         char_sequence = difflib.SequenceMatcher(
                             None, old_string, new_string)
                         copcodes = char_sequence.get_opcodes()
                         obuffer, nbuffer = [], []
                         if use_diff_match_patch:
                             dmp = diff_match_patch()
                             dmp.Diff_EditCost = 11  # TODO: dan: extract this to a setting
                             reps = dmp.diff_main(old_string, new_string)
                             dmp.diff_cleanupEfficiency(reps)
                             a, b = 0, 0
                             for op, rep in reps:
                                 l = len(rep)
                                 if op == 0:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], '', c))
                                         nbuffer.append((new_char_tokens[b+i][0], '', c))
                                     a += l
                                     b += l
                                 elif op == -1:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], 'del', c))
                                     a += l
                                 elif op == 1:
                                     for i, c in enumerate(rep):
                                         nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
                                     b += l
                         else:
                             for ctag, co1, co2, cn1, cn2 in copcodes:
                                 if ctag == 'equal':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, '', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, '', token_text))
                                 elif ctag == 'delete':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                 elif ctag == 'insert':
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                                 elif ctag == 'replace':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                         old_tokens_result.extend(obuffer)
                         new_tokens_result.extend(nbuffer)
                 return old_tokens_result, new_tokens_result, similarity
             def diffset_node_getter(commit):
                 def get_node(fname):
                     try:
                         return commit.get_node(fname)
                     except NodeDoesNotExistError:
                         return None
                 return get_node
             class DiffSet(object):
                 """
                 An object for parsing the diff result from diffs.DiffProcessor and
                 adding highlighting, side by side/unified renderings and line diffs
                 """
                 HL_REAL = 'REAL'  # highlights using original file, slow
                 HL_FAST = 'FAST'  # highlights using just the line, fast but not correct
                                   # in the case of multiline code
                 HL_NONE = 'NONE'  # no highlighting, fastest
                 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
                              source_repo_name=None,
                              source_node_getter=lambda filename: None,
                              target_repo_name=None,
                              target_node_getter=lambda filename: None,
                              source_nodes=None, target_nodes=None,
                              # files over this size will use fast highlighting
                              max_file_size_limit=150 * 1024,
                              ):
                     self.highlight_mode = highlight_mode
-                    self.highlighted_filenodes = {}
+                    self.highlighted_filenodes = {
+                        'before': {},
+                        'after': {}
+                    }
                     self.source_node_getter = source_node_getter
                     self.target_node_getter = target_node_getter
                     self.source_nodes = source_nodes or {}
                     self.target_nodes = target_nodes or {}
                     self.repo_name = repo_name
                     self.target_repo_name = target_repo_name or repo_name
                     self.source_repo_name = source_repo_name or repo_name
                     self.max_file_size_limit = max_file_size_limit
                 def render_patchset(self, patchset, source_ref=None, target_ref=None):
                     diffset = AttributeDict(dict(
                         lines_added=0,
                         lines_deleted=0,
                         changed_files=0,
                         files=[],
                         file_stats={},
                         limited_diff=isinstance(patchset, LimitedDiffContainer),
                         repo_name=self.repo_name,
                         target_repo_name=self.target_repo_name,
                         source_repo_name=self.source_repo_name,
                         source_ref=source_ref,
                         target_ref=target_ref,
                     ))
                     for patch in patchset:
                         diffset.file_stats[patch['filename']] = patch['stats']
                         filediff = self.render_patch(patch)
                         filediff.diffset = StrictAttributeDict(dict(
                             source_ref=diffset.source_ref,
                             target_ref=diffset.target_ref,
                             repo_name=diffset.repo_name,
                             source_repo_name=diffset.source_repo_name,
                             target_repo_name=diffset.target_repo_name,
                         ))
                         diffset.files.append(filediff)
                         diffset.changed_files += 1
                         if not patch['stats']['binary']:
                             diffset.lines_added += patch['stats']['added']
                             diffset.lines_deleted += patch['stats']['deleted']
                     return diffset
                 _lexer_cache = {}
                 def _get_lexer_for_filename(self, filename, filenode=None):
                     # cached because we might need to call it twice for source/target
                     if filename not in self._lexer_cache:
                         if filenode:
                             lexer = filenode.lexer
                             extension = filenode.extension
                         else:
                             lexer = FileNode.get_lexer(filename=filename)
                             extension = filename.split('.')[-1]
                         lexer = get_custom_lexer(extension) or lexer
                         self._lexer_cache[filename] = lexer
                     return self._lexer_cache[filename]
                 def render_patch(self, patch):
                     log.debug('rendering diff for %r', patch['filename'])
                     source_filename = patch['original_filename']
                     target_filename = patch['filename']
                     source_lexer = plain_text_lexer
                     target_lexer = plain_text_lexer
                     if not patch['stats']['binary']:
                         node_hl_mode = self.HL_NONE if patch['chunks'] == [] else None
                         hl_mode = node_hl_mode or self.highlight_mode
                         if hl_mode == self.HL_REAL:
                             if (source_filename and patch['operation'] in ('D', 'M')
                                 and source_filename not in self.source_nodes):
                                     self.source_nodes[source_filename] = (
                                         self.source_node_getter(source_filename))
                             if (target_filename and patch['operation'] in ('A', 'M')
                                 and target_filename not in self.target_nodes):
                                     self.target_nodes[target_filename] = (
                                         self.target_node_getter(target_filename))
                         elif hl_mode == self.HL_FAST:
                             source_lexer = self._get_lexer_for_filename(source_filename)
                             target_lexer = self._get_lexer_for_filename(target_filename)
                     source_file = self.source_nodes.get(source_filename, source_filename)
                     target_file = self.target_nodes.get(target_filename, target_filename)
                     raw_id_uid = ''
                     if self.source_nodes.get(source_filename):
                         raw_id_uid = self.source_nodes[source_filename].commit.raw_id
                     if not raw_id_uid and self.target_nodes.get(target_filename):
                         # in case this is a new file we only have it in target
                         raw_id_uid = self.target_nodes[target_filename].commit.raw_id
                     source_filenode, target_filenode = None, None
                     # TODO: dan: FileNode.lexer works on the content of the file - which
                     # can be slow - issue #4289 explains a lexer clean up - which once
                     # done can allow caching a lexer for a filenode to avoid the file lookup
                     if isinstance(source_file, FileNode):
                         source_filenode = source_file
                         #source_lexer = source_file.lexer
                         source_lexer = self._get_lexer_for_filename(source_filename)
                         source_file.lexer = source_lexer
                     if isinstance(target_file, FileNode):
                         target_filenode = target_file
                         #target_lexer = target_file.lexer
                         target_lexer = self._get_lexer_for_filename(target_filename)
                         target_file.lexer = target_lexer
                     source_file_path, target_file_path = None, None
                     if source_filename != '/dev/null':
                         source_file_path = source_filename
                     if target_filename != '/dev/null':
                         target_file_path = target_filename
                     source_file_type = source_lexer.name
                     target_file_type = target_lexer.name
                     filediff = AttributeDict({
                         'source_file_path': source_file_path,
                         'target_file_path': target_file_path,
                         'source_filenode': source_filenode,
                         'target_filenode': target_filenode,
                         'source_file_type': target_file_type,
                         'target_file_type': source_file_type,
                         'patch': {'filename': patch['filename'], 'stats': patch['stats']},
                         'operation': patch['operation'],
                         'source_mode': patch['stats']['old_mode'],
                         'target_mode': patch['stats']['new_mode'],
                         'limited_diff': patch['is_limited_diff'],
                         'hunks': [],
                         'hunk_ops': None,
                         'diffset': self,
                         'raw_id': raw_id_uid,
                     })
                     file_chunks = patch['chunks'][1:]
                     for i, hunk in enumerate(file_chunks, 1):
                         hunkbit = self.parse_hunk(hunk, source_file, target_file)
                         hunkbit.source_file_path = source_file_path
                         hunkbit.target_file_path = target_file_path
                         hunkbit.index = i
                         filediff.hunks.append(hunkbit)
                     # Simulate hunk on OPS type line which doesn't really contain any diff
                     # this allows commenting on those
                     if not file_chunks:
                         actions = []
                         for op_id, op_text in filediff.patch['stats']['ops'].items():
                             if op_id == DEL_FILENODE:
                                 actions.append(u'file was removed')
                             elif op_id == BIN_FILENODE:
                                 actions.append(u'binary diff hidden')
                             else:
                                 actions.append(safe_unicode(op_text))
                         action_line = u'NO CONTENT: ' + \
                                       u', '.join(actions) or u'UNDEFINED_ACTION'
                         hunk_ops = {'source_length': 0, 'source_start': 0,
                                     'lines': [
                                         {'new_lineno': 0, 'old_lineno': 1,
                                          'action': 'unmod-no-hl', 'line': action_line}
                                     ],
                                     'section_header': u'', 'target_start': 1, 'target_length': 1}
                         hunkbit = self.parse_hunk(hunk_ops, source_file, target_file)
                         hunkbit.source_file_path = source_file_path
                         hunkbit.target_file_path = target_file_path
                         filediff.hunk_ops = hunkbit
                     return filediff
                 def parse_hunk(self, hunk, source_file, target_file):
                     result = AttributeDict(dict(
                         source_start=hunk['source_start'],
                         source_length=hunk['source_length'],
                         target_start=hunk['target_start'],
                         target_length=hunk['target_length'],
                         section_header=hunk['section_header'],
                         lines=[],
                     ))
                     before, after = [], []
                     for line in hunk['lines']:
                         if line['action'] in ['unmod', 'unmod-no-hl']:
                             no_hl = line['action'] == 'unmod-no-hl'
                             result.lines.extend(
                                 self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))
                             after.append(line)
                             before.append(line)
                         elif line['action'] == 'add':
                             after.append(line)
                         elif line['action'] == 'del':
                             before.append(line)
                         elif line['action'] == 'old-no-nl':
                             before.append(line)
                         elif line['action'] == 'new-no-nl':
                             after.append(line)
                     all_actions = [x['action'] for x in after] + [x['action'] for x in before]
                     no_hl = {x for x in all_actions} == {'unmod-no-hl'}
                     result.lines.extend(
                         self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))
                     # NOTE(marcink): we must keep list() call here so we can cache the result...
                     result.unified = list(self.as_unified(result.lines))
                     result.sideside = result.lines
                     return result
                 def parse_lines(self, before_lines, after_lines, source_file, target_file,
                                 no_hl=False):
                     # TODO: dan: investigate doing the diff comparison and fast highlighting
                     # on the entire before and after buffered block lines rather than by
                     # line, this means we can get better 'fast' highlighting if the context
                     # allows it - eg.
                     # line 4: """
                     # line 5: this gets highlighted as a string
                     # line 6: """
                     lines = []
                     before_newline = AttributeDict()
                     after_newline = AttributeDict()
                     if before_lines and before_lines[-1]['action'] == 'old-no-nl':
                         before_newline_line = before_lines.pop(-1)
                         before_newline.content = '\n {}'.format(
                             render_tokenstream(
                                 [(x[0], '', x[1])
                                  for x in [('nonl', before_newline_line['line'])]]))
                     if after_lines and after_lines[-1]['action'] == 'new-no-nl':
                         after_newline_line = after_lines.pop(-1)
                         after_newline.content = '\n {}'.format(
                             render_tokenstream(
                                 [(x[0], '', x[1])
                                  for x in [('nonl', after_newline_line['line'])]]))
                     while before_lines or after_lines:
                         before, after = None, None
                         before_tokens, after_tokens = None, None
                         if before_lines:
                             before = before_lines.pop(0)
                         if after_lines:
                             after = after_lines.pop(0)
                         original = AttributeDict()
                         modified = AttributeDict()
                         if before:
                             if before['action'] == 'old-no-nl':
                                 before_tokens = [('nonl', before['line'])]
                             else:
                                 before_tokens = self.get_line_tokens(
                                     line_text=before['line'], line_number=before['old_lineno'],
-                                    input_file=source_file, no_hl=no_hl)
+                                    input_file=source_file, no_hl=no_hl, source='before')
                             original.lineno = before['old_lineno']
                             original.content = before['line']
                             original.action = self.action_to_op(before['action'])
                             original.get_comment_args = (
                                 source_file, 'o', before['old_lineno'])
                         if after:
                             if after['action'] == 'new-no-nl':
                                 after_tokens = [('nonl', after['line'])]
                             else:
                                 after_tokens = self.get_line_tokens(
                                     line_text=after['line'], line_number=after['new_lineno'],
-                                    input_file=target_file, no_hl=no_hl)
+                                    input_file=target_file, no_hl=no_hl, source='after')
                             modified.lineno = after['new_lineno']
                             modified.content = after['line']
                             modified.action = self.action_to_op(after['action'])
                             modified.get_comment_args = (target_file, 'n', after['new_lineno'])
                         # diff the lines
                         if before_tokens and after_tokens:
                             o_tokens, m_tokens, similarity = tokens_diff(
                                 before_tokens, after_tokens)
                             original.content = render_tokenstream(o_tokens)
                             modified.content = render_tokenstream(m_tokens)
                         elif before_tokens:
                             original.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in before_tokens])
                         elif after_tokens:
                             modified.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in after_tokens])
                         if not before_lines and before_newline:
                             original.content += before_newline.content
                             before_newline = None
                         if not after_lines and after_newline:
                             modified.content += after_newline.content
                             after_newline = None
                         lines.append(AttributeDict({
                             'original': original,
                             'modified': modified,
                         }))
                     return lines
-                def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False):
+                def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False, source=''):
                     filenode = None
                     filename = None
                     if isinstance(input_file, compat.string_types):
                         filename = input_file
                     elif isinstance(input_file, FileNode):
                         filenode = input_file
                         filename = input_file.unicode_path
                     hl_mode = self.HL_NONE if no_hl else self.highlight_mode
                     if hl_mode == self.HL_REAL and filenode:
                         lexer = self._get_lexer_for_filename(filename)
                         file_size_allowed = input_file.size < self.max_file_size_limit
                         if line_number and file_size_allowed:
-                            return self.get_tokenized_filenode_line(
+                            return self.get_tokenized_filenode_line(input_file, line_number, lexer, source)
-                                input_file, line_number, lexer)
                     if hl_mode in (self.HL_REAL, self.HL_FAST) and filename:
                         lexer = self._get_lexer_for_filename(filename)
                         return list(tokenize_string(line_text, lexer))
                     return list(tokenize_string(line_text, plain_text_lexer))
-                def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
+                def get_tokenized_filenode_line(self, filenode, line_number, lexer=None, source=''):
-                    if filenode not in self.highlighted_filenodes:
+                    def tokenize(_filenode):
-                        tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
+                        self.highlighted_filenodes[source][filenode] = filenode_as_lines_tokens(filenode, lexer)
-                        self.highlighted_filenodes[filenode] = tokenized_lines
+                    if filenode not in self.highlighted_filenodes[source]:
+                        tokenize(filenode)
                     try:
-                        return self.highlighted_filenodes[filenode][line_number - 1]
+                        return self.highlighted_filenodes[source][filenode][line_number - 1]
                     except Exception:
                         log.exception('diff rendering error')
-                        return [('', u'rhodecode diff rendering error')]
+                        return [('', u'L{}: rhodecode diff rendering error'.format(line_number))]
                 def action_to_op(self, action):
                     return {
                         'add': '+',
                         'del': '-',
                         'unmod': ' ',
                         'unmod-no-hl': ' ',
                         'old-no-nl': ' ',
                         'new-no-nl': ' ',
                     }.get(action, action)
                 def as_unified(self, lines):
                     """
                     Return a generator that yields the lines of a diff in unified order
                     """
                     def generator():
                         buf = []
                         for line in lines:
                             if buf and not line.original or line.original.action == ' ':
                                 for b in buf:
                                     yield b
                                 buf = []
                             if line.original:
                                 if line.original.action == ' ':
                                     yield (line.original.lineno, line.modified.lineno,
                                            line.original.action, line.original.content,
                                            line.original.get_comment_args)
                                     continue
                                 if line.original.action == '-':
                                     yield (line.original.lineno, None,
                                            line.original.action, line.original.content,
                                            line.original.get_comment_args)
                                 if line.modified.action == '+':
                                     buf.append((
                                         None, line.modified.lineno,
                                         line.modified.action, line.modified.content,
                                         line.modified.get_comment_args))
                                     continue
                             if line.modified:
                                 yield (None, line.modified.lineno,
                                        line.modified.action, line.modified.content,
                                        line.modified.get_comment_args)
                         for b in buf:
                             yield b
                     return generator()