rhodecode-enterprise-ce Commit - r3831:0f09c1a7

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

3

4

#

4

#

5

# This program is free software: you can redistribute it and/or modify

5

# This program is free software: you can redistribute it and/or modify

6

# it under the terms of the GNU Affero General Public License, version 3

6

# it under the terms of the GNU Affero General Public License, version 3

7

# (only), as published by the Free Software Foundation.

7

# (only), as published by the Free Software Foundation.

8

#

8

#

9

# This program is distributed in the hope that it will be useful,

9

# This program is distributed in the hope that it will be useful,

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

# GNU General Public License for more details.

12

# GNU General Public License for more details.

13

#

13

#

14

# You should have received a copy of the GNU Affero General Public License

14

# You should have received a copy of the GNU Affero General Public License

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

16

#

16

#

17

# This program is dual-licensed. If you wish to learn more about the

17

# This program is dual-licensed. If you wish to learn more about the

18

# RhodeCode Enterprise Edition, including its added features, Support services,

18

# RhodeCode Enterprise Edition, including its added features, Support services,

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

20

21

import logging

21

import logging

22

import difflib

22

import difflib

23

from itertools import groupby

23

from itertools import groupby

24

25

from pygments import lex

25

from pygments import lex

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

27

from pygments.lexers.special import TextLexer, Token

27

from pygments.lexers.special import TextLexer, Token

28

from pygments.lexers import get_lexer_by_name

28

from pygments.lexers import get_lexer_by_name

29

from pyramid import compat

29

from pyramid import compat

30

31

from rhodecode.lib.helpers import (

31

from rhodecode.lib.helpers import (

32

get_lexer_for_filenode, html_escape, get_custom_lexer)

32

get_lexer_for_filenode, html_escape, get_custom_lexer)

33

from rhodecode.lib.utils2 import AttributeDict, StrictAttributeDict, safe_unicode

33

from rhodecode.lib.utils2 import AttributeDict, StrictAttributeDict, safe_unicode

34

from rhodecode.lib.vcs.nodes import FileNode

34

from rhodecode.lib.vcs.nodes import FileNode

35

from rhodecode.lib.vcs.exceptions import VCSError, NodeDoesNotExistError

35

from rhodecode.lib.vcs.exceptions import VCSError, NodeDoesNotExistError

36

from rhodecode.lib.diff_match_patch import diff_match_patch

36

from rhodecode.lib.diff_match_patch import diff_match_patch

37

from rhodecode.lib.diffs import LimitedDiffContainer, DEL_FILENODE, BIN_FILENODE

37

from rhodecode.lib.diffs import LimitedDiffContainer, DEL_FILENODE, BIN_FILENODE

38

39

40

plain_text_lexer = get_lexer_by_name(

40

plain_text_lexer = get_lexer_by_name(

41

'text', stripall=False, stripnl=False, ensurenl=False)

41

'text', stripall=False, stripnl=False, ensurenl=False)

42

43

44

log = logging.getLogger(__name__)

44

log = logging.getLogger(__name__)

45

46

47

def filenode_as_lines_tokens(filenode, lexer=None):

47

def filenode_as_lines_tokens(filenode, lexer=None):

48

org_lexer = lexer

48

org_lexer = lexer

49

lexer = lexer or get_lexer_for_filenode(filenode)

49

lexer = lexer or get_lexer_for_filenode(filenode)

50

log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',

50

log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',

51

lexer, filenode, org_lexer)

51

lexer, filenode, org_lexer)

52

content = filenode.content

52

content = filenode.content

53

tokens = tokenize_string(content, lexer)

53

tokens = tokenize_string(content, lexer)

54

lines = split_token_stream(tokens, content)

54

lines = split_token_stream(tokens, content)

55

rv = list(lines)

55

rv = list(lines)

56

return rv

56

return rv

57

58

59

def tokenize_string(content, lexer):

59

def tokenize_string(content, lexer):

60

"""

60

"""

61

Use pygments to tokenize some content based on a lexer

61

Use pygments to tokenize some content based on a lexer

62

ensuring all original new lines and whitespace is preserved

62

ensuring all original new lines and whitespace is preserved

63

"""

63

"""

64

65

lexer.stripall = False

65

lexer.stripall = False

66

lexer.stripnl = False

66

lexer.stripnl = False

67

lexer.ensurenl = False

67

lexer.ensurenl = False

68

69

if isinstance(lexer, TextLexer):

69

if isinstance(lexer, TextLexer):

70

lexed = [(Token.Text, content)]

70

lexed = [(Token.Text, content)]

71

else:

71

else:

72

lexed = lex(content, lexer)

72

lexed = lex(content, lexer)

73

74

for token_type, token_text in lexed:

74

for token_type, token_text in lexed:

75

yield pygment_token_class(token_type), token_text

75

yield pygment_token_class(token_type), token_text

76

77

78

def split_token_stream(tokens, content):

78

def split_token_stream(tokens, content):

79

"""

79

"""

80

Take a list of (TokenType, text) tuples and split them by a string

80

Take a list of (TokenType, text) tuples and split them by a string

81

82

split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

82

split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

83

[(TEXT, 'some'), (TEXT, 'text'),

83

[(TEXT, 'some'), (TEXT, 'text'),

84

(TEXT, 'more'), (TEXT, 'text')]

84

(TEXT, 'more'), (TEXT, 'text')]

85

"""

85

"""

86

87

token_buffer = []

87

token_buffer = []

88

for token_class, token_text in tokens:

88

for token_class, token_text in tokens:

89

parts = token_text.split('\n')

89

parts = token_text.split('\n')

90

for part in parts[:-1]:

90

for part in parts[:-1]:

91

token_buffer.append((token_class, part))

91

token_buffer.append((token_class, part))

92

yield token_buffer

92

yield token_buffer

93

token_buffer = []

93

token_buffer = []

94

95

token_buffer.append((token_class, parts[-1]))

95

token_buffer.append((token_class, parts[-1]))

96

97

if token_buffer:

97

if token_buffer:

98

yield token_buffer

98

yield token_buffer

99

elif content:

99

elif content:

100

# this is a special case, we have the content, but tokenization didn't produce

100

# this is a special case, we have the content, but tokenization didn't produce

101

# any results. THis can happen if know file extensions like .css have some bogus

101

# any results. THis can happen if know file extensions like .css have some bogus

102

# unicode content without any newline characters

102

# unicode content without any newline characters

103

yield [(pygment_token_class(Token.Text), content)]

103

yield [(pygment_token_class(Token.Text), content)]

104

105

106

def filenode_as_annotated_lines_tokens(filenode):

106

def filenode_as_annotated_lines_tokens(filenode):

107

"""

107

"""

108

Take a file node and return a list of annotations => lines, if no annotation

108

Take a file node and return a list of annotations => lines, if no annotation

109

is found, it will be None.

109

is found, it will be None.

110

111

eg:

111

eg:

112

113

[

113

[

114

(annotation1, [

114

(annotation1, [

115

(1, line1_tokens_list),

115

(1, line1_tokens_list),

116

(2, line2_tokens_list),

116

(2, line2_tokens_list),

117

]),

117

]),

118

(annotation2, [

118

(annotation2, [

119

(3, line1_tokens_list),

119

(3, line1_tokens_list),

120

]),

120

]),

121

(None, [

121

(None, [

122

(4, line1_tokens_list),

122

(4, line1_tokens_list),

123

]),

123

]),

124

(annotation1, [

124

(annotation1, [

125

(5, line1_tokens_list),

125

(5, line1_tokens_list),

126

(6, line2_tokens_list),

126

(6, line2_tokens_list),

127

])

127

])

128

]

128

]

129

"""

129

"""

130

131

commit_cache = {} # cache commit_getter lookups

131

commit_cache = {} # cache commit_getter lookups

132

133

def _get_annotation(commit_id, commit_getter):

133

def _get_annotation(commit_id, commit_getter):

134

if commit_id not in commit_cache:

134

if commit_id not in commit_cache:

135

commit_cache[commit_id] = commit_getter()

135

commit_cache[commit_id] = commit_getter()

136

return commit_cache[commit_id]

136

return commit_cache[commit_id]

137

138

annotation_lookup = {

138

annotation_lookup = {

139

line_no: _get_annotation(commit_id, commit_getter)

139

line_no: _get_annotation(commit_id, commit_getter)

140

for line_no, commit_id, commit_getter, line_content

140

for line_no, commit_id, commit_getter, line_content

141

in filenode.annotate

141

in filenode.annotate

142

}

142

}

143

144

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

144

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

145

for line_no, tokens

145

for line_no, tokens

146

in enumerate(filenode_as_lines_tokens(filenode), 1))

146

in enumerate(filenode_as_lines_tokens(filenode), 1))

147

148

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

148

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

149

150

for annotation, group in grouped_annotations_lines:

150

for annotation, group in grouped_annotations_lines:

151

yield (

151

yield (

152

annotation, [(line_no, tokens)

152

annotation, [(line_no, tokens)

153

for (_, line_no, tokens) in group]

153

for (_, line_no, tokens) in group]

154

)

154

)

155

156

157

def render_tokenstream(tokenstream):

157

def render_tokenstream(tokenstream):

158

result = []

158

result = []

159

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

159

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

160

161

if token_class:

161

if token_class:

162

result.append(u'' % token_class)

162

result.append(u'' % token_class)

163

else:

163

else:

164

result.append(u'')

164

result.append(u'')

165

166

for op_tag, token_text in token_ops_texts:

166

for op_tag, token_text in token_ops_texts:

167

168

if op_tag:

168

if op_tag:

169

result.append(u'<%s>' % op_tag)

169

result.append(u'<%s>' % op_tag)

170

171

escaped_text = html_escape(token_text)

171

# NOTE(marcink): in some cases of mixed encodings, we might run into

172

# troubles in the html_escape, in this case we say unicode force on token_text

173

# that would ensure "correct" data even with the cost of rendered

174

try:

175

escaped_text = html_escape(token_text)

176

except TypeError:

177

escaped_text = html_escape(safe_unicode(token_text))

172

178

173

# TODO: dan: investigate showing hidden characters like space/nl/tab

179

# TODO: dan: investigate showing hidden characters like space/nl/tab

174

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

180

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

175

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

181

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

176

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

182

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

177

183

178

result.append(escaped_text)

184

result.append(escaped_text)

179

185

180

if op_tag:

186

if op_tag:

181

result.append(u'</%s>' % op_tag)

187

result.append(u'</%s>' % op_tag)

182

188

183

result.append(u'')

189

result.append(u'')

184

190

185

html = ''.join(result)

191

html = ''.join(result)

186

return html

192

return html

187

193

188

194

189

def rollup_tokenstream(tokenstream):

195

def rollup_tokenstream(tokenstream):

190

"""

196

"""

191

Group a token stream of the format:

197

Group a token stream of the format:

192

198

193

('class', 'op', 'text')

199

('class', 'op', 'text')

194

or

200

or

195

('class', 'text')

201

('class', 'text')

196

202

197

into

203

into

198

204

199

[('class1',

205

[('class1',

200

[('op1', 'text'),

206

[('op1', 'text'),

201

('op2', 'text')]),

207

('op2', 'text')]),

202

('class2',

208

('class2',

203

[('op3', 'text')])]

209

[('op3', 'text')])]

204

210

205

This is used to get the minimal tags necessary when

211

This is used to get the minimal tags necessary when

206

rendering to html eg for a token stream ie.

212

rendering to html eg for a token stream ie.

207

213

208

<ins>he</ins>llo

214

<ins>he</ins>llo

209

vs

215

vs

210

<ins>he</ins>llo

216

<ins>he</ins>llo

211

217

212

If a 2 tuple is passed in, the output op will be an empty string.

218

If a 2 tuple is passed in, the output op will be an empty string.

213

219

214

eg:

220

eg:

215

221

216

>>> rollup_tokenstream([('classA', '', 'h'),

222

>>> rollup_tokenstream([('classA', '', 'h'),

217

('classA', 'del', 'ell'),

223

('classA', 'del', 'ell'),

218

('classA', '', 'o'),

224

('classA', '', 'o'),

219

('classB', '', ' '),

225

('classB', '', ' '),

220

('classA', '', 'the'),

226

('classA', '', 'the'),

221

('classA', '', 're'),

227

('classA', '', 're'),

222

])

228

])

223

229

224

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

230

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

225

('classB', [('', ' ')],

231

('classB', [('', ' ')],

226

('classA', [('', 'there')]]

232

('classA', [('', 'there')]]

227

233

228

"""

234

"""

229

if tokenstream and len(tokenstream[0]) == 2:

235

if tokenstream and len(tokenstream[0]) == 2:

230

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

236

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

231

237

232

result = []

238

result = []

233

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

239

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

234

ops = []

240

ops = []

235

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

241

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

236

text_buffer = []

242

text_buffer = []

237

for t_class, t_op, t_text in token_text_list:

243

for t_class, t_op, t_text in token_text_list:

238

text_buffer.append(t_text)

244

text_buffer.append(t_text)

239

ops.append((token_op, ''.join(text_buffer)))

245

ops.append((token_op, ''.join(text_buffer)))

240

result.append((token_class, ops))

246

result.append((token_class, ops))

241

return result

247

return result

242

248

243

249

244

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

250

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

245

"""

251

"""

246

Converts a list of (token_class, token_text) tuples to a list of

252

Converts a list of (token_class, token_text) tuples to a list of

247

(token_class, token_op, token_text) tuples where token_op is one of

253

(token_class, token_op, token_text) tuples where token_op is one of

248

('ins', 'del', '')

254

('ins', 'del', '')

249

255

250

:param old_tokens: list of (token_class, token_text) tuples of old line

256

:param old_tokens: list of (token_class, token_text) tuples of old line

251

:param new_tokens: list of (token_class, token_text) tuples of new line

257

:param new_tokens: list of (token_class, token_text) tuples of new line

252

:param use_diff_match_patch: boolean, will use google's diff match patch

258

:param use_diff_match_patch: boolean, will use google's diff match patch

253

library which has options to 'smooth' out the character by character

259

library which has options to 'smooth' out the character by character

254

differences making nicer ins/del blocks

260

differences making nicer ins/del blocks

255

"""

261

"""

256

262

257

old_tokens_result = []

263

old_tokens_result = []

258

new_tokens_result = []

264

new_tokens_result = []

259

265

260

similarity = difflib.SequenceMatcher(None,

266

similarity = difflib.SequenceMatcher(None,

261

''.join(token_text for token_class, token_text in old_tokens),

267

''.join(token_text for token_class, token_text in old_tokens),

262

''.join(token_text for token_class, token_text in new_tokens)

268

''.join(token_text for token_class, token_text in new_tokens)

263

).ratio()

269

).ratio()

264

270

265

if similarity < 0.6: # return, the blocks are too different

271

if similarity < 0.6: # return, the blocks are too different

266

for token_class, token_text in old_tokens:

272

for token_class, token_text in old_tokens:

267

old_tokens_result.append((token_class, '', token_text))

273

old_tokens_result.append((token_class, '', token_text))

268

for token_class, token_text in new_tokens:

274

for token_class, token_text in new_tokens:

269

new_tokens_result.append((token_class, '', token_text))

275

new_tokens_result.append((token_class, '', token_text))

270

return old_tokens_result, new_tokens_result, similarity

276

return old_tokens_result, new_tokens_result, similarity

271

277

272

token_sequence_matcher = difflib.SequenceMatcher(None,

278

token_sequence_matcher = difflib.SequenceMatcher(None,

273

[x[1] for x in old_tokens],

279

[x[1] for x in old_tokens],

274

[x[1] for x in new_tokens])

280

[x[1] for x in new_tokens])

275

281

276

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

282

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

277

# check the differences by token block types first to give a more

283

# check the differences by token block types first to give a more

278

# nicer "block" level replacement vs character diffs

284

# nicer "block" level replacement vs character diffs

279

285

280

if tag == 'equal':

286

if tag == 'equal':

281

for token_class, token_text in old_tokens[o1:o2]:

287

for token_class, token_text in old_tokens[o1:o2]:

282

old_tokens_result.append((token_class, '', token_text))

288

old_tokens_result.append((token_class, '', token_text))

283

for token_class, token_text in new_tokens[n1:n2]:

289

for token_class, token_text in new_tokens[n1:n2]:

284

new_tokens_result.append((token_class, '', token_text))

290

new_tokens_result.append((token_class, '', token_text))

285

elif tag == 'delete':

291

elif tag == 'delete':

286

for token_class, token_text in old_tokens[o1:o2]:

292

for token_class, token_text in old_tokens[o1:o2]:

287

old_tokens_result.append((token_class, 'del', token_text))

293

old_tokens_result.append((token_class, 'del', token_text))

288

elif tag == 'insert':

294

elif tag == 'insert':

289

for token_class, token_text in new_tokens[n1:n2]:

295

for token_class, token_text in new_tokens[n1:n2]:

290

new_tokens_result.append((token_class, 'ins', token_text))

296

new_tokens_result.append((token_class, 'ins', token_text))

291

elif tag == 'replace':

297

elif tag == 'replace':

292

# if same type token blocks must be replaced, do a diff on the

298

# if same type token blocks must be replaced, do a diff on the

293

# characters in the token blocks to show individual changes

299

# characters in the token blocks to show individual changes

294

300

295

old_char_tokens = []

301

old_char_tokens = []

296

new_char_tokens = []

302

new_char_tokens = []

297

for token_class, token_text in old_tokens[o1:o2]:

303

for token_class, token_text in old_tokens[o1:o2]:

298

for char in token_text:

304

for char in token_text:

299

old_char_tokens.append((token_class, char))

305

old_char_tokens.append((token_class, char))

300

306

301

for token_class, token_text in new_tokens[n1:n2]:

307

for token_class, token_text in new_tokens[n1:n2]:

302

for char in token_text:

308

for char in token_text:

303

new_char_tokens.append((token_class, char))

309

new_char_tokens.append((token_class, char))

304

310

305

old_string = ''.join([token_text for

311

old_string = ''.join([token_text for

306

token_class, token_text in old_char_tokens])

312

token_class, token_text in old_char_tokens])

307

new_string = ''.join([token_text for

313

new_string = ''.join([token_text for

308

token_class, token_text in new_char_tokens])

314

token_class, token_text in new_char_tokens])

309

315

310

char_sequence = difflib.SequenceMatcher(

316

char_sequence = difflib.SequenceMatcher(

311

None, old_string, new_string)

317

None, old_string, new_string)

312

copcodes = char_sequence.get_opcodes()

318

copcodes = char_sequence.get_opcodes()

313

obuffer, nbuffer = [], []

319

obuffer, nbuffer = [], []

314

320

315

if use_diff_match_patch:

321

if use_diff_match_patch:

316

dmp = diff_match_patch()

322

dmp = diff_match_patch()

317

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

323

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

318

reps = dmp.diff_main(old_string, new_string)

324

reps = dmp.diff_main(old_string, new_string)

319

dmp.diff_cleanupEfficiency(reps)

325

dmp.diff_cleanupEfficiency(reps)

320

326

321

a, b = 0, 0

327

a, b = 0, 0

322

for op, rep in reps:

328

for op, rep in reps:

323

l = len(rep)

329

l = len(rep)

324

if op == 0:

330

if op == 0:

325

for i, c in enumerate(rep):

331

for i, c in enumerate(rep):

326

obuffer.append((old_char_tokens[a+i][0], '', c))

332

obuffer.append((old_char_tokens[a+i][0], '', c))

327

nbuffer.append((new_char_tokens[b+i][0], '', c))

333

nbuffer.append((new_char_tokens[b+i][0], '', c))

328

a += l

334

a += l

329

b += l

335

b += l

330

elif op == -1:

336

elif op == -1:

331

for i, c in enumerate(rep):

337

for i, c in enumerate(rep):

332

obuffer.append((old_char_tokens[a+i][0], 'del', c))

338

obuffer.append((old_char_tokens[a+i][0], 'del', c))

333

a += l

339

a += l

334

elif op == 1:

340

elif op == 1:

335

for i, c in enumerate(rep):

341

for i, c in enumerate(rep):

336

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

342

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

337

b += l

343

b += l

338

else:

344

else:

339

for ctag, co1, co2, cn1, cn2 in copcodes:

345

for ctag, co1, co2, cn1, cn2 in copcodes:

340

if ctag == 'equal':

346

if ctag == 'equal':

341

for token_class, token_text in old_char_tokens[co1:co2]:

347

for token_class, token_text in old_char_tokens[co1:co2]:

342

obuffer.append((token_class, '', token_text))

348

obuffer.append((token_class, '', token_text))

343

for token_class, token_text in new_char_tokens[cn1:cn2]:

349

for token_class, token_text in new_char_tokens[cn1:cn2]:

344

nbuffer.append((token_class, '', token_text))

350

nbuffer.append((token_class, '', token_text))

345

elif ctag == 'delete':

351

elif ctag == 'delete':

346

for token_class, token_text in old_char_tokens[co1:co2]:

352

for token_class, token_text in old_char_tokens[co1:co2]:

347

obuffer.append((token_class, 'del', token_text))

353

obuffer.append((token_class, 'del', token_text))

348

elif ctag == 'insert':

354

elif ctag == 'insert':

349

for token_class, token_text in new_char_tokens[cn1:cn2]:

355

for token_class, token_text in new_char_tokens[cn1:cn2]:

350

nbuffer.append((token_class, 'ins', token_text))

356

nbuffer.append((token_class, 'ins', token_text))

351

elif ctag == 'replace':

357

elif ctag == 'replace':

352

for token_class, token_text in old_char_tokens[co1:co2]:

358

for token_class, token_text in old_char_tokens[co1:co2]:

353

obuffer.append((token_class, 'del', token_text))

359

obuffer.append((token_class, 'del', token_text))

354

for token_class, token_text in new_char_tokens[cn1:cn2]:

360

for token_class, token_text in new_char_tokens[cn1:cn2]:

355

nbuffer.append((token_class, 'ins', token_text))

361

nbuffer.append((token_class, 'ins', token_text))

356

362

357

old_tokens_result.extend(obuffer)

363

old_tokens_result.extend(obuffer)

358

new_tokens_result.extend(nbuffer)

364

new_tokens_result.extend(nbuffer)

359

365

360

return old_tokens_result, new_tokens_result, similarity

366

return old_tokens_result, new_tokens_result, similarity

361

367

362

368

363

def diffset_node_getter(commit):

369

def diffset_node_getter(commit):

364

def get_node(fname):

370

def get_node(fname):

365

try:

371

try:

366

return commit.get_node(fname)

372

return commit.get_node(fname)

367

except NodeDoesNotExistError:

373

except NodeDoesNotExistError:

368

return None

374

return None

369

375

370

return get_node

376

return get_node

371

377

372

378

373

class DiffSet(object):

379

class DiffSet(object):

374

"""

380

"""

375

An object for parsing the diff result from diffs.DiffProcessor and

381

An object for parsing the diff result from diffs.DiffProcessor and

376

adding highlighting, side by side/unified renderings and line diffs

382

adding highlighting, side by side/unified renderings and line diffs

377

"""

383

"""

378

384

379

HL_REAL = 'REAL' # highlights using original file, slow

385

HL_REAL = 'REAL' # highlights using original file, slow

380

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

386

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

381

# in the case of multiline code

387

# in the case of multiline code

382

HL_NONE = 'NONE' # no highlighting, fastest

388

HL_NONE = 'NONE' # no highlighting, fastest

383

389

384

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

390

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

385

source_repo_name=None,

391

source_repo_name=None,

386

source_node_getter=lambda filename: None,

392

source_node_getter=lambda filename: None,

387

target_repo_name=None,

393

target_repo_name=None,

388

target_node_getter=lambda filename: None,

394

target_node_getter=lambda filename: None,

389

source_nodes=None, target_nodes=None,

395

source_nodes=None, target_nodes=None,

390

# files over this size will use fast highlighting

396

# files over this size will use fast highlighting

391

max_file_size_limit=150 * 1024,

397

max_file_size_limit=150 * 1024,

392

):

398

):

393

399

394

self.highlight_mode = highlight_mode

400

self.highlight_mode = highlight_mode

395

self.highlighted_filenodes = {}

401

self.highlighted_filenodes = {}

396

self.source_node_getter = source_node_getter

402

self.source_node_getter = source_node_getter

397

self.target_node_getter = target_node_getter

403

self.target_node_getter = target_node_getter

398

self.source_nodes = source_nodes or {}

404

self.source_nodes = source_nodes or {}

399

self.target_nodes = target_nodes or {}

405

self.target_nodes = target_nodes or {}

400

self.repo_name = repo_name

406

self.repo_name = repo_name

401

self.target_repo_name = target_repo_name or repo_name

407

self.target_repo_name = target_repo_name or repo_name

402

self.source_repo_name = source_repo_name or repo_name

408

self.source_repo_name = source_repo_name or repo_name

403

self.max_file_size_limit = max_file_size_limit

409

self.max_file_size_limit = max_file_size_limit

404

410

405

def render_patchset(self, patchset, source_ref=None, target_ref=None):

411

def render_patchset(self, patchset, source_ref=None, target_ref=None):

406

diffset = AttributeDict(dict(

412

diffset = AttributeDict(dict(

407

lines_added=0,

413

lines_added=0,

408

lines_deleted=0,

414

lines_deleted=0,

409

changed_files=0,

415

changed_files=0,

410

files=[],

416

files=[],

411

file_stats={},

417

file_stats={},

412

limited_diff=isinstance(patchset, LimitedDiffContainer),

418

limited_diff=isinstance(patchset, LimitedDiffContainer),

413

repo_name=self.repo_name,

419

repo_name=self.repo_name,

414

target_repo_name=self.target_repo_name,

420

target_repo_name=self.target_repo_name,

415

source_repo_name=self.source_repo_name,

421

source_repo_name=self.source_repo_name,

416

source_ref=source_ref,

422

source_ref=source_ref,

417

target_ref=target_ref,

423

target_ref=target_ref,

418

))

424

))

419

for patch in patchset:

425

for patch in patchset:

420

diffset.file_stats[patch['filename']] = patch['stats']

426

diffset.file_stats[patch['filename']] = patch['stats']

421

filediff = self.render_patch(patch)

427

filediff = self.render_patch(patch)

422

filediff.diffset = StrictAttributeDict(dict(

428

filediff.diffset = StrictAttributeDict(dict(

423

source_ref=diffset.source_ref,

429

source_ref=diffset.source_ref,

424

target_ref=diffset.target_ref,

430

target_ref=diffset.target_ref,

425

repo_name=diffset.repo_name,

431

repo_name=diffset.repo_name,

426

source_repo_name=diffset.source_repo_name,

432

source_repo_name=diffset.source_repo_name,

427

target_repo_name=diffset.target_repo_name,

433

target_repo_name=diffset.target_repo_name,

428

))

434

))

429

diffset.files.append(filediff)

435

diffset.files.append(filediff)

430

diffset.changed_files += 1

436

diffset.changed_files += 1

431

if not patch['stats']['binary']:

437

if not patch['stats']['binary']:

432

diffset.lines_added += patch['stats']['added']

438

diffset.lines_added += patch['stats']['added']

433

diffset.lines_deleted += patch['stats']['deleted']

439

diffset.lines_deleted += patch['stats']['deleted']

434

440

435

return diffset

441

return diffset

436

442

437

_lexer_cache = {}

443

_lexer_cache = {}

438

444

439

def _get_lexer_for_filename(self, filename, filenode=None):

445

def _get_lexer_for_filename(self, filename, filenode=None):

440

# cached because we might need to call it twice for source/target

446

# cached because we might need to call it twice for source/target

441

if filename not in self._lexer_cache:

447

if filename not in self._lexer_cache:

442

if filenode:

448

if filenode:

443

lexer = filenode.lexer

449

lexer = filenode.lexer

444

extension = filenode.extension

450

extension = filenode.extension

445

else:

451

else:

446

lexer = FileNode.get_lexer(filename=filename)

452

lexer = FileNode.get_lexer(filename=filename)

447

extension = filename.split('.')[-1]

453

extension = filename.split('.')[-1]

448

454

449

lexer = get_custom_lexer(extension) or lexer

455

lexer = get_custom_lexer(extension) or lexer

450

self._lexer_cache[filename] = lexer

456

self._lexer_cache[filename] = lexer

451

return self._lexer_cache[filename]

457

return self._lexer_cache[filename]

452

458

453

def render_patch(self, patch):

459

def render_patch(self, patch):

454

log.debug('rendering diff for %r', patch['filename'])

460

log.debug('rendering diff for %r', patch['filename'])

455

461

456

source_filename = patch['original_filename']

462

source_filename = patch['original_filename']

457

target_filename = patch['filename']

463

target_filename = patch['filename']

458

464

459

source_lexer = plain_text_lexer

465

source_lexer = plain_text_lexer

460

target_lexer = plain_text_lexer

466

target_lexer = plain_text_lexer

461

467

462

if not patch['stats']['binary']:

468

if not patch['stats']['binary']:

463

node_hl_mode = self.HL_NONE if patch['chunks'] == [] else None

469

node_hl_mode = self.HL_NONE if patch['chunks'] == [] else None

464

hl_mode = node_hl_mode or self.highlight_mode

470

hl_mode = node_hl_mode or self.highlight_mode

465

471

466

if hl_mode == self.HL_REAL:

472

if hl_mode == self.HL_REAL:

467

if (source_filename and patch['operation'] in ('D', 'M')

473

if (source_filename and patch['operation'] in ('D', 'M')

468

and source_filename not in self.source_nodes):

474

and source_filename not in self.source_nodes):

469

self.source_nodes[source_filename] = (

475

self.source_nodes[source_filename] = (

470

self.source_node_getter(source_filename))

476

self.source_node_getter(source_filename))

471

477

472

if (target_filename and patch['operation'] in ('A', 'M')

478

if (target_filename and patch['operation'] in ('A', 'M')

473

and target_filename not in self.target_nodes):

479

and target_filename not in self.target_nodes):

474

self.target_nodes[target_filename] = (

480

self.target_nodes[target_filename] = (

475

self.target_node_getter(target_filename))

481

self.target_node_getter(target_filename))

476

482

477

elif hl_mode == self.HL_FAST:

483

elif hl_mode == self.HL_FAST:

478

source_lexer = self._get_lexer_for_filename(source_filename)

484

source_lexer = self._get_lexer_for_filename(source_filename)

479

target_lexer = self._get_lexer_for_filename(target_filename)

485

target_lexer = self._get_lexer_for_filename(target_filename)

480

486

481

source_file = self.source_nodes.get(source_filename, source_filename)

487

source_file = self.source_nodes.get(source_filename, source_filename)

482

target_file = self.target_nodes.get(target_filename, target_filename)

488

target_file = self.target_nodes.get(target_filename, target_filename)

483

raw_id_uid = ''

489

raw_id_uid = ''

484

if self.source_nodes.get(source_filename):

490

if self.source_nodes.get(source_filename):

485

raw_id_uid = self.source_nodes[source_filename].commit.raw_id

491

raw_id_uid = self.source_nodes[source_filename].commit.raw_id

486

492

487

if not raw_id_uid and self.target_nodes.get(target_filename):

493

if not raw_id_uid and self.target_nodes.get(target_filename):

488

# in case this is a new file we only have it in target

494

# in case this is a new file we only have it in target

489

raw_id_uid = self.target_nodes[target_filename].commit.raw_id

495

raw_id_uid = self.target_nodes[target_filename].commit.raw_id

490

496

491

source_filenode, target_filenode = None, None

497

source_filenode, target_filenode = None, None

492

498

493

# TODO: dan: FileNode.lexer works on the content of the file - which

499

# TODO: dan: FileNode.lexer works on the content of the file - which

494

# can be slow - issue #4289 explains a lexer clean up - which once

500

# can be slow - issue #4289 explains a lexer clean up - which once

495

# done can allow caching a lexer for a filenode to avoid the file lookup

501

# done can allow caching a lexer for a filenode to avoid the file lookup

496

if isinstance(source_file, FileNode):

502

if isinstance(source_file, FileNode):

497

source_filenode = source_file

503

source_filenode = source_file

498

#source_lexer = source_file.lexer

504

#source_lexer = source_file.lexer

499

source_lexer = self._get_lexer_for_filename(source_filename)

505

source_lexer = self._get_lexer_for_filename(source_filename)

500

source_file.lexer = source_lexer

506

source_file.lexer = source_lexer

501

507

502

if isinstance(target_file, FileNode):

508

if isinstance(target_file, FileNode):

503

target_filenode = target_file

509

target_filenode = target_file

504

#target_lexer = target_file.lexer

510

#target_lexer = target_file.lexer

505

target_lexer = self._get_lexer_for_filename(target_filename)

511

target_lexer = self._get_lexer_for_filename(target_filename)

506

target_file.lexer = target_lexer

512

target_file.lexer = target_lexer

507

513

508

source_file_path, target_file_path = None, None

514

source_file_path, target_file_path = None, None

509

515

510

if source_filename != '/dev/null':

516

if source_filename != '/dev/null':

511

source_file_path = source_filename

517

source_file_path = source_filename

512

if target_filename != '/dev/null':

518

if target_filename != '/dev/null':

513

target_file_path = target_filename

519

target_file_path = target_filename

514

520

515

source_file_type = source_lexer.name

521

source_file_type = source_lexer.name

516

target_file_type = target_lexer.name

522

target_file_type = target_lexer.name

517

523

518

filediff = AttributeDict({

524

filediff = AttributeDict({

519

'source_file_path': source_file_path,

525

'source_file_path': source_file_path,

520

'target_file_path': target_file_path,

526

'target_file_path': target_file_path,

521

'source_filenode': source_filenode,

527

'source_filenode': source_filenode,

522

'target_filenode': target_filenode,

528

'target_filenode': target_filenode,

523

'source_file_type': target_file_type,

529

'source_file_type': target_file_type,

524

'target_file_type': source_file_type,

530

'target_file_type': source_file_type,

525

'patch': {'filename': patch['filename'], 'stats': patch['stats']},

531

'patch': {'filename': patch['filename'], 'stats': patch['stats']},

526

'operation': patch['operation'],

532

'operation': patch['operation'],

527

'source_mode': patch['stats']['old_mode'],

533

'source_mode': patch['stats']['old_mode'],

528

'target_mode': patch['stats']['new_mode'],

534

'target_mode': patch['stats']['new_mode'],

529

'limited_diff': patch['is_limited_diff'],

535

'limited_diff': patch['is_limited_diff'],

530

'hunks': [],

536

'hunks': [],

531

'hunk_ops': None,

537

'hunk_ops': None,

532

'diffset': self,

538

'diffset': self,

533

'raw_id': raw_id_uid,

539

'raw_id': raw_id_uid,

534

})

540

})

535

541

536

file_chunks = patch['chunks'][1:]

542

file_chunks = patch['chunks'][1:]

537

for hunk in file_chunks:

543

for hunk in file_chunks:

538

hunkbit = self.parse_hunk(hunk, source_file, target_file)

544

hunkbit = self.parse_hunk(hunk, source_file, target_file)

539

hunkbit.source_file_path = source_file_path

545

hunkbit.source_file_path = source_file_path

540

hunkbit.target_file_path = target_file_path

546

hunkbit.target_file_path = target_file_path

541

filediff.hunks.append(hunkbit)

547

filediff.hunks.append(hunkbit)

542

548

543

# Simulate hunk on OPS type line which doesn't really contain any diff

549

# Simulate hunk on OPS type line which doesn't really contain any diff

544

# this allows commenting on those

550

# this allows commenting on those

545

if not file_chunks:

551

if not file_chunks:

546

actions = []

552

actions = []

547

for op_id, op_text in filediff.patch['stats']['ops'].items():

553

for op_id, op_text in filediff.patch['stats']['ops'].items():

548

if op_id == DEL_FILENODE:

554

if op_id == DEL_FILENODE:

549

actions.append(u'file was removed')

555

actions.append(u'file was removed')

550

elif op_id == BIN_FILENODE:

556

elif op_id == BIN_FILENODE:

551

actions.append(u'binary diff hidden')

557

actions.append(u'binary diff hidden')

552

else:

558

else:

553

actions.append(safe_unicode(op_text))

559

actions.append(safe_unicode(op_text))

554

action_line = u'NO CONTENT: ' + \

560

action_line = u'NO CONTENT: ' + \

555

u', '.join(actions) or u'UNDEFINED_ACTION'

561

u', '.join(actions) or u'UNDEFINED_ACTION'

556

562

557

hunk_ops = {'source_length': 0, 'source_start': 0,

563

hunk_ops = {'source_length': 0, 'source_start': 0,

558

'lines': [

564

'lines': [

559

{'new_lineno': 0, 'old_lineno': 1,

565

{'new_lineno': 0, 'old_lineno': 1,

560

'action': 'unmod-no-hl', 'line': action_line}

566

'action': 'unmod-no-hl', 'line': action_line}

561

],

567

],

562

'section_header': u'', 'target_start': 1, 'target_length': 1}

568

'section_header': u'', 'target_start': 1, 'target_length': 1}

563

569

564

hunkbit = self.parse_hunk(hunk_ops, source_file, target_file)

570

hunkbit = self.parse_hunk(hunk_ops, source_file, target_file)

565

hunkbit.source_file_path = source_file_path

571

hunkbit.source_file_path = source_file_path

566

hunkbit.target_file_path = target_file_path

572

hunkbit.target_file_path = target_file_path

567

filediff.hunk_ops = hunkbit

573

filediff.hunk_ops = hunkbit

568

return filediff

574

return filediff

569

575

570

def parse_hunk(self, hunk, source_file, target_file):

576

def parse_hunk(self, hunk, source_file, target_file):

571

result = AttributeDict(dict(

577

result = AttributeDict(dict(

572

source_start=hunk['source_start'],

578

source_start=hunk['source_start'],

573

source_length=hunk['source_length'],

579

source_length=hunk['source_length'],

574

target_start=hunk['target_start'],

580

target_start=hunk['target_start'],

575

target_length=hunk['target_length'],

581

target_length=hunk['target_length'],

576

section_header=hunk['section_header'],

582

section_header=hunk['section_header'],

577

lines=[],

583

lines=[],

578

))

584

))

579

before, after = [], []

585

before, after = [], []

580

586

581

for line in hunk['lines']:

587

for line in hunk['lines']:

582

if line['action'] in ['unmod', 'unmod-no-hl']:

588

if line['action'] in ['unmod', 'unmod-no-hl']:

583

no_hl = line['action'] == 'unmod-no-hl'

589

no_hl = line['action'] == 'unmod-no-hl'

584

result.lines.extend(

590

result.lines.extend(

585

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

591

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

586

after.append(line)

592

after.append(line)

587

before.append(line)

593

before.append(line)

588

elif line['action'] == 'add':

594

elif line['action'] == 'add':

589

after.append(line)

595

after.append(line)

590

elif line['action'] == 'del':

596

elif line['action'] == 'del':

591

before.append(line)

597

before.append(line)

592

elif line['action'] == 'old-no-nl':

598

elif line['action'] == 'old-no-nl':

593

before.append(line)

599

before.append(line)

594

elif line['action'] == 'new-no-nl':

600

elif line['action'] == 'new-no-nl':

595

after.append(line)

601

after.append(line)

596

602

597

all_actions = [x['action'] for x in after] + [x['action'] for x in before]

603

all_actions = [x['action'] for x in after] + [x['action'] for x in before]

598

no_hl = {x for x in all_actions} == {'unmod-no-hl'}

604

no_hl = {x for x in all_actions} == {'unmod-no-hl'}

599

result.lines.extend(

605

result.lines.extend(

600

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

606

self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))

601

# NOTE(marcink): we must keep list() call here so we can cache the result...

607

# NOTE(marcink): we must keep list() call here so we can cache the result...

602

result.unified = list(self.as_unified(result.lines))

608

result.unified = list(self.as_unified(result.lines))

603

result.sideside = result.lines

609

result.sideside = result.lines

604

610

605

return result

611

return result

606

612

607

def parse_lines(self, before_lines, after_lines, source_file, target_file,

613

def parse_lines(self, before_lines, after_lines, source_file, target_file,

608

no_hl=False):

614

no_hl=False):

609

# TODO: dan: investigate doing the diff comparison and fast highlighting

615

# TODO: dan: investigate doing the diff comparison and fast highlighting

610

# on the entire before and after buffered block lines rather than by

616

# on the entire before and after buffered block lines rather than by

611

# line, this means we can get better 'fast' highlighting if the context

617

# line, this means we can get better 'fast' highlighting if the context

612

# allows it - eg.

618

# allows it - eg.

613

# line 4: """

619

# line 4: """

614

# line 5: this gets highlighted as a string

620

# line 5: this gets highlighted as a string

615

# line 6: """

621

# line 6: """

616

622

617

lines = []

623

lines = []

618

624

619

before_newline = AttributeDict()

625

before_newline = AttributeDict()

620

after_newline = AttributeDict()

626

after_newline = AttributeDict()

621

if before_lines and before_lines[-1]['action'] == 'old-no-nl':

627

if before_lines and before_lines[-1]['action'] == 'old-no-nl':

622

before_newline_line = before_lines.pop(-1)

628

before_newline_line = before_lines.pop(-1)

623

before_newline.content = '\n {}'.format(

629

before_newline.content = '\n {}'.format(

624

render_tokenstream(

630

render_tokenstream(

625

[(x[0], '', x[1])

631

[(x[0], '', x[1])

626

for x in [('nonl', before_newline_line['line'])]]))

632

for x in [('nonl', before_newline_line['line'])]]))

627

633

628

if after_lines and after_lines[-1]['action'] == 'new-no-nl':

634

if after_lines and after_lines[-1]['action'] == 'new-no-nl':

629

after_newline_line = after_lines.pop(-1)

635

after_newline_line = after_lines.pop(-1)

630

after_newline.content = '\n {}'.format(

636

after_newline.content = '\n {}'.format(

631

render_tokenstream(

637

render_tokenstream(

632

[(x[0], '', x[1])

638

[(x[0], '', x[1])

633

for x in [('nonl', after_newline_line['line'])]]))

639

for x in [('nonl', after_newline_line['line'])]]))

634

640

635

while before_lines or after_lines:

641

while before_lines or after_lines:

636

before, after = None, None

642

before, after = None, None

637

before_tokens, after_tokens = None, None

643

before_tokens, after_tokens = None, None

638

644

639

if before_lines:

645

if before_lines:

640

before = before_lines.pop(0)

646

before = before_lines.pop(0)

641

if after_lines:

647

if after_lines:

642

after = after_lines.pop(0)

648

after = after_lines.pop(0)

643

649

644

original = AttributeDict()

650

original = AttributeDict()

645

modified = AttributeDict()

651

modified = AttributeDict()

646

652

647

if before:

653

if before:

648

if before['action'] == 'old-no-nl':

654

if before['action'] == 'old-no-nl':

649

before_tokens = [('nonl', before['line'])]

655

before_tokens = [('nonl', before['line'])]

650

else:

656

else:

651

before_tokens = self.get_line_tokens(

657

before_tokens = self.get_line_tokens(

652

line_text=before['line'], line_number=before['old_lineno'],

658

line_text=before['line'], line_number=before['old_lineno'],

653

input_file=source_file, no_hl=no_hl)

659

input_file=source_file, no_hl=no_hl)

654

original.lineno = before['old_lineno']

660

original.lineno = before['old_lineno']

655

original.content = before['line']

661

original.content = before['line']

656

original.action = self.action_to_op(before['action'])

662

original.action = self.action_to_op(before['action'])

657

663

658

original.get_comment_args = (

664

original.get_comment_args = (

659

source_file, 'o', before['old_lineno'])

665

source_file, 'o', before['old_lineno'])

660

666

661

if after:

667

if after:

662

if after['action'] == 'new-no-nl':

668

if after['action'] == 'new-no-nl':

663

after_tokens = [('nonl', after['line'])]

669

after_tokens = [('nonl', after['line'])]

664

else:

670

else:

665

after_tokens = self.get_line_tokens(

671

after_tokens = self.get_line_tokens(

666

line_text=after['line'], line_number=after['new_lineno'],

672

line_text=after['line'], line_number=after['new_lineno'],

667

input_file=target_file, no_hl=no_hl)

673

input_file=target_file, no_hl=no_hl)

668

modified.lineno = after['new_lineno']

674

modified.lineno = after['new_lineno']

669

modified.content = after['line']

675

modified.content = after['line']

670

modified.action = self.action_to_op(after['action'])

676

modified.action = self.action_to_op(after['action'])

671

677

672

modified.get_comment_args = (target_file, 'n', after['new_lineno'])

678

modified.get_comment_args = (target_file, 'n', after['new_lineno'])

673

679

674

# diff the lines

680

# diff the lines

675

if before_tokens and after_tokens:

681

if before_tokens and after_tokens:

676

o_tokens, m_tokens, similarity = tokens_diff(

682

o_tokens, m_tokens, similarity = tokens_diff(

677

before_tokens, after_tokens)

683

before_tokens, after_tokens)

678

original.content = render_tokenstream(o_tokens)

684

original.content = render_tokenstream(o_tokens)

679

modified.content = render_tokenstream(m_tokens)

685

modified.content = render_tokenstream(m_tokens)

680

elif before_tokens:

686

elif before_tokens:

681

original.content = render_tokenstream(

687

original.content = render_tokenstream(

682

[(x[0], '', x[1]) for x in before_tokens])

688

[(x[0], '', x[1]) for x in before_tokens])

683

elif after_tokens:

689

elif after_tokens:

684

modified.content = render_tokenstream(

690

modified.content = render_tokenstream(

685

[(x[0], '', x[1]) for x in after_tokens])

691

[(x[0], '', x[1]) for x in after_tokens])

686

692

687

if not before_lines and before_newline:

693

if not before_lines and before_newline:

688

original.content += before_newline.content

694

original.content += before_newline.content

689

before_newline = None

695

before_newline = None

690

if not after_lines and after_newline:

696

if not after_lines and after_newline:

691

modified.content += after_newline.content

697

modified.content += after_newline.content

692

after_newline = None

698

after_newline = None

693

699

694

lines.append(AttributeDict({

700

lines.append(AttributeDict({

695

'original': original,

701

'original': original,

696

'modified': modified,

702

'modified': modified,

697

}))

703

}))

698

704

699

return lines

705

return lines

700

706

701

def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False):

707

def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False):

702

filenode = None

708

filenode = None

703

filename = None

709

filename = None

704

710

705

if isinstance(input_file, compat.string_types):

711

if isinstance(input_file, compat.string_types):

706

filename = input_file

712

filename = input_file

707

elif isinstance(input_file, FileNode):

713

elif isinstance(input_file, FileNode):

708

filenode = input_file

714

filenode = input_file

709

filename = input_file.unicode_path

715

filename = input_file.unicode_path

710

716

711

hl_mode = self.HL_NONE if no_hl else self.highlight_mode

717

hl_mode = self.HL_NONE if no_hl else self.highlight_mode

712

if hl_mode == self.HL_REAL and filenode:

718

if hl_mode == self.HL_REAL and filenode:

713

lexer = self._get_lexer_for_filename(filename)

719

lexer = self._get_lexer_for_filename(filename)

714

file_size_allowed = input_file.size < self.max_file_size_limit

720

file_size_allowed = input_file.size < self.max_file_size_limit

715

if line_number and file_size_allowed:

721

if line_number and file_size_allowed:

716

return self.get_tokenized_filenode_line(

722

return self.get_tokenized_filenode_line(

717

input_file, line_number, lexer)

723

input_file, line_number, lexer)

718

724

719

if hl_mode in (self.HL_REAL, self.HL_FAST) and filename:

725

if hl_mode in (self.HL_REAL, self.HL_FAST) and filename:

720

lexer = self._get_lexer_for_filename(filename)

726

lexer = self._get_lexer_for_filename(filename)

721

return list(tokenize_string(line_text, lexer))

727

return list(tokenize_string(line_text, lexer))

722

728

723

return list(tokenize_string(line_text, plain_text_lexer))

729

return list(tokenize_string(line_text, plain_text_lexer))

724

730

725

def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):

731

def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):

726

732

727

if filenode not in self.highlighted_filenodes:

733

if filenode not in self.highlighted_filenodes:

728

tokenized_lines = filenode_as_lines_tokens(filenode, lexer)

734

tokenized_lines = filenode_as_lines_tokens(filenode, lexer)

729

self.highlighted_filenodes[filenode] = tokenized_lines

735

self.highlighted_filenodes[filenode] = tokenized_lines

730

736

731

try:

737

try:

732

return self.highlighted_filenodes[filenode][line_number - 1]

738

return self.highlighted_filenodes[filenode][line_number - 1]

733

except Exception:

739

except Exception:

734

return [('', u'rhodecode diff rendering error')]

740

return [('', u'rhodecode diff rendering error')]

735

741

736

def action_to_op(self, action):

742

def action_to_op(self, action):

737

return {

743

return {

738

'add': '+',

744

'add': '+',

739

'del': '-',

745

'del': '-',

740

'unmod': ' ',

746

'unmod': ' ',

741

'unmod-no-hl': ' ',

747

'unmod-no-hl': ' ',

742

'old-no-nl': ' ',

748

'old-no-nl': ' ',

743

'new-no-nl': ' ',

749

'new-no-nl': ' ',

744

}.get(action, action)

750

}.get(action, action)

745

751

746

def as_unified(self, lines):

752

def as_unified(self, lines):

747

"""

753

"""

748

Return a generator that yields the lines of a diff in unified order

754

Return a generator that yields the lines of a diff in unified order

749

"""

755

"""

750

def generator():

756

def generator():

751

buf = []

757

buf = []

752

for line in lines:

758

for line in lines:

753

759

754

if buf and not line.original or line.original.action == ' ':

760

if buf and not line.original or line.original.action == ' ':

755

for b in buf:

761

for b in buf:

756

yield b

762

yield b

757

buf = []

763

buf = []

758

764

759

if line.original:

765

if line.original:

760

if line.original.action == ' ':

766

if line.original.action == ' ':

761

yield (line.original.lineno, line.modified.lineno,

767

yield (line.original.lineno, line.modified.lineno,

762

line.original.action, line.original.content,

768

line.original.action, line.original.content,

763

line.original.get_comment_args)

769

line.original.get_comment_args)

764

continue

770

continue

765

771

766

if line.original.action == '-':

772

if line.original.action == '-':

767

yield (line.original.lineno, None,

773

yield (line.original.lineno, None,

768

line.original.action, line.original.content,

774

line.original.action, line.original.content,

769

line.original.get_comment_args)

775

line.original.get_comment_args)

770

776

771

if line.modified.action == '+':

777

if line.modified.action == '+':

772

buf.append((

778

buf.append((

773

None, line.modified.lineno,

779

None, line.modified.lineno,

774

line.modified.action, line.modified.content,

780

line.modified.action, line.modified.content,

775

line.modified.get_comment_args))

781

line.modified.get_comment_args))

776

continue

782

continue

777

783

778

if line.modified:

784

if line.modified:

779

yield (None, line.modified.lineno,

785

yield (None, line.modified.lineno,

780

line.modified.action, line.modified.content,

786

line.modified.action, line.modified.content,

781

line.modified.get_comment_args)

787

line.modified.get_comment_args)

782

788

783

for b in buf:

789

for b in buf:

784

yield b

790

yield b

785

791

786

return generator()

792

return generator()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             # Copyright (C) 2011-2019 RhodeCode GmbH
             #
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU Affero General Public License, version 3
             # (only), as published by the Free Software Foundation.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU Affero General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             #
             # This program is dual-licensed. If you wish to learn more about the
             # RhodeCode Enterprise Edition, including its added features, Support services,
             # and proprietary license terms, please see https://rhodecode.com/licenses/
             import logging
             import difflib
             from itertools import groupby
             from pygments import lex
             from pygments.formatters.html import _get_ttype_class as pygment_token_class
             from pygments.lexers.special import TextLexer, Token
             from pygments.lexers import get_lexer_by_name
             from pyramid import compat
             from rhodecode.lib.helpers import (
                 get_lexer_for_filenode, html_escape, get_custom_lexer)
             from rhodecode.lib.utils2 import AttributeDict, StrictAttributeDict, safe_unicode
             from rhodecode.lib.vcs.nodes import FileNode
             from rhodecode.lib.vcs.exceptions import VCSError, NodeDoesNotExistError
             from rhodecode.lib.diff_match_patch import diff_match_patch
             from rhodecode.lib.diffs import LimitedDiffContainer, DEL_FILENODE, BIN_FILENODE
             plain_text_lexer = get_lexer_by_name(
                 'text', stripall=False, stripnl=False, ensurenl=False)
             log = logging.getLogger(__name__)
             def filenode_as_lines_tokens(filenode, lexer=None):
                 org_lexer = lexer
                 lexer = lexer or get_lexer_for_filenode(filenode)
                 log.debug('Generating file node pygment tokens for %s, %s, org_lexer:%s',
                           lexer, filenode, org_lexer)
                 content = filenode.content
                 tokens = tokenize_string(content, lexer)
                 lines = split_token_stream(tokens, content)
                 rv = list(lines)
                 return rv
             def tokenize_string(content, lexer):
                 """
                 Use pygments to tokenize some content based on a lexer
                 ensuring all original new lines and whitespace is preserved
                 """
                 lexer.stripall = False
                 lexer.stripnl = False
                 lexer.ensurenl = False
                 if isinstance(lexer, TextLexer):
                     lexed = [(Token.Text, content)]
                 else:
                     lexed = lex(content, lexer)
                 for token_type, token_text in lexed:
                     yield pygment_token_class(token_type), token_text
             def split_token_stream(tokens, content):
                 """
                 Take a list of (TokenType, text) tuples and split them by a string
                 split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
                 [(TEXT, 'some'), (TEXT, 'text'),
                  (TEXT, 'more'), (TEXT, 'text')]
                 """
                 token_buffer = []
                 for token_class, token_text in tokens:
                     parts = token_text.split('\n')
                     for part in parts[:-1]:
                         token_buffer.append((token_class, part))
                         yield token_buffer
                         token_buffer = []
                     token_buffer.append((token_class, parts[-1]))
                 if token_buffer:
                     yield token_buffer
                 elif content:
                     # this is a special case, we have the content, but tokenization didn't produce
                     # any results. THis can happen if know file extensions like .css have some bogus
                     # unicode content without any newline characters
                     yield [(pygment_token_class(Token.Text), content)]
             def filenode_as_annotated_lines_tokens(filenode):
                 """
                 Take a file node and return a list of annotations => lines, if no annotation
                 is found, it will be None.
                 eg:
                 [
                     (annotation1, [
                         (1, line1_tokens_list),
                         (2, line2_tokens_list),
                     ]),
                     (annotation2, [
                         (3, line1_tokens_list),
                     ]),
                     (None, [
                         (4, line1_tokens_list),
                     ]),
                     (annotation1, [
                         (5, line1_tokens_list),
                         (6, line2_tokens_list),
                     ])
                 ]
                 """
                 commit_cache = {}  # cache commit_getter lookups
                 def _get_annotation(commit_id, commit_getter):
                     if commit_id not in commit_cache:
                         commit_cache[commit_id] = commit_getter()
                     return commit_cache[commit_id]
                 annotation_lookup = {
                     line_no: _get_annotation(commit_id, commit_getter)
                     for line_no, commit_id, commit_getter, line_content
                     in filenode.annotate
                 }
                 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
                                       for line_no, tokens
                                       in enumerate(filenode_as_lines_tokens(filenode), 1))
                 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
                 for annotation, group in grouped_annotations_lines:
                     yield (
                         annotation, [(line_no, tokens)
                                       for (_, line_no, tokens) in group]
                     )
             def render_tokenstream(tokenstream):
                 result = []
                 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
                     if token_class:
                         result.append(u'<span class="%s">' % token_class)
                     else:
                         result.append(u'<span>')
                     for op_tag, token_text in token_ops_texts:
                         if op_tag:
                             result.append(u'<%s>' % op_tag)
-                        escaped_text = html_escape(token_text)
+                        # NOTE(marcink): in some cases of mixed encodings, we might run into
+                        # troubles in the html_escape, in this case we say unicode force on token_text
+                        # that would ensure "correct" data even with the cost of rendered
+                        try:
+                            escaped_text = html_escape(token_text)
+                        except TypeError:
+                            escaped_text = html_escape(safe_unicode(token_text))
                         # TODO: dan: investigate showing hidden characters like space/nl/tab
                         # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
                         # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
                         # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
                         result.append(escaped_text)
                         if op_tag:
                             result.append(u'</%s>' % op_tag)
                     result.append(u'</span>')
                 html = ''.join(result)
                 return html
             def rollup_tokenstream(tokenstream):
                 """
                 Group a token stream of the format:
                     ('class', 'op', 'text')
                 or
                     ('class', 'text')
                 into
                     [('class1',
                         [('op1', 'text'),
                          ('op2', 'text')]),
                      ('class2',
                         [('op3', 'text')])]
                 This is used to get the minimal tags necessary when
                 rendering to html eg for a token stream ie.
                 <span class="A"><ins>he</ins>llo</span>
                 vs
                 <span class="A"><ins>he</ins></span><span class="A">llo</span>
                 If a 2 tuple is passed in, the output op will be an empty string.
                 eg:
                 >>> rollup_tokenstream([('classA', '',      'h'),
                                         ('classA', 'del',   'ell'),
                                         ('classA', '',      'o'),
                                         ('classB', '',      ' '),
                                         ('classA', '',      'the'),
                                         ('classA', '',      're'),
                                         ])
                 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
                  ('classB', [('', ' ')],
                  ('classA', [('', 'there')]]
                 """
                 if tokenstream and len(tokenstream[0]) == 2:
                     tokenstream = ((t[0], '', t[1]) for t in tokenstream)
                 result = []
                 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
                     ops = []
                     for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
                         text_buffer = []
                         for t_class, t_op, t_text in token_text_list:
                             text_buffer.append(t_text)
                         ops.append((token_op, ''.join(text_buffer)))
                     result.append((token_class, ops))
                 return result
             def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
                 """
                 Converts a list of (token_class, token_text) tuples to a list of
                 (token_class, token_op, token_text) tuples where token_op is one of
                 ('ins', 'del', '')
                 :param old_tokens: list of (token_class, token_text) tuples of old line
                 :param new_tokens: list of (token_class, token_text) tuples of new line
                 :param use_diff_match_patch: boolean, will use google's diff match patch
                     library which has options to 'smooth' out the character by character
                     differences making nicer ins/del blocks
                 """
                 old_tokens_result = []
                 new_tokens_result = []
                 similarity = difflib.SequenceMatcher(None,
                     ''.join(token_text for token_class, token_text in old_tokens),
                     ''.join(token_text for token_class, token_text in new_tokens)
                 ).ratio()
                 if similarity < 0.6: # return, the blocks are too different
                     for token_class, token_text in old_tokens:
                         old_tokens_result.append((token_class, '', token_text))
                     for token_class, token_text in new_tokens:
                         new_tokens_result.append((token_class, '', token_text))
                     return old_tokens_result, new_tokens_result, similarity
                 token_sequence_matcher = difflib.SequenceMatcher(None,
                     [x[1] for x in old_tokens],
                     [x[1] for x in new_tokens])
                 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
                     # check the differences by token block types first to give a more
                     # nicer "block" level replacement vs character diffs
                     if tag == 'equal':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, '', token_text))
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, '', token_text))
                     elif tag == 'delete':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, 'del', token_text))
                     elif tag == 'insert':
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, 'ins', token_text))
                     elif tag == 'replace':
                         # if same type token blocks must be replaced, do a diff on the
                         # characters in the token blocks to show individual changes
                         old_char_tokens = []
                         new_char_tokens = []
                         for token_class, token_text in old_tokens[o1:o2]:
                             for char in token_text:
                                 old_char_tokens.append((token_class, char))
                         for token_class, token_text in new_tokens[n1:n2]:
                             for char in token_text:
                                 new_char_tokens.append((token_class, char))
                         old_string = ''.join([token_text for
                             token_class, token_text in old_char_tokens])
                         new_string = ''.join([token_text for
                             token_class, token_text in new_char_tokens])
                         char_sequence = difflib.SequenceMatcher(
                             None, old_string, new_string)
                         copcodes = char_sequence.get_opcodes()
                         obuffer, nbuffer = [], []
                         if use_diff_match_patch:
                             dmp = diff_match_patch()
                             dmp.Diff_EditCost = 11  # TODO: dan: extract this to a setting
                             reps = dmp.diff_main(old_string, new_string)
                             dmp.diff_cleanupEfficiency(reps)
                             a, b = 0, 0
                             for op, rep in reps:
                                 l = len(rep)
                                 if op == 0:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], '', c))
                                         nbuffer.append((new_char_tokens[b+i][0], '', c))
                                     a += l
                                     b += l
                                 elif op == -1:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], 'del', c))
                                     a += l
                                 elif op == 1:
                                     for i, c in enumerate(rep):
                                         nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
                                     b += l
                         else:
                             for ctag, co1, co2, cn1, cn2 in copcodes:
                                 if ctag == 'equal':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, '', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, '', token_text))
                                 elif ctag == 'delete':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                 elif ctag == 'insert':
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                                 elif ctag == 'replace':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                         old_tokens_result.extend(obuffer)
                         new_tokens_result.extend(nbuffer)
                 return old_tokens_result, new_tokens_result, similarity
             def diffset_node_getter(commit):
                 def get_node(fname):
                     try:
                         return commit.get_node(fname)
                     except NodeDoesNotExistError:
                         return None
                 return get_node
             class DiffSet(object):
                 """
                 An object for parsing the diff result from diffs.DiffProcessor and
                 adding highlighting, side by side/unified renderings and line diffs
                 """
                 HL_REAL = 'REAL'  # highlights using original file, slow
                 HL_FAST = 'FAST'  # highlights using just the line, fast but not correct
                                   # in the case of multiline code
                 HL_NONE = 'NONE'  # no highlighting, fastest
                 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
                              source_repo_name=None,
                              source_node_getter=lambda filename: None,
                              target_repo_name=None,
                              target_node_getter=lambda filename: None,
                              source_nodes=None, target_nodes=None,
                              # files over this size will use fast highlighting
                              max_file_size_limit=150 * 1024,
                              ):
                     self.highlight_mode = highlight_mode
                     self.highlighted_filenodes = {}
                     self.source_node_getter = source_node_getter
                     self.target_node_getter = target_node_getter
                     self.source_nodes = source_nodes or {}
                     self.target_nodes = target_nodes or {}
                     self.repo_name = repo_name
                     self.target_repo_name = target_repo_name or repo_name
                     self.source_repo_name = source_repo_name or repo_name
                     self.max_file_size_limit = max_file_size_limit
                 def render_patchset(self, patchset, source_ref=None, target_ref=None):
                     diffset = AttributeDict(dict(
                         lines_added=0,
                         lines_deleted=0,
                         changed_files=0,
                         files=[],
                         file_stats={},
                         limited_diff=isinstance(patchset, LimitedDiffContainer),
                         repo_name=self.repo_name,
                         target_repo_name=self.target_repo_name,
                         source_repo_name=self.source_repo_name,
                         source_ref=source_ref,
                         target_ref=target_ref,
                     ))
                     for patch in patchset:
                         diffset.file_stats[patch['filename']] = patch['stats']
                         filediff = self.render_patch(patch)
                         filediff.diffset = StrictAttributeDict(dict(
                             source_ref=diffset.source_ref,
                             target_ref=diffset.target_ref,
                             repo_name=diffset.repo_name,
                             source_repo_name=diffset.source_repo_name,
                             target_repo_name=diffset.target_repo_name,
                         ))
                         diffset.files.append(filediff)
                         diffset.changed_files += 1
                         if not patch['stats']['binary']:
                             diffset.lines_added += patch['stats']['added']
                             diffset.lines_deleted += patch['stats']['deleted']
                     return diffset
                 _lexer_cache = {}
                 def _get_lexer_for_filename(self, filename, filenode=None):
                     # cached because we might need to call it twice for source/target
                     if filename not in self._lexer_cache:
                         if filenode:
                             lexer = filenode.lexer
                             extension = filenode.extension
                         else:
                             lexer = FileNode.get_lexer(filename=filename)
                             extension = filename.split('.')[-1]
                         lexer = get_custom_lexer(extension) or lexer
                         self._lexer_cache[filename] = lexer
                     return self._lexer_cache[filename]
                 def render_patch(self, patch):
                     log.debug('rendering diff for %r', patch['filename'])
                     source_filename = patch['original_filename']
                     target_filename = patch['filename']
                     source_lexer = plain_text_lexer
                     target_lexer = plain_text_lexer
                     if not patch['stats']['binary']:
                         node_hl_mode = self.HL_NONE if patch['chunks'] == [] else None
                         hl_mode = node_hl_mode or self.highlight_mode
                         if hl_mode == self.HL_REAL:
                             if (source_filename and patch['operation'] in ('D', 'M')
                                 and source_filename not in self.source_nodes):
                                     self.source_nodes[source_filename] = (
                                         self.source_node_getter(source_filename))
                             if (target_filename and patch['operation'] in ('A', 'M')
                                 and target_filename not in self.target_nodes):
                                     self.target_nodes[target_filename] = (
                                         self.target_node_getter(target_filename))
                         elif hl_mode == self.HL_FAST:
                             source_lexer = self._get_lexer_for_filename(source_filename)
                             target_lexer = self._get_lexer_for_filename(target_filename)
                     source_file = self.source_nodes.get(source_filename, source_filename)
                     target_file = self.target_nodes.get(target_filename, target_filename)
                     raw_id_uid = ''
                     if self.source_nodes.get(source_filename):
                         raw_id_uid = self.source_nodes[source_filename].commit.raw_id
                     if not raw_id_uid and self.target_nodes.get(target_filename):
                         # in case this is a new file we only have it in target
                         raw_id_uid = self.target_nodes[target_filename].commit.raw_id
                     source_filenode, target_filenode = None, None
                     # TODO: dan: FileNode.lexer works on the content of the file - which
                     # can be slow - issue #4289 explains a lexer clean up - which once
                     # done can allow caching a lexer for a filenode to avoid the file lookup
                     if isinstance(source_file, FileNode):
                         source_filenode = source_file
                         #source_lexer = source_file.lexer
                         source_lexer = self._get_lexer_for_filename(source_filename)
                         source_file.lexer = source_lexer
                     if isinstance(target_file, FileNode):
                         target_filenode = target_file
                         #target_lexer = target_file.lexer
                         target_lexer = self._get_lexer_for_filename(target_filename)
                         target_file.lexer = target_lexer
                     source_file_path, target_file_path = None, None
                     if source_filename != '/dev/null':
                         source_file_path = source_filename
                     if target_filename != '/dev/null':
                         target_file_path = target_filename
                     source_file_type = source_lexer.name
                     target_file_type = target_lexer.name
                     filediff = AttributeDict({
                         'source_file_path': source_file_path,
                         'target_file_path': target_file_path,
                         'source_filenode': source_filenode,
                         'target_filenode': target_filenode,
                         'source_file_type': target_file_type,
                         'target_file_type': source_file_type,
                         'patch': {'filename': patch['filename'], 'stats': patch['stats']},
                         'operation': patch['operation'],
                         'source_mode': patch['stats']['old_mode'],
                         'target_mode': patch['stats']['new_mode'],
                         'limited_diff': patch['is_limited_diff'],
                         'hunks': [],
                         'hunk_ops': None,
                         'diffset': self,
                         'raw_id': raw_id_uid,
                     })
                     file_chunks = patch['chunks'][1:]
                     for hunk in file_chunks:
                         hunkbit = self.parse_hunk(hunk, source_file, target_file)
                         hunkbit.source_file_path = source_file_path
                         hunkbit.target_file_path = target_file_path
                         filediff.hunks.append(hunkbit)
                     # Simulate hunk on OPS type line which doesn't really contain any diff
                     # this allows commenting on those
                     if not file_chunks:
                         actions = []
                         for op_id, op_text in filediff.patch['stats']['ops'].items():
                             if op_id == DEL_FILENODE:
                                 actions.append(u'file was removed')
                             elif op_id == BIN_FILENODE:
                                 actions.append(u'binary diff hidden')
                             else:
                                 actions.append(safe_unicode(op_text))
                         action_line = u'NO CONTENT: ' + \
                                       u', '.join(actions) or u'UNDEFINED_ACTION'
                         hunk_ops = {'source_length': 0, 'source_start': 0,
                                     'lines': [
                                         {'new_lineno': 0, 'old_lineno': 1,
                                          'action': 'unmod-no-hl', 'line': action_line}
                                     ],
                                     'section_header': u'', 'target_start': 1, 'target_length': 1}
                         hunkbit = self.parse_hunk(hunk_ops, source_file, target_file)
                         hunkbit.source_file_path = source_file_path
                         hunkbit.target_file_path = target_file_path
                         filediff.hunk_ops = hunkbit
                     return filediff
                 def parse_hunk(self, hunk, source_file, target_file):
                     result = AttributeDict(dict(
                         source_start=hunk['source_start'],
                         source_length=hunk['source_length'],
                         target_start=hunk['target_start'],
                         target_length=hunk['target_length'],
                         section_header=hunk['section_header'],
                         lines=[],
                     ))
                     before, after = [], []
                     for line in hunk['lines']:
                         if line['action'] in ['unmod', 'unmod-no-hl']:
                             no_hl = line['action'] == 'unmod-no-hl'
                             result.lines.extend(
                                 self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))
                             after.append(line)
                             before.append(line)
                         elif line['action'] == 'add':
                             after.append(line)
                         elif line['action'] == 'del':
                             before.append(line)
                         elif line['action'] == 'old-no-nl':
                             before.append(line)
                         elif line['action'] == 'new-no-nl':
                             after.append(line)
                     all_actions = [x['action'] for x in after] + [x['action'] for x in before]
                     no_hl = {x for x in all_actions} == {'unmod-no-hl'}
                     result.lines.extend(
                         self.parse_lines(before, after, source_file, target_file, no_hl=no_hl))
                     # NOTE(marcink): we must keep list() call here so we can cache the result...
                     result.unified = list(self.as_unified(result.lines))
                     result.sideside = result.lines
                     return result
                 def parse_lines(self, before_lines, after_lines, source_file, target_file,
                                 no_hl=False):
                     # TODO: dan: investigate doing the diff comparison and fast highlighting
                     # on the entire before and after buffered block lines rather than by
                     # line, this means we can get better 'fast' highlighting if the context
                     # allows it - eg.
                     # line 4: """
                     # line 5: this gets highlighted as a string
                     # line 6: """
                     lines = []
                     before_newline = AttributeDict()
                     after_newline = AttributeDict()
                     if before_lines and before_lines[-1]['action'] == 'old-no-nl':
                         before_newline_line = before_lines.pop(-1)
                         before_newline.content = '\n {}'.format(
                             render_tokenstream(
                                 [(x[0], '', x[1])
                                  for x in [('nonl', before_newline_line['line'])]]))
                     if after_lines and after_lines[-1]['action'] == 'new-no-nl':
                         after_newline_line = after_lines.pop(-1)
                         after_newline.content = '\n {}'.format(
                             render_tokenstream(
                                 [(x[0], '', x[1])
                                  for x in [('nonl', after_newline_line['line'])]]))
                     while before_lines or after_lines:
                         before, after = None, None
                         before_tokens, after_tokens = None, None
                         if before_lines:
                             before = before_lines.pop(0)
                         if after_lines:
                             after = after_lines.pop(0)
                         original = AttributeDict()
                         modified = AttributeDict()
                         if before:
                             if before['action'] == 'old-no-nl':
                                 before_tokens = [('nonl', before['line'])]
                             else:
                                 before_tokens = self.get_line_tokens(
                                     line_text=before['line'], line_number=before['old_lineno'],
                                     input_file=source_file, no_hl=no_hl)
                             original.lineno = before['old_lineno']
                             original.content = before['line']
                             original.action = self.action_to_op(before['action'])
                             original.get_comment_args = (
                                 source_file, 'o', before['old_lineno'])
                         if after:
                             if after['action'] == 'new-no-nl':
                                 after_tokens = [('nonl', after['line'])]
                             else:
                                 after_tokens = self.get_line_tokens(
                                     line_text=after['line'], line_number=after['new_lineno'],
                                     input_file=target_file, no_hl=no_hl)
                             modified.lineno = after['new_lineno']
                             modified.content = after['line']
                             modified.action = self.action_to_op(after['action'])
                             modified.get_comment_args = (target_file, 'n', after['new_lineno'])
                         # diff the lines
                         if before_tokens and after_tokens:
                             o_tokens, m_tokens, similarity = tokens_diff(
                                 before_tokens, after_tokens)
                             original.content = render_tokenstream(o_tokens)
                             modified.content = render_tokenstream(m_tokens)
                         elif before_tokens:
                             original.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in before_tokens])
                         elif after_tokens:
                             modified.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in after_tokens])
                         if not before_lines and before_newline:
                             original.content += before_newline.content
                             before_newline = None
                         if not after_lines and after_newline:
                             modified.content += after_newline.content
                             after_newline = None
                         lines.append(AttributeDict({
                             'original': original,
                             'modified': modified,
                         }))
                     return lines
                 def get_line_tokens(self, line_text, line_number, input_file=None, no_hl=False):
                     filenode = None
                     filename = None
                     if isinstance(input_file, compat.string_types):
                         filename = input_file
                     elif isinstance(input_file, FileNode):
                         filenode = input_file
                         filename = input_file.unicode_path
                     hl_mode = self.HL_NONE if no_hl else self.highlight_mode
                     if hl_mode == self.HL_REAL and filenode:
                         lexer = self._get_lexer_for_filename(filename)
                         file_size_allowed = input_file.size < self.max_file_size_limit
                         if line_number and file_size_allowed:
                             return self.get_tokenized_filenode_line(
                                 input_file, line_number, lexer)
                     if hl_mode in (self.HL_REAL, self.HL_FAST) and filename:
                         lexer = self._get_lexer_for_filename(filename)
                         return list(tokenize_string(line_text, lexer))
                     return list(tokenize_string(line_text, plain_text_lexer))
                 def get_tokenized_filenode_line(self, filenode, line_number, lexer=None):
                     if filenode not in self.highlighted_filenodes:
                         tokenized_lines = filenode_as_lines_tokens(filenode, lexer)
                         self.highlighted_filenodes[filenode] = tokenized_lines
                     try:
                         return self.highlighted_filenodes[filenode][line_number - 1]
                     except Exception:
                         return [('', u'rhodecode diff rendering error')]
                 def action_to_op(self, action):
                     return {
                         'add': '+',
                         'del': '-',
                         'unmod': ' ',
                         'unmod-no-hl': ' ',
                         'old-no-nl': ' ',
                         'new-no-nl': ' ',
                     }.get(action, action)
                 def as_unified(self, lines):
                     """
                     Return a generator that yields the lines of a diff in unified order
                     """
                     def generator():
                         buf = []
                         for line in lines:
                             if buf and not line.original or line.original.action == ' ':
                                 for b in buf:
                                     yield b
                                 buf = []
                             if line.original:
                                 if line.original.action == ' ':
                                     yield (line.original.lineno, line.modified.lineno,
                                            line.original.action, line.original.content,
                                            line.original.get_comment_args)
                                     continue
                                 if line.original.action == '-':
                                     yield (line.original.lineno, None,
                                            line.original.action, line.original.content,
                                            line.original.get_comment_args)
                                 if line.modified.action == '+':
                                     buf.append((
                                         None, line.modified.lineno,
                                         line.modified.action, line.modified.content,
                                         line.modified.get_comment_args))
                                     continue
                             if line.modified:
                                 yield (None, line.modified.lineno,
                                        line.modified.action, line.modified.content,
                                        line.modified.get_comment_args)
                         for b in buf:
                             yield b
                     return generator()