rhodecode-enterprise-ce Commit - r1258:70c673b5

1

# -*- coding: utf-8 -*-

1

# -*- coding: utf-8 -*-

2

3

4

#

4

#

5

# This program is free software: you can redistribute it and/or modify

5

# This program is free software: you can redistribute it and/or modify

6

# it under the terms of the GNU Affero General Public License, version 3

6

# it under the terms of the GNU Affero General Public License, version 3

7

# (only), as published by the Free Software Foundation.

7

# (only), as published by the Free Software Foundation.

8

#

8

#

9

# This program is distributed in the hope that it will be useful,

9

# This program is distributed in the hope that it will be useful,

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

10

# but WITHOUT ANY WARRANTY; without even the implied warranty of

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

11

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12

# GNU General Public License for more details.

12

# GNU General Public License for more details.

13

#

13

#

14

# You should have received a copy of the GNU Affero General Public License

14

# You should have received a copy of the GNU Affero General Public License

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

15

# along with this program. If not, see <http://www.gnu.org/licenses/>.

16

#

16

#

17

# This program is dual-licensed. If you wish to learn more about the

17

# This program is dual-licensed. If you wish to learn more about the

18

# RhodeCode Enterprise Edition, including its added features, Support services,

18

# RhodeCode Enterprise Edition, including its added features, Support services,

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

19

# and proprietary license terms, please see https://rhodecode.com/licenses/

20

21

import logging

21

import logging

22

import difflib

22

import difflib

23

from itertools import groupby

23

from itertools import groupby

24

25

from pygments import lex

25

from pygments import lex

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

26

from pygments.formatters.html import _get_ttype_class as pygment_token_class

27

from rhodecode.lib.helpers import (

27

from rhodecode.lib.helpers import (

28

get_lexer_for_filenode, get_lexer_safe, html_escape)

28

get_lexer_for_filenode, get_lexer_safe, html_escape)

29

from rhodecode.lib.utils2 import AttributeDict

29

from rhodecode.lib.utils2 import AttributeDict

30

from rhodecode.lib.vcs.nodes import FileNode

30

from rhodecode.lib.vcs.nodes import FileNode

31

from rhodecode.lib.diff_match_patch import diff_match_patch

31

from rhodecode.lib.diff_match_patch import diff_match_patch

32

from rhodecode.lib.diffs import LimitedDiffContainer

32

from rhodecode.lib.diffs import LimitedDiffContainer

33

from pygments.lexers import get_lexer_by_name

33

from pygments.lexers import get_lexer_by_name

34

35

plain_text_lexer = get_lexer_by_name(

35

plain_text_lexer = get_lexer_by_name(

36

'text', stripall=False, stripnl=False, ensurenl=False)

36

'text', stripall=False, stripnl=False, ensurenl=False)

37

38

39

log = logging.getLogger()

39

log = logging.getLogger()

40

41

42

def filenode_as_lines_tokens(filenode, lexer=None):

42

def filenode_as_lines_tokens(filenode, lexer=None):

43

lexer = lexer or get_lexer_for_filenode(filenode)

43

lexer = lexer or get_lexer_for_filenode(filenode)

44

log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)

44

log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)

45

tokens = tokenize_string(filenode.content, lexer)

45

tokens = tokenize_string(filenode.content, lexer)

46

lines = split_token_stream(tokens, split_string='\n')

46

lines = split_token_stream(tokens, split_string='\n')

47

rv = list(lines)

47

rv = list(lines)

48

return rv

48

return rv

49

50

51

def tokenize_string(content, lexer):

51

def tokenize_string(content, lexer):

52

"""

52

"""

53

Use pygments to tokenize some content based on a lexer

53

Use pygments to tokenize some content based on a lexer

54

ensuring all original new lines and whitespace is preserved

54

ensuring all original new lines and whitespace is preserved

55

"""

55

"""

56

57

lexer.stripall = False

57

lexer.stripall = False

58

lexer.stripnl = False

58

lexer.stripnl = False

59

lexer.ensurenl = False

59

lexer.ensurenl = False

60

for token_type, token_text in lex(content, lexer):

60

for token_type, token_text in lex(content, lexer):

61

yield pygment_token_class(token_type), token_text

61

yield pygment_token_class(token_type), token_text

62

63

64

def split_token_stream(tokens, split_string=u'\n'):

64

def split_token_stream(tokens, split_string=u'\n'):

65

"""

65

"""

66

Take a list of (TokenType, text) tuples and split them by a string

66

Take a list of (TokenType, text) tuples and split them by a string

67

68

>>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

68

>>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])

69

[(TEXT, 'some'), (TEXT, 'text'),

69

[(TEXT, 'some'), (TEXT, 'text'),

70

(TEXT, 'more'), (TEXT, 'text')]

70

(TEXT, 'more'), (TEXT, 'text')]

71

"""

71

"""

72

73

buffer = []

73

buffer = []

74

for token_class, token_text in tokens:

74

for token_class, token_text in tokens:

75

parts = token_text.split(split_string)

75

parts = token_text.split(split_string)

76

for part in parts[:-1]:

76

for part in parts[:-1]:

77

buffer.append((token_class, part))

77

buffer.append((token_class, part))

78

yield buffer

78

yield buffer

79

buffer = []

79

buffer = []

80

81

buffer.append((token_class, parts[-1]))

81

buffer.append((token_class, parts[-1]))

82

83

if buffer:

83

if buffer:

84

yield buffer

84

yield buffer

85

86

87

def filenode_as_annotated_lines_tokens(filenode):

87

def filenode_as_annotated_lines_tokens(filenode):

88

"""

88

"""

89

Take a file node and return a list of annotations => lines, if no annotation

89

Take a file node and return a list of annotations => lines, if no annotation

90

is found, it will be None.

90

is found, it will be None.

91

92

eg:

92

eg:

93

94

[

94

[

95

(annotation1, [

95

(annotation1, [

96

(1, line1_tokens_list),

96

(1, line1_tokens_list),

97

(2, line2_tokens_list),

97

(2, line2_tokens_list),

98

]),

98

]),

99

(annotation2, [

99

(annotation2, [

100

(3, line1_tokens_list),

100

(3, line1_tokens_list),

101

]),

101

]),

102

(None, [

102

(None, [

103

(4, line1_tokens_list),

103

(4, line1_tokens_list),

104

]),

104

]),

105

(annotation1, [

105

(annotation1, [

106

(5, line1_tokens_list),

106

(5, line1_tokens_list),

107

(6, line2_tokens_list),

107

(6, line2_tokens_list),

108

])

108

])

109

]

109

]

110

"""

110

"""

111

112

commit_cache = {} # cache commit_getter lookups

112

commit_cache = {} # cache commit_getter lookups

113

114

def _get_annotation(commit_id, commit_getter):

114

def _get_annotation(commit_id, commit_getter):

115

if commit_id not in commit_cache:

115

if commit_id not in commit_cache:

116

commit_cache[commit_id] = commit_getter()

116

commit_cache[commit_id] = commit_getter()

117

return commit_cache[commit_id]

117

return commit_cache[commit_id]

118

119

annotation_lookup = {

119

annotation_lookup = {

120

line_no: _get_annotation(commit_id, commit_getter)

120

line_no: _get_annotation(commit_id, commit_getter)

121

for line_no, commit_id, commit_getter, line_content

121

for line_no, commit_id, commit_getter, line_content

122

in filenode.annotate

122

in filenode.annotate

123

}

123

}

124

125

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

125

annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)

126

for line_no, tokens

126

for line_no, tokens

127

in enumerate(filenode_as_lines_tokens(filenode), 1))

127

in enumerate(filenode_as_lines_tokens(filenode), 1))

128

129

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

129

grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])

130

131

for annotation, group in grouped_annotations_lines:

131

for annotation, group in grouped_annotations_lines:

132

yield (

132

yield (

133

annotation, [(line_no, tokens)

133

annotation, [(line_no, tokens)

134

for (_, line_no, tokens) in group]

134

for (_, line_no, tokens) in group]

135

)

135

)

136

137

138

def render_tokenstream(tokenstream):

138

def render_tokenstream(tokenstream):

139

result = []

139

result = []

140

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

140

for token_class, token_ops_texts in rollup_tokenstream(tokenstream):

141

142

if token_class:

142

if token_class:

143

result.append(u'' % token_class)

143

result.append(u'' % token_class)

144

else:

144

else:

145

result.append(u'')

145

result.append(u'')

146

147

for op_tag, token_text in token_ops_texts:

147

for op_tag, token_text in token_ops_texts:

148

149

if op_tag:

149

if op_tag:

150

result.append(u'<%s>' % op_tag)

150

result.append(u'<%s>' % op_tag)

151

152

escaped_text = html_escape(token_text)

152

escaped_text = html_escape(token_text)

153

154

# TODO: dan: investigate showing hidden characters like space/nl/tab

154

# TODO: dan: investigate showing hidden characters like space/nl/tab

155

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

155

# escaped_text = escaped_text.replace(' ', '<sp> </sp>')

156

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

156

# escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')

157

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

157

# escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')

158

159

result.append(escaped_text)

159

result.append(escaped_text)

160

161

if op_tag:

161

if op_tag:

162

result.append(u'</%s>' % op_tag)

162

result.append(u'</%s>' % op_tag)

163

164

result.append(u'')

164

result.append(u'')

165

166

html = ''.join(result)

166

html = ''.join(result)

167

return html

167

return html

168

169

170

def rollup_tokenstream(tokenstream):

170

def rollup_tokenstream(tokenstream):

171

"""

171

"""

172

Group a token stream of the format:

172

Group a token stream of the format:

173

174

('class', 'op', 'text')

174

('class', 'op', 'text')

175

or

175

or

176

('class', 'text')

176

('class', 'text')

177

178

into

178

into

179

180

[('class1',

180

[('class1',

181

[('op1', 'text'),

181

[('op1', 'text'),

182

('op2', 'text')]),

182

('op2', 'text')]),

183

('class2',

183

('class2',

184

[('op3', 'text')])]

184

[('op3', 'text')])]

185

186

This is used to get the minimal tags necessary when

186

This is used to get the minimal tags necessary when

187

rendering to html eg for a token stream ie.

187

rendering to html eg for a token stream ie.

188

189

<ins>he</ins>llo

189

<ins>he</ins>llo

190

vs

190

vs

191

<ins>he</ins>llo

191

<ins>he</ins>llo

192

193

If a 2 tuple is passed in, the output op will be an empty string.

193

If a 2 tuple is passed in, the output op will be an empty string.

194

195

eg:

195

eg:

196

197

>>> rollup_tokenstream([('classA', '', 'h'),

197

>>> rollup_tokenstream([('classA', '', 'h'),

198

('classA', 'del', 'ell'),

198

('classA', 'del', 'ell'),

199

('classA', '', 'o'),

199

('classA', '', 'o'),

200

('classB', '', ' '),

200

('classB', '', ' '),

201

('classA', '', 'the'),

201

('classA', '', 'the'),

202

('classA', '', 're'),

202

('classA', '', 're'),

203

])

203

])

204

205

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

205

[('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],

206

('classB', [('', ' ')],

206

('classB', [('', ' ')],

207

('classA', [('', 'there')]]

207

('classA', [('', 'there')]]

208

209

"""

209

"""

210

if tokenstream and len(tokenstream[0]) == 2:

210

if tokenstream and len(tokenstream[0]) == 2:

211

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

211

tokenstream = ((t[0], '', t[1]) for t in tokenstream)

212

213

result = []

213

result = []

214

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

214

for token_class, op_list in groupby(tokenstream, lambda t: t[0]):

215

ops = []

215

ops = []

216

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

216

for token_op, token_text_list in groupby(op_list, lambda o: o[1]):

217

text_buffer = []

217

text_buffer = []

218

for t_class, t_op, t_text in token_text_list:

218

for t_class, t_op, t_text in token_text_list:

219

text_buffer.append(t_text)

219

text_buffer.append(t_text)

220

ops.append((token_op, ''.join(text_buffer)))

220

ops.append((token_op, ''.join(text_buffer)))

221

result.append((token_class, ops))

221

result.append((token_class, ops))

222

return result

222

return result

223

224

225

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

225

def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):

226

"""

226

"""

227

Converts a list of (token_class, token_text) tuples to a list of

227

Converts a list of (token_class, token_text) tuples to a list of

228

(token_class, token_op, token_text) tuples where token_op is one of

228

(token_class, token_op, token_text) tuples where token_op is one of

229

('ins', 'del', '')

229

('ins', 'del', '')

230

231

:param old_tokens: list of (token_class, token_text) tuples of old line

231

:param old_tokens: list of (token_class, token_text) tuples of old line

232

:param new_tokens: list of (token_class, token_text) tuples of new line

232

:param new_tokens: list of (token_class, token_text) tuples of new line

233

:param use_diff_match_patch: boolean, will use google's diff match patch

233

:param use_diff_match_patch: boolean, will use google's diff match patch

234

library which has options to 'smooth' out the character by character

234

library which has options to 'smooth' out the character by character

235

differences making nicer ins/del blocks

235

differences making nicer ins/del blocks

236

"""

236

"""

237

238

old_tokens_result = []

238

old_tokens_result = []

239

new_tokens_result = []

239

new_tokens_result = []

240

241

similarity = difflib.SequenceMatcher(None,

241

similarity = difflib.SequenceMatcher(None,

242

''.join(token_text for token_class, token_text in old_tokens),

242

''.join(token_text for token_class, token_text in old_tokens),

243

''.join(token_text for token_class, token_text in new_tokens)

243

''.join(token_text for token_class, token_text in new_tokens)

244

).ratio()

244

).ratio()

245

246

if similarity < 0.6: # return, the blocks are too different

246

if similarity < 0.6: # return, the blocks are too different

247

for token_class, token_text in old_tokens:

247

for token_class, token_text in old_tokens:

248

old_tokens_result.append((token_class, '', token_text))

248

old_tokens_result.append((token_class, '', token_text))

249

for token_class, token_text in new_tokens:

249

for token_class, token_text in new_tokens:

250

new_tokens_result.append((token_class, '', token_text))

250

new_tokens_result.append((token_class, '', token_text))

251

return old_tokens_result, new_tokens_result, similarity

251

return old_tokens_result, new_tokens_result, similarity

252

253

token_sequence_matcher = difflib.SequenceMatcher(None,

253

token_sequence_matcher = difflib.SequenceMatcher(None,

254

[x[1] for x in old_tokens],

254

[x[1] for x in old_tokens],

255

[x[1] for x in new_tokens])

255

[x[1] for x in new_tokens])

256

257

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

257

for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():

258

# check the differences by token block types first to give a more

258

# check the differences by token block types first to give a more

259

# nicer "block" level replacement vs character diffs

259

# nicer "block" level replacement vs character diffs

260

261

if tag == 'equal':

261

if tag == 'equal':

262

for token_class, token_text in old_tokens[o1:o2]:

262

for token_class, token_text in old_tokens[o1:o2]:

263

old_tokens_result.append((token_class, '', token_text))

263

old_tokens_result.append((token_class, '', token_text))

264

for token_class, token_text in new_tokens[n1:n2]:

264

for token_class, token_text in new_tokens[n1:n2]:

265

new_tokens_result.append((token_class, '', token_text))

265

new_tokens_result.append((token_class, '', token_text))

266

elif tag == 'delete':

266

elif tag == 'delete':

267

for token_class, token_text in old_tokens[o1:o2]:

267

for token_class, token_text in old_tokens[o1:o2]:

268

old_tokens_result.append((token_class, 'del', token_text))

268

old_tokens_result.append((token_class, 'del', token_text))

269

elif tag == 'insert':

269

elif tag == 'insert':

270

for token_class, token_text in new_tokens[n1:n2]:

270

for token_class, token_text in new_tokens[n1:n2]:

271

new_tokens_result.append((token_class, 'ins', token_text))

271

new_tokens_result.append((token_class, 'ins', token_text))

272

elif tag == 'replace':

272

elif tag == 'replace':

273

# if same type token blocks must be replaced, do a diff on the

273

# if same type token blocks must be replaced, do a diff on the

274

# characters in the token blocks to show individual changes

274

# characters in the token blocks to show individual changes

275

276

old_char_tokens = []

276

old_char_tokens = []

277

new_char_tokens = []

277

new_char_tokens = []

278

for token_class, token_text in old_tokens[o1:o2]:

278

for token_class, token_text in old_tokens[o1:o2]:

279

for char in token_text:

279

for char in token_text:

280

old_char_tokens.append((token_class, char))

280

old_char_tokens.append((token_class, char))

281

282

for token_class, token_text in new_tokens[n1:n2]:

282

for token_class, token_text in new_tokens[n1:n2]:

283

for char in token_text:

283

for char in token_text:

284

new_char_tokens.append((token_class, char))

284

new_char_tokens.append((token_class, char))

285

286

old_string = ''.join([token_text for

286

old_string = ''.join([token_text for

287

token_class, token_text in old_char_tokens])

287

token_class, token_text in old_char_tokens])

288

new_string = ''.join([token_text for

288

new_string = ''.join([token_text for

289

token_class, token_text in new_char_tokens])

289

token_class, token_text in new_char_tokens])

290

291

char_sequence = difflib.SequenceMatcher(

291

char_sequence = difflib.SequenceMatcher(

292

None, old_string, new_string)

292

None, old_string, new_string)

293

copcodes = char_sequence.get_opcodes()

293

copcodes = char_sequence.get_opcodes()

294

obuffer, nbuffer = [], []

294

obuffer, nbuffer = [], []

295

296

if use_diff_match_patch:

296

if use_diff_match_patch:

297

dmp = diff_match_patch()

297

dmp = diff_match_patch()

298

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

298

dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting

299

reps = dmp.diff_main(old_string, new_string)

299

reps = dmp.diff_main(old_string, new_string)

300

dmp.diff_cleanupEfficiency(reps)

300

dmp.diff_cleanupEfficiency(reps)

301

302

a, b = 0, 0

302

a, b = 0, 0

303

for op, rep in reps:

303

for op, rep in reps:

304

l = len(rep)

304

l = len(rep)

305

if op == 0:

305

if op == 0:

306

for i, c in enumerate(rep):

306

for i, c in enumerate(rep):

307

obuffer.append((old_char_tokens[a+i][0], '', c))

307

obuffer.append((old_char_tokens[a+i][0], '', c))

308

nbuffer.append((new_char_tokens[b+i][0], '', c))

308

nbuffer.append((new_char_tokens[b+i][0], '', c))

309

a += l

309

a += l

310

b += l

310

b += l

311

elif op == -1:

311

elif op == -1:

312

for i, c in enumerate(rep):

312

for i, c in enumerate(rep):

313

obuffer.append((old_char_tokens[a+i][0], 'del', c))

313

obuffer.append((old_char_tokens[a+i][0], 'del', c))

314

a += l

314

a += l

315

elif op == 1:

315

elif op == 1:

316

for i, c in enumerate(rep):

316

for i, c in enumerate(rep):

317

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

317

nbuffer.append((new_char_tokens[b+i][0], 'ins', c))

318

b += l

318

b += l

319

else:

319

else:

320

for ctag, co1, co2, cn1, cn2 in copcodes:

320

for ctag, co1, co2, cn1, cn2 in copcodes:

321

if ctag == 'equal':

321

if ctag == 'equal':

322

for token_class, token_text in old_char_tokens[co1:co2]:

322

for token_class, token_text in old_char_tokens[co1:co2]:

323

obuffer.append((token_class, '', token_text))

323

obuffer.append((token_class, '', token_text))

324

for token_class, token_text in new_char_tokens[cn1:cn2]:

324

for token_class, token_text in new_char_tokens[cn1:cn2]:

325

nbuffer.append((token_class, '', token_text))

325

nbuffer.append((token_class, '', token_text))

326

elif ctag == 'delete':

326

elif ctag == 'delete':

327

for token_class, token_text in old_char_tokens[co1:co2]:

327

for token_class, token_text in old_char_tokens[co1:co2]:

328

obuffer.append((token_class, 'del', token_text))

328

obuffer.append((token_class, 'del', token_text))

329

elif ctag == 'insert':

329

elif ctag == 'insert':

330

for token_class, token_text in new_char_tokens[cn1:cn2]:

330

for token_class, token_text in new_char_tokens[cn1:cn2]:

331

nbuffer.append((token_class, 'ins', token_text))

331

nbuffer.append((token_class, 'ins', token_text))

332

elif ctag == 'replace':

332

elif ctag == 'replace':

333

for token_class, token_text in old_char_tokens[co1:co2]:

333

for token_class, token_text in old_char_tokens[co1:co2]:

334

obuffer.append((token_class, 'del', token_text))

334

obuffer.append((token_class, 'del', token_text))

335

for token_class, token_text in new_char_tokens[cn1:cn2]:

335

for token_class, token_text in new_char_tokens[cn1:cn2]:

336

nbuffer.append((token_class, 'ins', token_text))

336

nbuffer.append((token_class, 'ins', token_text))

337

338

old_tokens_result.extend(obuffer)

338

old_tokens_result.extend(obuffer)

339

new_tokens_result.extend(nbuffer)

339

new_tokens_result.extend(nbuffer)

340

341

return old_tokens_result, new_tokens_result, similarity

341

return old_tokens_result, new_tokens_result, similarity

342

343

344

class DiffSet(object):

344

class DiffSet(object):

345

"""

345

"""

346

An object for parsing the diff result from diffs.DiffProcessor and

346

An object for parsing the diff result from diffs.DiffProcessor and

347

adding highlighting, side by side/unified renderings and line diffs

347

adding highlighting, side by side/unified renderings and line diffs

348

"""

348

"""

349

350

HL_REAL = 'REAL' # highlights using original file, slow

350

HL_REAL = 'REAL' # highlights using original file, slow

351

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

351

HL_FAST = 'FAST' # highlights using just the line, fast but not correct

352

# in the case of multiline code

352

# in the case of multiline code

353

HL_NONE = 'NONE' # no highlighting, fastest

353

HL_NONE = 'NONE' # no highlighting, fastest

354

355

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

355

def __init__(self, highlight_mode=HL_REAL, repo_name=None,

356

source_repo_name=None,

356

source_repo_name=None,

357

source_node_getter=lambda filename: None,

357

source_node_getter=lambda filename: None,

358

target_node_getter=lambda filename: None,

358

target_node_getter=lambda filename: None,

359

source_nodes=None, target_nodes=None,

359

source_nodes=None, target_nodes=None,

360

max_file_size_limit=150 * 1024, # files over this size will

360

max_file_size_limit=150 * 1024, # files over this size will

361

# use fast highlighting

361

# use fast highlighting

362

comments=None,

362

comments=None,

363

):

363

):

364

365

self.highlight_mode = highlight_mode

365

self.highlight_mode = highlight_mode

366

self.highlighted_filenodes = {}

366

self.highlighted_filenodes = {}

367

self.source_node_getter = source_node_getter

367

self.source_node_getter = source_node_getter

368

self.target_node_getter = target_node_getter

368

self.target_node_getter = target_node_getter

369

self.source_nodes = source_nodes or {}

369

self.source_nodes = source_nodes or {}

370

self.target_nodes = target_nodes or {}

370

self.target_nodes = target_nodes or {}

371

self.repo_name = repo_name

371

self.repo_name = repo_name

372

self.source_repo_name = source_repo_name or repo_name

372

self.source_repo_name = source_repo_name or repo_name

373

self.comments = comments or {}

373

self.comments = comments or {}

374

self.comments_store = self.comments.copy()

374

self.max_file_size_limit = max_file_size_limit

375

self.max_file_size_limit = max_file_size_limit

375

376

def render_patchset(self, patchset, source_ref=None, target_ref=None):

377

def render_patchset(self, patchset, source_ref=None, target_ref=None):

377

diffset = AttributeDict(dict(

378

diffset = AttributeDict(dict(

378

lines_added=0,

379

lines_added=0,

379

lines_deleted=0,

380

lines_deleted=0,

380

changed_files=0,

381

changed_files=0,

381

files=[],

382

files=[],

382

limited_diff=isinstance(patchset, LimitedDiffContainer),

383

limited_diff=isinstance(patchset, LimitedDiffContainer),

383

repo_name=self.repo_name,

384

repo_name=self.repo_name,

384

source_repo_name=self.source_repo_name,

385

source_repo_name=self.source_repo_name,

385

source_ref=source_ref,

386

source_ref=source_ref,

386

target_ref=target_ref,

387

target_ref=target_ref,

387

))

388

))

388

for patch in patchset:

389

for patch in patchset:

389

filediff = self.render_patch(patch)

390

filediff = self.render_patch(patch)

390

filediff.diffset = diffset

391

filediff.diffset = diffset

391

diffset.files.append(filediff)

392

diffset.files.append(filediff)

392

diffset.changed_files += 1

393

diffset.changed_files += 1

393

if not patch['stats']['binary']:

394

if not patch['stats']['binary']:

394

diffset.lines_added += patch['stats']['added']

395

diffset.lines_added += patch['stats']['added']

395

diffset.lines_deleted += patch['stats']['deleted']

396

diffset.lines_deleted += patch['stats']['deleted']

396

397

return diffset

398

return diffset

398

399

_lexer_cache = {}

400

_lexer_cache = {}

400

def _get_lexer_for_filename(self, filename):

401

def _get_lexer_for_filename(self, filename):

401

# cached because we might need to call it twice for source/target

402

# cached because we might need to call it twice for source/target

402

if filename not in self._lexer_cache:

403

if filename not in self._lexer_cache:

403

self._lexer_cache[filename] = get_lexer_safe(filepath=filename)

404

self._lexer_cache[filename] = get_lexer_safe(filepath=filename)

404

return self._lexer_cache[filename]

405

return self._lexer_cache[filename]

405

406

def render_patch(self, patch):

407

def render_patch(self, patch):

407

log.debug('rendering diff for %r' % patch['filename'])

408

log.debug('rendering diff for %r' % patch['filename'])

408

409

source_filename = patch['original_filename']

410

source_filename = patch['original_filename']

410

target_filename = patch['filename']

411

target_filename = patch['filename']

411

412

source_lexer = plain_text_lexer

413

source_lexer = plain_text_lexer

413

target_lexer = plain_text_lexer

414

target_lexer = plain_text_lexer

414

415

if not patch['stats']['binary']:

416

if not patch['stats']['binary']:

416

if self.highlight_mode == self.HL_REAL:

417

if self.highlight_mode == self.HL_REAL:

417

if (source_filename and patch['operation'] in ('D', 'M')

418

if (source_filename and patch['operation'] in ('D', 'M')

418

and source_filename not in self.source_nodes):

419

and source_filename not in self.source_nodes):

419

self.source_nodes[source_filename] = (

420

self.source_nodes[source_filename] = (

420

self.source_node_getter(source_filename))

421

self.source_node_getter(source_filename))

421

422

if (target_filename and patch['operation'] in ('A', 'M')

423

if (target_filename and patch['operation'] in ('A', 'M')

423

and target_filename not in self.target_nodes):

424

and target_filename not in self.target_nodes):

424

self.target_nodes[target_filename] = (

425

self.target_nodes[target_filename] = (

425

self.target_node_getter(target_filename))

426

self.target_node_getter(target_filename))

426

427

elif self.highlight_mode == self.HL_FAST:

428

elif self.highlight_mode == self.HL_FAST:

428

source_lexer = self._get_lexer_for_filename(source_filename)

429

source_lexer = self._get_lexer_for_filename(source_filename)

429

target_lexer = self._get_lexer_for_filename(target_filename)

430

target_lexer = self._get_lexer_for_filename(target_filename)

430

431

source_file = self.source_nodes.get(source_filename, source_filename)

432

source_file = self.source_nodes.get(source_filename, source_filename)

432

target_file = self.target_nodes.get(target_filename, target_filename)

433

target_file = self.target_nodes.get(target_filename, target_filename)

433

434

source_filenode, target_filenode = None, None

435

source_filenode, target_filenode = None, None

435

436

# TODO: dan: FileNode.lexer works on the content of the file - which

437

# TODO: dan: FileNode.lexer works on the content of the file - which

437

# can be slow - issue #4289 explains a lexer clean up - which once

438

# can be slow - issue #4289 explains a lexer clean up - which once

438

# done can allow caching a lexer for a filenode to avoid the file lookup

439

# done can allow caching a lexer for a filenode to avoid the file lookup

439

if isinstance(source_file, FileNode):

440

if isinstance(source_file, FileNode):

440

source_filenode = source_file

441

source_filenode = source_file

441

source_lexer = source_file.lexer

442

source_lexer = source_file.lexer

442

if isinstance(target_file, FileNode):

443

if isinstance(target_file, FileNode):

443

target_filenode = target_file

444

target_filenode = target_file

444

target_lexer = target_file.lexer

445

target_lexer = target_file.lexer

445

446

source_file_path, target_file_path = None, None

447

source_file_path, target_file_path = None, None

447

448

if source_filename != '/dev/null':

449

if source_filename != '/dev/null':

449

source_file_path = source_filename

450

source_file_path = source_filename

450

if target_filename != '/dev/null':

451

if target_filename != '/dev/null':

451

target_file_path = target_filename

452

target_file_path = target_filename

452

453

source_file_type = source_lexer.name

454

source_file_type = source_lexer.name

454

target_file_type = target_lexer.name

455

target_file_type = target_lexer.name

455

456

op_hunks = patch['chunks'][0]

457

op_hunks = patch['chunks'][0]

457

hunks = patch['chunks'][1:]

458

hunks = patch['chunks'][1:]

458

459

filediff = AttributeDict({

460

filediff = AttributeDict({

460

'source_file_path': source_file_path,

461

'source_file_path': source_file_path,

461

'target_file_path': target_file_path,

462

'target_file_path': target_file_path,

462

'source_filenode': source_filenode,

463

'source_filenode': source_filenode,

463

'target_filenode': target_filenode,

464

'target_filenode': target_filenode,

464

'hunks': [],

465

'hunks': [],

465

'source_file_type': target_file_type,

466

'source_file_type': target_file_type,

466

'target_file_type': source_file_type,

467

'target_file_type': source_file_type,

467

'patch': patch,

468

'patch': patch,

468

'source_mode': patch['stats']['old_mode'],

469

'source_mode': patch['stats']['old_mode'],

469

'target_mode': patch['stats']['new_mode'],

470

'target_mode': patch['stats']['new_mode'],

470

'limited_diff': isinstance(patch, LimitedDiffContainer),

471

'limited_diff': isinstance(patch, LimitedDiffContainer),

471

'diffset': self,

472

'diffset': self,

472

})

473

})

473

474

for hunk in hunks:

475

for hunk in hunks:

475

hunkbit = self.parse_hunk(hunk, source_file, target_file)

476

hunkbit = self.parse_hunk(hunk, source_file, target_file)

476

hunkbit.filediff = filediff

477

hunkbit.filediff = filediff

477

filediff.hunks.append(hunkbit)

478

filediff.hunks.append(hunkbit)

479

480

left_comments = {}

481

482

if source_file_path in self.comments_store:

483

for lineno, comments in self.comments_store[source_file_path].items():

484

left_comments[lineno] = comments

485

486

if target_file_path in self.comments_store:

487

for lineno, comments in self.comments_store[target_file_path].items():

488

left_comments[lineno] = comments

489

490

filediff.left_comments = left_comments

478

return filediff

491

return filediff

479

492

480

def parse_hunk(self, hunk, source_file, target_file):

493

def parse_hunk(self, hunk, source_file, target_file):

481

result = AttributeDict(dict(

494

result = AttributeDict(dict(

482

source_start=hunk['source_start'],

495

source_start=hunk['source_start'],

483

source_length=hunk['source_length'],

496

source_length=hunk['source_length'],

484

target_start=hunk['target_start'],

497

target_start=hunk['target_start'],

485

target_length=hunk['target_length'],

498

target_length=hunk['target_length'],

486

section_header=hunk['section_header'],

499

section_header=hunk['section_header'],

487

lines=[],

500

lines=[],

488

))

501

))

489

before, after = [], []

502

before, after = [], []

490

503

491

for line in hunk['lines']:

504

for line in hunk['lines']:

492

if line['action'] == 'unmod':

505

if line['action'] == 'unmod':

493

result.lines.extend(

506

result.lines.extend(

494

self.parse_lines(before, after, source_file, target_file))

507

self.parse_lines(before, after, source_file, target_file))

495

after.append(line)

508

after.append(line)

496

before.append(line)

509

before.append(line)

497

elif line['action'] == 'add':

510

elif line['action'] == 'add':

498

after.append(line)

511

after.append(line)

499

elif line['action'] == 'del':

512

elif line['action'] == 'del':

500

before.append(line)

513

before.append(line)

501

elif line['action'] == 'old-no-nl':

514

elif line['action'] == 'old-no-nl':

502

before.append(line)

515

before.append(line)

503

elif line['action'] == 'new-no-nl':

516

elif line['action'] == 'new-no-nl':

504

after.append(line)

517

after.append(line)

505

518

506

result.lines.extend(

519

result.lines.extend(

507

self.parse_lines(before, after, source_file, target_file))

520

self.parse_lines(before, after, source_file, target_file))

508

result.unified = self.as_unified(result.lines)

521

result.unified = self.as_unified(result.lines)

509

result.sideside = result.lines

522

result.sideside = result.lines

523

510

return result

524

return result

511

525

512

def parse_lines(self, before_lines, after_lines, source_file, target_file):

526

def parse_lines(self, before_lines, after_lines, source_file, target_file):

513

# TODO: dan: investigate doing the diff comparison and fast highlighting

527

# TODO: dan: investigate doing the diff comparison and fast highlighting

514

# on the entire before and after buffered block lines rather than by

528

# on the entire before and after buffered block lines rather than by

515

# line, this means we can get better 'fast' highlighting if the context

529

# line, this means we can get better 'fast' highlighting if the context

516

# allows it - eg.

530

# allows it - eg.

517

# line 4: """

531

# line 4: """

518

# line 5: this gets highlighted as a string

532

# line 5: this gets highlighted as a string

519

# line 6: """

533

# line 6: """

520

534

521

lines = []

535

lines = []

522

while before_lines or after_lines:

536

while before_lines or after_lines:

523

before, after = None, None

537

before, after = None, None

524

before_tokens, after_tokens = None, None

538

before_tokens, after_tokens = None, None

525

539

526

if before_lines:

540

if before_lines:

527

before = before_lines.pop(0)

541

before = before_lines.pop(0)

528

if after_lines:

542

if after_lines:

529

after = after_lines.pop(0)

543

after = after_lines.pop(0)

530

544

531

original = AttributeDict()

545

original = AttributeDict()

532

modified = AttributeDict()

546

modified = AttributeDict()

533

547

534

if before:

548

if before:

535

if before['action'] == 'old-no-nl':

549

if before['action'] == 'old-no-nl':

536

before_tokens = [('nonl', before['line'])]

550

before_tokens = [('nonl', before['line'])]

537

else:

551

else:

538

before_tokens = self.get_line_tokens(

552

before_tokens = self.get_line_tokens(

539

line_text=before['line'], line_number=before['old_lineno'],

553

line_text=before['line'], line_number=before['old_lineno'],

540

file=source_file)

554

file=source_file)

541

original.lineno = before['old_lineno']

555

original.lineno = before['old_lineno']

542

original.content = before['line']

556

original.content = before['line']

543

original.action = self.action_to_op(before['action'])

557

original.action = self.action_to_op(before['action'])

544

original.comments = self.get_comments_for('old',

558

original.comments = self.get_comments_for('old',

545

source_file, before['old_lineno'])

559

source_file, before['old_lineno'])

546

560

547

if after:

561

if after:

548

if after['action'] == 'new-no-nl':

562

if after['action'] == 'new-no-nl':

549

after_tokens = [('nonl', after['line'])]

563

after_tokens = [('nonl', after['line'])]

550

else:

564

else:

551

after_tokens = self.get_line_tokens(

565

after_tokens = self.get_line_tokens(

552

line_text=after['line'], line_number=after['new_lineno'],

566

line_text=after['line'], line_number=after['new_lineno'],

553

file=target_file)

567

file=target_file)

554

modified.lineno = after['new_lineno']

568

modified.lineno = after['new_lineno']

555

modified.content = after['line']

569

modified.content = after['line']

556

modified.action = self.action_to_op(after['action'])

570

modified.action = self.action_to_op(after['action'])

557

modified.comments = self.get_comments_for('new',

571

modified.comments = self.get_comments_for('new',

558

target_file, after['new_lineno'])

572

target_file, after['new_lineno'])

559

573

560

# diff the lines

574

# diff the lines

561

if before_tokens and after_tokens:

575

if before_tokens and after_tokens:

562

o_tokens, m_tokens, similarity = tokens_diff(

576

o_tokens, m_tokens, similarity = tokens_diff(

563

before_tokens, after_tokens)

577

before_tokens, after_tokens)

564

original.content = render_tokenstream(o_tokens)

578

original.content = render_tokenstream(o_tokens)

565

modified.content = render_tokenstream(m_tokens)

579

modified.content = render_tokenstream(m_tokens)

566

elif before_tokens:

580

elif before_tokens:

567

original.content = render_tokenstream(

581

original.content = render_tokenstream(

568

[(x[0], '', x[1]) for x in before_tokens])

582

[(x[0], '', x[1]) for x in before_tokens])

569

elif after_tokens:

583

elif after_tokens:

570

modified.content = render_tokenstream(

584

modified.content = render_tokenstream(

571

[(x[0], '', x[1]) for x in after_tokens])

585

[(x[0], '', x[1]) for x in after_tokens])

572

586

573

lines.append(AttributeDict({

587

lines.append(AttributeDict({

574

'original': original,

588

'original': original,

575

'modified': modified,

589

'modified': modified,

576

}))

590

}))

577

591

578

return lines

592

return lines

579

593

580

def get_comments_for(self, version, file, line_number):

594

def get_comments_for(self, version, file, line_number):

581

if hasattr(file, 'unicode_path'):

595

if hasattr(file, 'unicode_path'):

582

file = file.unicode_path

596

file = file.unicode_path

583

597

584

if not isinstance(file, basestring):

598

if not isinstance(file, basestring):

585

return None

599

return None

586

600

587

line_key = {

601

line_key = {

588

'old': 'o',

602

'old': 'o',

589

'new': 'n',

603

'new': 'n',

590

}[version] + str(line_number)

604

}[version] + str(line_number)

591

605

592

return self.comments.get(file, {}).get(line_key)

606

if file in self.comments_store:

607

file_comments = self.comments_store[file]

608

if line_key in file_comments:

609

return file_comments.pop(line_key)

593

610

594

def get_line_tokens(self, line_text, line_number, file=None):

611

def get_line_tokens(self, line_text, line_number, file=None):

595

filenode = None

612

filenode = None

596

filename = None

613

filename = None

597

614

598

if isinstance(file, basestring):

615

if isinstance(file, basestring):

599

filename = file

616

filename = file

600

elif isinstance(file, FileNode):

617

elif isinstance(file, FileNode):

601

filenode = file

618

filenode = file

602

filename = file.unicode_path

619

filename = file.unicode_path

603

620

604

if self.highlight_mode == self.HL_REAL and filenode:

621

if self.highlight_mode == self.HL_REAL and filenode:

605

if line_number and file.size < self.max_file_size_limit:

622

if line_number and file.size < self.max_file_size_limit:

606

return self.get_tokenized_filenode_line(file, line_number)

623

return self.get_tokenized_filenode_line(file, line_number)

607

624

608

if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:

625

if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:

609

lexer = self._get_lexer_for_filename(filename)

626

lexer = self._get_lexer_for_filename(filename)

610

return list(tokenize_string(line_text, lexer))

627

return list(tokenize_string(line_text, lexer))

611

628

612

return list(tokenize_string(line_text, plain_text_lexer))

629

return list(tokenize_string(line_text, plain_text_lexer))

613

630

614

def get_tokenized_filenode_line(self, filenode, line_number):

631

def get_tokenized_filenode_line(self, filenode, line_number):

615

632

616

if filenode not in self.highlighted_filenodes:

633

if filenode not in self.highlighted_filenodes:

617

tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)

634

tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)

618

self.highlighted_filenodes[filenode] = tokenized_lines

635

self.highlighted_filenodes[filenode] = tokenized_lines

619

return self.highlighted_filenodes[filenode][line_number - 1]

636

return self.highlighted_filenodes[filenode][line_number - 1]

620

637

621

def action_to_op(self, action):

638

def action_to_op(self, action):

622

return {

639

return {

623

'add': '+',

640

'add': '+',

624

'del': '-',

641

'del': '-',

625

'unmod': ' ',

642

'unmod': ' ',

626

'old-no-nl': ' ',

643

'old-no-nl': ' ',

627

'new-no-nl': ' ',

644

'new-no-nl': ' ',

628

}.get(action, action)

645

}.get(action, action)

629

646

630

def as_unified(self, lines):

647

def as_unified(self, lines):

631

""" Return a generator that yields the lines of a diff in unified order """

648

"""

649

Return a generator that yields the lines of a diff in unified order

650

"""

632

def generator():

651

def generator():

633

buf = []

652

buf = []

634

for line in lines:

653

for line in lines:

635

654

636

if buf and not line.original or line.original.action == ' ':

655

if buf and not line.original or line.original.action == ' ':

637

for b in buf:

656

for b in buf:

638

yield b

657

yield b

639

buf = []

658

buf = []

640

659

641

if line.original:

660

if line.original:

642

if line.original.action == ' ':

661

if line.original.action == ' ':

643

yield (line.original.lineno, line.modified.lineno,

662

yield (line.original.lineno, line.modified.lineno,

644

line.original.action, line.original.content,

663

line.original.action, line.original.content,

645

line.original.comments)

664

line.original.comments)

646

continue

665

continue

647

666

648

if line.original.action == '-':

667

if line.original.action == '-':

649

yield (line.original.lineno, None,

668

yield (line.original.lineno, None,

650

line.original.action, line.original.content,

669

line.original.action, line.original.content,

651

line.original.comments)

670

line.original.comments)

652

671

653

if line.modified.action == '+':

672

if line.modified.action == '+':

654

buf.append((

673

buf.append((

655

None, line.modified.lineno,

674

None, line.modified.lineno,

656

line.modified.action, line.modified.content,

675

line.modified.action, line.modified.content,

657

line.modified.comments))

676

line.modified.comments))

658

continue

677

continue

659

678

660

if line.modified:

679

if line.modified:

661

yield (None, line.modified.lineno,

680

yield (None, line.modified.lineno,

662

line.modified.action, line.modified.content,

681

line.modified.action, line.modified.content,

663

line.modified.comments)

682

line.modified.comments)

664

683

665

for b in buf:

684

for b in buf:

666

yield b

685

yield b

667

686

668

return generator()

687

return generator()

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # -*- coding: utf-8 -*-
             # Copyright (C) 2011-2016  RhodeCode GmbH
             #
             # This program is free software: you can redistribute it and/or modify
             # it under the terms of the GNU Affero General Public License, version 3
             # (only), as published by the Free Software Foundation.
             #
             # This program is distributed in the hope that it will be useful,
             # but WITHOUT ANY WARRANTY; without even the implied warranty of
             # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
             # GNU General Public License for more details.
             #
             # You should have received a copy of the GNU Affero General Public License
             # along with this program.  If not, see <http://www.gnu.org/licenses/>.
             #
             # This program is dual-licensed. If you wish to learn more about the
             # RhodeCode Enterprise Edition, including its added features, Support services,
             # and proprietary license terms, please see https://rhodecode.com/licenses/
             import logging
             import difflib
             from itertools import groupby
             from pygments import lex
             from pygments.formatters.html import _get_ttype_class as pygment_token_class
             from rhodecode.lib.helpers import (
                 get_lexer_for_filenode, get_lexer_safe, html_escape)
             from rhodecode.lib.utils2 import AttributeDict
             from rhodecode.lib.vcs.nodes import FileNode
             from rhodecode.lib.diff_match_patch import diff_match_patch
             from rhodecode.lib.diffs import LimitedDiffContainer
             from pygments.lexers import get_lexer_by_name
             plain_text_lexer = get_lexer_by_name(
                 'text', stripall=False, stripnl=False, ensurenl=False)
             log = logging.getLogger()
             def filenode_as_lines_tokens(filenode, lexer=None):
                 lexer = lexer or get_lexer_for_filenode(filenode)
                 log.debug('Generating file node pygment tokens for %s, %s', lexer, filenode)
                 tokens = tokenize_string(filenode.content, lexer)
                 lines = split_token_stream(tokens, split_string='\n')
                 rv = list(lines)
                 return rv
             def tokenize_string(content, lexer):
                 """
                 Use pygments to tokenize some content based on a lexer
                 ensuring all original new lines and whitespace is preserved
                 """
                 lexer.stripall = False
                 lexer.stripnl = False
                 lexer.ensurenl = False
                 for token_type, token_text in lex(content, lexer):
                     yield pygment_token_class(token_type), token_text
             def split_token_stream(tokens, split_string=u'\n'):
                 """
                 Take a list of (TokenType, text) tuples and split them by a string
                 >>> split_token_stream([(TEXT, 'some\ntext'), (TEXT, 'more\n')])
                 [(TEXT, 'some'), (TEXT, 'text'),
                  (TEXT, 'more'), (TEXT, 'text')]
                 """
                 buffer = []
                 for token_class, token_text in tokens:
                     parts = token_text.split(split_string)
                     for part in parts[:-1]:
                         buffer.append((token_class, part))
                         yield buffer
                         buffer = []
                     buffer.append((token_class, parts[-1]))
                 if buffer:
                     yield buffer
             def filenode_as_annotated_lines_tokens(filenode):
                 """
                 Take a file node and return a list of annotations => lines, if no annotation
                 is found, it will be None.
                 eg:
                 [
                     (annotation1, [
                         (1, line1_tokens_list),
                         (2, line2_tokens_list),
                     ]),
                     (annotation2, [
                         (3, line1_tokens_list),
                     ]),
                     (None, [
                         (4, line1_tokens_list),
                     ]),
                     (annotation1, [
                         (5, line1_tokens_list),
                         (6, line2_tokens_list),
                     ])
                 ]
                 """
                 commit_cache = {} # cache commit_getter lookups
                 def _get_annotation(commit_id, commit_getter):
                     if commit_id not in commit_cache:
                         commit_cache[commit_id] = commit_getter()
                     return commit_cache[commit_id]
                 annotation_lookup = {
                     line_no: _get_annotation(commit_id, commit_getter)
                     for line_no, commit_id, commit_getter, line_content
                     in filenode.annotate
                 }
                 annotations_lines = ((annotation_lookup.get(line_no), line_no, tokens)
                                       for line_no, tokens
                                       in enumerate(filenode_as_lines_tokens(filenode), 1))
                 grouped_annotations_lines = groupby(annotations_lines, lambda x: x[0])
                 for annotation, group in grouped_annotations_lines:
                     yield (
                         annotation, [(line_no, tokens)
                                       for (_, line_no, tokens) in group]
                     )
             def render_tokenstream(tokenstream):
                 result = []
                 for token_class, token_ops_texts in rollup_tokenstream(tokenstream):
                     if token_class:
                         result.append(u'<span class="%s">' % token_class)
                     else:
                         result.append(u'<span>')
                     for op_tag, token_text in token_ops_texts:
                         if op_tag:
                             result.append(u'<%s>' % op_tag)
                         escaped_text = html_escape(token_text)
                         # TODO: dan: investigate showing hidden characters like space/nl/tab
                         # escaped_text = escaped_text.replace(' ', '<sp> </sp>')
                         # escaped_text = escaped_text.replace('\n', '<nl>\n</nl>')
                         # escaped_text = escaped_text.replace('\t', '<tab>\t</tab>')
                         result.append(escaped_text)
                         if op_tag:
                             result.append(u'</%s>' % op_tag)
                     result.append(u'</span>')
                 html = ''.join(result)
                 return html
             def rollup_tokenstream(tokenstream):
                 """
                 Group a token stream of the format:
                     ('class', 'op', 'text')
                 or
                     ('class', 'text')
                 into
                     [('class1',
                         [('op1', 'text'),
                          ('op2', 'text')]),
                      ('class2',
                         [('op3', 'text')])]
                 This is used to get the minimal tags necessary when
                 rendering to html eg for a token stream ie.
                 <span class="A"><ins>he</ins>llo</span>
                 vs
                 <span class="A"><ins>he</ins></span><span class="A">llo</span>
                 If a 2 tuple is passed in, the output op will be an empty string.
                 eg:
                 >>> rollup_tokenstream([('classA', '',      'h'),
                                         ('classA', 'del',   'ell'),
                                         ('classA', '',      'o'),
                                         ('classB', '',      ' '),
                                         ('classA', '',      'the'),
                                         ('classA', '',      're'),
                                         ])
                 [('classA', [('', 'h'), ('del', 'ell'), ('', 'o')],
                  ('classB', [('', ' ')],
                  ('classA', [('', 'there')]]
                 """
                 if tokenstream and len(tokenstream[0]) == 2:
                     tokenstream = ((t[0], '', t[1]) for t in tokenstream)
                 result = []
                 for token_class, op_list in groupby(tokenstream, lambda t: t[0]):
                     ops = []
                     for token_op, token_text_list in groupby(op_list, lambda o: o[1]):
                         text_buffer = []
                         for t_class, t_op, t_text in token_text_list:
                             text_buffer.append(t_text)
                         ops.append((token_op, ''.join(text_buffer)))
                     result.append((token_class, ops))
                 return result
             def tokens_diff(old_tokens, new_tokens, use_diff_match_patch=True):
                 """
                 Converts a list of (token_class, token_text) tuples to a list of
                 (token_class, token_op, token_text) tuples where token_op is one of
                 ('ins', 'del', '')
                 :param old_tokens: list of (token_class, token_text) tuples of old line
                 :param new_tokens: list of (token_class, token_text) tuples of new line
                 :param use_diff_match_patch: boolean, will use google's diff match patch
                     library which has options to 'smooth' out the character by character
                     differences making nicer ins/del blocks
                 """
                 old_tokens_result = []
                 new_tokens_result = []
                 similarity = difflib.SequenceMatcher(None,
                     ''.join(token_text for token_class, token_text in old_tokens),
                     ''.join(token_text for token_class, token_text in new_tokens)
                 ).ratio()
                 if similarity < 0.6: # return, the blocks are too different
                     for token_class, token_text in old_tokens:
                         old_tokens_result.append((token_class, '', token_text))
                     for token_class, token_text in new_tokens:
                         new_tokens_result.append((token_class, '', token_text))
                     return old_tokens_result, new_tokens_result, similarity
                 token_sequence_matcher = difflib.SequenceMatcher(None,
                     [x[1] for x in old_tokens],
                     [x[1] for x in new_tokens])
                 for tag, o1, o2, n1, n2 in token_sequence_matcher.get_opcodes():
                     # check the differences by token block types first to give a more
                     # nicer "block" level replacement vs character diffs
                     if tag == 'equal':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, '', token_text))
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, '', token_text))
                     elif tag == 'delete':
                         for token_class, token_text in old_tokens[o1:o2]:
                             old_tokens_result.append((token_class, 'del', token_text))
                     elif tag == 'insert':
                         for token_class, token_text in new_tokens[n1:n2]:
                             new_tokens_result.append((token_class, 'ins', token_text))
                     elif tag == 'replace':
                         # if same type token blocks must be replaced, do a diff on the
                         # characters in the token blocks to show individual changes
                         old_char_tokens = []
                         new_char_tokens = []
                         for token_class, token_text in old_tokens[o1:o2]:
                             for char in token_text:
                                 old_char_tokens.append((token_class, char))
                         for token_class, token_text in new_tokens[n1:n2]:
                             for char in token_text:
                                 new_char_tokens.append((token_class, char))
                         old_string = ''.join([token_text for
                             token_class, token_text in old_char_tokens])
                         new_string = ''.join([token_text for
                             token_class, token_text in new_char_tokens])
                         char_sequence = difflib.SequenceMatcher(
                             None, old_string, new_string)
                         copcodes = char_sequence.get_opcodes()
                         obuffer, nbuffer = [], []
                         if use_diff_match_patch:
                             dmp = diff_match_patch()
                             dmp.Diff_EditCost = 11 # TODO: dan: extract this to a setting
                             reps = dmp.diff_main(old_string, new_string)
                             dmp.diff_cleanupEfficiency(reps)
                             a, b = 0, 0
                             for op, rep in reps:
                                 l = len(rep)
                                 if op == 0:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], '', c))
                                         nbuffer.append((new_char_tokens[b+i][0], '', c))
                                     a += l
                                     b += l
                                 elif op == -1:
                                     for i, c in enumerate(rep):
                                         obuffer.append((old_char_tokens[a+i][0], 'del', c))
                                     a += l
                                 elif op == 1:
                                     for i, c in enumerate(rep):
                                         nbuffer.append((new_char_tokens[b+i][0], 'ins', c))
                                     b += l
                         else:
                             for ctag, co1, co2, cn1, cn2 in copcodes:
                                 if ctag == 'equal':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, '', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, '', token_text))
                                 elif ctag == 'delete':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                 elif ctag == 'insert':
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                                 elif ctag == 'replace':
                                     for token_class, token_text in old_char_tokens[co1:co2]:
                                         obuffer.append((token_class, 'del', token_text))
                                     for token_class, token_text in new_char_tokens[cn1:cn2]:
                                         nbuffer.append((token_class, 'ins', token_text))
                         old_tokens_result.extend(obuffer)
                         new_tokens_result.extend(nbuffer)
                 return old_tokens_result, new_tokens_result, similarity
             class DiffSet(object):
                 """
                 An object for parsing the diff result from diffs.DiffProcessor and
                 adding highlighting, side by side/unified renderings and line diffs
                 """
                 HL_REAL = 'REAL' # highlights using original file, slow
                 HL_FAST = 'FAST' # highlights using just the line, fast but not correct
                                  # in the case of multiline code
                 HL_NONE = 'NONE' # no highlighting, fastest
                 def __init__(self, highlight_mode=HL_REAL, repo_name=None,
                              source_repo_name=None,
                              source_node_getter=lambda filename: None,
                              target_node_getter=lambda filename: None,
                              source_nodes=None, target_nodes=None,
                              max_file_size_limit=150 * 1024, # files over this size will
                                                              # use fast highlighting
                              comments=None,
                              ):
                     self.highlight_mode = highlight_mode
                     self.highlighted_filenodes = {}
                     self.source_node_getter = source_node_getter
                     self.target_node_getter = target_node_getter
                     self.source_nodes = source_nodes or {}
                     self.target_nodes = target_nodes or {}
                     self.repo_name = repo_name
                     self.source_repo_name = source_repo_name or repo_name
                     self.comments = comments or {}
+                    self.comments_store = self.comments.copy()
                     self.max_file_size_limit = max_file_size_limit
                 def render_patchset(self, patchset, source_ref=None, target_ref=None):
                     diffset = AttributeDict(dict(
                         lines_added=0,
                         lines_deleted=0,
                         changed_files=0,
                         files=[],
                         limited_diff=isinstance(patchset, LimitedDiffContainer),
                         repo_name=self.repo_name,
                         source_repo_name=self.source_repo_name,
                         source_ref=source_ref,
                         target_ref=target_ref,
                     ))
                     for patch in patchset:
                         filediff = self.render_patch(patch)
                         filediff.diffset = diffset
                         diffset.files.append(filediff)
                         diffset.changed_files += 1
                         if not patch['stats']['binary']:
                             diffset.lines_added += patch['stats']['added']
                             diffset.lines_deleted += patch['stats']['deleted']
                     return diffset
                 _lexer_cache = {}
                 def _get_lexer_for_filename(self, filename):
                     # cached because we might need to call it twice for source/target
                     if filename not in self._lexer_cache:
                         self._lexer_cache[filename] = get_lexer_safe(filepath=filename)
                     return self._lexer_cache[filename]
                 def render_patch(self, patch):
                     log.debug('rendering diff for %r' % patch['filename'])
                     source_filename = patch['original_filename']
                     target_filename = patch['filename']
                     source_lexer = plain_text_lexer
                     target_lexer = plain_text_lexer
                     if not patch['stats']['binary']:
                         if self.highlight_mode == self.HL_REAL:
                             if (source_filename and patch['operation'] in ('D', 'M')
                                 and source_filename not in self.source_nodes):
                                     self.source_nodes[source_filename] = (
                                         self.source_node_getter(source_filename))
                             if (target_filename and patch['operation'] in ('A', 'M')
                                 and target_filename not in self.target_nodes):
                                     self.target_nodes[target_filename] = (
                                         self.target_node_getter(target_filename))
                         elif self.highlight_mode == self.HL_FAST:
                             source_lexer = self._get_lexer_for_filename(source_filename)
                             target_lexer = self._get_lexer_for_filename(target_filename)
                     source_file = self.source_nodes.get(source_filename, source_filename)
                     target_file = self.target_nodes.get(target_filename, target_filename)
                     source_filenode, target_filenode = None, None
                     # TODO: dan: FileNode.lexer works on the content of the file - which
                     # can be slow - issue #4289 explains a lexer clean up - which once
                     # done can allow caching a lexer for a filenode to avoid the file lookup
                     if isinstance(source_file, FileNode):
                         source_filenode = source_file
                         source_lexer = source_file.lexer
                     if isinstance(target_file, FileNode):
                         target_filenode = target_file
                         target_lexer = target_file.lexer
                     source_file_path, target_file_path = None, None
                     if source_filename != '/dev/null':
                         source_file_path = source_filename
                     if target_filename != '/dev/null':
                         target_file_path = target_filename
                     source_file_type = source_lexer.name
                     target_file_type = target_lexer.name
                     op_hunks = patch['chunks'][0]
                     hunks = patch['chunks'][1:]
                     filediff = AttributeDict({
                         'source_file_path': source_file_path,
                         'target_file_path': target_file_path,
                         'source_filenode': source_filenode,
                         'target_filenode': target_filenode,
                         'hunks': [],
                         'source_file_type': target_file_type,
                         'target_file_type': source_file_type,
                         'patch': patch,
                         'source_mode': patch['stats']['old_mode'],
                         'target_mode': patch['stats']['new_mode'],
                         'limited_diff': isinstance(patch, LimitedDiffContainer),
                         'diffset': self,
                     })
                     for hunk in hunks:
                         hunkbit = self.parse_hunk(hunk, source_file, target_file)
                         hunkbit.filediff = filediff
                         filediff.hunks.append(hunkbit)
+                    left_comments = {}
+                    if source_file_path in self.comments_store:
+                        for lineno, comments in self.comments_store[source_file_path].items():
+                            left_comments[lineno] = comments
+                    if target_file_path in self.comments_store:
+                        for lineno, comments in self.comments_store[target_file_path].items():
+                            left_comments[lineno] = comments
+                    filediff.left_comments = left_comments
                     return filediff
                 def parse_hunk(self, hunk, source_file, target_file):
                     result = AttributeDict(dict(
                         source_start=hunk['source_start'],
                         source_length=hunk['source_length'],
                         target_start=hunk['target_start'],
                         target_length=hunk['target_length'],
                         section_header=hunk['section_header'],
                         lines=[],
                     ))
                     before, after = [], []
                     for line in hunk['lines']:
                         if line['action'] == 'unmod':
                             result.lines.extend(
                                 self.parse_lines(before, after, source_file, target_file))
                             after.append(line)
                             before.append(line)
                         elif line['action'] == 'add':
                             after.append(line)
                         elif line['action'] == 'del':
                             before.append(line)
                         elif line['action'] == 'old-no-nl':
                             before.append(line)
                         elif line['action'] == 'new-no-nl':
                             after.append(line)
                     result.lines.extend(
                         self.parse_lines(before, after, source_file, target_file))
                     result.unified = self.as_unified(result.lines)
                     result.sideside = result.lines
                     return result
                 def parse_lines(self, before_lines, after_lines, source_file, target_file):
                     # TODO: dan: investigate doing the diff comparison and fast highlighting
                     # on the entire before and after buffered block lines rather than by
                     # line, this means we can get better 'fast' highlighting if the context
                     # allows it - eg.
                     # line 4: """
                     # line 5: this gets highlighted as a string
                     # line 6: """
                     lines = []
                     while before_lines or after_lines:
                         before, after = None, None
                         before_tokens, after_tokens = None, None
                         if before_lines:
                             before = before_lines.pop(0)
                         if after_lines:
                             after = after_lines.pop(0)
                         original = AttributeDict()
                         modified = AttributeDict()
                         if before:
                             if before['action'] == 'old-no-nl':
                                 before_tokens = [('nonl', before['line'])]
                             else:
                                 before_tokens = self.get_line_tokens(
                                     line_text=before['line'], line_number=before['old_lineno'],
                                     file=source_file)
                             original.lineno = before['old_lineno']
                             original.content = before['line']
                             original.action = self.action_to_op(before['action'])
                             original.comments = self.get_comments_for('old',
                                 source_file, before['old_lineno'])
                         if after:
                             if after['action'] == 'new-no-nl':
                                 after_tokens = [('nonl', after['line'])]
                             else:
                                 after_tokens = self.get_line_tokens(
                                     line_text=after['line'], line_number=after['new_lineno'],
                                     file=target_file)
                             modified.lineno = after['new_lineno']
                             modified.content = after['line']
                             modified.action = self.action_to_op(after['action'])
                             modified.comments = self.get_comments_for('new',
                                 target_file, after['new_lineno'])
                         # diff the lines
                         if before_tokens and after_tokens:
                             o_tokens, m_tokens, similarity = tokens_diff(
                                 before_tokens, after_tokens)
                             original.content = render_tokenstream(o_tokens)
                             modified.content = render_tokenstream(m_tokens)
                         elif before_tokens:
                             original.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in before_tokens])
                         elif after_tokens:
                             modified.content = render_tokenstream(
                                 [(x[0], '', x[1]) for x in after_tokens])
                         lines.append(AttributeDict({
                             'original': original,
                             'modified': modified,
                         }))
                     return lines
                 def get_comments_for(self, version, file, line_number):
                     if hasattr(file, 'unicode_path'):
                         file = file.unicode_path
                     if not isinstance(file, basestring):
                         return None
                     line_key = {
                         'old': 'o',
                         'new': 'n',
                     }[version] + str(line_number)
-                    return self.comments.get(file, {}).get(line_key)
+                    if file in self.comments_store:
+                        file_comments = self.comments_store[file]
+                        if line_key in file_comments:
+                            return file_comments.pop(line_key)
                 def get_line_tokens(self, line_text, line_number, file=None):
                     filenode = None
                     filename = None
                     if isinstance(file, basestring):
                         filename = file
                     elif isinstance(file, FileNode):
                         filenode = file
                         filename = file.unicode_path
                     if self.highlight_mode == self.HL_REAL and filenode:
                         if line_number and file.size < self.max_file_size_limit:
                             return self.get_tokenized_filenode_line(file, line_number)
                     if self.highlight_mode in (self.HL_REAL, self.HL_FAST) and filename:
                         lexer = self._get_lexer_for_filename(filename)
                         return list(tokenize_string(line_text, lexer))
                     return list(tokenize_string(line_text, plain_text_lexer))
                 def get_tokenized_filenode_line(self, filenode, line_number):
                     if filenode not in self.highlighted_filenodes:
                         tokenized_lines = filenode_as_lines_tokens(filenode, filenode.lexer)
                         self.highlighted_filenodes[filenode] = tokenized_lines
                     return self.highlighted_filenodes[filenode][line_number - 1]
                 def action_to_op(self, action):
                     return {
                         'add': '+',
                         'del': '-',
                         'unmod': ' ',
                         'old-no-nl': ' ',
                         'new-no-nl': ' ',
                     }.get(action, action)
                 def as_unified(self, lines):
-                    """ Return a generator that yields the lines of a diff in unified order """
+                    """
+                    Return a generator that yields the lines of a diff in unified order
+                    """
                     def generator():
                         buf = []
                         for line in lines:
                             if buf and not line.original or line.original.action == ' ':
                                 for b in buf:
                                     yield b
                                 buf = []
                             if line.original:
                                 if line.original.action == ' ':
                                     yield (line.original.lineno, line.modified.lineno,
                                            line.original.action, line.original.content,
                                            line.original.comments)
                                     continue
                                 if line.original.action == '-':
                                     yield (line.original.lineno, None,
                                            line.original.action, line.original.content,
                                            line.original.comments)
                                 if line.modified.action == '+':
                                     buf.append((
                                         None, line.modified.lineno,
                                         line.modified.action, line.modified.content,
                                         line.modified.comments))
                                     continue
                             if line.modified:
                                 yield (None, line.modified.lineno,
                                        line.modified.action, line.modified.content,
                                        line.modified.comments)
                         for b in buf:
                             yield b
                     return generator()