upstream/mercurial-mirror Commit - r32042:8f8ad013

1

# worker.py - master-slave parallelism support

1

# worker.py - master-slave parallelism support

2

#

2

#

3

4

#

4

#

5

# This software may be used and distributed according to the terms of the

5

# This software may be used and distributed according to the terms of the

6

# GNU General Public License version 2 or any later version.

6

# GNU General Public License version 2 or any later version.

7

8

from __future__ import absolute_import

8

from __future__ import absolute_import

9

10

import errno

10

import errno

11

import os

11

import os

12

import signal

12

import signal

13

import sys

13

import sys

14

15

from .i18n import _

15

from .i18n import _

16

from . import (

16

from . import (

17

encoding,

17

encoding,

18

error,

18

error,

19

pycompat,

19

pycompat,

20

scmutil,

20

scmutil,

21

util,

21

util,

22

)

22

)

23

24

def countcpus():

24

def countcpus():

25

'''try to count the number of CPUs on the system'''

25

'''try to count the number of CPUs on the system'''

26

27

# posix

27

# posix

28

try:

28

try:

29

n = int(os.sysconf('SC_NPROCESSORS_ONLN'))

29

n = int(os.sysconf('SC_NPROCESSORS_ONLN'))

30

if n > 0:

30

if n > 0:

31

return n

31

return n

32

except (AttributeError, ValueError):

32

except (AttributeError, ValueError):

33

pass

33

pass

34

35

# windows

35

# windows

36

try:

36

try:

37

n = int(encoding.environ['NUMBER_OF_PROCESSORS'])

37

n = int(encoding.environ['NUMBER_OF_PROCESSORS'])

38

if n > 0:

38

if n > 0:

39

return n

39

return n

40

except (KeyError, ValueError):

40

except (KeyError, ValueError):

41

pass

41

pass

42

43

return 1

43

return 1

44

45

def _numworkers(ui):

45

def _numworkers(ui):

46

s = ui.config('worker', 'numcpus')

46

s = ui.config('worker', 'numcpus')

47

if s:

47

if s:

48

try:

48

try:

49

n = int(s)

49

n = int(s)

50

if n >= 1:

50

if n >= 1:

51

return n

51

return n

52

except ValueError:

52

except ValueError:

53

raise error.Abort(_('number of cpus must be an integer'))

53

raise error.Abort(_('number of cpus must be an integer'))

54

return min(max(countcpus(), 4), 32)

54

return min(max(countcpus(), 4), 32)

55

56

if pycompat.osname == 'posix':

56

if pycompat.osname == 'posix':

57

_startupcost = 0.01

57

_startupcost = 0.01

58

else:

58

else:

59

_startupcost = 1e30

59

_startupcost = 1e30

60

61

def worthwhile(ui, costperop, nops):

61

def worthwhile(ui, costperop, nops):

62

'''try to determine whether the benefit of multiple processes can

62

'''try to determine whether the benefit of multiple processes can

63

outweigh the cost of starting them'''

63

outweigh the cost of starting them'''

64

linear = costperop * nops

64

linear = costperop * nops

65

workers = _numworkers(ui)

65

workers = _numworkers(ui)

66

benefit = linear - (_startupcost * workers + linear / workers)

66

benefit = linear - (_startupcost * workers + linear / workers)

67

return benefit >= 0.15

67

return benefit >= 0.15

68

69

def worker(ui, costperarg, func, staticargs, args):

69

def worker(ui, costperarg, func, staticargs, args):

70

'''run a function, possibly in parallel in multiple worker

70

'''run a function, possibly in parallel in multiple worker

71

processes.

71

processes.

72

73

returns a progress iterator

73

returns a progress iterator

74

75

costperarg - cost of a single task

75

costperarg - cost of a single task

76

77

func - function to run

77

func - function to run

78

79

staticargs - arguments to pass to every invocation of the function

79

staticargs - arguments to pass to every invocation of the function

80

81

args - arguments to split into chunks, to pass to individual

81

args - arguments to split into chunks, to pass to individual

82

workers

82

workers

83

'''

83

'''

84

if worthwhile(ui, costperarg, len(args)):

84

if worthwhile(ui, costperarg, len(args)):

85

return _platformworker(ui, func, staticargs, args)

85

return _platformworker(ui, func, staticargs, args)

86

return func(*staticargs + (args,))

86

return func(*staticargs + (args,))

87

88

def _posixworker(ui, func, staticargs, args):

88

def _posixworker(ui, func, staticargs, args):

89

rfd, wfd = os.pipe()

89

rfd, wfd = os.pipe()

90

workers = _numworkers(ui)

90

workers = _numworkers(ui)

91

oldhandler = signal.getsignal(signal.SIGINT)

91

oldhandler = signal.getsignal(signal.SIGINT)

92

signal.signal(signal.SIGINT, signal.SIG_IGN)

92

signal.signal(signal.SIGINT, signal.SIG_IGN)

93

pids, problem = set(), [0]

93

pids, problem = set(), [0]

94

def killworkers():

94

def killworkers():

95

# unregister SIGCHLD handler as all children will be killed. This

95

# unregister SIGCHLD handler as all children will be killed. This

96

# function shouldn't be interrupted by another SIGCHLD; otherwise pids

96

# function shouldn't be interrupted by another SIGCHLD; otherwise pids

97

# could be updated while iterating, which would cause inconsistency.

97

# could be updated while iterating, which would cause inconsistency.

98

signal.signal(signal.SIGCHLD, oldchldhandler)

98

signal.signal(signal.SIGCHLD, oldchldhandler)

99

# if one worker bails, there's no good reason to wait for the rest

99

# if one worker bails, there's no good reason to wait for the rest

100

for p in pids:

100

for p in pids:

101

try:

101

try:

102

os.kill(p, signal.SIGTERM)

102

os.kill(p, signal.SIGTERM)

103

except OSError as err:

103

except OSError as err:

104

if err.errno != errno.ESRCH:

104

if err.errno != errno.ESRCH:

105

raise

105

raise

106

def waitforworkers(blocking=True):

106

def waitforworkers(blocking=True):

107

for pid in pids.copy():

107

for pid in pids.copy():

108

p = st = 0

108

p = st = 0

109

while True:

109

while True:

110

try:

110

try:

111

p, st = os.waitpid(pid, (0 if blocking else os.WNOHANG))

111

p, st = os.waitpid(pid, (0 if blocking else os.WNOHANG))

112

break

112

break

113

except OSError as e:

113

except OSError as e:

114

if e.errno == errno.EINTR:

114

if e.errno == errno.EINTR:

115

continue

115

continue

116

elif e.errno == errno.ECHILD:

116

elif e.errno == errno.ECHILD:

117

# child would already be reaped, but pids yet been

117

# child would already be reaped, but pids yet been

118

# updated (maybe interrupted just after waitpid)

118

# updated (maybe interrupted just after waitpid)

119

pids.discard(pid)

119

pids.discard(pid)

120

break

120

break

121

else:

121

else:

122

raise

122

raise

123

if not p:

123

if not p:

124

# skip subsequent steps, because child process should

124

# skip subsequent steps, because child process should

125

# be still running in this case

125

# be still running in this case

126

continue

126

continue

127

pids.discard(p)

127

pids.discard(p)

128

st = _exitstatus(st)

128

st = _exitstatus(st)

129

if st and not problem[0]:

129

if st and not problem[0]:

130

problem[0] = st

130

problem[0] = st

131

def sigchldhandler(signum, frame):

131

def sigchldhandler(signum, frame):

132

waitforworkers(blocking=False)

132

waitforworkers(blocking=False)

133

if problem[0]:

133

if problem[0]:

134

killworkers()

134

killworkers()

135

oldchldhandler = signal.signal(signal.SIGCHLD, sigchldhandler)

135

oldchldhandler = signal.signal(signal.SIGCHLD, sigchldhandler)

136

ui.flush()

136

ui.flush()

137

for pargs in partition(args, workers):

137

for pargs in partition(args, workers):

138

pid = os.fork()

138

pid = os.fork()

139

if pid == 0:

139

if pid == 0:

140

signal.signal(signal.SIGINT, oldhandler)

140

signal.signal(signal.SIGINT, oldhandler)

141

signal.signal(signal.SIGCHLD, oldchldhandler)

141

signal.signal(signal.SIGCHLD, oldchldhandler)

142

143

def workerfunc():

143

def workerfunc():

144

os.close(rfd)

144

os.close(rfd)

145

for i, item in func(*(staticargs + (pargs,))):

145

for i, item in func(*(staticargs + (pargs,))):

146

os.write(wfd, '%d %s\n' % (i, item))

146

os.write(wfd, '%d %s\n' % (i, item))

147

return 0

147

148

# make sure we use os._exit in all code paths. otherwise the worker

149

# make sure we use os._exit in all code paths. otherwise the worker

149

# may do some clean-ups which could cause surprises like deadlock.

150

# may do some clean-ups which could cause surprises like deadlock.

150

# see sshpeer.cleanup for example.

151

# see sshpeer.cleanup for example.

152

ret = 0

151

try:

153

try:

152

try:

154

try:

153

scmutil.callcatch(ui, workerfunc)

155

ret = scmutil.callcatch(ui, workerfunc)

154

finally:

156

finally:

155

ui.flush()

157

ui.flush()

156

except KeyboardInterrupt:

158

except KeyboardInterrupt:

157

os._exit(255)

159

os._exit(255)

158

except: # never return, therefore no re-raises

160

except: # never return, therefore no re-raises

159

try:

161

try:

160

ui.traceback()

162

ui.traceback()

161

ui.flush()

163

ui.flush()

162

finally:

164

finally:

163

os._exit(255)

165

os._exit(255)

164

else:

166

else:

165

os._exit(0)

167

os._exit(ret & 255)

166

pids.add(pid)

168

pids.add(pid)

167

os.close(wfd)

169

os.close(wfd)

168

fp = os.fdopen(rfd, pycompat.sysstr('rb'), 0)

170

fp = os.fdopen(rfd, pycompat.sysstr('rb'), 0)

169

def cleanup():

171

def cleanup():

170

signal.signal(signal.SIGINT, oldhandler)

172

signal.signal(signal.SIGINT, oldhandler)

171

waitforworkers()

173

waitforworkers()

172

signal.signal(signal.SIGCHLD, oldchldhandler)

174

signal.signal(signal.SIGCHLD, oldchldhandler)

173

status = problem[0]

175

status = problem[0]

174

if status:

176

if status:

175

if status < 0:

177

if status < 0:

176

os.kill(os.getpid(), -status)

178

os.kill(os.getpid(), -status)

177

sys.exit(status)

179

sys.exit(status)

178

try:

180

try:

179

for line in util.iterfile(fp):

181

for line in util.iterfile(fp):

180

l = line.split(' ', 1)

182

l = line.split(' ', 1)

181

yield int(l[0]), l[1][:-1]

183

yield int(l[0]), l[1][:-1]

182

except: # re-raises

184

except: # re-raises

183

killworkers()

185

killworkers()

184

cleanup()

186

cleanup()

185

raise

187

raise

186

cleanup()

188

cleanup()

187

189

188

def _posixexitstatus(code):

190

def _posixexitstatus(code):

189

'''convert a posix exit status into the same form returned by

191

'''convert a posix exit status into the same form returned by

190

os.spawnv

192

os.spawnv

191

193

192

returns None if the process was stopped instead of exiting'''

194

returns None if the process was stopped instead of exiting'''

193

if os.WIFEXITED(code):

195

if os.WIFEXITED(code):

194

return os.WEXITSTATUS(code)

196

return os.WEXITSTATUS(code)

195

elif os.WIFSIGNALED(code):

197

elif os.WIFSIGNALED(code):

196

return -os.WTERMSIG(code)

198

return -os.WTERMSIG(code)

197

199

198

if pycompat.osname != 'nt':

200

if pycompat.osname != 'nt':

199

_platformworker = _posixworker

201

_platformworker = _posixworker

200

_exitstatus = _posixexitstatus

202

_exitstatus = _posixexitstatus

201

203

202

def partition(lst, nslices):

204

def partition(lst, nslices):

203

'''partition a list into N slices of roughly equal size

205

'''partition a list into N slices of roughly equal size

204

206

205

The current strategy takes every Nth element from the input. If

207

The current strategy takes every Nth element from the input. If

206

we ever write workers that need to preserve grouping in input

208

we ever write workers that need to preserve grouping in input

207

we should consider allowing callers to specify a partition strategy.

209

we should consider allowing callers to specify a partition strategy.

208

210

209

mpm is not a fan of this partitioning strategy when files are involved.

211

mpm is not a fan of this partitioning strategy when files are involved.

210

In his words:

212

In his words:

211

213

212

Single-threaded Mercurial makes a point of creating and visiting

214

Single-threaded Mercurial makes a point of creating and visiting

213

files in a fixed order (alphabetical). When creating files in order,

215

files in a fixed order (alphabetical). When creating files in order,

214

a typical filesystem is likely to allocate them on nearby regions on

216

a typical filesystem is likely to allocate them on nearby regions on

215

disk. Thus, when revisiting in the same order, locality is maximized

217

disk. Thus, when revisiting in the same order, locality is maximized

216

and various forms of OS and disk-level caching and read-ahead get a

218

and various forms of OS and disk-level caching and read-ahead get a

217

chance to work.

219

chance to work.

218

220

219

This effect can be quite significant on spinning disks. I discovered it

221

This effect can be quite significant on spinning disks. I discovered it

220

circa Mercurial v0.4 when revlogs were named by hashes of filenames.

222

circa Mercurial v0.4 when revlogs were named by hashes of filenames.

221

Tarring a repo and copying it to another disk effectively randomized

223

Tarring a repo and copying it to another disk effectively randomized

222

the revlog ordering on disk by sorting the revlogs by hash and suddenly

224

the revlog ordering on disk by sorting the revlogs by hash and suddenly

223

performance of my kernel checkout benchmark dropped by ~10x because the

225

performance of my kernel checkout benchmark dropped by ~10x because the

224

"working set" of sectors visited no longer fit in the drive's cache and

226

"working set" of sectors visited no longer fit in the drive's cache and

225

the workload switched from streaming to random I/O.

227

the workload switched from streaming to random I/O.

226

228

227

What we should really be doing is have workers read filenames from a

229

What we should really be doing is have workers read filenames from a

228

ordered queue. This preserves locality and also keeps any worker from

230

ordered queue. This preserves locality and also keeps any worker from

229

getting more than one file out of balance.

231

getting more than one file out of balance.

230

'''

232

'''

231

for i in range(nslices):

233

for i in range(nslices):

232

yield lst[i::nslices]

234

yield lst[i::nslices]

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

             # worker.py - master-slave parallelism support
             #
             # Copyright 2013 Facebook, Inc.
             #
             # This software may be used and distributed according to the terms of the
             # GNU General Public License version 2 or any later version.
             from __future__ import absolute_import
             import errno
             import os
             import signal
             import sys
             from .i18n import _
             from . import (
                 encoding,
                 error,
                 pycompat,
                 scmutil,
                 util,
             )
             def countcpus():
                 '''try to count the number of CPUs on the system'''
                 # posix
                 try:
                     n = int(os.sysconf('SC_NPROCESSORS_ONLN'))
                     if n > 0:
                         return n
                 except (AttributeError, ValueError):
                     pass
                 # windows
                 try:
                     n = int(encoding.environ['NUMBER_OF_PROCESSORS'])
                     if n > 0:
                         return n
                 except (KeyError, ValueError):
                     pass
                 return 1
             def _numworkers(ui):
                 s = ui.config('worker', 'numcpus')
                 if s:
                     try:
                         n = int(s)
                         if n >= 1:
                             return n
                     except ValueError:
                         raise error.Abort(_('number of cpus must be an integer'))
                 return min(max(countcpus(), 4), 32)
             if pycompat.osname == 'posix':
                 _startupcost = 0.01
             else:
                 _startupcost = 1e30
             def worthwhile(ui, costperop, nops):
                 '''try to determine whether the benefit of multiple processes can
                 outweigh the cost of starting them'''
                 linear = costperop * nops
                 workers = _numworkers(ui)
                 benefit = linear - (_startupcost * workers + linear / workers)
                 return benefit >= 0.15
             def worker(ui, costperarg, func, staticargs, args):
                 '''run a function, possibly in parallel in multiple worker
                 processes.
                 returns a progress iterator
                 costperarg - cost of a single task
                 func - function to run
                 staticargs - arguments to pass to every invocation of the function
                 args - arguments to split into chunks, to pass to individual
                 workers
                 '''
                 if worthwhile(ui, costperarg, len(args)):
                     return _platformworker(ui, func, staticargs, args)
                 return func(*staticargs + (args,))
             def _posixworker(ui, func, staticargs, args):
                 rfd, wfd = os.pipe()
                 workers = _numworkers(ui)
                 oldhandler = signal.getsignal(signal.SIGINT)
                 signal.signal(signal.SIGINT, signal.SIG_IGN)
                 pids, problem = set(), [0]
                 def killworkers():
                     # unregister SIGCHLD handler as all children will be killed. This
                     # function shouldn't be interrupted by another SIGCHLD; otherwise pids
                     # could be updated while iterating, which would cause inconsistency.
                     signal.signal(signal.SIGCHLD, oldchldhandler)
                     # if one worker bails, there's no good reason to wait for the rest
                     for p in pids:
                         try:
                             os.kill(p, signal.SIGTERM)
                         except OSError as err:
                             if err.errno != errno.ESRCH:
                                 raise
                 def waitforworkers(blocking=True):
                     for pid in pids.copy():
                         p = st = 0
                         while True:
                             try:
                                 p, st = os.waitpid(pid, (0 if blocking else os.WNOHANG))
                                 break
                             except OSError as e:
                                 if e.errno == errno.EINTR:
                                     continue
                                 elif e.errno == errno.ECHILD:
                                     # child would already be reaped, but pids yet been
                                     # updated (maybe interrupted just after waitpid)
                                     pids.discard(pid)
                                     break
                                 else:
                                     raise
                         if not p:
                             # skip subsequent steps, because child process should
                             # be still running in this case
                             continue
                         pids.discard(p)
                         st = _exitstatus(st)
                         if st and not problem[0]:
                             problem[0] = st
                 def sigchldhandler(signum, frame):
                     waitforworkers(blocking=False)
                     if problem[0]:
                         killworkers()
                 oldchldhandler = signal.signal(signal.SIGCHLD, sigchldhandler)
                 ui.flush()
                 for pargs in partition(args, workers):
                     pid = os.fork()
                     if pid == 0:
                         signal.signal(signal.SIGINT, oldhandler)
                         signal.signal(signal.SIGCHLD, oldchldhandler)
                         def workerfunc():
                             os.close(rfd)
                             for i, item in func(*(staticargs + (pargs,))):
                                 os.write(wfd, '%d %s\n' % (i, item))
+                            return 0
                         # make sure we use os._exit in all code paths. otherwise the worker
                         # may do some clean-ups which could cause surprises like deadlock.
                         # see sshpeer.cleanup for example.
+                        ret = 0
                         try:
                             try:
-                                scmutil.callcatch(ui, workerfunc)
+                                ret = scmutil.callcatch(ui, workerfunc)
                             finally:
                                 ui.flush()
                         except KeyboardInterrupt:
                             os._exit(255)
                         except: # never return, therefore no re-raises
                             try:
                                 ui.traceback()
                                 ui.flush()
                             finally:
                                 os._exit(255)
                         else:
-                            os._exit(0)
+                            os._exit(ret & 255)
                     pids.add(pid)
                 os.close(wfd)
                 fp = os.fdopen(rfd, pycompat.sysstr('rb'), 0)
                 def cleanup():
                     signal.signal(signal.SIGINT, oldhandler)
                     waitforworkers()
                     signal.signal(signal.SIGCHLD, oldchldhandler)
                     status = problem[0]
                     if status:
                         if status < 0:
                             os.kill(os.getpid(), -status)
                         sys.exit(status)
                 try:
                     for line in util.iterfile(fp):
                         l = line.split(' ', 1)
                         yield int(l[0]), l[1][:-1]
                 except: # re-raises
                     killworkers()
                     cleanup()
                     raise
                 cleanup()
             def _posixexitstatus(code):
                 '''convert a posix exit status into the same form returned by
                 os.spawnv
                 returns None if the process was stopped instead of exiting'''
                 if os.WIFEXITED(code):
                     return os.WEXITSTATUS(code)
                 elif os.WIFSIGNALED(code):
                     return -os.WTERMSIG(code)
             if pycompat.osname != 'nt':
                 _platformworker = _posixworker
                 _exitstatus = _posixexitstatus
             def partition(lst, nslices):
                 '''partition a list into N slices of roughly equal size
                 The current strategy takes every Nth element from the input. If
                 we ever write workers that need to preserve grouping in input
                 we should consider allowing callers to specify a partition strategy.
                 mpm is not a fan of this partitioning strategy when files are involved.
                 In his words:
                     Single-threaded Mercurial makes a point of creating and visiting
                     files in a fixed order (alphabetical). When creating files in order,
                     a typical filesystem is likely to allocate them on nearby regions on
                     disk. Thus, when revisiting in the same order, locality is maximized
                     and various forms of OS and disk-level caching and read-ahead get a
                     chance to work.
                     This effect can be quite significant on spinning disks. I discovered it
                     circa Mercurial v0.4 when revlogs were named by hashes of filenames.
                     Tarring a repo and copying it to another disk effectively randomized
                     the revlog ordering on disk by sorting the revlogs by hash and suddenly
                     performance of my kernel checkout benchmark dropped by ~10x because the
                     "working set" of sectors visited no longer fit in the drive's cache and
                     the workload switched from streaming to random I/O.
                     What we should really be doing is have workers read filenames from a
                     ordered queue. This preserves locality and also keeps any worker from
                     getting more than one file out of balance.
                 '''
                 for i in range(nslices):
                     yield lst[i::nslices]

             Test UI worker interaction
               $ cat > t.py <<EOF
               > from __future__ import absolute_import, print_function
               > from mercurial import (
               >     cmdutil,
               >     error,
               >     ui as uimod,
               >     worker,
               > )
               > def abort(ui, args):
               >     if args[0] == 0:
               >         # by first worker for test stability
               >         raise error.Abort('known exception')
               >     return runme(ui, [])
               > def runme(ui, args):
               >     for arg in args:
               >         ui.status('run\n')
               >         yield 1, arg
               > functable = {
               >     'abort': abort,
               >     'runme': runme,
               > }
               > cmdtable = {}
               > command = cmdutil.command(cmdtable)
               > @command('test', [], 'hg test [COST] [FUNC]')
               > def t(ui, repo, cost=1.0, func='runme'):
               >     cost = float(cost)
               >     func = functable[func]
               >     ui.status('start\n')
               >     runs = worker.worker(ui, cost, func, (ui,), range(8))
               >     for n, i in runs:
               >         pass
               >     ui.status('done\n')
               > EOF
               $ abspath=`pwd`/t.py
               $ hg init
             Run tests with worker enable by forcing a heigh cost
               $ hg --config "extensions.t=$abspath" test 100000.0
               start
               run
               run
               run
               run
               run
               run
               run
               run
               done
             Run tests without worker by forcing a low cost
               $ hg --config "extensions.t=$abspath" test 0.0000001
               start
               run
               run
               run
               run
               run
               run
               run
               run
               done
             Known exception should be caught, but printed if --traceback is enabled
               $ hg --config "extensions.t=$abspath" --config 'worker.numcpus=2' \
               > test 100000.0 abort
               start
               abort: known exception
-              done
+              [255]
               $ hg --config "extensions.t=$abspath" --config 'worker.numcpus=2' \
               > test 100000.0 abort --traceback 2>&1 | grep '^Traceback'
               Traceback (most recent call last):
+              Traceback (most recent call last):