worker.py
224 lines
| 7.2 KiB
| text/x-python
|
PythonLexer
/ mercurial / worker.py
Bryan O'Sullivan
|
r18635 | # worker.py - master-slave parallelism support | ||
# | ||||
# Copyright 2013 Facebook, Inc. | ||||
# | ||||
# This software may be used and distributed according to the terms of the | ||||
# GNU General Public License version 2 or any later version. | ||||
Gregory Szorc
|
r25992 | from __future__ import absolute_import | ||
import errno | ||||
import os | ||||
import signal | ||||
import sys | ||||
from .i18n import _ | ||||
Jun Wu
|
r30396 | from . import ( | ||
Pulkit Goyal
|
r30635 | encoding, | ||
Jun Wu
|
r30396 | error, | ||
Pulkit Goyal
|
r30639 | pycompat, | ||
Jun Wu
|
r30521 | scmutil, | ||
Jun Wu
|
r30396 | util, | ||
) | ||||
Bryan O'Sullivan
|
r18635 | |||
def countcpus(): | ||||
'''try to count the number of CPUs on the system''' | ||||
Gregory Szorc
|
r26568 | |||
# posix | ||||
Bryan O'Sullivan
|
r18635 | try: | ||
Gregory Szorc
|
r26568 | n = int(os.sysconf('SC_NPROCESSORS_ONLN')) | ||
if n > 0: | ||||
return n | ||||
except (AttributeError, ValueError): | ||||
pass | ||||
# windows | ||||
try: | ||||
Pulkit Goyal
|
r30635 | n = int(encoding.environ['NUMBER_OF_PROCESSORS']) | ||
Gregory Szorc
|
r26568 | if n > 0: | ||
return n | ||||
except (KeyError, ValueError): | ||||
pass | ||||
return 1 | ||||
Bryan O'Sullivan
|
r18636 | |||
def _numworkers(ui): | ||||
s = ui.config('worker', 'numcpus') | ||||
if s: | ||||
try: | ||||
n = int(s) | ||||
if n >= 1: | ||||
return n | ||||
except ValueError: | ||||
Pierre-Yves David
|
r26587 | raise error.Abort(_('number of cpus must be an integer')) | ||
Bryan O'Sullivan
|
r18636 | return min(max(countcpus(), 4), 32) | ||
Pulkit Goyal
|
r30639 | if pycompat.osname == 'posix': | ||
Bryan O'Sullivan
|
r18636 | _startupcost = 0.01 | ||
else: | ||||
_startupcost = 1e30 | ||||
def worthwhile(ui, costperop, nops): | ||||
'''try to determine whether the benefit of multiple processes can | ||||
outweigh the cost of starting them''' | ||||
linear = costperop * nops | ||||
workers = _numworkers(ui) | ||||
benefit = linear - (_startupcost * workers + linear / workers) | ||||
return benefit >= 0.15 | ||||
Bryan O'Sullivan
|
r18637 | |||
Bryan O'Sullivan
|
r18638 | def worker(ui, costperarg, func, staticargs, args): | ||
'''run a function, possibly in parallel in multiple worker | ||||
processes. | ||||
returns a progress iterator | ||||
costperarg - cost of a single task | ||||
func - function to run | ||||
staticargs - arguments to pass to every invocation of the function | ||||
args - arguments to split into chunks, to pass to individual | ||||
workers | ||||
''' | ||||
if worthwhile(ui, costperarg, len(args)): | ||||
return _platformworker(ui, func, staticargs, args) | ||||
return func(*staticargs + (args,)) | ||||
def _posixworker(ui, func, staticargs, args): | ||||
rfd, wfd = os.pipe() | ||||
workers = _numworkers(ui) | ||||
Bryan O'Sullivan
|
r18708 | oldhandler = signal.getsignal(signal.SIGINT) | ||
signal.signal(signal.SIGINT, signal.SIG_IGN) | ||||
Jun Wu
|
r30413 | pids, problem = set(), [0] | ||
Jun Wu
|
r30410 | def killworkers(): | ||
Yuya Nishihara
|
r30423 | # unregister SIGCHLD handler as all children will be killed. This | ||
# function shouldn't be interrupted by another SIGCHLD; otherwise pids | ||||
# could be updated while iterating, which would cause inconsistency. | ||||
signal.signal(signal.SIGCHLD, oldchldhandler) | ||||
Jun Wu
|
r30410 | # if one worker bails, there's no good reason to wait for the rest | ||
for p in pids: | ||||
try: | ||||
os.kill(p, signal.SIGTERM) | ||||
except OSError as err: | ||||
if err.errno != errno.ESRCH: | ||||
raise | ||||
Jun Wu
|
r30412 | def waitforworkers(blocking=True): | ||
Jun Wu
|
r30414 | for pid in pids.copy(): | ||
p = st = 0 | ||||
while True: | ||||
try: | ||||
p, st = os.waitpid(pid, (0 if blocking else os.WNOHANG)) | ||||
Yuya Nishihara
|
r30422 | break | ||
Jun Wu
|
r30414 | except OSError as e: | ||
if e.errno == errno.EINTR: | ||||
continue | ||||
elif e.errno == errno.ECHILD: | ||||
Yuya Nishihara
|
r30425 | # child would already be reaped, but pids yet been | ||
# updated (maybe interrupted just after waitpid) | ||||
pids.discard(pid) | ||||
break | ||||
Jun Wu
|
r30414 | else: | ||
raise | ||||
Jun Wu
|
r30412 | if p: | ||
Yuya Nishihara
|
r30425 | pids.discard(p) | ||
Jun Wu
|
r30412 | st = _exitstatus(st) | ||
Jun Wu
|
r30410 | if st and not problem[0]: | ||
problem[0] = st | ||||
Jun Wu
|
r30415 | def sigchldhandler(signum, frame): | ||
waitforworkers(blocking=False) | ||||
Yuya Nishihara
|
r30424 | if problem[0]: | ||
killworkers() | ||||
Jun Wu
|
r30415 | oldchldhandler = signal.signal(signal.SIGCHLD, sigchldhandler) | ||
Bryan O'Sullivan
|
r18638 | for pargs in partition(args, workers): | ||
pid = os.fork() | ||||
if pid == 0: | ||||
Bryan O'Sullivan
|
r18708 | signal.signal(signal.SIGINT, oldhandler) | ||
Jun Wu
|
r30415 | signal.signal(signal.SIGCHLD, oldchldhandler) | ||
Jun Wu
|
r30521 | |||
def workerfunc(): | ||||
Bryan O'Sullivan
|
r18638 | os.close(rfd) | ||
for i, item in func(*(staticargs + (pargs,))): | ||||
os.write(wfd, '%d %s\n' % (i, item)) | ||||
Jun Wu
|
r30521 | |||
# make sure we use os._exit in all code paths. otherwise the worker | ||||
# may do some clean-ups which could cause surprises like deadlock. | ||||
# see sshpeer.cleanup for example. | ||||
try: | ||||
scmutil.callcatch(ui, workerfunc) | ||||
Bryan O'Sullivan
|
r18638 | except KeyboardInterrupt: | ||
os._exit(255) | ||||
Jun Wu
|
r30521 | except: # never return, therefore no re-raises | ||
try: | ||||
ui.traceback() | ||||
finally: | ||||
os._exit(255) | ||||
else: | ||||
os._exit(0) | ||||
Jun Wu
|
r30413 | pids.add(pid) | ||
Bryan O'Sullivan
|
r18638 | os.close(wfd) | ||
fp = os.fdopen(rfd, 'rb', 0) | ||||
def cleanup(): | ||||
signal.signal(signal.SIGINT, oldhandler) | ||||
Jun Wu
|
r30416 | waitforworkers() | ||
Jun Wu
|
r30415 | signal.signal(signal.SIGCHLD, oldchldhandler) | ||
Bryan O'Sullivan
|
r18709 | status = problem[0] | ||
if status: | ||||
if status < 0: | ||||
os.kill(os.getpid(), -status) | ||||
sys.exit(status) | ||||
Bryan O'Sullivan
|
r18638 | try: | ||
Jun Wu
|
r30396 | for line in util.iterfile(fp): | ||
Bryan O'Sullivan
|
r18638 | l = line.split(' ', 1) | ||
yield int(l[0]), l[1][:-1] | ||||
except: # re-raises | ||||
Bryan O'Sullivan
|
r18709 | killworkers() | ||
Bryan O'Sullivan
|
r18638 | cleanup() | ||
raise | ||||
cleanup() | ||||
Bryan O'Sullivan
|
r18707 | def _posixexitstatus(code): | ||
'''convert a posix exit status into the same form returned by | ||||
os.spawnv | ||||
returns None if the process was stopped instead of exiting''' | ||||
if os.WIFEXITED(code): | ||||
return os.WEXITSTATUS(code) | ||||
elif os.WIFSIGNALED(code): | ||||
return -os.WTERMSIG(code) | ||||
Pulkit Goyal
|
r30639 | if pycompat.osname != 'nt': | ||
Bryan O'Sullivan
|
r18638 | _platformworker = _posixworker | ||
Bryan O'Sullivan
|
r18707 | _exitstatus = _posixexitstatus | ||
Bryan O'Sullivan
|
r18638 | |||
Bryan O'Sullivan
|
r18637 | def partition(lst, nslices): | ||
Gregory Szorc
|
r28181 | '''partition a list into N slices of roughly equal size | ||
The current strategy takes every Nth element from the input. If | ||||
we ever write workers that need to preserve grouping in input | ||||
we should consider allowing callers to specify a partition strategy. | ||||
Gregory Szorc
|
r28292 | |||
mpm is not a fan of this partitioning strategy when files are involved. | ||||
In his words: | ||||
Single-threaded Mercurial makes a point of creating and visiting | ||||
files in a fixed order (alphabetical). When creating files in order, | ||||
a typical filesystem is likely to allocate them on nearby regions on | ||||
disk. Thus, when revisiting in the same order, locality is maximized | ||||
and various forms of OS and disk-level caching and read-ahead get a | ||||
chance to work. | ||||
This effect can be quite significant on spinning disks. I discovered it | ||||
circa Mercurial v0.4 when revlogs were named by hashes of filenames. | ||||
Tarring a repo and copying it to another disk effectively randomized | ||||
the revlog ordering on disk by sorting the revlogs by hash and suddenly | ||||
performance of my kernel checkout benchmark dropped by ~10x because the | ||||
"working set" of sectors visited no longer fit in the drive's cache and | ||||
the workload switched from streaming to random I/O. | ||||
What we should really be doing is have workers read filenames from a | ||||
ordered queue. This preserves locality and also keeps any worker from | ||||
getting more than one file out of balance. | ||||
Gregory Szorc
|
r28181 | ''' | ||
for i in range(nslices): | ||||
yield lst[i::nslices] | ||||