diff --git a/tests/run-tests.py b/tests/run-tests.py
--- a/tests/run-tests.py
+++ b/tests/run-tests.py
@@ -1390,16 +1390,19 @@ class TestSuite(unittest.TestSuite):
                 done.put(('!', test, 'run-test raised an error, see traceback'))
                 raise
 
+        stoppedearly = False
+
         try:
             while tests or running:
                 if not done.empty() or running == self._jobs or not tests:
                     try:
                         done.get(True, 1)
+                        running -= 1
                         if result and result.shouldStop:
+                            stoppedearly = True
                             break
                     except queue.Empty:
                         continue
-                    running -= 1
                 if tests and not running == self._jobs:
                     test = tests.pop(0)
                     if self._loop:
@@ -1413,6 +1416,18 @@ class TestSuite(unittest.TestSuite):
                                          args=(test, result))
                     t.start()
                     running += 1
+
+            # If we stop early we still need to wait on started tests to
+            # finish. Otherwise, there is a race between the test completing
+            # and the test's cleanup code running. This could result in the
+            # test reporting incorrect.
+            if stoppedearly:
+                while running:
+                    try:
+                        done.get(True, 1)
+                        running -= 1
+                    except queue.Empty:
+                        continue
         except KeyboardInterrupt:
             for test in runtests:
                 test.abort()
diff --git a/tests/test-run-tests.t b/tests/test-run-tests.t
--- a/tests/test-run-tests.t
+++ b/tests/test-run-tests.t
@@ -265,7 +265,8 @@ failures in parallel with --first should
    this test is still more bytes than success.
   
   Failed test-failure*.t: output changed (glob)
-  # Ran 2 tests, 0 skipped, 0 warned, 1 failed.
+  Failed test-nothing.t: output changed
+  # Ran 2 tests, 0 skipped, 0 warned, 2 failed.
   python hash seed: * (glob)
   [1]