-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-1740] [PySpark] kill the python worker #1643
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,9 +17,11 @@ | |
|
|
||
| package org.apache.spark.api.python | ||
|
|
||
| import java.lang.Runtime | ||
| import java.io.{DataInputStream, InputStream, OutputStreamWriter} | ||
| import java.net.{InetAddress, ServerSocket, Socket, SocketException} | ||
|
|
||
| import scala.collection.mutable | ||
| import scala.collection.JavaConversions._ | ||
|
|
||
| import org.apache.spark._ | ||
|
|
@@ -39,6 +41,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String | |
| var daemon: Process = null | ||
| val daemonHost = InetAddress.getByAddress(Array(127, 0, 0, 1)) | ||
| var daemonPort: Int = 0 | ||
| var daemonWorkers = new mutable.WeakHashMap[Socket, Int]() | ||
|
|
||
| var simpleWorkers = new mutable.WeakHashMap[Socket, Process]() | ||
|
|
||
| val pythonPath = PythonUtils.mergePythonPaths( | ||
| PythonUtils.sparkPythonPath, | ||
|
|
@@ -65,10 +70,11 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String | |
| // Attempt to connect, restart and retry once if it fails | ||
| try { | ||
| val socket = new Socket(daemonHost, daemonPort) | ||
| val launchStatus = new DataInputStream(socket.getInputStream).readInt() | ||
| if (launchStatus != 0) { | ||
| val pid = new DataInputStream(socket.getInputStream).readInt() | ||
| if (pid < 0) { | ||
| throw new IllegalStateException("Python daemon failed to launch worker") | ||
| } | ||
| daemonWorkers.put(socket, pid) | ||
| socket | ||
| } catch { | ||
| case exc: SocketException => | ||
|
|
@@ -107,7 +113,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String | |
| // Wait for it to connect to our socket | ||
| serverSocket.setSoTimeout(10000) | ||
| try { | ||
| return serverSocket.accept() | ||
| val socket = serverSocket.accept() | ||
| simpleWorkers.put(socket, pb) | ||
| return socket | ||
| } catch { | ||
| case e: Exception => | ||
| throw new SparkException("Python worker did not connect back in time", e) | ||
|
|
@@ -189,19 +197,34 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String | |
|
|
||
| private def stopDaemon() { | ||
| synchronized { | ||
| // Request shutdown of existing daemon by sending SIGTERM | ||
| if (daemon != null) { | ||
| daemon.destroy() | ||
| } | ||
| if (useDaemon) { | ||
| // Request shutdown of existing daemon by sending SIGTERM | ||
| if (daemon != null) { | ||
| daemon.destroy() | ||
| } | ||
|
|
||
| daemon = null | ||
| daemonPort = 0 | ||
| daemon = null | ||
| daemonPort = 0 | ||
| } else { | ||
| simpleWorkers.mapValues(_.destroy()) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def stop() { | ||
| stopDaemon() | ||
| } | ||
|
|
||
| def stopWorker(worker: Socket) { | ||
| if (useDaemon) { | ||
| daemonWorkers.get(worker).foreach { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other accesses of
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I think the current synchronization is fine: every call of PythonWorkerFactory's public methods is guarded by SparkEnv's lock. |
||
| pid => Runtime.getRuntime.exec("kill " + pid.toString) | ||
| } | ||
| } else { | ||
| simpleWorkers.get(worker).foreach(_.destroy()) | ||
| } | ||
| worker.close() | ||
| } | ||
| } | ||
|
|
||
| private object PythonWorkerFactory { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,7 +67,8 @@ def waitSocketClose(sock): | |
| outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536) | ||
| exit_code = 0 | ||
| try: | ||
| write_int(0, outfile) # Acknowledge that the fork was successful | ||
| # Acknowledge that the fork was successful | ||
| write_int(os.getpid(), outfile) | ||
| outfile.flush() | ||
| worker_main(infile, outfile) | ||
| except SystemExit as exc: | ||
|
|
@@ -131,8 +132,8 @@ def handle_sigchld(*args): | |
| sock, addr = listen_sock.accept() | ||
| # Launch a worker process | ||
| try: | ||
| fork_return_code = os.fork() | ||
| if fork_return_code == 0: | ||
| pid = os.fork() | ||
| if pid == 0: | ||
| listen_sock.close() | ||
| try: | ||
| worker(sock) | ||
|
|
@@ -141,13 +142,17 @@ def handle_sigchld(*args): | |
| os._exit(1) | ||
| else: | ||
| os._exit(0) | ||
| else: | ||
| elif pid > 0: | ||
| sock.close() | ||
| else: | ||
| raise OSError("fork failed") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that |
||
|
|
||
| except OSError as e: | ||
| print >> sys.stderr, "Daemon failed to fork PySpark worker: %s" % e | ||
| outfile = os.fdopen(os.dup(sock.fileno()), "a+", 65536) | ||
| write_int(-1, outfile) # Signal that the fork failed | ||
| outfile.flush() | ||
| outfile.close() | ||
| sock.close() | ||
| finally: | ||
| shutdown(1) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like
simpleWorkersis declared as a map ofProcessbut here you're storing aProcessBuilder; IntelliJ displays this as an error, but it still seems to compile. Any idea what's going on here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As an experiment, I added the line
and, as expected, this results in a
java.lang.ProcessBuilder cannot be cast to java.lang.Processerror.The compiler should have prevented this, so I think we've found a compiler bug.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, this looks like a compiler bug. This file compiles in 2.10.4 and gives the expected error in 2.11.2:
For the curious, here's the actual implicit conversions that led to this bug (run
/scala-2.10.4/bin/scalac -Xprint:typer ImplicitBug.scalato get this output):