# 命令行提交Spark应用样例:
#./bin/spark-submit \
#  --class com.imooc.spark.Test.TestOfSparkContext2 \
#  --conf spark.master spark://localhost:7077 \
#  --master local[2] \
#  /home/hadoop/data/test-jar/sql-1.0.jar arg1 arg2
if [ -z "${SPARK_HOME}" ]; then
  source "$(dirname "$0")"/find-spark-home
# disable randomized hash for string in Python 3.3+
exec "${SPARK_HOME}"/bin/spark-class org.apache.spark.deploy.SparkSubmit "$@"



  override def main(args: Array[String]): Unit = {
  	// 创建自定义SparkSubmit类,使用匿名子类的创建方式来override一些方法
    val submit = new SparkSubmit() {
      self => // 创建SparkSubmit类的一个别名
      // 自定义参数解析类匿名子类对象,主要自定义了如何打印日志
      override protected def parseArguments(args: Array[String]): SparkSubmitArguments = {
        new SparkSubmitArguments(args) {
          override protected def logInfo(msg: => String): Unit = self.logInfo(msg)
          override protected def logWarning(msg: => String): Unit = self.logWarning(msg)
      override protected def logInfo(msg: => String): Unit = printMessage(msg)
      override protected def logWarning(msg: => String): Unit = printMessage(s"Warning: $msg")
      // 重载此方法,主要是添加try...catch语句,捕获异常
      override def doSubmit(args: Array[String]): Unit = {
        try {
        } catch {
          case e: SparkUserAppException =>




  • 加载通过参数--properties-file指定的文件中加载配置信息作为默认的属性
  • 加载用户通过命令行指定的各项属性,包括--conf | --jars | --class等,作为
  • 如果用户没有通过参数--properties-file,指定属性文件,则加载环境变量SPARK_CONF_DIR指定的路径或是${SPARK_HOME}/conf路径下的spark-defaults.conf文件中的配置信息,并与前面所有读取的属性合并
  • 加载通过环境变量指定的各种属性,后续在访问每个变量时,优先使用相应的环境变量


/** args函数参数:通过启动脚本接收到的所有在/bin/spark-submit之后的参数 **/
private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, String] = sys.env)
  extends SparkSubmitArgumentsParser with Logging {
  /** Default properties present in the currently defined defaults file. */
  lazy val defaultSparkProperties: HashMap[String, String] = {
    val defaultProperties = new HashMap[String, String]()
    if (verbose) {
      logInfo(s"Using properties file: $propertiesFile")
    Option(propertiesFile).foreach { filename =>
      val properties = Utils.getPropertiesFromFile(filename)
      properties.foreach { case (k, v) =>
        defaultProperties(k) = v
      // Property files may contain sensitive information, so redact before printing
      if (verbose) {
        Utils.redact(properties).foreach { case (k, v) =>
          logInfo(s"Adding default property: $k=$v")
  // Set parameters from command line arguments
  // Populate `sparkProperties` map from properties file
  // Remove keys that don't start with "spark." from `sparkProperties`.
  // Use `sparkProperties` map along with env vars to fill in any missing parameters
  useRest = sparkProperties.getOrElse("spark.master.rest.enabled", "false").toBoolean

从下面代码可以看到Spark CLI支持4种操作,但这里主要关注submit流程,其它方法暂不深究,详细的分析见下一小节。

  def doSubmit(args: Array[String]): Unit = {
    // Initialize logging if it hasn't been done yet. Keep track of whether logging needs to
    // be reset before the application starts.
    val uninitLog = initializeLogIfNecessary(true, silent = true)
    val appArgs = parseArguments(args)
    if (appArgs.verbose) {
    appArgs.action match {
      case SparkSubmitAction.SUBMIT => submit(appArgs, uninitLog)
      case SparkSubmitAction.KILL => kill(appArgs)
      case SparkSubmitAction.REQUEST_STATUS => requestStatus(appArgs)
      case SparkSubmitAction.PRINT_VERSION => printVersion()


* Submit the application using the provided parameters. * This runs in two steps. First, we prepare the launch environment by setting up * the appropriate classpath, system properties, and application arguments for * running the child main class based on the cluster manager and the deploy mode. * Second, we use this launch environment to invoke the main method of the child * main class. @tailrec private def submit(args: SparkSubmitArguments, uninitLog: Boolean): Unit = { // 分类参数,child指用户指定的入口类的子进程的概念: // childArgs Array,包含了传递给 // childClasspath Array,包含了用户通过spark.jars属性、--jars参数及指定的 // 入口jar包,其中当提交的任务模式为client时,会首先尝试下载通过spark.jars或 val (childArgs, childClasspath, sparkConf, childMainClass) = prepareSubmitEnvironment(args) def doRunMain(): Unit = { if (args.proxyUser != null) { val proxyUser = UserGroupInformation.createProxyUser(args.proxyUser, UserGroupInformation.getCurrentUser()) try { proxyUser.doAs(new PrivilegedExceptionAction[Unit]() { override def run(): Unit = { runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose) } catch { case e: Exception => // Hadoop's AuthorizationException suppresses the exception's stack trace, which // makes the message printed to the output by the JVM not very helpful. Instead, // detect exceptions with empty stack traces here, and treat them differently. if (e.getStackTrace().length == 0) { error(s"ERROR: ${e.getClass().getName()}: ${e.getMessage()}") } else { throw e } else { runMain(childArgs, childClasspath, sparkConf, childMainClass, args.verbose) // In standalone cluster mode, there are two submission gateways: // (1) The traditional RPC gateway using o.a.s.deploy.Client as a wrapper // (2) The new REST-based gateway introduced in Spark 1.3 // The latter is the default behavior as of Spark 1.3, but Spark submit will fail over // to use the legacy gateway if the master endpoint turns out to be not a REST server. if (args.isStandaloneCluster && args.useRest) { try { logInfo("Running Spark using the REST application submission protocol.") doRunMain() } catch { // Fail over to use the legacy submission gateway case e: SubmitRestConnectionException => logWarning(s"Master endpoint ${args.master} was not a REST server. " + "Falling back to legacy submission gateway instead.") args.useRest = false submit(args, false) // In all other modes, just run the main class as prepared } else { doRunMain()



* Run the main method of the child class using the provided launch environment. * Note that this main class will not be the one provided by the user if we're * running cluster deploy mode or python applications. private def runMain( childArgs: Seq[String], childClasspath: Seq[String], sparkConf: SparkConf, childMainClass: String, verbose: Boolean): Unit = { // ... 忽略添加jar包到JAVA的系统路径下的代码逻辑,这里会根据用户是否指定了 // spark.driver.userClassPathFirst // 这个参数,来选择添加jar包的优先级 var mainClass: Class[_] = null try { mainClass = Utils.classForName(childMainClass) } catch { case e: ClassNotFoundException => logWarning(s"Failed to load $childMainClass.", e) if (childMainClass.contains("thriftserver")) { logInfo(s"Failed to load main class $childMainClass.") logInfo("You need to build Spark with -Phive and -Phive-thriftserver.") throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS) case e: NoClassDefFoundError => logWarning(s"Failed to load $childMainClass: ${e.getMessage()}") if (e.getMessage.contains("org/apache/hadoop/hive")) { logInfo(s"Failed to load hive class.") logInfo("You need to build Spark with -Phive and -Phive-thriftserver.") throw new SparkUserAppException(CLASS_NOT_FOUND_EXIT_STATUS) val app: SparkApplication = if (classOf[SparkApplication].isAssignableFrom(mainClass)) { mainClass.newInstance().asInstanceOf[SparkApplication] } else { // SPARK-4170 if (classOf[scala.App].isAssignableFrom(mainClass)) { logWarning("Subclasses of scala.App may not work correctly. Use a main() method instead.") new JavaMainApplication(mainClass) @tailrec def findCause(t: Throwable): Throwable = t match { case e: UndeclaredThrowableException => if (e.getCause() != null) findCause(e.getCause()) else e case e: InvocationTargetException => if (e.getCause() != null) findCause(e.getCause()) else e case e: Throwable => try { app.start(childArgs.toArray, sparkConf) } catch { case t: Throwable => throw findCause(t)



