核心代碼
private void pi() { log.info("----- start pi -----"); final String JAVAHome = System.getenv("JAVA_HOME"); final String hadoopConfDir = System.getenv("HADOOP_CONF_DIR"); log.info("javaHome: " + javaHome); log.info("hadoopConfDir: " + hadoopConfDir); log.info("sparkHome: " + sparkHome); log.info("mode: " + deployMode); log.info("AppResource: " + sparkJar); log.info("mainClass: " + mainClass); final String[] args = new String[]{ "--jar", sparkJar, "--class", mainClass, "--arg", "10"}; String appName = "spark-yarn"; System.setProperty("SPARK_YARN_MODE", "true"); SparkConf sparkConf = new SparkConf(); sparkConf.setSparkHome(sparkHome); sparkConf.setMaster("yarn"); sparkConf.setAppName(appName); sparkConf.set("spark.submit.deployMode", "cluster"); String jarDir = "hdfs://sh01:9000/user/deployer/spark-jars/*.jar"; log.info("jarDir: " + jarDir); sparkConf.set("spark.yarn.jars", jarDir); if (enableKerberos) { log.info("---------------- enable kerberos ------------------"); sparkConf.set("spark.hadoop.hadoop.security.authentication", "kerberos"); sparkConf.set("spark.hadoop.hadoop.security.authorization", "true"); sparkConf.set("spark.hadoop.dfs.namenode.kerberos.principal", "hdfs/_HOST@KPP.COM"); sparkConf.set("spark.hadoop.yarn.resourcemanager.principal", "yarn/_HOST@KPP.COM"); } ClientArguments clientArguments = new ClientArguments(args); Client client = new Client(clientArguments, sparkConf);// client.run(); ApplicationId applicationId = client.submitApplication(); log.info("submit task [{}] and application id [{}] ", appName, applicationId.getId()); YarnAppReport yarnAppReport = client.monitorApplication(applicationId, false, true, 1000); log.info("task [{}] process result [{}]", appName, yarnAppReport.finalState()); if (yarnAppReport.finalState().equals(FinalApplicationStatus.SUCCEEDED)) { log.info("spark任務(wù)執(zhí)行成功"); } else { log.info("spark任務(wù)執(zhí)行失敗"); } log.info("----- finish pi -----"); }
兩種提交方式有什么區(qū)別
client.run() 是同步的,spark 任務(wù)結(jié)束前該行一下的代碼不會執(zhí)行。該方法的無返回值,也就是說拿不到 spark 任務(wù)執(zhí)行的任何信息。
client.submitApplication() 是異步的,提交任務(wù)后立即執(zhí)行該行下的代碼。但是該方法會返回 ApplicationId ,這個就很有用啦。接下來可以調(diào)用 monitorApplication 方法讓 java 代碼 block 住,并且拿到 spark 任務(wù)執(zhí)行的一些信息。
YarnAppReport yarnAppReport = client.monitorApplication(applicationId, false, true, 1000);
public YarnAppReport monitorApplication(final ApplicationId appId, final boolean returnOnRunning, final boolean logApplicationReport, final long interval) { // 代碼就不貼了,有需要自己去看嘍。}
- applicationId 就不用說啦,肯定是spark job 的 id。
returnOnRunningtrue :當 spark job 處于 RUNNING 狀態(tài)時,monitorApplication 方法結(jié)束 block,返回 yarnAppReport。false : monitorApplication 等待 spark job 執(zhí)行完畢結(jié)束 block,返回 yarnAppReport。當然如果 spark job 里面有 bug,那該啥時返回就啥時返回,具體的可以看下源代碼,只需要看清楚幾個關(guān)鍵環(huán)節(jié)就行。logApplicationReport 控制是否在輸出 spark job 執(zhí)行時的日志。interval 間隔多嘗試間去輪詢一次 spark job。源代碼里面寫的是 while(true) 循環(huán)。YarnAppReport 中持有 spark 任務(wù)的狀態(tài) 以及其他信息,具體內(nèi)容自己可以去里面搜。很顯然,client.submitApplication() 更有操作空間。