Flipkart · shubhamjainflipkart · Jan 29, 2018 · Jan 29, 2018 · Jan 29, 2018 · Jan 30, 2018
diff --git a/app-conf/FetcherConf.xml b/app-conf/FetcherConf.xml
@@ -29,15 +29,15 @@
   </fetcher>
 -->
 <fetchers>
-  <!--
+
   <fetcher>
     <applicationtype>mapreduce</applicationtype>
     <classname>com.linkedin.drelephant.mapreduce.fetchers.MapReduceFetcherHadoop2</classname>
     <params>
       <sampling_enabled>false</sampling_enabled>
     </params>
   </fetcher>
-  -->
+
   <!--
      This is a replacement for the MapReduceFetcherHadoop2 that attempts to burn
      through queues of jobs faster by pulling data directly from HDFS rather than going through
@@ -51,7 +51,7 @@
      To work properly, this fetcher should use the same timezone with the job history server.
      If not set, the local timezone will be used.
    -->
-
+<!--
   <fetcher>
     <applicationtype>mapreduce</applicationtype>
     <classname>com.linkedin.drelephant.mapreduce.fetchers.MapReduceFSFetcherHadoop2</classname>
@@ -62,7 +62,7 @@
     </params>
   </fetcher>
 
-
+-->
   <!--
     FSFetcher for Spark. Loads the eventlog from HDFS and replays to get the metrics and application properties
 

diff --git a/app-conf/GeneralConf.xml b/app-conf/GeneralConf.xml
@@ -39,7 +39,7 @@
   </property>
   <property>
     <name>drelephant.executor.service.class.name</name>
-    <value>com.linkedin.drelephant.executors.QuartzExecutorService</value>
+    <value>com.linkedin.drelephant.executors.ThreadPoolExecutorService</value>
     <description>Executor service (can be one of ThreadPoolExecutorService or QuartzExecutorService)</description>
   </property>
   <!--

diff --git a/app/com/linkedin/drelephant/analysis/AnalyticJob.java b/app/com/linkedin/drelephant/analysis/AnalyticJob.java
@@ -27,6 +27,8 @@
 import models.AppHeuristicResult;
 import models.AppHeuristicResultDetails;
 import models.AppResult;
+import models.FailedAppResult;
+import org.apache.commons.lang.exception.ExceptionUtils;
 import org.apache.log4j.Logger;
 
 
@@ -249,6 +251,40 @@ public AnalyticJob setTrackingUrl(String trackingUrl) {
     return this;
   }
 
+
+  public FailedAppResult getFailedAppResult(Exception e) throws Exception {
+
+    ElephantFetcher fetcher = ElephantContext.instance().getFetcherForApplicationType(getAppType());
+    HadoopApplicationData data = fetcher.fetchConfData(this);
+
+    JobType jobType = ElephantContext.instance().matchJobType(data);
+    String jobTypeName = jobType == null ? UNKNOWN_JOB_TYPE : jobType.getName();
+
+    AppResult result = new AppResult();
+    InfoExtractor.loadInfo(result, data);
+
+    FailedAppResult failedApp = new FailedAppResult();
+    failedApp.appId = Utils.truncateField(getAppId(), AppResult.ID_LIMIT, getAppId());
+    failedApp.startTime = getStartTime();
+    failedApp.finishTime = getFinishTime();
+    failedApp.name = Utils.truncateField(getName(), AppResult.APP_NAME_LIMIT, getAppId());
+    failedApp.trackingUrl = Utils.truncateField(getTrackingUrl(), AppResult.TRACKING_URL_LIMIT, getAppId());
+    failedApp.jobType = Utils.truncateField(jobTypeName, AppResult.JOBTYPE_LIMIT, getAppId());
+    failedApp.scheduler = result.scheduler;
+    failedApp.jobName =result.jobName;
+    failedApp.jobDefId = result.jobDefId;
+    failedApp.jobExecId = result.jobExecId;
+    failedApp.flowDefId = result.flowDefId;
+    failedApp.flowExecId = result.flowExecId;
+    failedApp.jobDefUrl = result.jobDefUrl;
+    failedApp.jobExecUrl = result.jobExecUrl;
+    failedApp.flowDefId =result.flowDefId;
+    failedApp.flowExecUrl = result.jobExecUrl;
+    failedApp.error = ExceptionUtils.getStackTrace(e);
+
+    return failedApp;
+  }
+
   /**
    * Returns the analysed AppResult that could be directly serialized into DB.
    *

diff --git a/app/com/linkedin/drelephant/analysis/AnalyticJobGeneratorHadoop2.java b/app/com/linkedin/drelephant/analysis/AnalyticJobGeneratorHadoop2.java
@@ -28,6 +28,7 @@
 import java.util.*;
 import models.AppResult;
 import models.CheckPoint;
+import models.FailedAppResult;
 import org.apache.commons.lang.exception.ExceptionUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.security.authentication.client.AuthenticatedURL;
@@ -244,6 +245,14 @@ public void analyseJob(AnalyticJob analyticJob) {
       } else {
         if (analyticJob != null) {
           MetricsController.triggerJobRetriesExhaustionEvent();
+          try {
+            FailedAppResult result = analyticJob.getFailedAppResult(e);
+            result.save();
+
+          } catch (Exception ex) {
+            logger.error("Failed to get info for failed app: " + analyticJob.getAppId());
+            logger.error(ExceptionUtils.getStackTrace(ex));
+          }
           logger.error("Drop the analytic job. Reason: reached the max retries for application id = ["
                   + analyticJob.getAppId() + "].");
         }

diff --git a/app/com/linkedin/drelephant/analysis/ElephantFetcher.java b/app/com/linkedin/drelephant/analysis/ElephantFetcher.java
@@ -32,4 +32,7 @@ public interface ElephantFetcher<T extends HadoopApplicationData> {
    */
   public T fetchData(AnalyticJob job)
       throws Exception;
+
+  public T fetchConfData(AnalyticJob job);
+
 }
diff --git a/app/com/linkedin/drelephant/mapreduce/fetchers/MapReduceFSFetcherHadoop2.java b/app/com/linkedin/drelephant/mapreduce/fetchers/MapReduceFSFetcherHadoop2.java
@@ -202,6 +202,11 @@ private DataFiles getHistoryFiles(AnalyticJob job) throws IOException {
     return new DataFiles(jobConfPath, jobHistPath);
   }
 
+  @Override
+  public MapReduceApplicationData fetchConfData(AnalyticJob analyticJob) {
+    return new MapReduceApplicationData();
+  }
+
   @Override
   public MapReduceApplicationData fetchData(AnalyticJob job) throws IOException {
     DataFiles files = getHistoryFiles(job);

diff --git a/app/com/linkedin/drelephant/mapreduce/fetchers/MapReduceFetcherHadoop2.java b/app/com/linkedin/drelephant/mapreduce/fetchers/MapReduceFetcherHadoop2.java
@@ -71,19 +71,32 @@ public MapReduceFetcherHadoop2(FetcherConfigurationData fetcherConfData) throws
   }
 
   @Override
-  public MapReduceApplicationData fetchData(AnalyticJob analyticJob) throws IOException, AuthenticationException {
+  public MapReduceApplicationData fetchConfData(AnalyticJob analyticJob) {
     String appId = analyticJob.getAppId();
     MapReduceApplicationData jobData = new MapReduceApplicationData();
     String jobId = Utils.getJobIdFromApplicationId(appId);
     jobData.setAppId(appId).setJobId(jobId);
     // Change job tracking url to job history page
     analyticJob.setTrackingUrl(_jhistoryWebAddr + jobId);
-    try {
 
       // Fetch job config
+    try {
       Properties jobConf = _jsonFactory.getProperties(_urlFactory.getJobConfigURL(jobId));
       jobData.setJobConf(jobConf);
+    } catch (Exception e) {
+      logger.error("Failed to fetch conf data: ", e);
+    }
+      return jobData;
+  }
+
+  @Override
+  public MapReduceApplicationData fetchData(AnalyticJob analyticJob) throws IOException, AuthenticationException {
+
+    String appId = analyticJob.getAppId();
+    String jobId = Utils.getJobIdFromApplicationId(appId);
+    MapReduceApplicationData jobData = fetchConfData(analyticJob);
 
+    try {
       URL jobURL = _urlFactory.getJobURL(jobId);
       String state = _jsonFactory.getState(jobURL);
 

diff --git a/app/com/linkedin/drelephant/spark/fetchers/FSFetcher.scala b/app/com/linkedin/drelephant/spark/fetchers/FSFetcher.scala
@@ -30,6 +30,8 @@ class FSFetcher(fetcherConfigurationData: FetcherConfigurationData)
   extends ElephantFetcher[SparkApplicationData] {
   lazy val legacyFetcher = new SparkFSFetcher(fetcherConfigurationData)
 
+  override def fetchConfData(job: AnalyticJob): SparkApplicationData = ???
+
   override def fetchData(analyticJob: AnalyticJob): SparkApplicationData = {
     val legacyData = legacyFetcher.fetchData(analyticJob)
     LegacyDataConverters.convert(legacyData)

diff --git a/app/com/linkedin/drelephant/spark/fetchers/SparkFetcher.scala b/app/com/linkedin/drelephant/spark/fetchers/SparkFetcher.scala
@@ -76,6 +76,8 @@ class SparkFetcher(fetcherConfigurationData: FetcherConfigurationData)
     new SparkLogClient(hadoopConfiguration, sparkConf, eventLogUri)
   }
 
+  override def fetchConfData(job: AnalyticJob): SparkApplicationData = ???
+
   override def fetchData(analyticJob: AnalyticJob): SparkApplicationData = {
     doFetchData(analyticJob) match {
       case Success(data) => data