From 8431dcfe52f5395a0fd9e3c00db009dbb2bcf6f5 Mon Sep 17 00:00:00 2001 From: Lewis John McGibbney Date: Sat, 21 Oct 2023 11:09:31 -0700 Subject: [PATCH] NUTCH-3013 Employ commons-lang3's StopWatch to simplify timing logic (#788) --- .github/workflows/master-build.yml | 1 - .gitignore | 1 + src/java/org/apache/nutch/crawl/CrawlDb.java | 19 ++++++++------- .../org/apache/nutch/crawl/CrawlDbMerger.java | 16 ++++++------- .../apache/nutch/crawl/DeduplicationJob.java | 16 ++++++------- .../org/apache/nutch/crawl/Generator.java | 17 ++++++------- src/java/org/apache/nutch/crawl/Injector.java | 16 ++++++------- src/java/org/apache/nutch/crawl/LinkDb.java | 15 ++++++------ .../org/apache/nutch/crawl/LinkDbMerger.java | 16 ++++++------- .../org/apache/nutch/crawl/LinkDbReader.java | 24 +++++++++---------- .../org/apache/nutch/fetcher/Fetcher.java | 17 ++++++------- .../org/apache/nutch/hostdb/ReadHostDb.java | 15 ++++++------ .../org/apache/nutch/hostdb/UpdateHostDb.java | 16 ++++++------- .../org/apache/nutch/indexer/CleaningJob.java | 16 ++++++------- .../org/apache/nutch/indexer/IndexingJob.java | 16 ++++++------- .../org/apache/nutch/parse/ParseSegment.java | 21 +++++++--------- .../nutch/scoring/webgraph/LinkDumper.java | 17 ++++++------- .../nutch/scoring/webgraph/LinkRank.java | 16 ++++++------- .../nutch/scoring/webgraph/NodeDumper.java | 16 ++++++------- .../nutch/scoring/webgraph/ScoreUpdater.java | 16 ++++++------- .../nutch/scoring/webgraph/WebGraph.java | 24 +++++++++---------- .../org/apache/nutch/tools/FreeGenerator.java | 16 ++++++------- .../nutch/tools/arc/ArcSegmentCreator.java | 16 ++++++------- .../apache/nutch/tools/warc/WARCExporter.java | 15 ++++++------ .../nutch/util/CrawlCompletionStats.java | 15 ++++++------ .../nutch/util/ProtocolStatusStatistics.java | 19 +++++++-------- .../apache/nutch/util/SitemapProcessor.java | 12 ++++++---- .../nutch/util/domain/DomainStatistics.java | 16 ++++++------- .../urlfilter/api/RegexURLFilterBaseTest.java | 11 +++++---- .../regex/TestRegexURLNormalizer.java | 8 +++++-- 30 files changed, 234 insertions(+), 225 deletions(-) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index e3ed11c869..ba1d470ece 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -22,7 +22,6 @@ on: branches: [ master ] pull_request: branches: [ master ] - jobs: build: diff --git a/.gitignore b/.gitignore index 0612a99c23..b466908527 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ naivebayes-model csvindexwriter lib/spotbugs-* ivy/dependency-check-ant/* +.gradle* diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java index 3819bb3a01..16394832bf 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDb.java +++ b/src/java/org/apache/nutch/crawl/CrawlDb.java @@ -19,14 +19,15 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -49,7 +50,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; /** * This class takes the output of the fetcher and updates the crawldb @@ -85,10 +85,11 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException, InterruptedException, ClassNotFoundException { - Path lock = lock(getConf(), crawlDb, force); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + + Path lock = lock(getConf(), crawlDb, force); Job job = CrawlDb.createJob(getConf(), crawlDb); Configuration conf = job.getConfiguration(); @@ -98,7 +99,7 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, boolean url404Purging = conf.getBoolean(CRAWLDB_PURGE_404, false); - LOG.info("CrawlDb update: starting at {}", sdf.format(start)); + LOG.info("CrawlDb update: starting"); LOG.info("CrawlDb update: db: {}", crawlDb); LOG.info("CrawlDb update: segments: {}", Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: {}", additionsAllowed); @@ -151,9 +152,9 @@ public void update(Path crawlDb, Path[] segments, boolean normalize, urlsFiltered); } - long end = System.currentTimeMillis(); - LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CrawlDb update: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } /* diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java index 70c65135ec..1bf7243d38 100644 --- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java +++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java @@ -18,11 +18,12 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Map.Entry; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -44,7 +45,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool merges several CrawlDb-s into one, optionally filtering URLs @@ -129,9 +129,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { Path lock = CrawlDb.lock(getConf(), output, false); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CrawlDb merge: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CrawlDb merge: starting"); Job job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { @@ -155,9 +155,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) NutchJob.cleanupAfterFailure(outPath, lock, fs); throw e; } - long end = System.currentTimeMillis(); - LOG.info("CrawlDb merge: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CrawlDb merge: finished, elapsed: {}", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static Job createMergeJob(Configuration conf, Path output, diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java index ae5ac37ce0..217005d415 100644 --- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java +++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java @@ -21,11 +21,12 @@ import java.lang.invoke.MethodHandles; import java.net.URLDecoder; import java.nio.charset.StandardCharsets; -import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -48,7 +49,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -298,9 +298,9 @@ public int run(String[] args) throws IOException { } } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("DeduplicationJob: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("DeduplicationJob: starting"); Path tempDir = new Path(crawlDb, "dedup-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); @@ -381,9 +381,9 @@ public int run(String[] args) throws IOException { // clean up fs.delete(tempDir, true); - long end = System.currentTimeMillis(); - LOG.info("Deduplication finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Deduplication finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java index d1569e1f03..1b62314e7a 100644 --- a/src/java/org/apache/nutch/crawl/Generator.java +++ b/src/java/org/apache/nutch/crawl/Generator.java @@ -30,7 +30,9 @@ import java.util.Locale; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configurable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -76,7 +78,6 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.SegmentReaderUtil; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -821,10 +822,10 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, Path lock = CrawlDb.lock(getConf(), dbDir, force); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Generator: starting at " + sdf.format(start)); - LOG.info("Generator: Selecting best-scoring urls due for fetch."); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Generator: starting"); + LOG.info("Generator: selecting best-scoring urls due for fetch."); LOG.info("Generator: filtering: " + filter); LOG.info("Generator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { @@ -982,9 +983,9 @@ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, } fs.delete(tempDir, true); - long end = System.currentTimeMillis(); - LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Generator: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java index 9fca719f62..9bfd1b4547 100644 --- a/src/java/org/apache/nutch/crawl/Injector.java +++ b/src/java/org/apache/nutch/crawl/Injector.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.crawl; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -45,17 +46,16 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; /** * Injector takes a flat text file of URLs (or a folder containing text files) @@ -372,10 +372,11 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, boolean update, boolean normalize, boolean filter, boolean filterNormalizeAll) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Injector: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + + LOG.info("Injector: starting"); LOG.info("Injector: crawlDb: {}", crawlDb); LOG.info("Injector: urlDir: {}", urlDir); LOG.info("Injector: Converting injected urls to crawl db entries."); @@ -479,9 +480,8 @@ public void inject(Path crawlDb, Path urlDir, boolean overwrite, urlsPurged404); } - long end = System.currentTimeMillis(); - LOG.info("Injector: finished at {}, elapsed: {}", sdf.format(end), - TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Injector: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); } } catch (IOException | InterruptedException | ClassNotFoundException | NullPointerException e) { LOG.error("Injector job failed: {}", e.getMessage()); diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java index 2b3d2ed907..3c752ab1db 100644 --- a/src/java/org/apache/nutch/crawl/LinkDb.java +++ b/src/java/org/apache/nutch/crawl/LinkDb.java @@ -21,13 +21,14 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -54,7 +55,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; /** Maintains an inverted link map, listing incoming links for each url. */ public class LinkDb extends NutchTool implements Tool { @@ -196,9 +196,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize, Path currentLinkDb = new Path(linkDb, CURRENT_NAME); Configuration conf = job.getConfiguration(); - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("LinkDb: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb: starting"); LOG.info("LinkDb: linkdb: {}", linkDb); LOG.info("LinkDb: URL normalize: {}", normalize); LOG.info("LinkDb: URL filter: {}", filter); @@ -260,8 +260,9 @@ public void invert(Path linkDb, Path[] segments, boolean normalize, } LinkDb.install(job, linkDb); - long end = System.currentTimeMillis(); - LOG.info("LinkDb: finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } private static Job createJob(Configuration config, Path linkDb, diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java index f696c599e8..d6a41ab48c 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java +++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java @@ -18,11 +18,12 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Iterator; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -41,7 +42,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool merges several LinkDb-s into one, optionally filtering URLs through @@ -112,9 +112,9 @@ public void reduce(Text key, Iterable values, Context context) public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("LinkDb merge: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb merge: starting"); Job job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { @@ -137,9 +137,9 @@ public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME)); - long end = System.currentTimeMillis(); - LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb merge: finished, elapsed: {} ms" + stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static Job createMergeJob(Configuration config, Path linkDb, diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java index c307b985d5..fa01f20bf3 100644 --- a/src/java/org/apache/nutch/crawl/LinkDbReader.java +++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java @@ -16,13 +16,15 @@ */ package org.apache.nutch.crawl; +import java.io.Closeable; import java.io.IOException; - import java.lang.invoke.MethodHandles; +import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.Iterator; -// Commons Logging imports +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,11 +48,8 @@ import org.apache.nutch.util.AbstractChecker; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; -import java.text.SimpleDateFormat; -import java.util.Iterator; -import java.io.Closeable; + /** * Read utility for the LinkDb. @@ -153,10 +152,9 @@ public void map(Text key, Inlinks value, Context context) public void processDumpJob(String linkdb, String output, String regex) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - - LOG.info("LinkDb dump: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkDb dump: starting"); LOG.info("LinkDb dump: db: {}", linkdb); Path outFolder = new Path(output); @@ -192,9 +190,9 @@ public void processDumpJob(String linkdb, String output, String regex) throw e; } - long end = System.currentTimeMillis(); - LOG.info("LinkDb dump: finished at {}, elapsed: {}", - sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDb dump: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } @Override diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 3727dcebef..92aef6f106 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -25,9 +25,11 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; @@ -454,11 +456,10 @@ public void fetch(Path segment, int threads) throws IOException, checkConfiguration(); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("Fetcher: starting at {}", TimingUtil.logDateMillis(start)); - LOG.info("Fetcher: segment: {}", segment); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Fetcher: starting"); + LOG.info("Fetcher: segment: {}", segment); // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task @@ -530,9 +531,9 @@ public void fetch(Path segment, int threads) throws IOException, throw e; } - long end = System.currentTimeMillis(); - LOG.info("Fetcher: finished at {}, elapsed: {}", - TimingUtil.logDateMillis(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Fetcher: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } /** diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java b/src/java/org/apache/nutch/hostdb/ReadHostDb.java index ffddb18898..0321a8652c 100644 --- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java +++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java @@ -18,9 +18,10 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Map; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -42,7 +43,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.SegmentReaderUtil; import org.apache.commons.jexl3.JexlBuilder; @@ -168,9 +168,9 @@ public void map(Text key, HostDatum datum, Context context) throws IOException, // } private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean dumpHostnames, String expr) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ReadHostDb: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ReadHostDb: starting"); Configuration conf = getConf(); conf.setBoolean(HOSTDB_DUMP_HOMEPAGES, dumpHomepages); @@ -211,8 +211,9 @@ private void readHostDb(Path hostDb, Path output, boolean dumpHomepages, boolean throw e; } - long end = System.currentTimeMillis(); - LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ReadHostDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } private void getHostDbRecord(Path hostDb, String host) throws Exception { diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java index ffa68d0963..65e45c55d8 100644 --- a/src/java/org/apache/nutch/hostdb/UpdateHostDb.java +++ b/src/java/org/apache/nutch/hostdb/UpdateHostDb.java @@ -17,9 +17,10 @@ package org.apache.nutch.hostdb; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -40,7 +41,6 @@ import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -73,9 +73,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew, boolean checkKnown, boolean force, boolean filter, boolean normalize) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("UpdateHostDb: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("UpdateHostDb: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -149,9 +149,9 @@ private void updateHostDb(Path hostDb, Path crawlDb, Path topHosts, } LockUtil.removeLockFile(fs, lock); - long end = System.currentTimeMillis(); - LOG.info("UpdateHostDb: finished at " + sdf.format(end) + - ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("UpdateHostDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String args[]) throws Exception { diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java index dc3ed69e4a..04b9c2efa5 100644 --- a/src/java/org/apache/nutch/indexer/CleaningJob.java +++ b/src/java/org/apache/nutch/indexer/CleaningJob.java @@ -18,7 +18,9 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.ByteWritable; @@ -36,7 +38,6 @@ import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -139,9 +140,9 @@ public void reduce(ByteWritable key, Iterable values, public void delete(String crawldb, boolean noCommit) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CleaningJob: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CleaningJob: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -173,9 +174,8 @@ public void delete(String crawldb, boolean noCommit) throw e; } - long end = System.currentTimeMillis(); - LOG.info("CleaningJob: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CleaningJob: finished, elapsed: {} ms", stopWatch.getTime(TimeUnit.MILLISECONDS)); } @Override diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java index ff46bc0eff..d2115230c8 100644 --- a/src/java/org/apache/nutch/indexer/IndexingJob.java +++ b/src/java/org/apache/nutch/indexer/IndexingJob.java @@ -19,7 +19,6 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -27,7 +26,9 @@ import java.util.Locale; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.segment.SegmentChecker; import org.apache.hadoop.conf.Configuration; @@ -44,7 +45,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -104,9 +104,9 @@ public void index(Path crawlDb, Path linkDb, List segments, boolean filter, boolean normalize, boolean addBinaryContent, boolean base64) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Indexer: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("Indexer: starting"); final Job job = NutchJob.getInstance(getConf()); job.setJobName("Indexer"); @@ -159,9 +159,9 @@ public void index(Path crawlDb, Path linkDb, List segments, String.format(Locale.ROOT, "%6d", counter.getValue()), counter.getName()); } - long end = System.currentTimeMillis(); - LOG.info("Indexer: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("Indexer: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } finally { tmp.getFileSystem(conf).delete(tmp, true); } diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java index c4e271feec..de45c463b9 100644 --- a/src/java/org/apache/nutch/parse/ParseSegment.java +++ b/src/java/org/apache/nutch/parse/ParseSegment.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.parse; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.nutch.crawl.CrawlDatum; @@ -25,7 +26,6 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.TimingUtil; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; @@ -50,13 +50,12 @@ import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; -import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; /* Parse content in a segment. */ public class ParseSegment extends NutchTool implements Tool { @@ -228,12 +227,10 @@ public void parse(Path segment) throws IOException, return; } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("ParseSegment: starting at {}", sdf.format(start)); - LOG.info("ParseSegment: segment: {}", segment); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ParseSegment: starting"); + LOG.info("ParseSegment: segment: {}", segment); Job job = NutchJob.getInstance(getConf()); job.setJobName("parse " + segment); @@ -263,9 +260,9 @@ public void parse(Path segment) throws IOException, throw e; } - long end = System.currentTimeMillis(); - LOG.info("ParseSegment: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ParseSegment: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java index 54cd8b8ed1..4831d73f38 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java @@ -20,10 +20,11 @@ import java.io.DataOutput; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import java.util.Random; +import java.util.concurrent.TimeUnit; + import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; @@ -31,6 +32,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -57,7 +59,6 @@ import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * The LinkDumper tool creates a database of node to inlink information that can @@ -327,9 +328,9 @@ public void reduce(Text key, Iterable values, public void dumpLinks(Path webGraphDb) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("NodeDumper: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("NodeDumper: starting"); Configuration conf = getConf(); FileSystem fs = webGraphDb.getFileSystem(conf); @@ -400,9 +401,9 @@ public void dumpLinks(Path webGraphDb) throws IOException, } fs.delete(tempInverted, true); - long end = System.currentTimeMillis(); - LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkDumper: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java index 739fe6cec1..c226ad130b 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java +++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java @@ -21,12 +21,12 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -35,6 +35,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -65,7 +66,6 @@ import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; public class LinkRank extends Configured implements Tool { @@ -651,9 +651,9 @@ public LinkRank(Configuration conf) { public void analyze(Path webGraphDb) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("Analysis: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("LinkRank Analysis: starting"); // store the link rank under the webgraphdb temporarily, final scores get // upddated into the nodedb @@ -714,9 +714,9 @@ public void analyze(Path webGraphDb) throws IOException, // remove the temporary link rank folder fs.delete(linkRank, true); - long end = System.currentTimeMillis(); - LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("LinkRank Analysis: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java index ede9fa1c59..dfccccc19e 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java +++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java @@ -18,7 +18,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -27,6 +27,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -48,7 +49,6 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -293,9 +293,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("NodeDumper: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("NodeDumper: starting"); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Job dumper = NutchJob.getInstance(getConf()); @@ -357,9 +357,9 @@ public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, LOG.error("NodeDumper job failed:", e); throw e; } - long end = System.currentTimeMillis(); - LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("NodeDumper: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java index 130e1b2a1c..c10a6e37b0 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java +++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java @@ -18,8 +18,8 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.Random; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -28,6 +28,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -51,7 +52,6 @@ import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * Updates the score from the WebGraph node database into the crawl database. @@ -156,9 +156,9 @@ public void reduce(Text key, Iterable values, public void update(Path crawlDb, Path webGraphDb) throws IOException, ClassNotFoundException, InterruptedException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ScoreUpdater: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ScoreUpdater: starting"); Configuration conf = getConf(); @@ -213,9 +213,9 @@ public void update(Path crawlDb, Path webGraphDb) throws IOException, LOG.info("ScoreUpdater: installing new crawldb " + crawlDb); CrawlDb.install(updater, crawlDb); - long end = System.currentTimeMillis(); - LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ScoreUpdater: finished, elapsed: {} ms ", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java index 63d0ead7da..b98329d1e0 100644 --- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java +++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java @@ -18,7 +18,6 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashMap; @@ -26,6 +25,7 @@ import java.util.Map; import java.util.Random; import java.util.Set; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -34,6 +34,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -67,7 +68,6 @@ import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; /** @@ -518,14 +518,12 @@ public void createWebGraph(Path webGraphDb, Path[] segments, boolean normalize, boolean filter) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - if (LOG.isInfoEnabled()) { - LOG.info("WebGraphDb: starting at " + sdf.format(start)); - LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); - LOG.info("WebGraphDb: URL normalize: " + normalize); - LOG.info("WebGraphDb: URL filter: " + filter); - } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("WebGraphDb: starting"); + LOG.info("WebGraphDb: webgraphdb: " + webGraphDb); + LOG.info("WebGraphDb: URL normalize: " + normalize); + LOG.info("WebGraphDb: URL filter: " + filter); FileSystem fs = webGraphDb.getFileSystem(getConf()); @@ -715,9 +713,9 @@ public void createWebGraph(Path webGraphDb, Path[] segments, // remove the lock file for the webgraph LockUtil.removeLockFile(fs, lock); - long end = System.currentTimeMillis(); - LOG.info("WebGraphDb: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("WebGraphDb: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String[] args) throws Exception { diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java index 039bccaece..e9f5c87619 100644 --- a/src/java/org/apache/nutch/tools/FreeGenerator.java +++ b/src/java/org/apache/nutch/tools/FreeGenerator.java @@ -18,10 +18,11 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; import java.util.HashMap; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; @@ -47,7 +48,6 @@ import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; /** * This tool generates fetchlists (segments to be fetched) from plain text files @@ -180,9 +180,9 @@ public int run(String[] args) throws Exception { } } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("FreeGenerator: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("FreeGenerator: starting"); Job job = NutchJob.getInstance(getConf()); Configuration conf = job.getConfiguration(); @@ -226,9 +226,9 @@ public int run(String[] args) throws Exception { LOG.error("FAILED: " + StringUtils.stringifyException(e)); return -1; } - long end = System.currentTimeMillis(); - LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("FreeGenerator: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java index 4e916dbd50..825e752cc0 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java +++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java @@ -21,7 +21,9 @@ import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map.Entry; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -56,7 +58,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; -import org.apache.nutch.util.TimingUtil; /** *

@@ -368,10 +369,10 @@ public void map(Text key, BytesWritable bytes, public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException, InterruptedException, ClassNotFoundException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); if (LOG.isInfoEnabled()) { - LOG.info("ArcSegmentCreator: starting at " + sdf.format(start)); + LOG.info("ArcSegmentCreator: starting"); LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles); } @@ -402,10 +403,9 @@ public void createSegments(Path arcFiles, Path segmentsOutDir) throw e; } - - long end = System.currentTimeMillis(); - LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) - + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ArcSegmentCreator: finished, elapsed: {} ms" + stopWatch.getTime( + TimeUnit.MILLISECONDS)); } public static void main(String args[]) throws Exception { diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java index cf000ba526..6d8a385572 100644 --- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java +++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java @@ -29,8 +29,10 @@ import java.util.List; import java.util.Locale; import java.util.UUID; +import java.util.concurrent.TimeUnit; import org.apache.commons.lang.StringUtils; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; @@ -58,7 +60,6 @@ import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -428,9 +429,9 @@ protected JsonObject metadataToJson(Metadata meta) { public int generateWARC(String output, List segments, boolean onlySuccessfulResponses, boolean includeParseData, boolean includeParseText) throws IOException { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("WARCExporter: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("WARCExporter: starting"); final Job job = NutchJob.getInstance(getConf()); job.setJobName("warc-exporter " + output); @@ -479,9 +480,9 @@ public int generateWARC(String output, List segments, throw new RuntimeException(message); } LOG.info(job.getCounters().toString()); - long end = System.currentTimeMillis(); - LOG.info("WARCExporter: finished at {}, elapsed: {}", sdf.format(end), - TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("WARCExporter: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("WARCExporter job failed: {}", e.getMessage()); return -1; diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java index 7210ee83af..8696d28221 100644 --- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java +++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java @@ -20,7 +20,7 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; @@ -30,6 +30,7 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -127,9 +128,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[3]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("CrawlCompletionStats: starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("CrawlCompletionStats: starting"); int mode = 0; String jobName = "CrawlCompletionStats"; @@ -180,9 +181,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("CrawlCompletionStats: finished at {}, elapsed: {}", - sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("CrawlCompletionStats: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java index 2499da0bfb..0fe6c57d03 100644 --- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java +++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java @@ -16,10 +16,11 @@ */ package org.apache.nutch.util; -import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; + +import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; @@ -37,8 +38,6 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.metadata.Nutch; /** @@ -86,9 +85,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[2]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("ProtocolStatistics: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("ProtocolStatistics: starting"); String jobName = "ProtocolStatistics"; @@ -130,9 +129,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("ProtocolStatistics: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("ProtocolStatistics: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java index 98f7df839d..66fa9b0e7a 100644 --- a/src/java/org/apache/nutch/util/SitemapProcessor.java +++ b/src/java/org/apache/nutch/util/SitemapProcessor.java @@ -22,7 +22,9 @@ import java.util.Collection; import java.util.List; import java.util.Random; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; @@ -359,8 +361,9 @@ else if(sitemapDatum != null) { public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter, boolean normalize, int threads) throws Exception { - long start = System.currentTimeMillis(); - LOG.info("SitemapProcessor: Starting at {}", sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("SitemapProcessor: starting"); FileSystem fs = crawldb.getFileSystem(getConf()); Path old = new Path(crawldb, "old"); @@ -441,8 +444,9 @@ public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean stric LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches); LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries); - long end = System.currentTimeMillis(); - LOG.info("SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("SitemapProcessor: finished, elapsed: {} ms", stopWatch.getTime( + TimeUnit.MILLISECONDS)); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error("SitemapProcessor_" + crawldb.toString(), e); diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java index 638b6c94f1..f77b72bc5f 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -20,8 +20,9 @@ import java.lang.invoke.MethodHandles; import java.net.MalformedURLException; import java.net.URL; -import java.text.SimpleDateFormat; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; @@ -39,7 +40,6 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -92,9 +92,9 @@ public int run(String[] args) throws Exception { numOfReducers = Integer.parseInt(args[3]); } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - long start = System.currentTimeMillis(); - LOG.info("DomainStatistics: starting at " + sdf.format(start)); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + LOG.info("DomainStatistics: starting"); int mode = 0; String jobName = "DomainStatistics"; @@ -151,9 +151,9 @@ public int run(String[] args) throws Exception { throw e; } - long end = System.currentTimeMillis(); - LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " - + TimingUtil.elapsedTime(start, end)); + stopWatch.stop(); + LOG.info("DomainStatistics: finished, elapsed: {} ms ", stopWatch.getTime( + TimeUnit.MILLISECONDS)); return 0; } diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java index c77c67eb17..080b2e5870 100644 --- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java +++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java @@ -16,7 +16,6 @@ */ package org.apache.nutch.urlfilter.api; -// JDK imports import java.lang.invoke.MethodHandles; import java.io.BufferedReader; import java.io.FileReader; @@ -24,12 +23,13 @@ import java.io.Reader; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.TimeUnit; +import org.apache.commons.lang3.time.StopWatch; import org.junit.Assert; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -// Nutch imports import org.apache.nutch.net.URLFilter; /** @@ -58,7 +58,8 @@ protected void bench(int loops, String file) { } protected void bench(int loops, Reader rules, Reader urls) { - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); try { URLFilter filter = getURLFilter(rules); FilteredURL[] expected = readURLFile(urls); @@ -68,8 +69,8 @@ protected void bench(int loops, Reader rules, Reader urls) { } catch (Exception e) { Assert.fail(e.toString()); } - LOG.info("bench time (" + loops + ") " - + (System.currentTimeMillis() - start) + "ms"); + stopWatch.stop(); + LOG.info("bench time {} loops {} ms", loops, stopWatch.getTime(TimeUnit.MILLISECONDS)); } protected void bench(int loops, String rulesFile, String urlsFile) { diff --git a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java index 1eee7183b7..4952a1da4c 100644 --- a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java +++ b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java @@ -25,11 +25,13 @@ import java.io.IOException; import java.io.InputStreamReader; import java.util.*; +import java.util.concurrent.TimeUnit; import org.junit.Assert; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.util.NutchConfiguration; @@ -104,7 +106,8 @@ private void normalizeTest(NormalizedURL[] urls, String scope) } private void bench(int loops, String scope) { - long start = System.currentTimeMillis(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); try { NormalizedURL[] expected = (NormalizedURL[]) testData.get(scope); if (expected == null) @@ -115,8 +118,9 @@ private void bench(int loops, String scope) { } catch (Exception e) { Assert.fail(e.toString()); } + stopWatch.stop(); LOG.info("bench time (" + loops + ") " - + (System.currentTimeMillis() - start) + "ms"); + + (stopWatch.getTime(TimeUnit.MILLISECONDS)) + "ms"); } private static class NormalizedURL {