Skip to content

Commit

Permalink
NUTCH-3096 HostDB ResolverThread can create too many job counters
Browse files Browse the repository at this point in the history
(patch contributed by Markus Jelsma)
  • Loading branch information
sebastian-nagel committed Dec 4, 2024
1 parent e2a29d0 commit 5263b7c
Showing 1 changed file with 20 additions and 3 deletions.
23 changes: 20 additions & 3 deletions src/java/org/apache/nutch/hostdb/ResolverThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,32 @@ public void run() {
}
}

context.getCounter("UpdateHostDb",
Long.toString(datum.numFailures()) + "_times_failed").increment(1);
context.getCounter("UpdateHostDb", createFailureCounterLabel(datum)).increment(1);
} catch (Exception ioe) {
LOG.warn(StringUtils.stringifyException(ioe));
}
} catch (Exception e) {
LOG.warn(StringUtils.stringifyException(e));
}

context.getCounter("UpdateHostDb", "checked_hosts").increment(1);
}

private String createFailureCounterLabel(HostDatum datum) {
// Hadoop will allow no more than 120 distinct counters. If we have a large
// number of distinct failures, we'll exceed the limit, Hadoop will complain,
// the job will fail. Let's limit the amount of possibilities by grouping
// the numFailures in buckets. NUTCH-3096
String label = null;
long n = datum.numFailures();
if (n < 4) {
label = Long.toString(n);
} else if (n > 3 && n < 11) {
label = "4-10";
} else {
label = ">10";
}

return label + "_times_failed";
}
}

0 comments on commit 5263b7c

Please sign in to comment.