Skip to content

Commit 4aeb2f7

Browse files
authored
Javadoc site crawler (#7300)
1 parent 29523e6 commit 4aeb2f7

File tree

7 files changed

+377
-0
lines changed

7 files changed

+377
-0
lines changed

.github/workflows/javadoc-crawler.yml

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: Javadoc.io site crawler (daily)
2+
3+
on:
4+
schedule:
5+
- cron: "30 1 * * *" # daily at 1:30 UTC
6+
workflow_dispatch:
7+
8+
permissions:
9+
contents: read
10+
11+
jobs:
12+
crawl:
13+
runs-on: ubuntu-latest
14+
steps:
15+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
16+
17+
- uses: actions/setup-java@c5195efecf7bdfc987ee8bae7a71cb8b11521c00 # v4.7.1
18+
with:
19+
distribution: temurin
20+
java-version: 17
21+
22+
- name: Set up gradle
23+
uses: gradle/actions/setup-gradle@06832c7b30a0129d7fb559bcc6e43d26f6374244 # v4.3.1
24+
25+
- name: Run crawler
26+
run: ./gradlew :javadoc-crawler:crawl

javadoc-crawler/README.md

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Javadoc Crawler
2+
3+
## Context
4+
5+
The javadocs.io website lazy loads content only when the artifacts have been accessed, which can
6+
lead to inaccuracies and confusion when someone loads the
7+
https://www.javadoc.io/doc/io.opentelemetry page, since the published `Latest version` will only be
8+
accurate if someone has accessed the page for the actual latest version.
9+
10+
This module provides a simple scraper that pulls the list of all `io.opentelemetry` artifacts from
11+
maven central and then visits each corresponding page on the javadoc.io website in order to trigger
12+
loading them into the site's system.
13+
14+
See https://github.com/open-telemetry/opentelemetry-java/issues/7294 for more information.
15+
16+
## How to run
17+
18+
```bash
19+
./gradlew :javadoc-crawler:crawl
20+
```

javadoc-crawler/build.gradle.kts

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
plugins {
2+
id("otel.java-conventions")
3+
}
4+
5+
dependencies {
6+
implementation("com.fasterxml.jackson.core:jackson-databind")
7+
testImplementation("org.assertj:assertj-core:3.27.3")
8+
}
9+
10+
description = "OpenTelemetry Javadoc Crawler"
11+
otelJava.moduleName.set("io.opentelemetry.javadocs")
12+
13+
tasks {
14+
withType<JavaCompile>().configureEach {
15+
sourceCompatibility = "17"
16+
targetCompatibility = "17"
17+
options.release.set(17)
18+
}
19+
20+
// only test on java 17+
21+
val testJavaVersion: String? by project
22+
if (testJavaVersion != null && Integer.valueOf(testJavaVersion) < 17) {
23+
test {
24+
enabled = false
25+
}
26+
}
27+
28+
val crawl by registering(JavaExec::class) {
29+
dependsOn(classes)
30+
31+
mainClass.set("io.opentelemetry.javadocs.JavaDocsCrawler")
32+
classpath(sourceSets["main"].runtimeClasspath)
33+
}
34+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.javadocs;
7+
8+
public class Artifact {
9+
private final String name;
10+
private final String version;
11+
12+
public Artifact(String name, String version) {
13+
this.name = name;
14+
this.version = version;
15+
}
16+
17+
public String getName() {
18+
return name;
19+
}
20+
21+
public String getVersion() {
22+
return version;
23+
}
24+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.javadocs;
7+
8+
import com.fasterxml.jackson.databind.ObjectMapper;
9+
import java.io.IOException;
10+
import java.net.URI;
11+
import java.net.http.HttpClient;
12+
import java.net.http.HttpRequest;
13+
import java.net.http.HttpResponse;
14+
import java.util.ArrayList;
15+
import java.util.List;
16+
import java.util.Locale;
17+
import java.util.Map;
18+
import java.util.Optional;
19+
import java.util.logging.Level;
20+
import java.util.logging.Logger;
21+
22+
/**
23+
* The javadoc.io site relies on someone accessing the page for an artifact version in order to
24+
* update the contents of the site. This will query Maven Central for all artifacts under
25+
* io.opentelemetry in order to identify the latest versions. Then it will crawl the associated
26+
* pages on the javadoc.io site to trigger updates.
27+
*/
28+
public final class JavaDocsCrawler {
29+
private static final String GROUP = "io.opentelemetry";
30+
private static final String MAVEN_CENTRAL_BASE_URL =
31+
"https://search.maven.org/solrsearch/select?q=g:";
32+
private static final String JAVA_DOCS_BASE_URL = "https://javadoc.io/doc/";
33+
private static final int PAGE_SIZE = 20;
34+
private static final int THROTTLE_MS = 500;
35+
36+
// visible for testing
37+
static final String JAVA_DOC_DOWNLOADED_TEXT = "Javadoc is being downloaded";
38+
39+
private static final Logger logger = Logger.getLogger(JavaDocsCrawler.class.getName());
40+
private static final ObjectMapper objectMapper = new ObjectMapper();
41+
42+
public static void main(String[] args) throws Exception {
43+
HttpClient client = HttpClient.newHttpClient();
44+
List<Artifact> artifacts = getArtifacts(client);
45+
if (artifacts.isEmpty()) {
46+
logger.log(Level.SEVERE, "No artifacts found");
47+
return;
48+
}
49+
logger.info(String.format(Locale.ROOT, "Found %d artifacts", artifacts.size()));
50+
51+
List<String> updated = crawlJavaDocs(client, artifacts);
52+
if (updated.isEmpty()) {
53+
logger.info("No updates were needed");
54+
return;
55+
}
56+
57+
logger.info("Artifacts that triggered updates:\n" + String.join("\n", updated));
58+
}
59+
60+
static List<Artifact> getArtifacts(HttpClient client) throws IOException, InterruptedException {
61+
int start = 0;
62+
Integer numFound;
63+
List<Artifact> result = new ArrayList<>();
64+
65+
do {
66+
if (start != 0) {
67+
Thread.sleep(THROTTLE_MS); // try not to DDoS the site, it gets knocked over easily
68+
}
69+
70+
Map<?, ?> map = queryMavenCentral(client, start);
71+
72+
numFound =
73+
Optional.ofNullable(map)
74+
.map(mavenResult -> (Map<?, ?>) mavenResult.get("response"))
75+
.map(response -> (Integer) response.get("numFound"))
76+
.orElse(null);
77+
78+
List<Artifact> artifacts = convertToArtifacts(map);
79+
result.addAll(artifacts);
80+
81+
start += PAGE_SIZE;
82+
} while (numFound != null && start < numFound);
83+
84+
return result;
85+
}
86+
87+
private static List<Artifact> convertToArtifacts(Map<?, ?> map) {
88+
return Optional.ofNullable(map)
89+
.map(mavenResults -> (Map<?, ?>) mavenResults.get("response"))
90+
.map(response -> (List<?>) response.get("docs"))
91+
.map(
92+
docs -> {
93+
List<Artifact> artifacts = new ArrayList<>();
94+
for (Object doc : docs) {
95+
Map<?, ?> docMap = (Map<?, ?>) doc;
96+
String artifact = (String) docMap.get("a");
97+
String version = (String) docMap.get("latestVersion");
98+
if (artifact != null && version != null) {
99+
artifacts.add(new Artifact(artifact, version));
100+
}
101+
}
102+
return artifacts;
103+
})
104+
.orElseGet(ArrayList::new);
105+
}
106+
107+
private static Map<?, ?> queryMavenCentral(HttpClient client, int start)
108+
throws IOException, InterruptedException {
109+
URI uri =
110+
URI.create(
111+
String.format(
112+
Locale.ROOT,
113+
"%s%s&rows=%d&start=%d&wt=json",
114+
MAVEN_CENTRAL_BASE_URL,
115+
GROUP,
116+
PAGE_SIZE,
117+
start));
118+
119+
HttpRequest request = HttpRequest.newBuilder(uri).GET().build();
120+
121+
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
122+
if (response.statusCode() != 200) {
123+
logger.log(
124+
Level.SEVERE,
125+
"Unexpected response code: " + response.statusCode() + ": " + response.body());
126+
throw new IOException("Unable to pull Maven central artifacts list");
127+
}
128+
return objectMapper.readValue(response.body(), Map.class);
129+
}
130+
131+
static List<String> crawlJavaDocs(HttpClient client, List<Artifact> artifacts)
132+
throws IOException, InterruptedException {
133+
List<String> updatedArtifacts = new ArrayList<>();
134+
135+
for (Artifact artifact : artifacts) {
136+
String[] parts = artifact.getName().split("-");
137+
StringBuilder path = new StringBuilder();
138+
path.append(JAVA_DOCS_BASE_URL)
139+
.append(GROUP)
140+
.append("/")
141+
.append(artifact.getName())
142+
.append("/")
143+
.append(artifact.getVersion())
144+
.append("/")
145+
.append(String.join("/", parts))
146+
.append("/package-summary.html");
147+
148+
HttpRequest crawlRequest = HttpRequest.newBuilder(URI.create(path.toString())).GET().build();
149+
HttpResponse<String> crawlResponse =
150+
client.send(crawlRequest, HttpResponse.BodyHandlers.ofString());
151+
152+
// gets a status code 303 when version exists and the site redirects it to use /latest/
153+
if (crawlResponse.statusCode() != 200 && crawlResponse.statusCode() != 303) {
154+
logger.log(
155+
Level.WARNING,
156+
String.format(
157+
Locale.ROOT,
158+
"Crawl failed for %s with status code %d at URL %s\nResponse: %s",
159+
artifact.getName(),
160+
crawlResponse.statusCode(),
161+
path,
162+
crawlResponse.body()));
163+
continue;
164+
}
165+
166+
if (crawlResponse.body().contains(JAVA_DOC_DOWNLOADED_TEXT)) {
167+
updatedArtifacts.add(artifact.getName());
168+
}
169+
170+
Thread.sleep(THROTTLE_MS); // some light throttling
171+
}
172+
return updatedArtifacts;
173+
}
174+
175+
private JavaDocsCrawler() {}
176+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Copyright The OpenTelemetry Authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package io.opentelemetry.javadocs;
7+
8+
import static io.opentelemetry.javadocs.JavaDocsCrawler.JAVA_DOC_DOWNLOADED_TEXT;
9+
import static org.assertj.core.api.Assertions.assertThat;
10+
import static org.mockito.ArgumentMatchers.any;
11+
import static org.mockito.Mockito.times;
12+
import static org.mockito.Mockito.verify;
13+
import static org.mockito.Mockito.when;
14+
15+
import java.io.IOException;
16+
import java.net.http.HttpClient;
17+
import java.net.http.HttpRequest;
18+
import java.net.http.HttpResponse;
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
import org.junit.jupiter.api.Test;
22+
import org.junit.jupiter.api.extension.ExtendWith;
23+
import org.mockito.ArgumentCaptor;
24+
import org.mockito.Mock;
25+
import org.mockito.junit.jupiter.MockitoExtension;
26+
27+
@ExtendWith(MockitoExtension.class)
28+
class JavaDocsCrawlerTest {
29+
@Mock HttpClient mockClient;
30+
@Mock HttpResponse<Object> mockMavenCentralRequest1;
31+
@Mock HttpResponse<Object> mockMavenCentralRequest2;
32+
@Mock HttpResponse<Object> mockJavaDocResponse;
33+
34+
@Test
35+
void testGetArtifactsHandlesPagination() throws IOException, InterruptedException {
36+
String page1Response =
37+
"""
38+
{
39+
"response": {
40+
"numFound": 40,
41+
"docs": [
42+
{"a": "artifact1", "latestVersion": "1.0"},
43+
{"a": "artifact2", "latestVersion": "1.1"}
44+
]
45+
}
46+
}
47+
""";
48+
String page2Response =
49+
"""
50+
{
51+
"response": {
52+
"numFound": 40,
53+
"docs": [
54+
{"a": "artifact3", "latestVersion": "2.0"}
55+
]
56+
}
57+
}
58+
""";
59+
60+
when(mockMavenCentralRequest1.body()).thenReturn(page1Response);
61+
when(mockMavenCentralRequest1.statusCode()).thenReturn(200);
62+
when(mockMavenCentralRequest2.body()).thenReturn(page2Response);
63+
when(mockMavenCentralRequest2.statusCode()).thenReturn(200);
64+
65+
when(mockClient.send(any(), any()))
66+
.thenReturn(mockMavenCentralRequest1)
67+
.thenReturn(mockMavenCentralRequest2);
68+
69+
List<Artifact> artifacts = JavaDocsCrawler.getArtifacts(mockClient);
70+
71+
// 2 calls for the pagination
72+
verify(mockClient, times(2)).send(any(), any());
73+
assertThat(artifacts.size()).isEqualTo(3);
74+
}
75+
76+
@Test
77+
void testCrawler() throws IOException, InterruptedException {
78+
List<Artifact> artifacts = new ArrayList<>();
79+
artifacts.add(new Artifact("opentelemetry-context", "1.49.0"));
80+
ArgumentCaptor<HttpRequest> requestCaptor = ArgumentCaptor.forClass(HttpRequest.class);
81+
82+
when(mockJavaDocResponse.body()).thenReturn(JAVA_DOC_DOWNLOADED_TEXT);
83+
when(mockJavaDocResponse.statusCode()).thenReturn(200);
84+
85+
when(mockClient.send(any(), any())).thenReturn(mockJavaDocResponse);
86+
87+
List<String> updated = JavaDocsCrawler.crawlJavaDocs(mockClient, artifacts);
88+
89+
verify(mockClient, times(1)).send(requestCaptor.capture(), any());
90+
91+
assertThat(requestCaptor.getValue().uri().toString())
92+
.isEqualTo(
93+
"https://javadoc.io/doc/io.opentelemetry/opentelemetry-context/1.49.0/opentelemetry/context/package-summary.html");
94+
assertThat(updated).containsExactly("opentelemetry-context");
95+
}
96+
}

settings.gradle.kts

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ include(":integration-tests:otlp")
5151
include(":integration-tests:tracecontext")
5252
include(":integration-tests:graal")
5353
include(":integration-tests:graal-incubating")
54+
include(":javadoc-crawler")
5455
include(":opencensus-shim")
5556
include(":opentracing-shim")
5657
include(":perf-harness")

0 commit comments

Comments
 (0)