-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: optimise GitLab commit history scraper
Scrape commits based on timestamp on last time commits were scraped.
- Loading branch information
Showing
6 changed files
with
102 additions
and
18 deletions.
There are no files selected for viewing
11 changes: 11 additions & 0 deletions
11
scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/BasicRepositoryDataWithHistory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package nl.esciencecenter.rsd.scraper.git; | ||
|
||
import java.util.UUID; | ||
|
||
public record BasicRepositoryDataWithHistory(UUID software, String url, String commitHistoryScrapedAt, CommitsPerWeek commitsPerWeek) { | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -26,12 +28,21 @@ | |
*/ | ||
public class CommitsPerWeek { | ||
|
||
final SortedMap<Instant, Long> data = new TreeMap<>(); | ||
private final SortedMap<Instant, Long> data = new TreeMap<>(); | ||
static final Gson gson = new GsonBuilder() | ||
.enableComplexMapKeySerialization() | ||
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.getEpochSecond())) | ||
.create(); | ||
|
||
public SortedMap<Instant, Long> getData() { | ||
return new TreeMap<>(data); | ||
} | ||
|
||
public void setData(SortedMap<Instant, Long> data) { | ||
this.data.clear(); | ||
this.data.putAll(data); | ||
} | ||
|
||
public void addCommits(ZonedDateTime zonedDateTime, long count) { | ||
ZonedDateTime utcTime = zonedDateTime.withZoneSameInstant(ZoneOffset.UTC); | ||
Instant sundayMidnight = utcTime.truncatedTo(ChronoUnit.DAYS).with(TemporalAdjusters.previousOrSame(DayOfWeek.SUNDAY)).toInstant(); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
// SPDX-FileCopyrightText: 2022 - 2024 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2024 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2022 - 2025 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -26,6 +26,8 @@ | |
public class GitlabScraper implements GitScraper { | ||
private final String projectPath; | ||
private final String apiUri; | ||
private final String commitHistoryScrapedAt; | ||
private final CommitsPerWeek existingCommitsPerWeek = new CommitsPerWeek(); | ||
|
||
/** | ||
* A GitLab scraper for API version 4. | ||
|
@@ -36,6 +38,14 @@ public class GitlabScraper implements GitScraper { | |
public GitlabScraper(String gitLabApiUrl, String projectPath) { | ||
this.projectPath = projectPath.endsWith(".git") ? projectPath.substring(0, projectPath.length() - 4) : projectPath; | ||
this.apiUri = gitLabApiUrl + "/v4"; | ||
this.commitHistoryScrapedAt = null; | ||
} | ||
|
||
public GitlabScraper(String gitLabApiUrl, String projectPath, String commitHistoryScrapedAt, CommitsPerWeek existingCommitsPerWeek) { | ||
this.projectPath = projectPath.endsWith(".git") ? projectPath.substring(0, projectPath.length() - 4) : projectPath; | ||
this.apiUri = gitLabApiUrl + "/v4"; | ||
this.commitHistoryScrapedAt = commitHistoryScrapedAt; | ||
if (existingCommitsPerWeek != null) this.existingCommitsPerWeek.setData(existingCommitsPerWeek.getData()); | ||
} | ||
|
||
/** | ||
|
@@ -81,12 +91,17 @@ public String languages() throws IOException, InterruptedException, RsdResponseE | |
@Override | ||
public CommitsPerWeek contributions() throws IOException, InterruptedException, RsdResponseException { | ||
CommitsPerWeek commits = new CommitsPerWeek(); | ||
|
||
String since=""; | ||
if (commitHistoryScrapedAt != null) { | ||
since = "&since=" + commitHistoryScrapedAt; | ||
} | ||
String page = "1"; | ||
boolean done = false; | ||
while (!done) { | ||
HttpRequest request = HttpRequest.newBuilder().GET() | ||
.uri(URI.create(apiUri + "/projects/" + Utils.urlEncode(projectPath) | ||
+ "/repository/commits?per_page=100&order=default&page=" + page)) | ||
+ "/repository/commits?per_page=100&order=default&page=" + page + since)) | ||
.timeout(Duration.ofSeconds(30)) | ||
.build(); | ||
HttpResponse<String> response; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// SPDX-FileCopyrightText: 2022 - 2025 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -40,31 +40,35 @@ public static void main(String[] args) { | |
|
||
private static void scrapeGitLab() { | ||
PostgrestConnector softwareInfoRepository = new PostgrestConnector(Config.backendBaseUrl() + "/repository_url", CodePlatformProvider.GITLAB); | ||
Collection<BasicRepositoryData> dataToScrape = softwareInfoRepository.commitData(Config.maxRequestsGitLab()); | ||
Collection<BasicRepositoryDataWithHistory> dataToScrape = softwareInfoRepository.commitDataWithHistory(Config.maxRequestsGitLab()); | ||
CompletableFuture<?>[] futures = new CompletableFuture[dataToScrape.size()]; | ||
ZonedDateTime scrapedAt = ZonedDateTime.now(); | ||
int i = 0; | ||
for (BasicRepositoryData commitData : dataToScrape) { | ||
for (BasicRepositoryDataWithHistory repositoryDataToScrape : dataToScrape) { | ||
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> { | ||
try { | ||
String repoUrl = commitData.url(); | ||
String repoUrl = repositoryDataToScrape.url(); | ||
String commitHistoryScrapedAt = repositoryDataToScrape.commitHistoryScrapedAt(); | ||
String hostname = URI.create(repoUrl).getHost(); | ||
String apiUrl = "https://" + hostname + "/api"; | ||
String projectPath = repoUrl.replace("https://" + hostname + "/", ""); | ||
if (projectPath.endsWith("/")) projectPath = projectPath.substring(0, projectPath.length() - 1); | ||
|
||
CommitsPerWeek existingCommitsPerWeek = repositoryDataToScrape.commitsPerWeek(); | ||
CommitsPerWeek scrapedCommits = new GitlabScraper(apiUrl, projectPath, commitHistoryScrapedAt, existingCommitsPerWeek).contributions(); | ||
|
||
CommitsPerWeek scrapedCommits = new GitlabScraper(apiUrl, projectPath).contributions(); | ||
CommitData updatedData = new CommitData(commitData, scrapedCommits, scrapedAt); | ||
BasicRepositoryData basicData = new BasicRepositoryData(repositoryDataToScrape.software(), repositoryDataToScrape.url()); | ||
CommitData updatedData = new CommitData(basicData, scrapedCommits, scrapedAt); | ||
softwareInfoRepository.saveCommitData(updatedData); | ||
} catch (RsdRateLimitException e) { | ||
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e); | ||
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", commitData.software().toString(), "software", null, null); | ||
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e); | ||
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", null, null); | ||
} catch (RsdResponseException e) { | ||
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e); | ||
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", commitData.software().toString(), "software", scrapedAt, "commit_history_scraped_at"); | ||
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e); | ||
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", scrapedAt, "commit_history_scraped_at"); | ||
} catch (Exception e) { | ||
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e); | ||
Utils.saveErrorMessageInDatabase("Unknown error", "repository_url", "commit_history_last_error", commitData.software().toString(), "software", scrapedAt, "commit_history_scraped_at"); | ||
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e); | ||
Utils.saveErrorMessageInDatabase("Unknown error", "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", scrapedAt, "commit_history_scraped_at"); | ||
} | ||
}); | ||
futures[i] = future; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -12,10 +14,14 @@ | |
import com.google.gson.JsonParser; | ||
import nl.esciencecenter.rsd.scraper.Utils; | ||
|
||
import java.time.Instant; | ||
import java.time.format.DateTimeFormatter; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.Objects; | ||
import java.util.Optional; | ||
import java.util.SortedMap; | ||
import java.util.TreeMap; | ||
import java.util.UUID; | ||
|
||
public class PostgrestConnector { | ||
|
@@ -51,6 +57,11 @@ public Collection<BasicRepositoryData> commitData(int limit) { | |
return parseBasicJsonData(data); | ||
} | ||
|
||
public Collection<BasicRepositoryDataWithHistory> commitDataWithHistory(int limit) { | ||
String data = Utils.getAsAdmin(backendUrl + "?" + filter + "&select=software,url,commit_history_scraped_at,commit_history&order=commit_history_scraped_at.asc.nullsfirst&limit=" + limit + "&" + Utils.atLeastOneHourAgoFilter("commit_history_scraped_at")); | ||
return parseBasicJsonDataWithHistory(data); | ||
} | ||
|
||
/** | ||
* Fetch basic data from PostgREST | ||
* | ||
|
@@ -67,6 +78,31 @@ public Collection<BasicRepositoryData> contributorData(int limit) { | |
return parseBasicJsonData(data); | ||
} | ||
|
||
static Collection<BasicRepositoryDataWithHistory> parseBasicJsonDataWithHistory(String data) { | ||
JsonArray dataInArray = JsonParser.parseString(data).getAsJsonArray(); | ||
Collection<BasicRepositoryDataWithHistory> result = new ArrayList<>(); | ||
for (JsonElement element : dataInArray) { | ||
JsonObject jsonObject = element.getAsJsonObject(); | ||
String softwareUuid = jsonObject.getAsJsonPrimitive("software").getAsString(); | ||
UUID software = UUID.fromString(softwareUuid); | ||
String url = jsonObject.getAsJsonPrimitive("url").getAsString(); | ||
String commitHistoryScrapedAt = Utils.stringOrNull(jsonObject.get("commit_history_scraped_at")); | ||
SortedMap<Instant, Long> commitHistory = null; | ||
if (!jsonObject.get("commit_history").isJsonNull()) { | ||
JsonObject commitHistoryJsonObject = jsonObject.getAsJsonObject("commit_history"); | ||
commitHistory = new TreeMap<>(); | ||
for (String key : commitHistoryJsonObject.keySet()) { | ||
commitHistory.put(Instant.ofEpochSecond(Long.parseLong(key)), commitHistoryJsonObject.getAsJsonPrimitive(key).getAsLong()); | ||
} | ||
} | ||
CommitsPerWeek commitsPerWeek = new CommitsPerWeek(); | ||
commitsPerWeek.setData(commitHistory); | ||
|
||
result.add(new BasicRepositoryDataWithHistory(software, url, commitHistoryScrapedAt, commitsPerWeek)); | ||
} | ||
return result; | ||
} | ||
|
||
static Collection<BasicRepositoryData> parseBasicJsonData(String data) { | ||
JsonArray dataInArray = JsonParser.parseString(data).getAsJsonArray(); | ||
Collection<BasicRepositoryData> result = new ArrayList<>(); | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]> | ||
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -22,18 +24,20 @@ class CommitsPerWeekTest { | |
@Test | ||
void givenInstance_whenValidOperations_thenCorrectResults() { | ||
CommitsPerWeek commitsPerWeek = new CommitsPerWeek(); | ||
Map<Instant, Long> underlyingMap = commitsPerWeek.data; | ||
Map<Instant, Long> underlyingMap; | ||
|
||
Instant sundayMidnight1 = Instant.ofEpochSecond(1670716800); | ||
commitsPerWeek.addCommits(sundayMidnight1, 10); | ||
|
||
underlyingMap = commitsPerWeek.getData(); | ||
Assertions.assertEquals(1, underlyingMap.size()); | ||
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight1)); | ||
Assertions.assertEquals(10, underlyingMap.get(sundayMidnight1)); | ||
|
||
|
||
commitsPerWeek.addCommits(sundayMidnight1, 20); | ||
|
||
underlyingMap = commitsPerWeek.getData(); | ||
Assertions.assertEquals(1, underlyingMap.size()); | ||
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight1)); | ||
Assertions.assertEquals(30, underlyingMap.get(sundayMidnight1)); | ||
|
@@ -43,6 +47,7 @@ void givenInstance_whenValidOperations_thenCorrectResults() { | |
.plus(Duration.ofSeconds(12345)); | ||
commitsPerWeek.addCommits(smallTimeAfterSundayMidnight1, 10); | ||
|
||
underlyingMap = commitsPerWeek.getData(); | ||
Assertions.assertEquals(1, underlyingMap.size()); | ||
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight1)); | ||
Assertions.assertEquals(40, underlyingMap.get(sundayMidnight1)); | ||
|
@@ -51,13 +56,15 @@ void givenInstance_whenValidOperations_thenCorrectResults() { | |
Instant sundayMidnight2 = sundayMidnight1.plus(Period.ofWeeks(5)); | ||
commitsPerWeek.addCommits(sundayMidnight2, 5); | ||
|
||
underlyingMap = commitsPerWeek.getData(); | ||
Assertions.assertEquals(2, underlyingMap.size()); | ||
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight2)); | ||
Assertions.assertEquals(5, underlyingMap.get(sundayMidnight2)); | ||
Assertions.assertEquals(40, underlyingMap.get(sundayMidnight1)); | ||
|
||
|
||
commitsPerWeek.addMissingZeros(); | ||
underlyingMap = commitsPerWeek.getData(); | ||
Assertions.assertEquals(6, underlyingMap.size()); | ||
Assertions.assertEquals(0, underlyingMap.get(sundayMidnight1.plus(Period.ofWeeks(1)))); | ||
Assertions.assertEquals(0, underlyingMap.get(sundayMidnight1.plus(Period.ofWeeks(2)))); | ||
|