Skip to content

Commit

Permalink
fix: optimise GitLab commit history scraper
Browse files Browse the repository at this point in the history
Scrape commits based on timestamp on last time commits were scraped.
  • Loading branch information
cmeessen committed Jan 24, 2025
1 parent d55cdd1 commit bf51083
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 18 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

package nl.esciencecenter.rsd.scraper.git;

import java.util.UUID;

public record BasicRepositoryDataWithHistory(UUID software, String url, String commitHistoryScrapedAt, CommitsPerWeek commitsPerWeek) {
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -26,12 +28,21 @@
*/
public class CommitsPerWeek {

final SortedMap<Instant, Long> data = new TreeMap<>();
private final SortedMap<Instant, Long> data = new TreeMap<>();
static final Gson gson = new GsonBuilder()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.getEpochSecond()))
.create();

public SortedMap<Instant, Long> getData() {
return new TreeMap<>(data);
}

public void setData(SortedMap<Instant, Long> data) {
this.data.clear();
this.data.putAll(data);
}

public void addCommits(ZonedDateTime zonedDateTime, long count) {
ZonedDateTime utcTime = zonedDateTime.withZoneSameInstant(ZoneOffset.UTC);
Instant sundayMidnight = utcTime.truncatedTo(ChronoUnit.DAYS).with(TemporalAdjusters.previousOrSame(DayOfWeek.SUNDAY)).toInstant();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: 2022 - 2024 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -26,6 +26,8 @@
public class GitlabScraper implements GitScraper {
private final String projectPath;
private final String apiUri;
private final String commitHistoryScrapedAt;
private final CommitsPerWeek existingCommitsPerWeek = new CommitsPerWeek();

/**
* A GitLab scraper for API version 4.
Expand All @@ -36,6 +38,14 @@ public class GitlabScraper implements GitScraper {
public GitlabScraper(String gitLabApiUrl, String projectPath) {
this.projectPath = projectPath.endsWith(".git") ? projectPath.substring(0, projectPath.length() - 4) : projectPath;
this.apiUri = gitLabApiUrl + "/v4";
this.commitHistoryScrapedAt = null;
}

public GitlabScraper(String gitLabApiUrl, String projectPath, String commitHistoryScrapedAt, CommitsPerWeek existingCommitsPerWeek) {
this.projectPath = projectPath.endsWith(".git") ? projectPath.substring(0, projectPath.length() - 4) : projectPath;
this.apiUri = gitLabApiUrl + "/v4";
this.commitHistoryScrapedAt = commitHistoryScrapedAt;
if (existingCommitsPerWeek != null) this.existingCommitsPerWeek.setData(existingCommitsPerWeek.getData());
}

/**
Expand Down Expand Up @@ -81,12 +91,17 @@ public String languages() throws IOException, InterruptedException, RsdResponseE
@Override
public CommitsPerWeek contributions() throws IOException, InterruptedException, RsdResponseException {
CommitsPerWeek commits = new CommitsPerWeek();

String since="";
if (commitHistoryScrapedAt != null) {
since = "&since=" + commitHistoryScrapedAt;
}
String page = "1";
boolean done = false;
while (!done) {
HttpRequest request = HttpRequest.newBuilder().GET()
.uri(URI.create(apiUri + "/projects/" + Utils.urlEncode(projectPath)
+ "/repository/commits?per_page=100&order=default&page=" + page))
+ "/repository/commits?per_page=100&order=default&page=" + page + since))
.timeout(Duration.ofSeconds(30))
.build();
HttpResponse<String> response;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
// SPDX-FileCopyrightText: 2022 - 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -40,31 +40,35 @@ public static void main(String[] args) {

private static void scrapeGitLab() {
PostgrestConnector softwareInfoRepository = new PostgrestConnector(Config.backendBaseUrl() + "/repository_url", CodePlatformProvider.GITLAB);
Collection<BasicRepositoryData> dataToScrape = softwareInfoRepository.commitData(Config.maxRequestsGitLab());
Collection<BasicRepositoryDataWithHistory> dataToScrape = softwareInfoRepository.commitDataWithHistory(Config.maxRequestsGitLab());
CompletableFuture<?>[] futures = new CompletableFuture[dataToScrape.size()];
ZonedDateTime scrapedAt = ZonedDateTime.now();
int i = 0;
for (BasicRepositoryData commitData : dataToScrape) {
for (BasicRepositoryDataWithHistory repositoryDataToScrape : dataToScrape) {
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
try {
String repoUrl = commitData.url();
String repoUrl = repositoryDataToScrape.url();
String commitHistoryScrapedAt = repositoryDataToScrape.commitHistoryScrapedAt();
String hostname = URI.create(repoUrl).getHost();
String apiUrl = "https://" + hostname + "/api";
String projectPath = repoUrl.replace("https://" + hostname + "/", "");
if (projectPath.endsWith("/")) projectPath = projectPath.substring(0, projectPath.length() - 1);

CommitsPerWeek existingCommitsPerWeek = repositoryDataToScrape.commitsPerWeek();
CommitsPerWeek scrapedCommits = new GitlabScraper(apiUrl, projectPath, commitHistoryScrapedAt, existingCommitsPerWeek).contributions();

CommitsPerWeek scrapedCommits = new GitlabScraper(apiUrl, projectPath).contributions();
CommitData updatedData = new CommitData(commitData, scrapedCommits, scrapedAt);
BasicRepositoryData basicData = new BasicRepositoryData(repositoryDataToScrape.software(), repositoryDataToScrape.url());
CommitData updatedData = new CommitData(basicData, scrapedCommits, scrapedAt);
softwareInfoRepository.saveCommitData(updatedData);
} catch (RsdRateLimitException e) {
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", commitData.software().toString(), "software", null, null);
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", null, null);
} catch (RsdResponseException e) {
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", commitData.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
} catch (Exception e) {
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e);
Utils.saveErrorMessageInDatabase("Unknown error", "repository_url", "commit_history_last_error", commitData.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e);
Utils.saveErrorMessageInDatabase("Unknown error", "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
}
});
futures[i] = future;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -12,10 +14,14 @@
import com.google.gson.JsonParser;
import nl.esciencecenter.rsd.scraper.Utils;

import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Objects;
import java.util.Optional;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.UUID;

public class PostgrestConnector {
Expand Down Expand Up @@ -51,6 +57,11 @@ public Collection<BasicRepositoryData> commitData(int limit) {
return parseBasicJsonData(data);
}

public Collection<BasicRepositoryDataWithHistory> commitDataWithHistory(int limit) {
String data = Utils.getAsAdmin(backendUrl + "?" + filter + "&select=software,url,commit_history_scraped_at,commit_history&order=commit_history_scraped_at.asc.nullsfirst&limit=" + limit + "&" + Utils.atLeastOneHourAgoFilter("commit_history_scraped_at"));
return parseBasicJsonDataWithHistory(data);
}

/**
* Fetch basic data from PostgREST
*
Expand All @@ -67,6 +78,31 @@ public Collection<BasicRepositoryData> contributorData(int limit) {
return parseBasicJsonData(data);
}

static Collection<BasicRepositoryDataWithHistory> parseBasicJsonDataWithHistory(String data) {
JsonArray dataInArray = JsonParser.parseString(data).getAsJsonArray();
Collection<BasicRepositoryDataWithHistory> result = new ArrayList<>();
for (JsonElement element : dataInArray) {
JsonObject jsonObject = element.getAsJsonObject();
String softwareUuid = jsonObject.getAsJsonPrimitive("software").getAsString();
UUID software = UUID.fromString(softwareUuid);
String url = jsonObject.getAsJsonPrimitive("url").getAsString();
String commitHistoryScrapedAt = Utils.stringOrNull(jsonObject.get("commit_history_scraped_at"));
SortedMap<Instant, Long> commitHistory = null;
if (!jsonObject.get("commit_history").isJsonNull()) {
JsonObject commitHistoryJsonObject = jsonObject.getAsJsonObject("commit_history");
commitHistory = new TreeMap<>();
for (String key : commitHistoryJsonObject.keySet()) {
commitHistory.put(Instant.ofEpochSecond(Long.parseLong(key)), commitHistoryJsonObject.getAsJsonPrimitive(key).getAsLong());
}
}
CommitsPerWeek commitsPerWeek = new CommitsPerWeek();
commitsPerWeek.setData(commitHistory);

result.add(new BasicRepositoryDataWithHistory(software, url, commitHistoryScrapedAt, commitsPerWeek));
}
return result;
}

static Collection<BasicRepositoryData> parseBasicJsonData(String data) {
JsonArray dataInArray = JsonParser.parseString(data).getAsJsonArray();
Collection<BasicRepositoryData> result = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -22,18 +24,20 @@ class CommitsPerWeekTest {
@Test
void givenInstance_whenValidOperations_thenCorrectResults() {
CommitsPerWeek commitsPerWeek = new CommitsPerWeek();
Map<Instant, Long> underlyingMap = commitsPerWeek.data;
Map<Instant, Long> underlyingMap;

Instant sundayMidnight1 = Instant.ofEpochSecond(1670716800);
commitsPerWeek.addCommits(sundayMidnight1, 10);

underlyingMap = commitsPerWeek.getData();
Assertions.assertEquals(1, underlyingMap.size());
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight1));
Assertions.assertEquals(10, underlyingMap.get(sundayMidnight1));


commitsPerWeek.addCommits(sundayMidnight1, 20);

underlyingMap = commitsPerWeek.getData();
Assertions.assertEquals(1, underlyingMap.size());
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight1));
Assertions.assertEquals(30, underlyingMap.get(sundayMidnight1));
Expand All @@ -43,6 +47,7 @@ void givenInstance_whenValidOperations_thenCorrectResults() {
.plus(Duration.ofSeconds(12345));
commitsPerWeek.addCommits(smallTimeAfterSundayMidnight1, 10);

underlyingMap = commitsPerWeek.getData();
Assertions.assertEquals(1, underlyingMap.size());
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight1));
Assertions.assertEquals(40, underlyingMap.get(sundayMidnight1));
Expand All @@ -51,13 +56,15 @@ void givenInstance_whenValidOperations_thenCorrectResults() {
Instant sundayMidnight2 = sundayMidnight1.plus(Period.ofWeeks(5));
commitsPerWeek.addCommits(sundayMidnight2, 5);

underlyingMap = commitsPerWeek.getData();
Assertions.assertEquals(2, underlyingMap.size());
Assertions.assertTrue(underlyingMap.containsKey(sundayMidnight2));
Assertions.assertEquals(5, underlyingMap.get(sundayMidnight2));
Assertions.assertEquals(40, underlyingMap.get(sundayMidnight1));


commitsPerWeek.addMissingZeros();
underlyingMap = commitsPerWeek.getData();
Assertions.assertEquals(6, underlyingMap.size());
Assertions.assertEquals(0, underlyingMap.get(sundayMidnight1.plus(Period.ofWeeks(1))));
Assertions.assertEquals(0, underlyingMap.get(sundayMidnight1.plus(Period.ofWeeks(2))));
Expand Down

0 comments on commit bf51083

Please sign in to comment.