Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: optimise GitLab commit history scraper #1374

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -14,6 +16,10 @@ import HelpOutlineIcon from '@mui/icons-material/HelpOutline'
import ScheduleIcon from '@mui/icons-material/Schedule'
import DoDisturbOnIcon from '@mui/icons-material/DoDisturbOn'
import {CodePlatform} from '~/types/SoftwareTypes'
import IconButton from '@mui/material/IconButton'
import ListItemSecondaryAction from '@mui/material/ListItemSecondaryAction'
import DeleteIcon from '@mui/icons-material/Delete'
import {deleteServiceDataFromDb} from './apiSoftwareServices'

type ServiceInfoListItemProps={
readonly title:string
Expand All @@ -22,9 +28,16 @@ type ServiceInfoListItemProps={
readonly url: string|null
readonly platform: CodePlatform|null
readonly scraping_disabled_reason: string|null
readonly dbprops: string[]
}

export function ServiceInfoListItem({title,scraped_at,last_error,url,platform,scraping_disabled_reason}:ServiceInfoListItemProps){
const deleteAction = {
'Commit history': 'delete',
'Programming languages': 'delete prog',
'Repository statistics': 'delete'
}

export function ServiceInfoListItem({title,scraped_at,last_error,url,platform,scraping_disabled_reason,dbprops}:ServiceInfoListItemProps){
let status:'error'|'success'|'not_active'|'scheduled'|'not_supported' = 'not_active'

// set service status
Expand Down Expand Up @@ -79,6 +92,13 @@ export function ServiceInfoListItem({title,scraped_at,last_error,url,platform,sc
)
}

async function clearServiceData() {
dbprops.forEach((dbProp) => {
// const resp = await deleteServiceDataFromDb({})
})

}

return (
<ListItem
data-testid="software-service-item"
Expand All @@ -101,6 +121,17 @@ export function ServiceInfoListItem({title,scraped_at,last_error,url,platform,sc
primary={title}
secondary={getStatusMsg()}
/>

<ListItemSecondaryAction>
<IconButton
onClick={() => {}}
aria-label="delete"
size="large"
>
<DeleteIcon />
</IconButton>
</ListItemSecondaryAction>

</ListItem>
)
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -30,7 +32,8 @@ export default function SoftwareRepoServices() {
scraped_at: services ? services[service.props.scraped_at] : null,
last_error: services ? services[service.props.last_error] : null,
url: services ? services[service.props.url] : null,
platform: services ? services['code_platform'] : null
platform: services ? services['code_platform'] : null,
dbprops: service.dbprops
}
return (
<ServiceInfoListItem key={service.name} scraping_disabled_reason={null} {...props} />
Expand Down
32 changes: 32 additions & 0 deletions frontend/components/software/edit/services/apiSoftwareServices.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -160,3 +162,33 @@ export function useSoftwareServices(){
services
}
}

export async function deleteServiceDataFromDb({dbprop, software, token}:
{dbprop:string, software: string, token:string}){
try {
const query = `repository_url?software=eq.${software}`
const url = `${getBaseUrl()}/${query}`
const resp = await fetch(url, {
method: 'PATCH',
headers: {
...createJsonHeaders(token),
'Content-Type': 'application/json'
},
body: JSON.stringify({
dbprop: null
})
})
if (resp.status === 201) {
return {
status: 201,
message: `${dbprop} cleared from database.`
}
}
} catch (e: any) {
logger(`deleteServiceDataFromDb: ${e?.message}`, 'error')
return {
status: 500,
message: e?.message
}
}
}
14 changes: 10 additions & 4 deletions frontend/components/software/edit/services/config.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -12,21 +14,25 @@ type ServiceListProps={
scraped_at: keyof SoftwareServices,
last_error: keyof SoftwareServices,
url: keyof SoftwareServices
}
},
dbprops: string[]
}

export const repoServiceList:ServiceListProps[]=[{
name: 'Commit history',
desc: 'Information is extracted from the repository api (github/gitlab)',
props:{scraped_at:'commit_history_scraped_at',last_error:'commit_history_last_error',url:'url'}
props:{scraped_at:'commit_history_scraped_at',last_error:'commit_history_last_error',url:'url'},
dbprops: ['commit_history']
},{
name: 'Programming languages',
desc: 'Information is extracted from the repository api (github/gitlab)',
props:{scraped_at:'languages_scraped_at',last_error:'languages_last_error',url:'url'}
props:{scraped_at:'languages_scraped_at',last_error:'languages_last_error',url:'url'},
dbprops: ['languages']
},{
name: 'Repository statistics',
desc: 'Information is extracted from the repository api (github/gitlab)',
props:{scraped_at:'basic_data_scraped_at',last_error:'basic_data_last_error',url:'url'}
props:{scraped_at:'basic_data_scraped_at',last_error:'basic_data_last_error',url:'url'},
dbprops: ['star_count', 'fork_count', 'open_issue_count', 'contributor_count']
}]


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

package nl.esciencecenter.rsd.scraper.git;

import java.util.UUID;

public record BasicRepositoryDataWithHistory(UUID software, String url, String commitHistoryScrapedAt, CommitsPerWeek commitsPerWeek) {
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -26,12 +28,23 @@
*/
public class CommitsPerWeek {

final SortedMap<Instant, Long> data = new TreeMap<>();
private final SortedMap<Instant, Long> data = new TreeMap<>();
static final Gson gson = new GsonBuilder()
.enableComplexMapKeySerialization()
.registerTypeAdapter(Instant.class, (JsonSerializer<Instant>) (src, typeOfSrc, context) -> new JsonPrimitive(src.getEpochSecond()))
.create();

public SortedMap<Instant, Long> getData() {
return new TreeMap<>(data);
}

public void setData(SortedMap<Instant, Long> data) {
if(data != null) {
this.data.clear();
this.data.putAll(data);
}
}

public void addCommits(ZonedDateTime zonedDateTime, long count) {
ZonedDateTime utcTime = zonedDateTime.withZoneSameInstant(ZoneOffset.UTC);
Instant sundayMidnight = utcTime.truncatedTo(ChronoUnit.DAYS).with(TemporalAdjusters.previousOrSame(DayOfWeek.SUNDAY)).toInstant();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: 2022 - 2024 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -26,6 +26,8 @@
public class GitlabScraper implements GitScraper {
private final String projectPath;
private final String apiUri;
private final String commitHistoryScrapedAt;
private final CommitsPerWeek existingCommitsPerWeek = new CommitsPerWeek();

/**
* A GitLab scraper for API version 4.
Expand All @@ -36,6 +38,14 @@ public class GitlabScraper implements GitScraper {
public GitlabScraper(String gitLabApiUrl, String projectPath) {
this.projectPath = projectPath.endsWith(".git") ? projectPath.substring(0, projectPath.length() - 4) : projectPath;
this.apiUri = gitLabApiUrl + "/v4";
this.commitHistoryScrapedAt = null;
}

public GitlabScraper(String gitLabApiUrl, String projectPath, String commitHistoryScrapedAt, CommitsPerWeek existingCommitsPerWeek) {
this.projectPath = projectPath.endsWith(".git") ? projectPath.substring(0, projectPath.length() - 4) : projectPath;
this.apiUri = gitLabApiUrl + "/v4";
this.commitHistoryScrapedAt = commitHistoryScrapedAt;
if (existingCommitsPerWeek != null) this.existingCommitsPerWeek.setData(existingCommitsPerWeek.getData());
}

/**
Expand Down Expand Up @@ -81,12 +91,17 @@ public String languages() throws IOException, InterruptedException, RsdResponseE
@Override
public CommitsPerWeek contributions() throws IOException, InterruptedException, RsdResponseException {
CommitsPerWeek commits = new CommitsPerWeek();

String since="";
if (commitHistoryScrapedAt != null) {
since = "&since=" + commitHistoryScrapedAt;
}
String page = "1";
boolean done = false;
while (!done) {
HttpRequest request = HttpRequest.newBuilder().GET()
.uri(URI.create(apiUri + "/projects/" + Utils.urlEncode(projectPath)
+ "/repository/commits?per_page=100&order=default&page=" + page))
+ "/repository/commits?per_page=100&order=default&page=" + page + since))
.timeout(Duration.ofSeconds(30))
.build();
HttpResponse<String> response;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
// SPDX-FileCopyrightText: 2022 - 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -40,31 +40,35 @@ public static void main(String[] args) {

private static void scrapeGitLab() {
PostgrestConnector softwareInfoRepository = new PostgrestConnector(Config.backendBaseUrl() + "/repository_url", CodePlatformProvider.GITLAB);
Collection<BasicRepositoryData> dataToScrape = softwareInfoRepository.commitData(Config.maxRequestsGitLab());
Collection<BasicRepositoryDataWithHistory> dataToScrape = softwareInfoRepository.commitDataWithHistory(Config.maxRequestsGitLab());
CompletableFuture<?>[] futures = new CompletableFuture[dataToScrape.size()];
ZonedDateTime scrapedAt = ZonedDateTime.now();
int i = 0;
for (BasicRepositoryData commitData : dataToScrape) {
for (BasicRepositoryDataWithHistory repositoryDataToScrape : dataToScrape) {
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
try {
String repoUrl = commitData.url();
String repoUrl = repositoryDataToScrape.url();
String commitHistoryScrapedAt = repositoryDataToScrape.commitHistoryScrapedAt();
String hostname = URI.create(repoUrl).getHost();
String apiUrl = "https://" + hostname + "/api";
String projectPath = repoUrl.replace("https://" + hostname + "/", "");
if (projectPath.endsWith("/")) projectPath = projectPath.substring(0, projectPath.length() - 1);

CommitsPerWeek scrapedCommits = new GitlabScraper(apiUrl, projectPath).contributions();
CommitData updatedData = new CommitData(commitData, scrapedCommits, scrapedAt);
CommitsPerWeek existingCommitsPerWeek = repositoryDataToScrape.commitsPerWeek();
CommitsPerWeek scrapedCommits = new GitlabScraper(apiUrl, projectPath, commitHistoryScrapedAt, existingCommitsPerWeek).contributions();

BasicRepositoryData basicData = new BasicRepositoryData(repositoryDataToScrape.software(), repositoryDataToScrape.url());
CommitData updatedData = new CommitData(basicData, scrapedCommits, scrapedAt);
softwareInfoRepository.saveCommitData(updatedData);
} catch (RsdRateLimitException e) {
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", commitData.software().toString(), "software", null, null);
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", null, null);
} catch (RsdResponseException e) {
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", commitData.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e);
Utils.saveErrorMessageInDatabase(e.getMessage(), "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
} catch (Exception e) {
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", commitData.software(), e);
Utils.saveErrorMessageInDatabase("Unknown error", "repository_url", "commit_history_last_error", commitData.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
Utils.saveExceptionInDatabase("GitLab commit scraper", "repository_url", repositoryDataToScrape.software(), e);
Utils.saveErrorMessageInDatabase("Unknown error", "repository_url", "commit_history_last_error", repositoryDataToScrape.software().toString(), "software", scrapedAt, "commit_history_scraped_at");
}
});
futures[i] = future;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2025 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2025 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -12,10 +14,13 @@
import com.google.gson.JsonParser;
import nl.esciencecenter.rsd.scraper.Utils;

import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Objects;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.UUID;

public class PostgrestConnector {
Expand Down Expand Up @@ -51,6 +56,11 @@ public Collection<BasicRepositoryData> commitData(int limit) {
return parseBasicJsonData(data);
}

public Collection<BasicRepositoryDataWithHistory> commitDataWithHistory(int limit) {
String data = Utils.getAsAdmin(backendUrl + "?" + filter + "&select=software,url,commit_history_scraped_at,commit_history&order=commit_history_scraped_at.asc.nullsfirst&limit=" + limit + "&" + Utils.atLeastOneHourAgoFilter("commit_history_scraped_at"));
return parseBasicJsonDataWithHistory(data);
}

/**
* Fetch basic data from PostgREST
*
Expand All @@ -67,6 +77,31 @@ public Collection<BasicRepositoryData> contributorData(int limit) {
return parseBasicJsonData(data);
}

static Collection<BasicRepositoryDataWithHistory> parseBasicJsonDataWithHistory(String data) {
JsonArray dataInArray = JsonParser.parseString(data).getAsJsonArray();
Collection<BasicRepositoryDataWithHistory> result = new ArrayList<>();
for (JsonElement element : dataInArray) {
JsonObject jsonObject = element.getAsJsonObject();
String softwareUuid = jsonObject.getAsJsonPrimitive("software").getAsString();
UUID software = UUID.fromString(softwareUuid);
String url = jsonObject.getAsJsonPrimitive("url").getAsString();
String commitHistoryScrapedAt = Utils.stringOrNull(jsonObject.get("commit_history_scraped_at"));
SortedMap<Instant, Long> commitHistory = null;
if (!jsonObject.get("commit_history").isJsonNull()) {
JsonObject commitHistoryJsonObject = jsonObject.getAsJsonObject("commit_history");
commitHistory = new TreeMap<>();
for (String key : commitHistoryJsonObject.keySet()) {
commitHistory.put(Instant.ofEpochSecond(Long.parseLong(key)), commitHistoryJsonObject.getAsJsonPrimitive(key).getAsLong());
}
}
CommitsPerWeek commitsPerWeek = new CommitsPerWeek();
commitsPerWeek.setData(commitHistory);

result.add(new BasicRepositoryDataWithHistory(software, url, commitHistoryScrapedAt, commitsPerWeek));
}
return result;
}

static Collection<BasicRepositoryData> parseBasicJsonData(String data) {
JsonArray dataInArray = JsonParser.parseString(data).getAsJsonArray();
Collection<BasicRepositoryData> result = new ArrayList<>();
Expand Down
Loading
Loading