From 8c1021d9f10a0697c7abd58a6cb6a012812ef429 Mon Sep 17 00:00:00 2001 From: Ian Jaymes Iwata <97856957+ian-lastname@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:51:52 -1000 Subject: [PATCH 01/80] Created parse_mbox_latest_date and refresh_mbox functions and updated helix config in accordance to new save file structure I have created the parse_mbox_latest_date and refresh_mbox functions. The latter function deletes the latest year and month mbox file that is currently downloaded (identified by parse_mbox_latest_date), and redownloads that along with any file beyond up until the current year. The naming convention of the downloaded files are also changed to what we have agreed on. Just to note, download_mod_mbox REMAINS UNCHANGED since I'm only using download_mod_mbox_per_month. --- NAMESPACE | 2 + R/mail.R | 127 +++++++++++++++++- conf/helix.yml | 20 +-- man/download_mod_mbox_per_month.Rd | 7 +- ...e_bugzilla_perceval_rest_issue_comments.Rd | 1 + ...lla_perceval_traditional_issue_comments.Rd | 1 + man/parse_bugzilla_rest_comments.Rd | 1 + man/parse_bugzilla_rest_issues.Rd | 1 + man/parse_bugzilla_rest_issues_comments.Rd | 1 + man/parse_commit_message_id.Rd | 1 + man/parse_dependencies.Rd | 1 + man/parse_dv8_clusters.Rd | 1 + man/parse_gitlog.Rd | 1 + man/parse_jira.Rd | 1 + man/parse_jira_latest_date.Rd | 1 + man/parse_jira_rss_xml.Rd | 1 + man/parse_mbox.Rd | 1 + man/parse_mbox_latest_date.Rd | 39 ++++++ man/parse_nvdfeed.Rd | 1 + man/refresh_mbox.Rd | 39 ++++++ vignettes/download_mod_mbox.Rmd | 19 ++- 21 files changed, 247 insertions(+), 20 deletions(-) create mode 100644 man/parse_mbox_latest_date.Rd create mode 100644 man/refresh_mbox.Rd diff --git a/NAMESPACE b/NAMESPACE index 2e3b17bf..e4525716 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -127,6 +127,7 @@ export(parse_jira_rss_xml) export(parse_line_metrics) export(parse_line_type_file) export(parse_mbox) +export(parse_mbox_latest_date) export(parse_nvdfeed) export(parse_r_dependencies) export(parse_r_function_definition) @@ -138,6 +139,7 @@ export(query_src_text_namespace) export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) +export(refresh_mbox) export(smell_missing_links) export(smell_organizational_silo) export(smell_radio_silence) diff --git a/R/mail.R b/R/mail.R index 4a1257e5..daa76f21 100644 --- a/R/mail.R +++ b/R/mail.R @@ -183,15 +183,16 @@ download_mod_mbox <- function(base_url, mailing_list, from_year, to_year, save_f } #' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -#' @param base_url An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes +#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes #' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory +#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) #' @param from_year First year in the range to be downloaded #' @param to_year Last year in the range to be downloaded #' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. #' @param verbose Prints progress during execution #' @return Returns the path of the downloaded mbox file. #' @export -download_mod_mbox_per_month <- function(base_url, mailing_list, from_year, to_year, save_folder_path,verbose=FALSE) { +download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, from_year, to_year, save_folder_path,verbose=FALSE) { #Initialize variables @@ -209,14 +210,15 @@ download_mod_mbox_per_month <- function(base_url, mailing_list, from_year, to_ye #Generate file destinations for the monthly files in /tmp/ destination[[counter]] <- sprintf("%d%02d.mbox", year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, destination[[counter]], sep = "_") if(verbose){ - print(stringi::stri_c("Downloading:",destination[[counter]],sep = " ")) + print(stringi::stri_c("Downloading:",mbox_file_name,sep = " ")) } #Try file download and save result - full_month_url <- stringi::stri_c(base_url, mailing_list, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,destination[[counter]]) + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) x <- httr::GET(full_month_url, httr::write_disk(full_tmp_save_path,overwrite=TRUE)) @@ -236,6 +238,94 @@ download_mod_mbox_per_month <- function(base_url, mailing_list, from_year, to_ye return(output) } +#' Refresh mbox files +#' +#' Uses the adopted file name convention by \code{\link{download_mod_mbox_per_month}} to identify +#' the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, +#' then redownloads it along with the remaining months past j up to 12. Then, it calls +#' \code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being +#' the current real-life year so that all newer mbox files are downloaded. +#' +#' If the directory is empty, then it downloads all mbox files starting from a definable starting year to +#' the current real-life year. +#' +#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes +#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory +#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) +#' @param from_year First year in the range to be downloaded in case there are no mod_mbox files already downloaded +#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. +#' @param verbose Prints progress during execution +#' @export +refresh_mbox <- function(archive_url, mailing_list, archive_type, from_year, save_folder_path,verbose=FALSE) { + # Get a list of mbox files currently downloaded in save path folder + existing_mbox_files <- list.files(save_folder_path) + + # Get the current year + current_date <- Sys.Date() + current_year <- as.numeric(substr(current_date, 1, 4)) + + # If there are no mbox files downloaded, then download mbox files as normal using download_mod_mbox_per_month + if (length(existing_mbox_files) == 0) { + if (verbose) { + message("The folder is empty. Downloading mbox files from ", from_year, " to ", to_year, ". \n") + } + download_mod_mbox_per_month(archive_url = archive_url, + mailing_list = mailing_list, + archive_type = archive_type, + from_year = from_year, + to_year = current_year, + save_folder_path = save_folder_path, + verbose = verbose) + } else { + counter <- 0 + destination <- list() + latest_file_name <- parse_mbox_latest_date(save_folder_path) + extracted_year_month <- sub("[^_]*_[^_]*_", "", sub(".mbox", "", latest_file_name)) + output <- path.expand(save_folder_path) + + latest_downloaded_year <- as.numeric(substr(extracted_year_month, 1, 4)) + latest_downloaded_month <- as.numeric(substr(extracted_year_month, 5, 6)) + this_file <- paste(save_folder_path, latest_file_name, sep = "/") + file.remove(this_file) + # Download files starting from deleted file month to end of that year + for (month in (latest_downloaded_month:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, destination[[counter]], sep = "_") + + if(verbose){ + print(stringi::stri_c("Downloading:",mbox_file_name,sep = " ")) + } + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + warning(paste0("Unable to download: ",mbox_file_name)) + file.remove(full_tmp_save_path) + } + + } + + # Call the per-month-downloader to download the new mail missing from the user's machine + download_mod_mbox_per_month(archive_url = archive_url, + mailing_list = mailing_list, + archive_type = archive_type, + from_year = (latest_downloaded_year+1), + to_year = current_year, + save_folder_path = save_folder_path, + verbose = verbose) + } + # End of if-else +} + ############## Parsers ############## #' Parse mbox from Perceval @@ -279,6 +369,33 @@ parse_mbox <- function(perceval_path,mbox_path){ return(perceval_parsed) } +#' Parse mbox latest date +#' +#' Returns the name of the latest mod_mbox file downloaded in the specified folder +#' +#' The folder assumes the following convention: "(mailing_list)_(archive_type)_yearmonth.mbox" +#' For example: "geronimo-dev_apache_202401.mbox". This nomenclature is defined by \code{\link{download_mod_mbox_per_month}} +#' +#' @param mbox_path path to mbox archive file (ends in .mbox) +#' @return Returns the name of the latest mod_mbox file +#' @export +#' @family parsers +parse_mbox_latest_date <- function(mbox_path) { + file_list <- list.files(mbox_path) + date_list <- list() + # Checking if the save folder is empty + if(identical(file_list, character(0))){ + stop(stringi::stri_c("cannot open the connection")) + } + for(i in file_list){ + i <- sub(".mbox", "", i) + i <- sub("[^_]*_[^_]*_", "", i) + date_list <- append(date_list, i) + } + latest_date <- as.character(max(unlist(date_list))) + latest_mbox_file <- grep(latest_date, file_list, value = TRUE) + return(latest_mbox_file) +} ############## Fake Generator ############## diff --git a/conf/helix.yml b/conf/helix.yml index 2effc52a..431e6fce 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -48,15 +48,17 @@ version_control: - revert-1685-master mailing_list: - # Where is the mbox located locally? - # This is the path to the .git of the project repository you are analyzing. - # The .git is hidden, so you can see it using `ls -a` - mbox: ../../rawdata/mbox/helix_mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - helix-dev + mod_mbox: + mail_key_1: + archive_url: http://mail-archives.apache.org/mod_mbox/helix-dev + mbox: ../../rawdata/helix/mod_mbox/helix-dev/ + mailing_list: helix-dev + archive_type: apache + mail_key_2: + archive_url: http://mail-archives.apache.org/mod_mbox/helix-user + mbox: ../../rawdata/helix/mod_mbox/helix-user/ + mailing_list: helix-user + archive_type: apache issue_tracker: jira: diff --git a/man/download_mod_mbox_per_month.Rd b/man/download_mod_mbox_per_month.Rd index 5be3c0a3..2debab7b 100644 --- a/man/download_mod_mbox_per_month.Rd +++ b/man/download_mod_mbox_per_month.Rd @@ -5,8 +5,9 @@ \title{Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}}} \usage{ download_mod_mbox_per_month( - base_url, + archive_url, mailing_list, + archive_type, from_year, to_year, save_folder_path, @@ -14,10 +15,12 @@ download_mod_mbox_per_month( ) } \arguments{ -\item{base_url}{An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes} +\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} \item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} +\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} + \item{from_year}{First year in the range to be downloaded} \item{to_year}{Last year in the range to be downloaded} diff --git a/man/parse_bugzilla_perceval_rest_issue_comments.Rd b/man/parse_bugzilla_perceval_rest_issue_comments.Rd index d8788d60..610eeb6f 100644 --- a/man/parse_bugzilla_perceval_rest_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_rest_issue_comments.Rd @@ -35,6 +35,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd index 06f9397d..f6f3b7f2 100644 --- a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd @@ -35,6 +35,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_bugzilla_rest_comments.Rd b/man/parse_bugzilla_rest_comments.Rd index 8121d873..57999ca2 100644 --- a/man/parse_bugzilla_rest_comments.Rd +++ b/man/parse_bugzilla_rest_comments.Rd @@ -28,6 +28,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_bugzilla_rest_issues.Rd b/man/parse_bugzilla_rest_issues.Rd index 69d55e6b..da912e4b 100644 --- a/man/parse_bugzilla_rest_issues.Rd +++ b/man/parse_bugzilla_rest_issues.Rd @@ -30,6 +30,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_bugzilla_rest_issues_comments.Rd b/man/parse_bugzilla_rest_issues_comments.Rd index 68939e2c..b884739f 100644 --- a/man/parse_bugzilla_rest_issues_comments.Rd +++ b/man/parse_bugzilla_rest_issues_comments.Rd @@ -32,6 +32,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_commit_message_id.Rd b/man/parse_commit_message_id.Rd index 1fe5fd3f..13d9e542 100644 --- a/man/parse_commit_message_id.Rd +++ b/man/parse_commit_message_id.Rd @@ -27,6 +27,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_dependencies.Rd b/man/parse_dependencies.Rd index 9cd9d487..a7136742 100644 --- a/man/parse_dependencies.Rd +++ b/man/parse_dependencies.Rd @@ -36,6 +36,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_dv8_clusters.Rd b/man/parse_dv8_clusters.Rd index 474205be..987936bf 100644 --- a/man/parse_dv8_clusters.Rd +++ b/man/parse_dv8_clusters.Rd @@ -25,6 +25,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index 5552e83c..d4370808 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -31,6 +31,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_jira.Rd b/man/parse_jira.Rd index 26b2da1f..c3e8fe9a 100644 --- a/man/parse_jira.Rd +++ b/man/parse_jira.Rd @@ -41,6 +41,7 @@ Other parsers: \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_jira_latest_date.Rd b/man/parse_jira_latest_date.Rd index f5b8b18f..d05f3b82 100644 --- a/man/parse_jira_latest_date.Rd +++ b/man/parse_jira_latest_date.Rd @@ -33,6 +33,7 @@ Other parsers: \code{\link{parse_gitlog}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_jira_rss_xml.Rd b/man/parse_jira_rss_xml.Rd index 38bb6948..17b88ff5 100644 --- a/man/parse_jira_rss_xml.Rd +++ b/man/parse_jira_rss_xml.Rd @@ -36,6 +36,7 @@ Other parsers: \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index f048bd48..fd578695 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -32,6 +32,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd new file mode 100644 index 00000000..b45f1cbd --- /dev/null +++ b/man/parse_mbox_latest_date.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{parse_mbox_latest_date} +\alias{parse_mbox_latest_date} +\title{Parse mbox latest date} +\usage{ +parse_mbox_latest_date(mbox_path) +} +\arguments{ +\item{mbox_path}{path to mbox archive file (ends in .mbox)} +} +\value{ +Returns the name of the latest mod_mbox file +} +\description{ +Returns the name of the latest mod_mbox file downloaded in the specified folder +} +\details{ +The folder assumes the following convention: "(mailing_list)_(archive_type)_yearmonth.mbox" +For example: "geronimo-dev_apache_202401.mbox". This nomenclature is defined by \code{\link{download_mod_mbox_per_month}} +} +\seealso{ +Other parsers: +\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, +\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, +\code{\link{parse_bugzilla_rest_comments}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_commit_message_id}()}, +\code{\link{parse_dependencies}()}, +\code{\link{parse_dv8_clusters}()}, +\code{\link{parse_gitlog}()}, +\code{\link{parse_jira_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, +\code{\link{parse_mbox}()}, +\code{\link{parse_nvdfeed}()} +} +\concept{parsers} diff --git a/man/parse_nvdfeed.Rd b/man/parse_nvdfeed.Rd index e861f2a3..1c4365bd 100644 --- a/man/parse_nvdfeed.Rd +++ b/man/parse_nvdfeed.Rd @@ -27,6 +27,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()} } \concept{parsers} diff --git a/man/refresh_mbox.Rd b/man/refresh_mbox.Rd new file mode 100644 index 00000000..755e2d69 --- /dev/null +++ b/man/refresh_mbox.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{refresh_mbox} +\alias{refresh_mbox} +\title{Refresh mbox files} +\usage{ +refresh_mbox( + archive_url, + mailing_list, + archive_type, + from_year, + save_folder_path, + verbose = FALSE +) +} +\arguments{ +\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} + +\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} + +\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} + +\item{from_year}{First year in the range to be downloaded in case there are no mod_mbox files already downloaded} + +\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} + +\item{verbose}{Prints progress during execution} +} +\description{ +Uses the adopted file name convention by \code{\link{download_mod_mbox_per_month}} to identify +the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, +then redownloads it along with the remaining months past j up to 12. Then, it calls +\code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being +the current real-life year so that all newer mbox files are downloaded. +} +\details{ +If the directory is empty, then it downloads all mbox files starting from a definable starting year to +the current real-life year. +} diff --git a/vignettes/download_mod_mbox.Rmd b/vignettes/download_mod_mbox.Rmd index 44a354b4..6733fe63 100644 --- a/vignettes/download_mod_mbox.Rmd +++ b/vignettes/download_mod_mbox.Rmd @@ -34,19 +34,30 @@ As usual, the first step is to load the project configuration file. ```{r} conf <- yaml::read_yaml("../conf/helix.yml") -save_path_mbox <- conf[["mailing_list"]][["mbox"]] -mod_mbox_url <- conf[["mailing_list"]][["domain"]] -mailing_list <- conf[["mailing_list"]][["list_key"]] +save_path_mbox <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mbox"]] +mod_mbox_url <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_url"]] +mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_list"]] +archive_type <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_type"]] start_year <- 2017 end_year <- 2018 ``` ```{r eval = FALSE} -mbox <- download_mod_mbox_per_month(base_url = mod_mbox_url, +mbox <- download_mod_mbox_per_month(archive_url = mod_mbox_url, mailing_list = mailing_list, + archive_type = archive_type, from_year=start_year, to_year=end_year, save_folder_path = save_path_mbox, verbose = TRUE) ``` +```{r eval = FALSE} +mbox_latest <- parse_mbox_latest_date(save_path_mbox) +refresh_mbox(archive_url = mod_mbox_url, + mailing_list = mailing_list, + archive_type = archive_type, + from_year = start_year, + save_folder_path = save_path_mbox, + verbose = TRUE) +``` From 72238a759267f92bdf5c9d9fe0ead278c0130935 Mon Sep 17 00:00:00 2001 From: Ian Jaymes Iwata <97856957+ian-lastname@users.noreply.github.com> Date: Wed, 24 Apr 2024 00:35:31 -1000 Subject: [PATCH 02/80] Edited download_pipermail to save pipermail files as mbox files, created refresh_pipermail, updated news Found out that the pipermail downloader function already downloads the files by month and year, so all I really needed to do was change it so that it downloads the files as mbox files (change the extension from .txt to .mbox). Created the refresher for pipermail. I had no need to create a parse latest pipermail since they were mbox files anyway. --- NAMESPACE | 1 + NEWS.md | 3 + R/mail.R | 236 ++++++++++++++++++++++++++++++-- conf/openssl.yml | 14 +- man/download_pipermail.Rd | 14 +- man/refresh_pipermail.Rd | 35 +++++ vignettes/download_mod_mbox.Rmd | 28 ++++ 7 files changed, 306 insertions(+), 25 deletions(-) create mode 100644 man/refresh_pipermail.Rd diff --git a/NAMESPACE b/NAMESPACE index e4525716..59ac138a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -140,6 +140,7 @@ export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) export(refresh_mbox) +export(refresh_pipermail) export(smell_missing_links) export(smell_organizational_silo) export(smell_radio_silence) diff --git a/NEWS.md b/NEWS.md index cf2de75b..57acc182 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,8 @@ __kaiaulu 0.0.0.9700 (in development)__ ### NEW FEATURES + * `refresh_mbox()` and `refresh_pipermail()` has been added. They are both functions that downloads mbox issues that are not already downloaded up until the current year and month. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `parse_mbox_latest_date()` has been added. This function returns the file name of the downloaded mbox file containing the latest date for use by `download_mbox_per_month()` and `download_pipermail` to implement a refresh capability. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `refresh_jira_issues()` had been added. It is a wrapper function for the previous downloader and downloads only issues greater than the greatest key already downloaded. * `download_jira_issues()`, `download_jira_issues_by_issue_key()`, and `download_jira_issues_by_date()` has been added. This allows for downloading of Jira issues without the use of JirAgileR [#275](https://github.com/sailuh/kaiaulu/issues/275) and specification of issue Id and created ranges. It also interacts with `parse_jira_latest_date` to implement a refresh capability. * `make_jira_issue()` and `make_jira_issue_tracker()` no longer create fake issues following JirAgileR format, but instead the raw data obtained from JIRA API. This is compatible with the new parser function for JIRA. [#277](https://github.com/sailuh/kaiaulu/issues/277) @@ -28,6 +30,7 @@ __kaiaulu 0.0.0.9700 (in development)__ ### MINOR IMPROVEMENTS + * `download_pipermail()` now downloads all the txt and txt.gz files in the accessed pipermail archive as mbox files. [#284](https://github.com/sailuh/kaiaulu/issues/284) * The line metrics notebook now provides further guidance on adjusting the snapshot and filtering. * The R File and R Function parser can now properly parse R folders which contain folders within (not following R package structure). Both `.r` and `.R` files are also now captured (previously only one of the two were specified, but R accepts both). [#235](https://github.com/sailuh/kaiaulu/issues/235) * Refactor GoF Notebook in Graph GoF and Text GoF Notebooks [#224](https://github.com/sailuh/kaiaulu/issues/224) diff --git a/R/mail.R b/R/mail.R index daa76f21..7548234b 100644 --- a/R/mail.R +++ b/R/mail.R @@ -6,14 +6,17 @@ ############## Downloader ############## -#' Download all pipermail files in an archive -#' @param url An url pointing to a pipermail archive +#' Download all pipermail files in an archive as mbox files +#' @param archive_url An url pointing to a pipermail archive +#' @param mailing_list The name of the mailing list being downloaded +#' @param archive_type The name of the type of archive that the mailing list is stored in +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored #' @return Returns `destination`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(url) { +download_pipermail <- function(archive_url, mailing_list, archive_type, save_folder_path) { #Get page - pagedata <- httr::GET(url) + pagedata <- httr::GET(archive_url) #Parse html file into object tbls_xml <- XML::htmlParse(pagedata) @@ -26,32 +29,40 @@ download_pipermail <- function(url) { #Create Vector files <- vector() + file_names <- vector() #Compose download urls for both gunzipped and plain text files for (i in hrefs ){ if (endsWith(i, ".txt.gz")){ - i <- paste0(url, i) + f_month <- match(sub("[^_]*-","", sub(".txt.gz","",i)), month.name) + f_year <- sub("-[^_]*", "", i) + file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) + i <- stringi::stri_c(archive_url, i, sep = "/") files <- c(files, i) } else if (endsWith(i, ".txt")) { - i <- paste0(url, i) + f_month <- match(sub("[^_]*-","", sub(".txt","",i)), month.name) + f_year <- sub("-[^_]*", "", i) + file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) + i <- stringi::stri_c(archive_url, i, sep = "/") files <- c(files, i) } } - - destination <- vector() + amount <- length(files) # File downloading loop - for (i in files){ + for (i in 1:amount){ #split filename from url and create download destination out of it - splits <- stringi::stri_split_fixed(i, "/") - destination[[i]] <- paste0(splits[[1]][[length(splits[[1]])]]) + #splits <- stringi::stri_split_fixed(i, "/") + #destination[[i]] <- paste0(splits[[1]][[length(splits[[1]])]]) #download file and place it at the destination - httr::GET(i, httr::write_disk(destination[[i]], overwrite=TRUE)) + save_file_name <- stringi::stri_c(mailing_list, archive_type, file_names[[i]], sep = "_") + save_file_path <- stringi::stri_c(save_folder_path, save_file_name, sep = "/") + httr::GET(files[[i]], httr::write_disk(save_file_path, overwrite=TRUE)) } #Return filenames - return(destination) + return(save_folder_path) } @@ -326,6 +337,205 @@ refresh_mbox <- function(archive_url, mailing_list, archive_type, from_year, sav # End of if-else } +#' Refresh mbox files downloaded via pipermail +#' +#' Uses the adopted file name convention by \code{\link{download_pipermail}} to identify +#' the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, +#' then redownloads it along with the remaining months past j up to 12. Then, it calls +#' \code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being +#' the current real-life year so that all newer mbox files are downloaded. +#' +#' If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} +#' +#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes +#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory +#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) +#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. +#' @param verbose prints progress during execution +#' @export +refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_folder_path,verbose=FALSE) { + # Get a list of mbox files currently downloaded in save path folder + existing_mbox_files <- list.files(save_folder_path) + + # Get the current year + current_date <- Sys.Date() + current_year <- as.numeric(substr(current_date, 1, 4)) + + # If there are no mbox files downloaded, then download mbox files as normal using download_pipermail + if (length(existing_mbox_files) == 0) { + if (verbose) { + message("The folder is empty. Downloading all pipermail files. \n") + } + download_pipermail(archive_url = archive_url, + mailing_list = mailing_list, + archive_type = archive_type, + save_folder_path = save_folder_path) + } else { + latest_file_name <- parse_mbox_latest_date(save_folder_path) + extracted_year_month <- sub("[^_]*_[^_]*_", "", sub(".mbox", "", latest_file_name)) + output <- path.expand(save_folder_path) + + latest_downloaded_year <- as.numeric(substr(extracted_year_month, 1, 4)) + latest_downloaded_month <- as.numeric(substr(extracted_year_month, 5, 6)) + this_file <- paste(save_folder_path, latest_file_name, sep = "/") + file.remove(this_file) + + # Download txt files starting from deleted file month to end of that year, save as mbox + download_txt_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, save_folder_path) { + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (month in (latest_downloaded_month:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt", latest_downloaded_year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + # Download txt.gz files starting from deleted file month to the end of that year, save as mbox + download_txt_gz_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, save_folder_path) { + + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (month in (latest_downloaded_month:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt.gz", latest_downloaded_year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + # Download txt files from the year after the latest downloaded year to the current real life year + download_txt_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, save_folder_path) { + + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (year in (latest_downloaded_year+1):current_year) { + for (month in (1:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt", year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + } + + # Download txt.gz files from the year after the latest downloaded year to the current real life year + download_txt_gz_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, save_folder_path) { + + counter <- 0 + destination <- list() + mbox_correct_name_format <- list() + + for (year in (latest_downloaded_year+1):current_year) { + for (month in (1:12)) { + counter <- counter + 1 + + #Generate file destinations for the monthly files in /tmp/ + destination[[counter]] <- sprintf("%d-%s.txt.gz", year, month.name[month]) + mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", year, month) + mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") + + #Try file download and save result + full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") + full_tmp_save_path <- file.path(output,mbox_file_name) + x <- httr::GET(full_month_url, + httr::write_disk(full_tmp_save_path,overwrite=TRUE)) + + # Remove file if error + # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 + if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + file.remove(full_tmp_save_path) + } + + } + } + + } + + download_txt_files_latest_downloaded_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + latest_downloaded_month=latest_downloaded_month, + save_folder_path=save_folder_path) + + download_txt_gz_files_latest_downloaded_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + latest_downloaded_month=latest_downloaded_month, + save_folder_path=save_folder_path) + + download_txt_files_current_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + current_year=current_year, + save_folder_path=save_folder_path) + + download_txt_gz_files_current_year(archive_url=archive_url, + mailing_list=mailing_list, + archive_type=archive_type, + latest_downloaded_year=latest_downloaded_year, + current_year=current_year, + save_folder_path=save_folder_path) + } + # End of if-else +} + ############## Parsers ############## #' Parse mbox from Perceval diff --git a/conf/openssl.yml b/conf/openssl.yml index aa7b2254..41ec5af2 100644 --- a/conf/openssl.yml +++ b/conf/openssl.yml @@ -45,14 +45,12 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - #mbox: ../../rawdata/mbox/openssl_dev_mbox # 2004-2008 fields are complete - mbox: ../../rawdata/mbox/openssl-dev.mbx # 2002-2019 gmail field is redacted due to google groups - # What is the domain of the chosen mailing list archive? - #domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - #list_key: - # - apr-dev + pipermail: + mail_key_1: + archive_url: https://mta.openssl.org/pipermail/openssl-dev + pipermail: ../../rawdata/openssl/pipermail/openssl-dev/ + mailing_list: openssl-dev + archive_type: mta #issue_tracker: #jira: diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 9f4db683..218527c6 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -2,16 +2,22 @@ % Please edit documentation in R/mail.R \name{download_pipermail} \alias{download_pipermail} -\title{Download all pipermail files in an archive} +\title{Download all pipermail files in an archive as mbox files} \usage{ -download_pipermail(url) +download_pipermail(archive_url, mailing_list, archive_type, save_folder_path) } \arguments{ -\item{url}{An url pointing to a pipermail archive} +\item{archive_url}{An url pointing to a pipermail archive} + +\item{mailing_list}{The name of the mailing list being downloaded} + +\item{archive_type}{The name of the type of archive that the mailing list is stored in} + +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} } \value{ Returns `destination`, a vector of the downloaded files in the current working directory } \description{ -Download all pipermail files in an archive +Download all pipermail files in an archive as mbox files } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd new file mode 100644 index 00000000..427c66d2 --- /dev/null +++ b/man/refresh_pipermail.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{refresh_pipermail} +\alias{refresh_pipermail} +\title{Refresh mbox files downloaded via pipermail} +\usage{ +refresh_pipermail( + archive_url, + mailing_list, + archive_type, + save_folder_path, + verbose = FALSE +) +} +\arguments{ +\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} + +\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} + +\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} + +\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} + +\item{verbose}{prints progress during execution} +} +\description{ +Uses the adopted file name convention by \code{\link{download_pipermail}} to identify +the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, +then redownloads it along with the remaining months past j up to 12. Then, it calls +\code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being +the current real-life year so that all newer mbox files are downloaded. +} +\details{ +If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} +} diff --git a/vignettes/download_mod_mbox.Rmd b/vignettes/download_mod_mbox.Rmd index 6733fe63..ee6ab6a1 100644 --- a/vignettes/download_mod_mbox.Rmd +++ b/vignettes/download_mod_mbox.Rmd @@ -40,6 +40,14 @@ mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_l archive_type <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_type"]] start_year <- 2017 end_year <- 2018 + +conf2 <- yaml::read_yaml("../conf/openssl.yml") +save_path_pipermail <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["pipermail"]] +pipermail_url <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["archive_url"]] +mailing_list2 <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["mailing_list"]] +archive_type2 <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["archive_type"]] + +perceval_path <- yaml::read_yaml("../tools.yml")[["perceval"]] ``` ```{r eval = FALSE} @@ -61,3 +69,23 @@ refresh_mbox(archive_url = mod_mbox_url, save_folder_path = save_path_mbox, verbose = TRUE) ``` + +```{r eval = FALSE} +download_pipermail(archive_url = pipermail_url, + mailing_list = mailing_list2, + archive_type = archive_type2, + save_folder_path = save_path_pipermail) +``` + +```{r eval = FALSE} +mbox_latest <- parse_mbox_latest_date(save_path_pipermail) +refresh_pipermail(archive_url = pipermail_url, + mailing_list=mailing_list2, + archive_type=archive_type2, + save_folder_path=save_path_pipermail, + verbose=TRUE) +``` + +```{r eval = FALSE} +parse_mbox(perceval_path, save_path_pipermail) +``` From 99fb7e3ebf2ad48d5b4d6edf4124a84de6ec75dd Mon Sep 17 00:00:00 2001 From: Ian Jaymes Iwata <97856957+ian-lastname@users.noreply.github.com> Date: Wed, 24 Apr 2024 18:23:54 -1000 Subject: [PATCH 03/80] Changed function name from refresh_mbox to refresh_mod_mbox for consistency --- NAMESPACE | 2 +- NEWS.md | 2 +- R/mail.R | 2 +- man/{refresh_mbox.Rd => refresh_mod_mbox.Rd} | 6 +++--- vignettes/download_mod_mbox.Rmd | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) rename man/{refresh_mbox.Rd => refresh_mod_mbox.Rd} (95%) diff --git a/NAMESPACE b/NAMESPACE index 59ac138a..5f578622 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,7 +139,7 @@ export(query_src_text_namespace) export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) -export(refresh_mbox) +export(refresh_mod_mbox) export(refresh_pipermail) export(smell_missing_links) export(smell_organizational_silo) diff --git a/NEWS.md b/NEWS.md index 57acc182..0abffa8e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,7 +3,7 @@ __kaiaulu 0.0.0.9700 (in development)__ ### NEW FEATURES - * `refresh_mbox()` and `refresh_pipermail()` has been added. They are both functions that downloads mbox issues that are not already downloaded up until the current year and month. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `refresh_mod_mbox()` and `refresh_pipermail()` has been added. They are both functions that downloads mbox issues that are not already downloaded up until the current year and month. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `parse_mbox_latest_date()` has been added. This function returns the file name of the downloaded mbox file containing the latest date for use by `download_mbox_per_month()` and `download_pipermail` to implement a refresh capability. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `refresh_jira_issues()` had been added. It is a wrapper function for the previous downloader and downloads only issues greater than the greatest key already downloaded. * `download_jira_issues()`, `download_jira_issues_by_issue_key()`, and `download_jira_issues_by_date()` has been added. This allows for downloading of Jira issues without the use of JirAgileR [#275](https://github.com/sailuh/kaiaulu/issues/275) and specification of issue Id and created ranges. It also interacts with `parse_jira_latest_date` to implement a refresh capability. diff --git a/R/mail.R b/R/mail.R index 7548234b..54fbccf8 100644 --- a/R/mail.R +++ b/R/mail.R @@ -267,7 +267,7 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, #' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. #' @param verbose Prints progress during execution #' @export -refresh_mbox <- function(archive_url, mailing_list, archive_type, from_year, save_folder_path,verbose=FALSE) { +refresh_mod_mbox <- function(archive_url, mailing_list, archive_type, from_year, save_folder_path,verbose=FALSE) { # Get a list of mbox files currently downloaded in save path folder existing_mbox_files <- list.files(save_folder_path) diff --git a/man/refresh_mbox.Rd b/man/refresh_mod_mbox.Rd similarity index 95% rename from man/refresh_mbox.Rd rename to man/refresh_mod_mbox.Rd index 755e2d69..5022e7c8 100644 --- a/man/refresh_mbox.Rd +++ b/man/refresh_mod_mbox.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mail.R -\name{refresh_mbox} -\alias{refresh_mbox} +\name{refresh_mod_mbox} +\alias{refresh_mod_mbox} \title{Refresh mbox files} \usage{ -refresh_mbox( +refresh_mod_mbox( archive_url, mailing_list, archive_type, diff --git a/vignettes/download_mod_mbox.Rmd b/vignettes/download_mod_mbox.Rmd index ee6ab6a1..bb054e6a 100644 --- a/vignettes/download_mod_mbox.Rmd +++ b/vignettes/download_mod_mbox.Rmd @@ -62,7 +62,7 @@ mbox <- download_mod_mbox_per_month(archive_url = mod_mbox_url, ```{r eval = FALSE} mbox_latest <- parse_mbox_latest_date(save_path_mbox) -refresh_mbox(archive_url = mod_mbox_url, +refresh_mod_mbox(archive_url = mod_mbox_url, mailing_list = mailing_list, archive_type = archive_type, from_year = start_year, From 618f2d0f275374338afb71da6a2aac170bafab01 Mon Sep 17 00:00:00 2001 From: Ian Jaymes Iwata <97856957+ian-lastname@users.noreply.github.com> Date: Wed, 24 Apr 2024 22:30:08 -1000 Subject: [PATCH 04/80] Added checks in refresh functions and in download_mod_mbox_per_month to ensure it does not download files past current year and month Added checks in the aforementioned functions so that the refreshers won't download "mail from the future" --- R/mail.R | 58 +++++++++++++++++++++++++-------- man/refresh_mod_mbox.Rd | 2 +- vignettes/download_mod_mbox.Rmd | 15 +++++++-- 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/R/mail.R b/R/mail.R index 54fbccf8..107e7d12 100644 --- a/R/mail.R +++ b/R/mail.R @@ -213,10 +213,18 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, #Open file handle to output file output <- path.expand(save_folder_path) + current_date <- Sys.Date() + current_year <- as.numeric(substr(current_date, 1, 4)) + current_month <- as.numeric(substr(current_date, 6, 7)) + #Loop through time and compose the mbox file for (year in (from_year:to_year)) { for (month in 1:12) { + # Check to stop function when month iterates path current real life month + if (year == current_year && month > current_month) { + return(output) + } counter <- counter + 1 #Generate file destinations for the monthly files in /tmp/ @@ -240,7 +248,6 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, file.remove(full_tmp_save_path) } - } } @@ -263,22 +270,24 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, #' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes #' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory #' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) -#' @param from_year First year in the range to be downloaded in case there are no mod_mbox files already downloaded +#' @param from_year First year in the range to be downloaded in case there are no mod_mbox files already downloaded (e.g. 201401) #' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. #' @param verbose Prints progress during execution #' @export refresh_mod_mbox <- function(archive_url, mailing_list, archive_type, from_year, save_folder_path,verbose=FALSE) { # Get a list of mbox files currently downloaded in save path folder existing_mbox_files <- list.files(save_folder_path) + output <- save_folder_path # Get the current year current_date <- Sys.Date() current_year <- as.numeric(substr(current_date, 1, 4)) + current_month <- as.numeric(substr(current_date, 6, 7)) # If there are no mbox files downloaded, then download mbox files as normal using download_mod_mbox_per_month if (length(existing_mbox_files) == 0) { if (verbose) { - message("The folder is empty. Downloading mbox files from ", from_year, " to ", to_year, ". \n") + message("The folder is empty. Downloading mbox files from ", from_year, " to ", current_year, ". \n") } download_mod_mbox_per_month(archive_url = archive_url, mailing_list = mailing_list, @@ -295,11 +304,15 @@ refresh_mod_mbox <- function(archive_url, mailing_list, archive_type, from_year, output <- path.expand(save_folder_path) latest_downloaded_year <- as.numeric(substr(extracted_year_month, 1, 4)) - latest_downloaded_month <- as.numeric(substr(extracted_year_month, 5, 6)) + latest_downloaded_month <- as.numeric(substr(extracted_year_month, 6, 7)) this_file <- paste(save_folder_path, latest_file_name, sep = "/") file.remove(this_file) # Download files starting from deleted file month to end of that year for (month in (latest_downloaded_month:12)) { + # Checks to see if iterator goes beyond current month, stops function if it does + if (latest_downloaded_year == current_year && month > current_month) { + return(output) + } counter <- counter + 1 #Generate file destinations for the monthly files in /tmp/ @@ -360,6 +373,7 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold # Get the current year current_date <- Sys.Date() current_year <- as.numeric(substr(current_date, 1, 4)) + current_month <- as.numeric(substr(current_date, 6, 7)) # If there are no mbox files downloaded, then download mbox files as normal using download_pipermail if (length(existing_mbox_files) == 0) { @@ -381,12 +395,16 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold file.remove(this_file) # Download txt files starting from deleted file month to end of that year, save as mbox - download_txt_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, save_folder_path) { + download_txt_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, current_year, current_month, save_folder_path) { counter <- 0 destination <- list() mbox_correct_name_format <- list() + output <- save_folder_path for (month in (latest_downloaded_month:12)) { + if (latest_downloaded_year == current_year && month > current_month) { + return(output) + } counter <- counter + 1 #Generate file destinations for the monthly files in /tmp/ @@ -410,13 +428,17 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold } # Download txt.gz files starting from deleted file month to the end of that year, save as mbox - download_txt_gz_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, save_folder_path) { + download_txt_gz_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, current_year, current_month, save_folder_path) { counter <- 0 destination <- list() mbox_correct_name_format <- list() + output <- save_folder_path for (month in (latest_downloaded_month:12)) { + if (latest_downloaded_year == current_year && month > current_month) { + return(output) + } counter <- counter + 1 #Generate file destinations for the monthly files in /tmp/ @@ -440,14 +462,18 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold } # Download txt files from the year after the latest downloaded year to the current real life year - download_txt_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, save_folder_path) { + download_txt_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, current_month, save_folder_path) { counter <- 0 destination <- list() mbox_correct_name_format <- list() + output <- save_folder_path for (year in (latest_downloaded_year+1):current_year) { for (month in (1:12)) { + if (year == current_year && month > current_month) { + return(output) + } counter <- counter + 1 #Generate file destinations for the monthly files in /tmp/ @@ -473,14 +499,18 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold } # Download txt.gz files from the year after the latest downloaded year to the current real life year - download_txt_gz_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, save_folder_path) { + download_txt_gz_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, current_month, save_folder_path) { counter <- 0 destination <- list() mbox_correct_name_format <- list() + output <- save_folder_path for (year in (latest_downloaded_year+1):current_year) { for (month in (1:12)) { + if (year == current_year && month > current_month) { + return(output) + } counter <- counter + 1 #Generate file destinations for the monthly files in /tmp/ @@ -510,6 +540,8 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold archive_type=archive_type, latest_downloaded_year=latest_downloaded_year, latest_downloaded_month=latest_downloaded_month, + current_year = current_year, + current_month = current_month, save_folder_path=save_folder_path) download_txt_gz_files_latest_downloaded_year(archive_url=archive_url, @@ -517,6 +549,8 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold archive_type=archive_type, latest_downloaded_year=latest_downloaded_year, latest_downloaded_month=latest_downloaded_month, + current_year = current_year, + current_month = current_month, save_folder_path=save_folder_path) download_txt_files_current_year(archive_url=archive_url, @@ -524,13 +558,15 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold archive_type=archive_type, latest_downloaded_year=latest_downloaded_year, current_year=current_year, + current_month = current_month, save_folder_path=save_folder_path) download_txt_gz_files_current_year(archive_url=archive_url, mailing_list=mailing_list, archive_type=archive_type, latest_downloaded_year=latest_downloaded_year, - current_year=current_year, + current_year = current_year, + current_month = current_month, save_folder_path=save_folder_path) } # End of if-else @@ -593,10 +629,6 @@ parse_mbox <- function(perceval_path,mbox_path){ parse_mbox_latest_date <- function(mbox_path) { file_list <- list.files(mbox_path) date_list <- list() - # Checking if the save folder is empty - if(identical(file_list, character(0))){ - stop(stringi::stri_c("cannot open the connection")) - } for(i in file_list){ i <- sub(".mbox", "", i) i <- sub("[^_]*_[^_]*_", "", i) diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd index 5022e7c8..19132bd3 100644 --- a/man/refresh_mod_mbox.Rd +++ b/man/refresh_mod_mbox.Rd @@ -20,7 +20,7 @@ refresh_mod_mbox( \item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} -\item{from_year}{First year in the range to be downloaded in case there are no mod_mbox files already downloaded} +\item{from_year}{First year in the range to be downloaded in case there are no mod_mbox files already downloaded (e.g. 201401)} \item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} diff --git a/vignettes/download_mod_mbox.Rmd b/vignettes/download_mod_mbox.Rmd index bb054e6a..48ba38c4 100644 --- a/vignettes/download_mod_mbox.Rmd +++ b/vignettes/download_mod_mbox.Rmd @@ -38,7 +38,7 @@ save_path_mbox <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mbox"]] mod_mbox_url <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_url"]] mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_list"]] archive_type <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_type"]] -start_year <- 2017 +start_year <- 2024 end_year <- 2018 conf2 <- yaml::read_yaml("../conf/openssl.yml") @@ -50,6 +50,8 @@ archive_type2 <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["archiv perceval_path <- yaml::read_yaml("../tools.yml")[["perceval"]] ``` +# Mod Mbox Downloader + ```{r eval = FALSE} mbox <- download_mod_mbox_per_month(archive_url = mod_mbox_url, mailing_list = mailing_list, @@ -60,16 +62,20 @@ mbox <- download_mod_mbox_per_month(archive_url = mod_mbox_url, verbose = TRUE) ``` +# Refresh Mod Mbox + ```{r eval = FALSE} mbox_latest <- parse_mbox_latest_date(save_path_mbox) refresh_mod_mbox(archive_url = mod_mbox_url, mailing_list = mailing_list, archive_type = archive_type, - from_year = start_year, + from_year = 2024, save_folder_path = save_path_mbox, verbose = TRUE) ``` +# Pipermail Downloader + ```{r eval = FALSE} download_pipermail(archive_url = pipermail_url, mailing_list = mailing_list2, @@ -77,6 +83,8 @@ download_pipermail(archive_url = pipermail_url, save_folder_path = save_path_pipermail) ``` +# Pipermail Refresher + ```{r eval = FALSE} mbox_latest <- parse_mbox_latest_date(save_path_pipermail) refresh_pipermail(archive_url = pipermail_url, @@ -86,6 +94,9 @@ refresh_pipermail(archive_url = pipermail_url, verbose=TRUE) ``` +# Parse Mbox + ```{r eval = FALSE} +parse_mbox(perceval_path, save_path_mbox) parse_mbox(perceval_path, save_path_pipermail) ``` From 075121800e7aa97a6c01cd1d37ced1ec041ed873 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sun, 28 Apr 2024 02:38:01 -0700 Subject: [PATCH 05/80] fix github checks --- .github/workflows/R-CMD-check.yml | 4 ++-- .github/workflows/test-coverage.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 39b5968f..2350bf81 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -12,7 +12,7 @@ name: R-CMD-check jobs: R-CMD-check: - runs-on: macOS-latest + runs-on: macOS-13 strategy: matrix: r-version: ['4.2'] @@ -65,7 +65,7 @@ jobs: - name: Install UCtags and Update tools.yml if: always() run: | - brew tap universal-ctags/universal-ctags + brew tap homebrew/core brew install --HEAD universal-ctags utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index cfc2fecd..e70821a2 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -10,7 +10,7 @@ name: test-coverage jobs: test-coverage: - runs-on: macOS-latest + runs-on: macOS-13 strategy: matrix: r-version: ['4.2'] @@ -57,7 +57,7 @@ jobs: - name: Install UCtags and Update tools.yml if: always() run: | - brew tap universal-ctags/universal-ctags + brew tap homebrew/core brew install --HEAD universal-ctags utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml From be4ff329fa8a9dd270e7f001128c1c3f6488987a Mon Sep 17 00:00:00 2001 From: Ian Jaymes Iwata <97856957+ian-lastname@users.noreply.github.com> Date: Sun, 28 Apr 2024 23:35:04 -1000 Subject: [PATCH 06/80] Re-added error message in refresh_pipermail when an http error is encountered Done as requested by Carlos --- R/mail.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/mail.R b/R/mail.R index 107e7d12..f98bacdd 100644 --- a/R/mail.R +++ b/R/mail.R @@ -421,6 +421,7 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold # Remove file if error # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + warning(paste0("Unable to download: ",destination[[counter]])) file.remove(full_tmp_save_path) } @@ -455,6 +456,7 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold # Remove file if error # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + warning(paste0("Unable to download: ",destination[[counter]])) file.remove(full_tmp_save_path) } @@ -490,6 +492,7 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold # Remove file if error # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + warning(paste0("Unable to download: ",destination[[counter]])) file.remove(full_tmp_save_path) } @@ -527,6 +530,7 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold # Remove file if error # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 if (httr::http_error(x) && file.exists(full_tmp_save_path)) { + warning(paste0("Unable to download: ",destination[[counter]])) file.remove(full_tmp_save_path) } From b5be04e078d3b2241497f8c9ed81aa319cb6b78c Mon Sep 17 00:00:00 2001 From: Ian Jaymes Iwata <97856957+ian-lastname@users.noreply.github.com> Date: Mon, 29 Apr 2024 18:11:37 -1000 Subject: [PATCH 07/80] Added comments to download_pipermail --- R/mail.R | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/R/mail.R b/R/mail.R index f98bacdd..6090869d 100644 --- a/R/mail.R +++ b/R/mail.R @@ -34,12 +34,17 @@ download_pipermail <- function(archive_url, mailing_list, archive_type, save_fol #Compose download urls for both gunzipped and plain text files for (i in hrefs ){ if (endsWith(i, ".txt.gz")){ + # Converts month from text form into a number for the naming convention f_month <- match(sub("[^_]*-","", sub(".txt.gz","",i)), month.name) + # Retrieves year number for the naming convention f_year <- sub("-[^_]*", "", i) + # txt files are actually mbox files, so this renames the extension file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) + # Saves regular name so that function can access correct url i <- stringi::stri_c(archive_url, i, sep = "/") files <- c(files, i) } else if (endsWith(i, ".txt")) { + # Same logic, but with txt f_month <- match(sub("[^_]*-","", sub(".txt","",i)), month.name) f_year <- sub("-[^_]*", "", i) file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) @@ -51,10 +56,6 @@ download_pipermail <- function(archive_url, mailing_list, archive_type, save_fol # File downloading loop for (i in 1:amount){ - #split filename from url and create download destination out of it - #splits <- stringi::stri_split_fixed(i, "/") - #destination[[i]] <- paste0(splits[[1]][[length(splits[[1]])]]) - #download file and place it at the destination save_file_name <- stringi::stri_c(mailing_list, archive_type, file_names[[i]], sep = "_") save_file_path <- stringi::stri_c(save_folder_path, save_file_name, sep = "/") @@ -392,6 +393,7 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold latest_downloaded_year <- as.numeric(substr(extracted_year_month, 1, 4)) latest_downloaded_month <- as.numeric(substr(extracted_year_month, 5, 6)) this_file <- paste(save_folder_path, latest_file_name, sep = "/") + # Overwrite file because new email may have been added at this point in this month file.remove(this_file) # Download txt files starting from deleted file month to end of that year, save as mbox From d2ce2227ade2777b56741b19ecfc840cbcd1c49c Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 10 Sep 2024 12:00:52 -1000 Subject: [PATCH 08/80] Minor documentation update for setup verification. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3fc3e836..f31356f4 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ I also recommend you download the repo to have some example project configuratio 1. Clone this repo 2. Open `kaiaulu.Rproj` using RStudio - 3. Run the unit tests `devtools::test()`. If any fail, and you are not clear why, feel free to [ask in Discussions](https://github.com/sailuh/kaiaulu/discussions) + 3. Run the unit tests `devtools::test()`. If any fail and you are not clear why, feel free to [ask in Discussions](https://github.com/sailuh/kaiaulu/discussions) 4. Build the documentation `devtools::document(roclets = c('rd', 'collate', 'namespace'))`. 5. Build Kaiaulu (Top right pane in RStudio -> Build tab -> Install and Restart) 6. Run `vignettes/kaiaulu_architecture.Rmd` From 7c585aeda18537044f97e85f8648183bd010f10c Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 15 Sep 2024 12:32:54 -1000 Subject: [PATCH 09/80] i #284 Refactor download_pipermail function - Remove archive_url and archive_type parameters from download_pipermail(). - Add start_year_month and end_year_month parameters for date filtering. - Remove convert_pipermail_to_mbox() function, as download_pipermail() now handles file conversion automatically. - Change file naming convention to 'kaiaulu_'YYYYMM.mbox'. - Attempt to download and decompress files directly without saving .gz to disk, but could not establish a valid connection. Signed-off-by: Dao McGill --- R/mail.R | 136 +++++++++++++++++++++++++++++++------------------ conf/helix.yml | 8 +++ 2 files changed, 94 insertions(+), 50 deletions(-) diff --git a/R/mail.R b/R/mail.R index 6090869d..34e5c7fa 100644 --- a/R/mail.R +++ b/R/mail.R @@ -7,63 +7,99 @@ ############## Downloader ############## #' Download all pipermail files in an archive as mbox files -#' @param archive_url An url pointing to a pipermail archive #' @param mailing_list The name of the mailing list being downloaded -#' @param archive_type The name of the type of archive that the mailing list is stored in +#' @param start_year_month The year and month of the first file to be downloaded +#' @param end_year_month The year and month of the last file to be downloaded #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored -#' @return Returns `destination`, a vector of the downloaded files in the current working directory +#' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(archive_url, mailing_list, archive_type, save_folder_path) { - - #Get page - pagedata <- httr::GET(archive_url) - - #Parse html file into object - tbls_xml <- XML::htmlParse(pagedata) - - #Extract href tablenodes from html table - tableNodes <- XML::getNodeSet(tbls_xml, "//td/a[@href]") - - #Extract filenames from tablenode content with xmlGetAtrr - hrefs <- sapply(tableNodes, XML::xmlGetAttr, 'href') - - #Create Vector - files <- vector() - file_names <- vector() - - #Compose download urls for both gunzipped and plain text files - for (i in hrefs ){ - if (endsWith(i, ".txt.gz")){ - # Converts month from text form into a number for the naming convention - f_month <- match(sub("[^_]*-","", sub(".txt.gz","",i)), month.name) - # Retrieves year number for the naming convention - f_year <- sub("-[^_]*", "", i) - # txt files are actually mbox files, so this renames the extension - file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) - # Saves regular name so that function can access correct url - i <- stringi::stri_c(archive_url, i, sep = "/") - files <- c(files, i) - } else if (endsWith(i, ".txt")) { - # Same logic, but with txt - f_month <- match(sub("[^_]*-","", sub(".txt","",i)), month.name) - f_year <- sub("-[^_]*", "", i) - file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month)) - i <- stringi::stri_c(archive_url, i, sep = "/") - files <- c(files, i) +download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path) { + + # Create directory if it does not exist + if (!dir.exists(save_folder_path)) { + dir.create(save_folder_path, recursive = TRUE) + } + + # Get mailing list contents + response <- GET(mailing_list) + + # Parse the response + parsed_response <- content(response, "text") + doc_obj <- htmlParse(parsed_response, asText = TRUE) + + # Table rows + rows <- getNodeSet(doc_obj, "//tr") + + # Skip header row + data_rows <- rows[-1] + + # Vector for link storage + links = c() + + # Extract the date and link from each row + for (row in data_rows) { + # Date in YYYYMM format + date_extracted <- xpathSApply(row, ".//td[1]", xmlValue) + date_cleaned <- stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "") + date_cleaned <- stri_trim_both(date_cleaned) + # Parse the date + # Add 01 as dummy to make it a valid date + date_parsed <- as.Date(paste0("01 ", date_cleaned), format = "%d %B %Y") + year_month <- format(date_parsed, "%Y%m") + + # Check if date is within range + if (year_month >= start_year_month & year_month <= end_year_month) { + # get href from column 3 + link_nodes <- xpathSApply(row, ".//td[3]/a", xmlGetAttr, 'href') + # Store the link in links + link <- link_nodes[1] + links <- c(links, link) } } - amount <- length(files) - # File downloading loop - for (i in 1:amount){ - - #download file and place it at the destination - save_file_name <- stringi::stri_c(mailing_list, archive_type, file_names[[i]], sep = "_") - save_file_path <- stringi::stri_c(save_folder_path, save_file_name, sep = "/") - httr::GET(files[[i]], httr::write_disk(save_file_path, overwrite=TRUE)) + # Vector for downloaded files + downloaded_files <- c() + for (i in seq_along(links)) { + link <- links[i] + + # Extract the name without the .txt.gz extension + base_name <- gsub("\\.txt\\.gz$", "", link) + + # Parse the date from the base name + date_parsed <- as.Date(paste0("01-", base_name), format = "%d-%Y-%B") + year_month_clean <- format(date_parsed, "%Y%m") + + # Download URL + download_url <- paste0(mailing_list, link) + + # Define the destination file + # Rename (also converts to mbox by changing extension to .mbox) + dest_gz <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox.gz')) + dest <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox')) + + # Download the gz mbox file + cat("Downloading:", download_url, "\n") + GET(download_url, write_disk(dest_gz, overwrite = TRUE)) + + # Unzip the file + gz_con <- gzfile(dest_gz, open = "rb") + out_con <- file(dest, open = "wb") + while (TRUE) { + bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) + if (length(bytes) == 0) break + writeBin(bytes, out_con) + } + close(gz_con) + close(out_con) + + # Remove the gz file + file.remove(dest_gz) + + # Add the downloaded file to the list + downloaded_files <- c(downloaded_files, dest) } - #Return filenames - return(save_folder_path) + # Return downloaded files + return(downloaded_files) } diff --git a/conf/helix.yml b/conf/helix.yml index 431e6fce..a57a516a 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -59,6 +59,14 @@ mailing_list: mbox: ../../rawdata/helix/mod_mbox/helix-user/ mailing_list: helix-user archive_type: apache + # Using for testing R/mail.R/pipermail_downloader() + pipermail_key: + archive_url: https://mta.openssl.org/mailman/listinfo/ + mailing_list: https://mta.openssl.org/pipermail/openssl-users/ + # archive_type + start_year_month: 202310 + end_year_month: 202405 + save_folder_path: "save_folder_mail" issue_tracker: jira: From 69ca16374c9930ec86a6f1346cbc37a0178f5fd1 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:18:07 -1000 Subject: [PATCH 10/80] i #284 Updated documentation and modified function for download_pipermail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Modified helix.yml to use [[“mailing_list”]][[“pipermail”]][[“project_key_1”]] - Added project_key_2 to helix.yml - Created /vignettes/download_mail.Rmd to document information about pipermail downloader - Made function calls explicit for external libraries - ISSUE: Build -> Check is not passing. Seems to be having issues with utags_path, even though I changed the path to the one for universal-ctags in tools.yml --- R/mail.R | 50 +++++++++++++++-------- conf/helix.yml | 15 ++++--- tools.yml | 2 +- vignettes/download_mail.Rmd | 81 +++++++++++++++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 22 deletions(-) create mode 100644 vignettes/download_mail.Rmd diff --git a/R/mail.R b/R/mail.R index 34e5c7fa..ef7899e3 100644 --- a/R/mail.R +++ b/R/mail.R @@ -20,42 +20,56 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s dir.create(save_folder_path, recursive = TRUE) } + # Ensure mailing_list URL ends with a slash + if (!stringi::stri_endswith_fixed(mailing_list, "/")) { + mailing_list <- paste0(mailing_list, "/") + } + # Get mailing list contents - response <- GET(mailing_list) + response <- httr::GET(mailing_list) # Parse the response - parsed_response <- content(response, "text") - doc_obj <- htmlParse(parsed_response, asText = TRUE) + parsed_response <- httr::content(response, "text") + doc_obj <- XML::htmlParse(parsed_response, asText = TRUE) # Table rows - rows <- getNodeSet(doc_obj, "//tr") + rows <- XML::getNodeSet(doc_obj, "//tr") # Skip header row data_rows <- rows[-1] # Vector for link storage - links = c() + links <- c() # Extract the date and link from each row for (row in data_rows) { # Date in YYYYMM format - date_extracted <- xpathSApply(row, ".//td[1]", xmlValue) - date_cleaned <- stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "") - date_cleaned <- stri_trim_both(date_cleaned) + date_extracted <- XML::xpathSApply(row, ".//td[1]", XML::xmlValue) + date_cleaned <- stringi::stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "") + date_cleaned <- stringi::stri_trim_both(date_cleaned) # Parse the date # Add 01 as dummy to make it a valid date date_parsed <- as.Date(paste0("01 ", date_cleaned), format = "%d %B %Y") + if (is.na(date_parsed)) { + warning("Date could not be parsed: ", date_cleaned) + next + } year_month <- format(date_parsed, "%Y%m") # Check if date is within range if (year_month >= start_year_month & year_month <= end_year_month) { - # get href from column 3 - link_nodes <- xpathSApply(row, ".//td[3]/a", xmlGetAttr, 'href') + # Get href from column 3 + link_nodes <- XML::xpathSApply(row, ".//td[3]/a", XML::xmlGetAttr, 'href') + if (length(link_nodes) == 0) { + warning("No link found in row for date: ", date_cleaned) + next + } # Store the link in links link <- link_nodes[1] links <- c(links, link) } } + # Vector for downloaded files downloaded_files <- c() for (i in seq_along(links)) { @@ -66,6 +80,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s # Parse the date from the base name date_parsed <- as.Date(paste0("01-", base_name), format = "%d-%Y-%B") + if (is.na(date_parsed)) { + warning("Could not parse date from link: ", link) + next + } year_month_clean <- format(date_parsed, "%Y%m") # Download URL @@ -78,7 +96,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s # Download the gz mbox file cat("Downloading:", download_url, "\n") - GET(download_url, write_disk(dest_gz, overwrite = TRUE)) + httr::GET(download_url, httr::write_disk(dest_gz, overwrite = TRUE)) # Unzip the file gz_con <- gzfile(dest_gz, open = "rb") @@ -100,10 +118,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s # Return downloaded files return(downloaded_files) - } + #' Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}} #' @param filelist A vector of pipermail archive files from \code{\link{download_pipermail}} #' @return Returns `output`, the name of the resulting .mbox file in the current working directory @@ -417,10 +435,10 @@ refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_fold if (verbose) { message("The folder is empty. Downloading all pipermail files. \n") } - download_pipermail(archive_url = archive_url, - mailing_list = mailing_list, - archive_type = archive_type, - save_folder_path = save_folder_path) + download_pipermail(mailing_list = mailing_list, + start_year_month = start_year_month, + end_year_month = end_year_month, + save_folder_path = save_folder_path) } else { latest_file_name <- parse_mbox_latest_date(save_folder_path) extracted_year_month <- sub("[^_]*_[^_]*_", "", sub(".mbox", "", latest_file_name)) diff --git a/conf/helix.yml b/conf/helix.yml index a57a516a..d0b623c0 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -59,14 +59,19 @@ mailing_list: mbox: ../../rawdata/helix/mod_mbox/helix-user/ mailing_list: helix-user archive_type: apache - # Using for testing R/mail.R/pipermail_downloader() - pipermail_key: - archive_url: https://mta.openssl.org/mailman/listinfo/ + pipermail: + project_key_1: + # archive_url: https://mta.openssl.org/mailman/listinfo/ mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - # archive_type start_year_month: 202310 end_year_month: 202405 - save_folder_path: "save_folder_mail" + save_folder_path: "../save_folder_mail" + project_key_2: + # archive_url: https://mta.openssl.org/mailman/listinfo/ + mailing_list: https://mta.openssl.org/pipermail/openssl-project/ + start_year_month: 201903 + end_year_month: 202103 + save_folder_path: "../save_folder_mail_2" issue_tracker: jira: diff --git a/tools.yml b/tools.yml index 27951fe6..fd4ac52a 100644 --- a/tools.yml +++ b/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /usr/local/Cellar/universal-ctags/HEAD-62f0144/bin/ctags +utags: /usr/local/Cellar/universal-ctags/p6.1.20240901.0/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd new file mode 100644 index 00000000..fbf5a034 --- /dev/null +++ b/vignettes/download_mail.Rmd @@ -0,0 +1,81 @@ +--- +title: "Download Mod Mbox and Pipermail Mailing List Archives" +output: + html_document: + toc: true + number_sections: true +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Download Mod Mbox Mailing List Archives} + %\VignetteEncoding{UTF-8} +--- + + +```{r} +rm(list = ls()) +seed <- 1 +set.seed(seed) + +# Load libraries + library(kaiaulu) + library(data.table) + library(yaml) + library(stringi) + library(XML) + library(httr) +``` + + +# Introduction + +Mailing list data is stored in a variety of archives. See: +- Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). +- Pipermail: [OpenSSL](https://mta.openssl.org/mailman/listinfo/). +is notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail. + +## Mailing List Organization + +Mailing lists are typically organized by topic or purpose. For example, the [OpenSSL project](https://www.openssl.org/community/mailinglists.html) maintains several mailing lists, each serving a different group: + +- **openssl-announce**: For important announcements. +- **openssl-commits**: For commit messages. +- **openssl-project**: For project discussions. +- **openssl-users**: For general user questions and discussions. + +Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. + +# Project Configuration File + +To start, we load the project configuration file, which contains parameters for downloading the mailing list archives. + +// # Project Configuration File + +```{r} +conf <- yaml::read_yaml("conf/helix.yml") +mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] +start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]] +end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]] +save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["save_folder_path"]] +``` + +### Explanation of Configuration Parameters +- mailing_list: The URL of the mailing list archive index page (e.g., https://lists.openssl.org/pipermail/openssl-users/). +- start_year_month: The starting date for downloading archives (in YYYYMM format). +- end_year_month: The ending date for downloading archives (in YYYYMM format). +- save_folder_path: The local directory where the downloaded archives will be saved. + + +# Pipermail Downloader + +```{r} +# Download archives +download_pipermail( + mailing_list = mailing_list, + start_year_month = start_year_month, + end_year_month = end_year_month, + save_folder_path = save_folder_path +) + +``` +After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc. + From b9a886b17de1d6ba31d532984261eae6a8790b89 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 17 Sep 2024 13:56:47 -1000 Subject: [PATCH 11/80] i #284 Edited download_pipermail() and Added refresh_pipermail() and process_gz_to_mbox_in_folder() - download_pipermail: Attempts to download .txt file first. If unavailable fallback to .gz. If using .gz file, unzips and writes output in .mbox - Added log messages - download_pipermail: Added timeout parameter to deal with case that server takes too long to respond - Added refresh_pipermail function - Updated vignettes/download_mail.Rmd to include refresh_pipermail - Added process_gz_to_mbox_in_folder function --- NAMESPACE | 3 +- R/mail.R | 501 +++++++--------------------- conf/helix.yml | 4 +- man/convert_pipermail_to_mbox.Rd | 17 - man/download_pipermail.Rd | 15 +- man/process_gz_to_mbox_in_folder.Rd | 23 ++ man/refresh_mod_mbox.Rd | 39 --- man/refresh_pipermail.Rd | 36 +- tools.yml | 2 +- vignettes/download_mail.Rmd | 20 +- 10 files changed, 197 insertions(+), 463 deletions(-) delete mode 100644 man/convert_pipermail_to_mbox.Rd create mode 100644 man/process_gz_to_mbox_in_folder.Rd delete mode 100644 man/refresh_mod_mbox.Rd diff --git a/NAMESPACE b/NAMESPACE index 5f578622..f6e15a60 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,6 @@ export(assign_exact_identity) export(bipartite_graph_projection) export(commit_message_id_coverage) export(community_oslom) -export(convert_pipermail_to_mbox) export(dependencies_to_sdsmj) export(download_bugzilla_perceval_rest_issue_comments) export(download_bugzilla_perceval_traditional_issue_comments) @@ -133,13 +132,13 @@ export(parse_r_dependencies) export(parse_r_function_definition) export(parse_r_function_dependencies) export(parse_rfile_ast) +export(process_gz_to_mbox_in_folder) export(query_src_text) export(query_src_text_class_names) export(query_src_text_namespace) export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) -export(refresh_mod_mbox) export(refresh_pipermail) export(smell_missing_links) export(smell_organizational_silo) diff --git a/R/mail.R b/R/mail.R index ef7899e3..9b304ab5 100644 --- a/R/mail.R +++ b/R/mail.R @@ -26,7 +26,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s } # Get mailing list contents - response <- httr::GET(mailing_list) + response <- httr::GET(mailing_list, httr::timeout(60)) + if (httr::status_code(response) != 200) { + stop("Failed to access the mailing list page.") + } # Parse the response parsed_response <- httr::content(response, "text") @@ -48,7 +51,6 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s date_cleaned <- stringi::stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "") date_cleaned <- stringi::stri_trim_both(date_cleaned) # Parse the date - # Add 01 as dummy to make it a valid date date_parsed <- as.Date(paste0("01 ", date_cleaned), format = "%d %B %Y") if (is.na(date_parsed)) { warning("Date could not be parsed: ", date_cleaned) @@ -87,30 +89,53 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s year_month_clean <- format(date_parsed, "%Y%m") # Download URL - download_url <- paste0(mailing_list, link) + txt_url <- paste0(mailing_list, gsub("\\.gz$", "", link)) + gz_url <- paste0(mailing_list, link) + + # Attempt to download the .txt file first + download_url <- txt_url + response <- httr::GET(download_url, httr::timeout(60)) + + if (httr::status_code(response) != 200) { + # Fallback to .gz file if .txt is unavailable + download_url <- gz_url + response <- httr::GET(download_url, httr::timeout(60)) + if (httr::status_code(response) != 200) { + cat("Both .txt and .gz downloads failed for link: ", link, "\n") + next + } + } # Define the destination file - # Rename (also converts to mbox by changing extension to .mbox) - dest_gz <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox.gz')) dest <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox')) - # Download the gz mbox file - cat("Downloading:", download_url, "\n") - httr::GET(download_url, httr::write_disk(dest_gz, overwrite = TRUE)) - - # Unzip the file - gz_con <- gzfile(dest_gz, open = "rb") - out_con <- file(dest, open = "wb") - while (TRUE) { - bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) - if (length(bytes) == 0) break - writeBin(bytes, out_con) + # Print diagnostic info + cat("Downloading: ", download_url, "\n") + cat("Saving to: ", dest, "\n") + + # Write file to disk + if (grepl("\\.gz$", download_url)) { + # Download the .gz file + gz_file_path <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox.gz')) + httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) + + # Unzip the file + gz_con <- gzfile(gz_file_path, open = "rb") + out_con <- file(dest, open = "wb") + while (TRUE) { + bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) + if (length(bytes) == 0) break + writeBin(bytes, out_con) + } + close(gz_con) + close(out_con) + + # Remove the gz file after unzipping + file.remove(gz_file_path) + } else { + # Download the .txt file directly + httr::GET(download_url, httr::write_disk(dest, overwrite = TRUE), httr::timeout(60)) } - close(gz_con) - close(out_con) - - # Remove the gz file - file.remove(dest_gz) # Add the downloaded file to the list downloaded_files <- c(downloaded_files, dest) @@ -121,61 +146,109 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s } - -#' Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}} -#' @param filelist A vector of pipermail archive files from \code{\link{download_pipermail}} -#' @return Returns `output`, the name of the resulting .mbox file in the current working directory +#' Refresh mbox files downloaded via pipermail +#' Uses the adopted file name convention by \code{\link{download_pipermail}} to identify +#' the latest downloaded mbox. It deletes this file, then redownloads it along with all future months +#' up to the current real-life month. +#' If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} +#' @param mailing_list The name of the mailing list being downloaded +#' @param start_year_month The year and month of the first file to be downloaded +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored +#' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export -convert_pipermail_to_mbox <- function(filelist) { +refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path) { - #at to @ replace function - pipermail_atreplacer <- function(string) { + # Create directory if it does not exist + if (!dir.exists(save_folder_path)) { + dir.create(save_folder_path, recursive = TRUE) + } - rstring <- sub(" at ", "@", string) + # Check if the folder is empty + files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + if (length(files_in_folder) == 0) { + # If empty, download from start_year_month to the current month + end_year_month <- format(Sys.Date(), "%Y%m") + cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") + download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) + return(NULL) + } + # If folder is not empty, find the most recent month + year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) + recent_month <- max(year_months) + + # Delete the most recent file + recent_file <- file.path(save_folder_path, paste0("kaiaulu_", recent_month, ".mbox")) + if (file.exists(recent_file)) { + file.remove(recent_file) + cat("Deleted the most recent file:", recent_file, "\n") + } - return(rstring) + # Redownload from the most recent month to the current real-life month + end_year_month <- format(Sys.Date(), "%Y%m") + cat("Redownloading from", recent_month, "to", end_year_month, "\n") + download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) +} - } - output <- "output.mbox" +#' Process .gz files in a folder, unzip and convert them to .mbox +#' Checks a folder for any .gz files, unzips them, and renames them +#' to .mbox format. The original .gz files are deleted after unzipping. If a .mbox +#' file with the same name already exists, it will be overwritten. +#' +#' @param folder_path The path to the folder containing both .gz and .mbox files. +#' @return A list of the .mbox files that were created or updated. +#' @export +process_gz_to_mbox_in_folder <- function(folder_path) { - #Create mbox file and file connection - file.create(output) - fileConn <- file(output, "w+") + # Get the list of files in the folder + files <- list.files(folder_path, full.names = TRUE) + # Find .gz files + gz_files <- files[grepl("\\.gz$", files)] - #Read lines from downloaded files and write them to mbox file - for (filename in filelist[]){ + # Check if there are no .gz files + if (length(gz_files) == 0) { + cat("This folder does not contain any .gz files.\n") + return(NULL) + } - #Open read connection - readCon <- file(filename, "r") + # Vector to store names of converted .mbox files + converted_mbox_files <- c() - data <- readLines(filename) + # Process .gz files + for (gz_file in gz_files) { + # Define the corresponding .mbox file path (remove .gz and replace with .mbox) + mbox_file <- gsub("\\.gz$", ".mbox", gz_file) - #Find email headers to send to 'at' to @ replacer - for (i in 1:length(data)) { + cat("Processing:", gz_file, " -> ", mbox_file, "\n") - data[i] <- sub("From:? \\S+ at \\S+", pipermail_atreplacer(data[i]), data[i]) + # Open .gz file and unzip its contents to .mbox + gz_con <- gzfile(gz_file, open = "rb") + out_con <- file(mbox_file, open = "wb") + # Read and write the contents + while (TRUE) { + bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) + if (length(bytes) == 0) break + writeBin(bytes, out_con) } - #Write files to output - writeLines(data, fileConn) + # Close connections + close(gz_con) + close(out_con) - #Close read connection - close(readCon) + # Remove the .gz file + file.remove(gz_file) - #Delete the file - unlink(filename, force = TRUE) + # Add the converted file to the list + converted_mbox_files <- c(converted_mbox_files, mbox_file) } - #Close connection to mbox file - close(fileConn) - - #return output location - return(output) + # Return the list of converted .mbox files + return(converted_mbox_files) } + #' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} #' @param base_url An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes #' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory @@ -311,326 +384,6 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, return(output) } -#' Refresh mbox files -#' -#' Uses the adopted file name convention by \code{\link{download_mod_mbox_per_month}} to identify -#' the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, -#' then redownloads it along with the remaining months past j up to 12. Then, it calls -#' \code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being -#' the current real-life year so that all newer mbox files are downloaded. -#' -#' If the directory is empty, then it downloads all mbox files starting from a definable starting year to -#' the current real-life year. -#' -#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes -#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory -#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) -#' @param from_year First year in the range to be downloaded in case there are no mod_mbox files already downloaded (e.g. 201401) -#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. -#' @param verbose Prints progress during execution -#' @export -refresh_mod_mbox <- function(archive_url, mailing_list, archive_type, from_year, save_folder_path,verbose=FALSE) { - # Get a list of mbox files currently downloaded in save path folder - existing_mbox_files <- list.files(save_folder_path) - output <- save_folder_path - - # Get the current year - current_date <- Sys.Date() - current_year <- as.numeric(substr(current_date, 1, 4)) - current_month <- as.numeric(substr(current_date, 6, 7)) - - # If there are no mbox files downloaded, then download mbox files as normal using download_mod_mbox_per_month - if (length(existing_mbox_files) == 0) { - if (verbose) { - message("The folder is empty. Downloading mbox files from ", from_year, " to ", current_year, ". \n") - } - download_mod_mbox_per_month(archive_url = archive_url, - mailing_list = mailing_list, - archive_type = archive_type, - from_year = from_year, - to_year = current_year, - save_folder_path = save_folder_path, - verbose = verbose) - } else { - counter <- 0 - destination <- list() - latest_file_name <- parse_mbox_latest_date(save_folder_path) - extracted_year_month <- sub("[^_]*_[^_]*_", "", sub(".mbox", "", latest_file_name)) - output <- path.expand(save_folder_path) - - latest_downloaded_year <- as.numeric(substr(extracted_year_month, 1, 4)) - latest_downloaded_month <- as.numeric(substr(extracted_year_month, 6, 7)) - this_file <- paste(save_folder_path, latest_file_name, sep = "/") - file.remove(this_file) - # Download files starting from deleted file month to end of that year - for (month in (latest_downloaded_month:12)) { - # Checks to see if iterator goes beyond current month, stops function if it does - if (latest_downloaded_year == current_year && month > current_month) { - return(output) - } - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) - mbox_file_name <- stringi::stri_c(mailing_list, archive_type, destination[[counter]], sep = "_") - - if(verbose){ - print(stringi::stri_c("Downloading:",mbox_file_name,sep = " ")) - } - - #Try file download and save result - full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,mbox_file_name) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",mbox_file_name)) - file.remove(full_tmp_save_path) - } - - } - - # Call the per-month-downloader to download the new mail missing from the user's machine - download_mod_mbox_per_month(archive_url = archive_url, - mailing_list = mailing_list, - archive_type = archive_type, - from_year = (latest_downloaded_year+1), - to_year = current_year, - save_folder_path = save_folder_path, - verbose = verbose) - } - # End of if-else -} - -#' Refresh mbox files downloaded via pipermail -#' -#' Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -#' the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, -#' then redownloads it along with the remaining months past j up to 12. Then, it calls -#' \code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being -#' the current real-life year so that all newer mbox files are downloaded. -#' -#' If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} -#' -#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes -#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory -#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) -#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. -#' @param verbose prints progress during execution -#' @export -refresh_pipermail <- function(archive_url, mailing_list, archive_type, save_folder_path,verbose=FALSE) { - # Get a list of mbox files currently downloaded in save path folder - existing_mbox_files <- list.files(save_folder_path) - - # Get the current year - current_date <- Sys.Date() - current_year <- as.numeric(substr(current_date, 1, 4)) - current_month <- as.numeric(substr(current_date, 6, 7)) - - # If there are no mbox files downloaded, then download mbox files as normal using download_pipermail - if (length(existing_mbox_files) == 0) { - if (verbose) { - message("The folder is empty. Downloading all pipermail files. \n") - } - download_pipermail(mailing_list = mailing_list, - start_year_month = start_year_month, - end_year_month = end_year_month, - save_folder_path = save_folder_path) - } else { - latest_file_name <- parse_mbox_latest_date(save_folder_path) - extracted_year_month <- sub("[^_]*_[^_]*_", "", sub(".mbox", "", latest_file_name)) - output <- path.expand(save_folder_path) - - latest_downloaded_year <- as.numeric(substr(extracted_year_month, 1, 4)) - latest_downloaded_month <- as.numeric(substr(extracted_year_month, 5, 6)) - this_file <- paste(save_folder_path, latest_file_name, sep = "/") - # Overwrite file because new email may have been added at this point in this month - file.remove(this_file) - - # Download txt files starting from deleted file month to end of that year, save as mbox - download_txt_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, current_year, current_month, save_folder_path) { - counter <- 0 - destination <- list() - mbox_correct_name_format <- list() - output <- save_folder_path - - for (month in (latest_downloaded_month:12)) { - if (latest_downloaded_year == current_year && month > current_month) { - return(output) - } - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d-%s.txt", latest_downloaded_year, month.name[month]) - mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) - mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") - - #Try file download and save result - full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,mbox_file_name) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",destination[[counter]])) - file.remove(full_tmp_save_path) - } - - } - } - - # Download txt.gz files starting from deleted file month to the end of that year, save as mbox - download_txt_gz_files_latest_downloaded_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, latest_downloaded_month, current_year, current_month, save_folder_path) { - - counter <- 0 - destination <- list() - mbox_correct_name_format <- list() - output <- save_folder_path - - for (month in (latest_downloaded_month:12)) { - if (latest_downloaded_year == current_year && month > current_month) { - return(output) - } - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d-%s.txt.gz", latest_downloaded_year, month.name[month]) - mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", latest_downloaded_year, month) - mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") - - #Try file download and save result - full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,mbox_file_name) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",destination[[counter]])) - file.remove(full_tmp_save_path) - } - - } - } - - # Download txt files from the year after the latest downloaded year to the current real life year - download_txt_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, current_month, save_folder_path) { - - counter <- 0 - destination <- list() - mbox_correct_name_format <- list() - output <- save_folder_path - - for (year in (latest_downloaded_year+1):current_year) { - for (month in (1:12)) { - if (year == current_year && month > current_month) { - return(output) - } - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d-%s.txt", year, month.name[month]) - mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", year, month) - mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") - - #Try file download and save result - full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,mbox_file_name) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",destination[[counter]])) - file.remove(full_tmp_save_path) - } - - } - } - - } - - # Download txt.gz files from the year after the latest downloaded year to the current real life year - download_txt_gz_files_current_year <- function(archive_url, mailing_list, archive_type, latest_downloaded_year, current_year, current_month, save_folder_path) { - - counter <- 0 - destination <- list() - mbox_correct_name_format <- list() - output <- save_folder_path - - for (year in (latest_downloaded_year+1):current_year) { - for (month in (1:12)) { - if (year == current_year && month > current_month) { - return(output) - } - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d-%s.txt.gz", year, month.name[month]) - mbox_correct_name_format[[counter]] <- sprintf("%d%02d.mbox", year, month) - mbox_file_name <- stringi::stri_c(mailing_list, archive_type, mbox_correct_name_format[[counter]], sep = "_") - - #Try file download and save result - full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,mbox_file_name) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",destination[[counter]])) - file.remove(full_tmp_save_path) - } - - } - } - - } - - download_txt_files_latest_downloaded_year(archive_url=archive_url, - mailing_list=mailing_list, - archive_type=archive_type, - latest_downloaded_year=latest_downloaded_year, - latest_downloaded_month=latest_downloaded_month, - current_year = current_year, - current_month = current_month, - save_folder_path=save_folder_path) - - download_txt_gz_files_latest_downloaded_year(archive_url=archive_url, - mailing_list=mailing_list, - archive_type=archive_type, - latest_downloaded_year=latest_downloaded_year, - latest_downloaded_month=latest_downloaded_month, - current_year = current_year, - current_month = current_month, - save_folder_path=save_folder_path) - - download_txt_files_current_year(archive_url=archive_url, - mailing_list=mailing_list, - archive_type=archive_type, - latest_downloaded_year=latest_downloaded_year, - current_year=current_year, - current_month = current_month, - save_folder_path=save_folder_path) - - download_txt_gz_files_current_year(archive_url=archive_url, - mailing_list=mailing_list, - archive_type=archive_type, - latest_downloaded_year=latest_downloaded_year, - current_year = current_year, - current_month = current_month, - save_folder_path=save_folder_path) - } - # End of if-else -} ############## Parsers ############## diff --git a/conf/helix.yml b/conf/helix.yml index d0b623c0..ba69d6e9 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -69,8 +69,8 @@ mailing_list: project_key_2: # archive_url: https://mta.openssl.org/mailman/listinfo/ mailing_list: https://mta.openssl.org/pipermail/openssl-project/ - start_year_month: 201903 - end_year_month: 202103 + start_year_month: 202203 + end_year_month: 202303 save_folder_path: "../save_folder_mail_2" issue_tracker: diff --git a/man/convert_pipermail_to_mbox.Rd b/man/convert_pipermail_to_mbox.Rd deleted file mode 100644 index 441b1230..00000000 --- a/man/convert_pipermail_to_mbox.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mail.R -\name{convert_pipermail_to_mbox} -\alias{convert_pipermail_to_mbox} -\title{Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}}} -\usage{ -convert_pipermail_to_mbox(filelist) -} -\arguments{ -\item{filelist}{A vector of pipermail archive files from \code{\link{download_pipermail}}} -} -\value{ -Returns `output`, the name of the resulting .mbox file in the current working directory -} -\description{ -Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}} -} diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 218527c6..b36a0d7a 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -4,19 +4,24 @@ \alias{download_pipermail} \title{Download all pipermail files in an archive as mbox files} \usage{ -download_pipermail(archive_url, mailing_list, archive_type, save_folder_path) +download_pipermail( + mailing_list, + start_year_month, + end_year_month, + save_folder_path +) } \arguments{ -\item{archive_url}{An url pointing to a pipermail archive} - \item{mailing_list}{The name of the mailing list being downloaded} -\item{archive_type}{The name of the type of archive that the mailing list is stored in} +\item{start_year_month}{The year and month of the first file to be downloaded} + +\item{end_year_month}{The year and month of the last file to be downloaded} \item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} } \value{ -Returns `destination`, a vector of the downloaded files in the current working directory +Returns `downloaded_files`, a vector of the downloaded files in the current working directory } \description{ Download all pipermail files in an archive as mbox files diff --git a/man/process_gz_to_mbox_in_folder.Rd b/man/process_gz_to_mbox_in_folder.Rd new file mode 100644 index 00000000..3564ac9c --- /dev/null +++ b/man/process_gz_to_mbox_in_folder.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{process_gz_to_mbox_in_folder} +\alias{process_gz_to_mbox_in_folder} +\title{Process .gz files in a folder, unzip and convert them to .mbox +Checks a folder for any .gz files, unzips them, and renames them +to .mbox format. The original .gz files are deleted after unzipping. If a .mbox +file with the same name already exists, it will be overwritten.} +\usage{ +process_gz_to_mbox_in_folder(folder_path) +} +\arguments{ +\item{folder_path}{The path to the folder containing both .gz and .mbox files.} +} +\value{ +A list of the .mbox files that were created or updated. +} +\description{ +Process .gz files in a folder, unzip and convert them to .mbox +Checks a folder for any .gz files, unzips them, and renames them +to .mbox format. The original .gz files are deleted after unzipping. If a .mbox +file with the same name already exists, it will be overwritten. +} diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd deleted file mode 100644 index 19132bd3..00000000 --- a/man/refresh_mod_mbox.Rd +++ /dev/null @@ -1,39 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mail.R -\name{refresh_mod_mbox} -\alias{refresh_mod_mbox} -\title{Refresh mbox files} -\usage{ -refresh_mod_mbox( - archive_url, - mailing_list, - archive_type, - from_year, - save_folder_path, - verbose = FALSE -) -} -\arguments{ -\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} - -\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} - -\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} - -\item{from_year}{First year in the range to be downloaded in case there are no mod_mbox files already downloaded (e.g. 201401)} - -\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} - -\item{verbose}{Prints progress during execution} -} -\description{ -Uses the adopted file name convention by \code{\link{download_mod_mbox_per_month}} to identify -the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, -then redownloads it along with the remaining months past j up to 12. Then, it calls -\code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being -the current real-life year so that all newer mbox files are downloaded. -} -\details{ -If the directory is empty, then it downloads all mbox files starting from a definable starting year to -the current real-life year. -} diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index 427c66d2..ba2b2e15 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -2,34 +2,28 @@ % Please edit documentation in R/mail.R \name{refresh_pipermail} \alias{refresh_pipermail} -\title{Refresh mbox files downloaded via pipermail} +\title{Refresh mbox files downloaded via pipermail +Uses the adopted file name convention by \code{\link{download_pipermail}} to identify +the latest downloaded mbox. It deletes this file, then redownloads it along with all future months +up to the current real-life month. +If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}}} \usage{ -refresh_pipermail( - archive_url, - mailing_list, - archive_type, - save_folder_path, - verbose = FALSE -) +refresh_pipermail(mailing_list, start_year_month, save_folder_path) } \arguments{ -\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} - -\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} - -\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} +\item{mailing_list}{The name of the mailing list being downloaded} -\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} +\item{start_year_month}{The year and month of the first file to be downloaded} -\item{verbose}{prints progress during execution} +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} +} +\value{ +Returns `downloaded_files`, a vector of the downloaded files in the current working directory } \description{ +Refresh mbox files downloaded via pipermail Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -the latest downloaded mbox year i and month j. It deletes the mbox file of year i and month j, -then redownloads it along with the remaining months past j up to 12. Then, it calls -\code{\link{download_mod_mbox_per_month}} with from_year being year i+1 and to_year being -the current real-life year so that all newer mbox files are downloaded. -} -\details{ +the latest downloaded mbox. It deletes this file, then redownloads it along with all future months +up to the current real-life month. If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} } diff --git a/tools.yml b/tools.yml index fd4ac52a..64667d8d 100644 --- a/tools.yml +++ b/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /usr/local/Cellar/universal-ctags/p6.1.20240901.0/bin/ctags +utags: /usr/local/Cellar/universal-ctags/HEAD-40b5861/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index fbf5a034..d7eb631d 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -64,9 +64,8 @@ save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["s - end_year_month: The ending date for downloading archives (in YYYYMM format). - save_folder_path: The local directory where the downloaded archives will be saved. - # Pipermail Downloader - +You can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory. The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. ```{r} # Download archives download_pipermail( @@ -79,3 +78,20 @@ download_pipermail( ``` After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc. +# Pipermail Refresher +In some cases, you may want to refresh the archive to ensure the most recent months are up-to-date or to handle updates to the mailing list. The refresh_pipermail() function helps automate this process. + +How refresh_pipermail Works +1. Checks if the folder is empty: If the folder is empty, it downloads archives starting from start_year_month to the current month using download_pipermail(). +2. Finds the most recent file: If the folder is not empty, the function checks for the most recent month’s file (based on the filename) and deletes it. +3. Redownloads from the most recent month: The function then redownloads the archive from the most recent month up to the current month. +```{r} +# Refresh archives +refresh_pipermail( + mailing_list = mailing_list, + start_year_month = start_year_month, + save_folder_path = save_folder_path +) + +``` +This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive if necessary and adding any new months that have been added to the mailing list. From 3c88140cb46bccda5e37c6a310b5dab74f99fc04 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Wed, 18 Sep 2024 14:18:20 -1000 Subject: [PATCH 12/80] i #284 Added more descriptive comments. Made minor changes to pipermail refresher. - Replaced paste0 with stringi::stri_c - Removed create directory if does not exist - Added more verbose descriptions/comments - Added dividers within functions - Added verbose parameter - Added else block for refresher - Added call to process_gz_to_mbox_in_folder at end of refresher - parse_mbox: stri_replace_last was not working, changed it to stringi::stri_replace_last_regex - Tested parse_mbox. Perceval was not returning any output. I will look further into why this is happening. --- R/mail.R | 264 +++++++++++++++++----------- man/download_pipermail.Rd | 21 ++- man/process_gz_to_mbox_in_folder.Rd | 17 +- man/refresh_pipermail.Rd | 36 ++-- tools.yml | 2 +- 5 files changed, 203 insertions(+), 137 deletions(-) diff --git a/R/mail.R b/R/mail.R index 9b304ab5..fc2b83ec 100644 --- a/R/mail.R +++ b/R/mail.R @@ -4,122 +4,140 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -############## Downloader ############## +############## Pipermail Downloader ############## #' Download all pipermail files in an archive as mbox files -#' @param mailing_list The name of the mailing list being downloaded -#' @param start_year_month The year and month of the first file to be downloaded -#' @param end_year_month The year and month of the last file to be downloaded +#' +#' @description This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. +#' It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. +#' The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. +#' +#' When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, +#' overwriting any existing file with the same name. The original .gz file is deleted after extraction. +#' +#' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. +#' The function only downloads files that fall between the specified start_year_month and end_year_month. +#' +#' @param mailing_list The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/") +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM') +#' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month) #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored +#' @param verbose Logical; if TRUE, prints diagnostic messages during the download process #' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path) { +download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { - # Create directory if it does not exist - if (!dir.exists(save_folder_path)) { - dir.create(save_folder_path, recursive = TRUE) - } - - # Ensure mailing_list URL ends with a slash + ########## Download and Parse Mailing List HTML for Links ########## + # Ensure mailing_list url ends with a slash, which is important when constructing links for downloading files, + # since the extracted links are relative to the base URL. + # e.g.base url: https://mta.openssl.org/pipermail/openssl-announce/ and extracted link: 2024-June.txt.gz if (!stringi::stri_endswith_fixed(mailing_list, "/")) { - mailing_list <- paste0(mailing_list, "/") + mailing_list <- stringi::stri_c(mailing_list, "/") } - # Get mailing list contents + # Sends a GET request to the mailing list’s URL to retrieve contents. This is the main page of the mailing list archive, + # which contains links to individual month files (in .txt or .gz format). response <- httr::GET(mailing_list, httr::timeout(60)) if (httr::status_code(response) != 200) { stop("Failed to access the mailing list page.") } - # Parse the response + # The content is parsed as text to extract the rows of data from the table that contains the file links. parsed_response <- httr::content(response, "text") doc_obj <- XML::htmlParse(parsed_response, asText = TRUE) - # Table rows + # Get all table rows in the archive page. These rows contain the links to the individual month files. rows <- XML::getNodeSet(doc_obj, "//tr") - - # Skip header row + # Skip the header row, to get to data rows data_rows <- rows[-1] - - # Vector for link storage + # Create an empty vector for storing the links that will be extracted. links <- c() - # Extract the date and link from each row + ########## Extract Date and Links ########## + # Loop through the data rows and extract the date and link from each row. + # The date is in the first column, and the link is in the third column. for (row in data_rows) { - # Date in YYYYMM format + # Extract and clean the date, which is in the format "Month Year" (e.g., "June 2024"). date_extracted <- XML::xpathSApply(row, ".//td[1]", XML::xmlValue) date_cleaned <- stringi::stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "") date_cleaned <- stringi::stri_trim_both(date_cleaned) - # Parse the date - date_parsed <- as.Date(paste0("01 ", date_cleaned), format = "%d %B %Y") + # Parse the cleaned date into a valid date object. This allows us to convert it into the "YYYYMM" format. + date_parsed <- as.Date(stringi::stri_c("01 ", date_cleaned), format = "%d %B %Y") if (is.na(date_parsed)) { warning("Date could not be parsed: ", date_cleaned) next } year_month <- format(date_parsed, "%Y%m") - # Check if date is within range + # Check if the extracted year_month falls within the specified range of start_year_month to end_year_month. + # If it does, proceed to extract the file link from the third column of the row. if (year_month >= start_year_month & year_month <= end_year_month) { - # Get href from column 3 + # Get the link (href) from the third column. This is the link to the .txt or .gz file for that month. link_nodes <- XML::xpathSApply(row, ".//td[3]/a", XML::xmlGetAttr, 'href') if (length(link_nodes) == 0) { warning("No link found in row for date: ", date_cleaned) next } - # Store the link in links + # Store the link in the links vector, for later download. link <- link_nodes[1] links <- c(links, link) } } - # Vector for downloaded files + ########## Use Links to Download Individual Files ########## + # Initialize a vector for storing the paths of the downloaded files. downloaded_files <- c() for (i in seq_along(links)) { link <- links[i] - # Extract the name without the .txt.gz extension + # Extract the base name of the file (without the .txt.gz extension), so we can construct the correct download paths. base_name <- gsub("\\.txt\\.gz$", "", link) - # Parse the date from the base name - date_parsed <- as.Date(paste0("01-", base_name), format = "%d-%Y-%B") + # Parse the date from the base name and convert it into "YYYYMM" format for consistency with our file naming. + date_parsed <- as.Date(stringi::stri_c("01-", base_name), format = "%d-%Y-%B") if (is.na(date_parsed)) { warning("Could not parse date from link: ", link) next } year_month_clean <- format(date_parsed, "%Y%m") - # Download URL - txt_url <- paste0(mailing_list, gsub("\\.gz$", "", link)) - gz_url <- paste0(mailing_list, link) + # Construct the download URLs for both the .txt and .gz versions of the file. + # The function will first attempt to download the .txt version. + txt_url <- stringi::stri_c(mailing_list, gsub("\\.gz$", "", link)) + gz_url <- stringi::stri_c(mailing_list, link) # Attempt to download the .txt file first download_url <- txt_url response <- httr::GET(download_url, httr::timeout(60)) + # If the response status code is not 200, the file is not available. if (httr::status_code(response) != 200) { # Fallback to .gz file if .txt is unavailable download_url <- gz_url response <- httr::GET(download_url, httr::timeout(60)) if (httr::status_code(response) != 200) { - cat("Both .txt and .gz downloads failed for link: ", link, "\n") + warning("Both .txt and .gz downloads failed for link: ", link, "\n") next } } - # Define the destination file - dest <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox')) + # Define the destination file name and path where the downloaded content will be saved as a .mbox file. + dest <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox')) - # Print diagnostic info - cat("Downloading: ", download_url, "\n") - cat("Saving to: ", dest, "\n") + ########## Write Downloaded File to Disk ########## + # Print diagnostic info if verbose is TRUE + if (verbose) { + cat("Downloading: ", download_url, "\n") + cat("Saving to: ", dest, "\n") + } - # Write file to disk + # Write the downloaded file to disk. If the file is a .gz file, it needs to be unzipped and converted to .mbox format. if (grepl("\\.gz$", download_url)) { - # Download the .gz file - gz_file_path <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox.gz')) + # Download the .gz file to a temporary location. + gz_file_path <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) - # Unzip the file + # Unzip the .gz file and save the contents as a .mbox file. gz_con <- gzfile(gz_file_path, open = "rb") out_con <- file(dest, open = "wb") while (TRUE) { @@ -130,121 +148,151 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s close(gz_con) close(out_con) - # Remove the gz file after unzipping + # Remove the .gz file after unzipping to avoid storing duplicate data. file.remove(gz_file_path) } else { - # Download the .txt file directly + # If the .txt file is available, download it directly and save it as a .mbox file. httr::GET(download_url, httr::write_disk(dest, overwrite = TRUE), httr::timeout(60)) } - # Add the downloaded file to the list + # Add the downloaded file path to the list of downloaded files. downloaded_files <- c(downloaded_files, dest) } - # Return downloaded files + ########## Return List of Downloaded Files ########## + # Return the list of downloaded .mbox files return(downloaded_files) } +############## Pipermail Refresher ############## #' Refresh mbox files downloaded via pipermail -#' Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -#' the latest downloaded mbox. It deletes this file, then redownloads it along with all future months -#' up to the current real-life month. -#' If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} -#' @param mailing_list The name of the mailing list being downloaded -#' @param start_year_month The year and month of the first file to be downloaded -#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored -#' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory +#' +#' @description This function refreshes the mailing list files by checking the contents of a specified folder. +#' If the folder is empty, it calls \code{download_pipermail} to download all pipermail files from start_year_month to the current month. +#' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +#' along with all future months up to the current real-life month. +#' +#' The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +#' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +#' Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. +#' +#' @param mailing_list The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/") +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored. +#' @param verbose Logical; if TRUE, prints diagnostic messages. +#' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path) { +refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { - # Create directory if it does not exist - if (!dir.exists(save_folder_path)) { - dir.create(save_folder_path, recursive = TRUE) - } - - # Check if the folder is empty + ########## Check if Folder is Empty ########## + # Check the contents of the folder to see if any .mbox files are already present + # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + if (length(files_in_folder) == 0) { - # If empty, download from start_year_month to the current month + # If the folder is empty, download all pipermail files starting from the start_year_month + # The end date is set to the current month based on the system date end_year_month <- format(Sys.Date(), "%Y%m") - cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") + if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") + + # Call the download_pipermail function to download files from start_year_month to end_year_month download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) - return(NULL) - } - # If folder is not empty, find the most recent month - year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) - recent_month <- max(year_months) - - # Delete the most recent file - recent_file <- file.path(save_folder_path, paste0("kaiaulu_", recent_month, ".mbox")) - if (file.exists(recent_file)) { - file.remove(recent_file) - cat("Deleted the most recent file:", recent_file, "\n") } + ########## Identify the Most Recent Month ########## + else { + # If the folder is not empty, identify the most recent month based on the filenames + # The filenames follow the pattern 'kaiaulu_YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) + + # Find the most recent month by taking the maximum of the extracted YYYYMM values + recent_month <- max(year_months) + + # Delete the most recent file before redownloading it + recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + if (file.exists(recent_file)) { + file.remove(recent_file) + if (verbose) cat("Deleted the most recent file:", recent_file, "\n") + } + + ########## Redownload from the Most Recent Month ########## + # Set the end_year_month to the current month (based on the system date) + end_year_month <- format(Sys.Date(), "%Y%m") + + # Redownload files from the most recent month (that was just deleted) to the current month + if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") - # Redownload from the most recent month to the current real-life month - end_year_month <- format(Sys.Date(), "%Y%m") - cat("Redownloading from", recent_month, "to", end_year_month, "\n") - download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) + # Call the download_pipermail function to redownload the deleted month and all subsequent months up to the current month + download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) + } + ########## Process .gz Files After Refresh ########## + # Call process_gz_to_mbox_in_folder to ensure all .gz files are converted to .mbox after the refresh + if (verbose) cat("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") + process_gz_to_mbox_in_folder(folder_path = save_folder_path, verbose = verbose) } -#' Process .gz files in a folder, unzip and convert them to .mbox -#' Checks a folder for any .gz files, unzips them, and renames them -#' to .mbox format. The original .gz files are deleted after unzipping. If a .mbox -#' file with the same name already exists, it will be overwritten. +#' Process .gz files in a folder and convert them to .mbox +#' +#' @description This function scans a specified folder for any .gz files, unzips them, +#' and renames them to the .mbox format. After unzipping, the original .gz files are deleted. +#' If a .mbox file with the same name already exists, it will be overwritten. +#' This makes sure that all the files in the folder are in .mbox format, ready for parsing. #' #' @param folder_path The path to the folder containing both .gz and .mbox files. +#' @param verbose if TRUE, prints diagnostic messages during processing. #' @return A list of the .mbox files that were created or updated. #' @export -process_gz_to_mbox_in_folder <- function(folder_path) { +process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { - # Get the list of files in the folder + # Get the list of all files in the folder, including full paths files <- list.files(folder_path, full.names = TRUE) - # Find .gz files + # Identify .gz files from the list of files gz_files <- files[grepl("\\.gz$", files)] - # Check if there are no .gz files + # If there are no .gz files, print a message (if verbose is TRUE) and return NULL if (length(gz_files) == 0) { - cat("This folder does not contain any .gz files.\n") + if (verbose) cat("This folder does not contain any .gz files.\n") return(NULL) } - # Vector to store names of converted .mbox files + # Create a vector to store the names of the converted .mbox files converted_mbox_files <- c() - # Process .gz files + ########## Process Each .gz File ########## + # Iterate over each .gz file, unzip it, and convert it to .mbox for (gz_file in gz_files) { - # Define the corresponding .mbox file path (remove .gz and replace with .mbox) + # Define the corresponding .mbox file path by replacing .gz with .mbox in the file name mbox_file <- gsub("\\.gz$", ".mbox", gz_file) - cat("Processing:", gz_file, " -> ", mbox_file, "\n") + if (verbose) cat("Processing:", gz_file, " -> ", mbox_file, "\n") - # Open .gz file and unzip its contents to .mbox + # Open the .gz file in binary mode for reading gz_con <- gzfile(gz_file, open = "rb") + + # Create a new .mbox file and open it in binary mode for writing out_con <- file(mbox_file, open = "wb") - # Read and write the contents + # Read the contents of the .gz file and write the chunks to the .mbox file while (TRUE) { bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) if (length(bytes) == 0) break writeBin(bytes, out_con) } - # Close connections + # Close both the input (gz) and output (mbox) file connections close(gz_con) close(out_con) - # Remove the .gz file + # After successfully converting the file, delete the original .gz file file.remove(gz_file) - # Add the converted file to the list + # Add the newly created .mbox file to the list of converted files converted_mbox_files <- c(converted_mbox_files, mbox_file) } - # Return the list of converted .mbox files + # Return the vector of all the .mbox files that were created or updated return(converted_mbox_files) } @@ -372,7 +420,7 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, # Remove file if error # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",destination[[counter]])) + warning(stringi::stri_c("Unable to download: ",destination[[counter]])) file.remove(full_tmp_save_path) } @@ -399,17 +447,19 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, #' @param mbox_path path to mbox archive file (ends in .mbox) #' @export #' @family parsers -parse_mbox <- function(perceval_path,mbox_path){ +parse_mbox <- function(perceval_path, mbox_path){ # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") perceval_path <- path.expand(perceval_path) mbox_path <- path.expand(mbox_path) # Remove ".mbox" - mbox_uri <- stri_replace_last(mbox_path,replacement="",regex=".mbox") + mbox_uri <- stringi::stri_replace_last_regex(mbox_path, pattern = "\\.mbox$", replacement = "") + # Use percerval to parse mbox_path. --json line is required to be parsed by jsonlite::fromJSON. perceval_output <- system2(perceval_path, args = c('mbox',mbox_uri,mbox_path,'--json-line'), stdout = TRUE, stderr = FALSE) + # Parsed JSON output as a data.table. perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose=FALSE)) @@ -479,12 +529,12 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r # format the date correctly cdate <- format(as.POSIXct(reply_datetime, format = "%Y-%m-%dT%H:%M:%S"), "%a, %e %b %Y %H:%M:%S ") - reply_from_full_info <- paste0(reply_from_author, " <", reply_from_email, ">") - reply_to_full_info <- paste0(reply_to_author, " <", reply_to_email, ">") - reply_cc_full_info <- paste0(reply_cc_author, " <", reply_cc_email, ">") + reply_from_full_info <- stringi::stri_c(reply_from_author, " <", reply_from_email, ">") + reply_to_full_info <- stringi::stri_c(reply_to_author, " <", reply_to_email, ">") + reply_cc_full_info <- stringi::stri_c(reply_cc_author, " <", reply_cc_email, ">") - mbox_content <- paste0( + mbox_content <- stringi::stri_c( "From MAILER-DAEMON Thu Jul 18 13:48:48 2013", "\nPath: example.com!not-for-mail", "\nFrom: ", reply_from_full_info, @@ -503,7 +553,7 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r "\nX-Accept-Language: en-us ", "\nOriginal-To: ", reply_to_full_info, " ", reply_cc_full_info, "\nPrecedence: bulk", - "\nX-Mailing-List: ", paste0(mailing_list, "@example.com"), + "\nX-Mailing-List: ", stringi::stri_c(mailing_list, "@example.com"), "\n\n", reply_body ) @@ -524,7 +574,7 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r make_mbox_mailing_list <- function(replies, folder_path = "/tmp", file_name) { # Create a unique filename for the mbox file - mbox_filepath <- file.path(folder_path, paste0(file_name, ".mbox")) + mbox_filepath <- file.path(folder_path, stringi::stri_c(file_name, ".mbox")) # make the file mbox_body <- stringi::stri_c(replies,collapse = "\n\n") diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index b36a0d7a..51a98bf2 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -8,21 +8,32 @@ download_pipermail( mailing_list, start_year_month, end_year_month, - save_folder_path + save_folder_path, + verbose = TRUE ) } \arguments{ -\item{mailing_list}{The name of the mailing list being downloaded} +\item{mailing_list}{The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/")} -\item{start_year_month}{The year and month of the first file to be downloaded} +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM')} -\item{end_year_month}{The year and month of the last file to be downloaded} +\item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month)} \item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} + +\item{verbose}{Logical; if TRUE, prints diagnostic messages during the download process} } \value{ Returns `downloaded_files`, a vector of the downloaded files in the current working directory } \description{ -Download all pipermail files in an archive as mbox files +This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. +It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. +The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. + +When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, +overwriting any existing file with the same name. The original .gz file is deleted after extraction. + +The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. +The function only downloads files that fall between the specified start_year_month and end_year_month. } diff --git a/man/process_gz_to_mbox_in_folder.Rd b/man/process_gz_to_mbox_in_folder.Rd index 3564ac9c..1964df28 100644 --- a/man/process_gz_to_mbox_in_folder.Rd +++ b/man/process_gz_to_mbox_in_folder.Rd @@ -2,22 +2,21 @@ % Please edit documentation in R/mail.R \name{process_gz_to_mbox_in_folder} \alias{process_gz_to_mbox_in_folder} -\title{Process .gz files in a folder, unzip and convert them to .mbox -Checks a folder for any .gz files, unzips them, and renames them -to .mbox format. The original .gz files are deleted after unzipping. If a .mbox -file with the same name already exists, it will be overwritten.} +\title{Process .gz files in a folder and convert them to .mbox} \usage{ -process_gz_to_mbox_in_folder(folder_path) +process_gz_to_mbox_in_folder(folder_path, verbose = TRUE) } \arguments{ \item{folder_path}{The path to the folder containing both .gz and .mbox files.} + +\item{verbose}{if TRUE, prints diagnostic messages during processing.} } \value{ A list of the .mbox files that were created or updated. } \description{ -Process .gz files in a folder, unzip and convert them to .mbox -Checks a folder for any .gz files, unzips them, and renames them -to .mbox format. The original .gz files are deleted after unzipping. If a .mbox -file with the same name already exists, it will be overwritten. +This function scans a specified folder for any .gz files, unzips them, +and renames them to the .mbox format. After unzipping, the original .gz files are deleted. +If a .mbox file with the same name already exists, it will be overwritten. +This makes sure that all the files in the folder are in .mbox format, ready for parsing. } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index ba2b2e15..934363a5 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -2,28 +2,34 @@ % Please edit documentation in R/mail.R \name{refresh_pipermail} \alias{refresh_pipermail} -\title{Refresh mbox files downloaded via pipermail -Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -the latest downloaded mbox. It deletes this file, then redownloads it along with all future months -up to the current real-life month. -If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}}} +\title{Refresh mbox files downloaded via pipermail} \usage{ -refresh_pipermail(mailing_list, start_year_month, save_folder_path) +refresh_pipermail( + mailing_list, + start_year_month, + save_folder_path, + verbose = TRUE +) } \arguments{ -\item{mailing_list}{The name of the mailing list being downloaded} +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/")} -\item{start_year_month}{The year and month of the first file to be downloaded} +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored.} + +\item{verbose}{Logical; if TRUE, prints diagnostic messages.} } \value{ -Returns `downloaded_files`, a vector of the downloaded files in the current working directory +Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. } \description{ -Refresh mbox files downloaded via pipermail -Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -the latest downloaded mbox. It deletes this file, then redownloads it along with all future months -up to the current real-life month. -If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} +This function refreshes the mailing list files by checking the contents of a specified folder. +If the folder is empty, it calls \code{download_pipermail} to download all pipermail files from start_year_month to the current month. +If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +along with all future months up to the current real-life month. + +The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. } diff --git a/tools.yml b/tools.yml index 64667d8d..d3bbc518 100644 --- a/tools.yml +++ b/tools.yml @@ -1,5 +1,5 @@ # https://github.com/chaoss/grimoirelab-perceval -perceval: ~/perceval/bin/perceval +perceval: /Users/dao/anaconda3/bin/perceval # https://github.com/multilang-depends/depends depends: ~/depends-0.9.6/depends.jar # https://github.com/tsantalis/RefactoringMiner#running-refactoringminer-from-the-command-line From 5de3aa248a422caed9ccf04a0bc2177d5effd7d6 Mon Sep 17 00:00:00 2001 From: Dao McGill Date: Wed, 18 Sep 2024 14:18:20 -1000 Subject: [PATCH 13/80] i #284 Added more descriptive comments. Made minor changes to pipermail refresher. - Replaced paste0 with stringi::stri_c - Removed create directory if does not exist - Added more verbose descriptions/comments - Added dividers within functions - Added verbose parameter - Added else block for refresher - Added call to process_gz_to_mbox_in_folder at end of refresher - parse_mbox: stri_replace_last was not working, changed it to stringi::stri_replace_last_regex - Tested parse_mbox. Perceval was not returning any output. I will look further into why this is happening. Signed-off-by: Dao McGill --- R/mail.R | 264 +++++++++++++++++----------- man/download_pipermail.Rd | 21 ++- man/process_gz_to_mbox_in_folder.Rd | 17 +- man/refresh_pipermail.Rd | 36 ++-- tools.yml | 2 +- 5 files changed, 203 insertions(+), 137 deletions(-) diff --git a/R/mail.R b/R/mail.R index 9b304ab5..fc2b83ec 100644 --- a/R/mail.R +++ b/R/mail.R @@ -4,122 +4,140 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -############## Downloader ############## +############## Pipermail Downloader ############## #' Download all pipermail files in an archive as mbox files -#' @param mailing_list The name of the mailing list being downloaded -#' @param start_year_month The year and month of the first file to be downloaded -#' @param end_year_month The year and month of the last file to be downloaded +#' +#' @description This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. +#' It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. +#' The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. +#' +#' When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, +#' overwriting any existing file with the same name. The original .gz file is deleted after extraction. +#' +#' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. +#' The function only downloads files that fall between the specified start_year_month and end_year_month. +#' +#' @param mailing_list The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/") +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM') +#' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month) #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored +#' @param verbose Logical; if TRUE, prints diagnostic messages during the download process #' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path) { +download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { - # Create directory if it does not exist - if (!dir.exists(save_folder_path)) { - dir.create(save_folder_path, recursive = TRUE) - } - - # Ensure mailing_list URL ends with a slash + ########## Download and Parse Mailing List HTML for Links ########## + # Ensure mailing_list url ends with a slash, which is important when constructing links for downloading files, + # since the extracted links are relative to the base URL. + # e.g.base url: https://mta.openssl.org/pipermail/openssl-announce/ and extracted link: 2024-June.txt.gz if (!stringi::stri_endswith_fixed(mailing_list, "/")) { - mailing_list <- paste0(mailing_list, "/") + mailing_list <- stringi::stri_c(mailing_list, "/") } - # Get mailing list contents + # Sends a GET request to the mailing list’s URL to retrieve contents. This is the main page of the mailing list archive, + # which contains links to individual month files (in .txt or .gz format). response <- httr::GET(mailing_list, httr::timeout(60)) if (httr::status_code(response) != 200) { stop("Failed to access the mailing list page.") } - # Parse the response + # The content is parsed as text to extract the rows of data from the table that contains the file links. parsed_response <- httr::content(response, "text") doc_obj <- XML::htmlParse(parsed_response, asText = TRUE) - # Table rows + # Get all table rows in the archive page. These rows contain the links to the individual month files. rows <- XML::getNodeSet(doc_obj, "//tr") - - # Skip header row + # Skip the header row, to get to data rows data_rows <- rows[-1] - - # Vector for link storage + # Create an empty vector for storing the links that will be extracted. links <- c() - # Extract the date and link from each row + ########## Extract Date and Links ########## + # Loop through the data rows and extract the date and link from each row. + # The date is in the first column, and the link is in the third column. for (row in data_rows) { - # Date in YYYYMM format + # Extract and clean the date, which is in the format "Month Year" (e.g., "June 2024"). date_extracted <- XML::xpathSApply(row, ".//td[1]", XML::xmlValue) date_cleaned <- stringi::stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "") date_cleaned <- stringi::stri_trim_both(date_cleaned) - # Parse the date - date_parsed <- as.Date(paste0("01 ", date_cleaned), format = "%d %B %Y") + # Parse the cleaned date into a valid date object. This allows us to convert it into the "YYYYMM" format. + date_parsed <- as.Date(stringi::stri_c("01 ", date_cleaned), format = "%d %B %Y") if (is.na(date_parsed)) { warning("Date could not be parsed: ", date_cleaned) next } year_month <- format(date_parsed, "%Y%m") - # Check if date is within range + # Check if the extracted year_month falls within the specified range of start_year_month to end_year_month. + # If it does, proceed to extract the file link from the third column of the row. if (year_month >= start_year_month & year_month <= end_year_month) { - # Get href from column 3 + # Get the link (href) from the third column. This is the link to the .txt or .gz file for that month. link_nodes <- XML::xpathSApply(row, ".//td[3]/a", XML::xmlGetAttr, 'href') if (length(link_nodes) == 0) { warning("No link found in row for date: ", date_cleaned) next } - # Store the link in links + # Store the link in the links vector, for later download. link <- link_nodes[1] links <- c(links, link) } } - # Vector for downloaded files + ########## Use Links to Download Individual Files ########## + # Initialize a vector for storing the paths of the downloaded files. downloaded_files <- c() for (i in seq_along(links)) { link <- links[i] - # Extract the name without the .txt.gz extension + # Extract the base name of the file (without the .txt.gz extension), so we can construct the correct download paths. base_name <- gsub("\\.txt\\.gz$", "", link) - # Parse the date from the base name - date_parsed <- as.Date(paste0("01-", base_name), format = "%d-%Y-%B") + # Parse the date from the base name and convert it into "YYYYMM" format for consistency with our file naming. + date_parsed <- as.Date(stringi::stri_c("01-", base_name), format = "%d-%Y-%B") if (is.na(date_parsed)) { warning("Could not parse date from link: ", link) next } year_month_clean <- format(date_parsed, "%Y%m") - # Download URL - txt_url <- paste0(mailing_list, gsub("\\.gz$", "", link)) - gz_url <- paste0(mailing_list, link) + # Construct the download URLs for both the .txt and .gz versions of the file. + # The function will first attempt to download the .txt version. + txt_url <- stringi::stri_c(mailing_list, gsub("\\.gz$", "", link)) + gz_url <- stringi::stri_c(mailing_list, link) # Attempt to download the .txt file first download_url <- txt_url response <- httr::GET(download_url, httr::timeout(60)) + # If the response status code is not 200, the file is not available. if (httr::status_code(response) != 200) { # Fallback to .gz file if .txt is unavailable download_url <- gz_url response <- httr::GET(download_url, httr::timeout(60)) if (httr::status_code(response) != 200) { - cat("Both .txt and .gz downloads failed for link: ", link, "\n") + warning("Both .txt and .gz downloads failed for link: ", link, "\n") next } } - # Define the destination file - dest <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox')) + # Define the destination file name and path where the downloaded content will be saved as a .mbox file. + dest <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox')) - # Print diagnostic info - cat("Downloading: ", download_url, "\n") - cat("Saving to: ", dest, "\n") + ########## Write Downloaded File to Disk ########## + # Print diagnostic info if verbose is TRUE + if (verbose) { + cat("Downloading: ", download_url, "\n") + cat("Saving to: ", dest, "\n") + } - # Write file to disk + # Write the downloaded file to disk. If the file is a .gz file, it needs to be unzipped and converted to .mbox format. if (grepl("\\.gz$", download_url)) { - # Download the .gz file - gz_file_path <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox.gz')) + # Download the .gz file to a temporary location. + gz_file_path <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) - # Unzip the file + # Unzip the .gz file and save the contents as a .mbox file. gz_con <- gzfile(gz_file_path, open = "rb") out_con <- file(dest, open = "wb") while (TRUE) { @@ -130,121 +148,151 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s close(gz_con) close(out_con) - # Remove the gz file after unzipping + # Remove the .gz file after unzipping to avoid storing duplicate data. file.remove(gz_file_path) } else { - # Download the .txt file directly + # If the .txt file is available, download it directly and save it as a .mbox file. httr::GET(download_url, httr::write_disk(dest, overwrite = TRUE), httr::timeout(60)) } - # Add the downloaded file to the list + # Add the downloaded file path to the list of downloaded files. downloaded_files <- c(downloaded_files, dest) } - # Return downloaded files + ########## Return List of Downloaded Files ########## + # Return the list of downloaded .mbox files return(downloaded_files) } +############## Pipermail Refresher ############## #' Refresh mbox files downloaded via pipermail -#' Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -#' the latest downloaded mbox. It deletes this file, then redownloads it along with all future months -#' up to the current real-life month. -#' If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} -#' @param mailing_list The name of the mailing list being downloaded -#' @param start_year_month The year and month of the first file to be downloaded -#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored -#' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory +#' +#' @description This function refreshes the mailing list files by checking the contents of a specified folder. +#' If the folder is empty, it calls \code{download_pipermail} to download all pipermail files from start_year_month to the current month. +#' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +#' along with all future months up to the current real-life month. +#' +#' The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +#' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +#' Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. +#' +#' @param mailing_list The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/") +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored. +#' @param verbose Logical; if TRUE, prints diagnostic messages. +#' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path) { +refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { - # Create directory if it does not exist - if (!dir.exists(save_folder_path)) { - dir.create(save_folder_path, recursive = TRUE) - } - - # Check if the folder is empty + ########## Check if Folder is Empty ########## + # Check the contents of the folder to see if any .mbox files are already present + # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + if (length(files_in_folder) == 0) { - # If empty, download from start_year_month to the current month + # If the folder is empty, download all pipermail files starting from the start_year_month + # The end date is set to the current month based on the system date end_year_month <- format(Sys.Date(), "%Y%m") - cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") + if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") + + # Call the download_pipermail function to download files from start_year_month to end_year_month download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) - return(NULL) - } - # If folder is not empty, find the most recent month - year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) - recent_month <- max(year_months) - - # Delete the most recent file - recent_file <- file.path(save_folder_path, paste0("kaiaulu_", recent_month, ".mbox")) - if (file.exists(recent_file)) { - file.remove(recent_file) - cat("Deleted the most recent file:", recent_file, "\n") } + ########## Identify the Most Recent Month ########## + else { + # If the folder is not empty, identify the most recent month based on the filenames + # The filenames follow the pattern 'kaiaulu_YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) + + # Find the most recent month by taking the maximum of the extracted YYYYMM values + recent_month <- max(year_months) + + # Delete the most recent file before redownloading it + recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + if (file.exists(recent_file)) { + file.remove(recent_file) + if (verbose) cat("Deleted the most recent file:", recent_file, "\n") + } + + ########## Redownload from the Most Recent Month ########## + # Set the end_year_month to the current month (based on the system date) + end_year_month <- format(Sys.Date(), "%Y%m") + + # Redownload files from the most recent month (that was just deleted) to the current month + if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") - # Redownload from the most recent month to the current real-life month - end_year_month <- format(Sys.Date(), "%Y%m") - cat("Redownloading from", recent_month, "to", end_year_month, "\n") - download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) + # Call the download_pipermail function to redownload the deleted month and all subsequent months up to the current month + download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) + } + ########## Process .gz Files After Refresh ########## + # Call process_gz_to_mbox_in_folder to ensure all .gz files are converted to .mbox after the refresh + if (verbose) cat("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") + process_gz_to_mbox_in_folder(folder_path = save_folder_path, verbose = verbose) } -#' Process .gz files in a folder, unzip and convert them to .mbox -#' Checks a folder for any .gz files, unzips them, and renames them -#' to .mbox format. The original .gz files are deleted after unzipping. If a .mbox -#' file with the same name already exists, it will be overwritten. +#' Process .gz files in a folder and convert them to .mbox +#' +#' @description This function scans a specified folder for any .gz files, unzips them, +#' and renames them to the .mbox format. After unzipping, the original .gz files are deleted. +#' If a .mbox file with the same name already exists, it will be overwritten. +#' This makes sure that all the files in the folder are in .mbox format, ready for parsing. #' #' @param folder_path The path to the folder containing both .gz and .mbox files. +#' @param verbose if TRUE, prints diagnostic messages during processing. #' @return A list of the .mbox files that were created or updated. #' @export -process_gz_to_mbox_in_folder <- function(folder_path) { +process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { - # Get the list of files in the folder + # Get the list of all files in the folder, including full paths files <- list.files(folder_path, full.names = TRUE) - # Find .gz files + # Identify .gz files from the list of files gz_files <- files[grepl("\\.gz$", files)] - # Check if there are no .gz files + # If there are no .gz files, print a message (if verbose is TRUE) and return NULL if (length(gz_files) == 0) { - cat("This folder does not contain any .gz files.\n") + if (verbose) cat("This folder does not contain any .gz files.\n") return(NULL) } - # Vector to store names of converted .mbox files + # Create a vector to store the names of the converted .mbox files converted_mbox_files <- c() - # Process .gz files + ########## Process Each .gz File ########## + # Iterate over each .gz file, unzip it, and convert it to .mbox for (gz_file in gz_files) { - # Define the corresponding .mbox file path (remove .gz and replace with .mbox) + # Define the corresponding .mbox file path by replacing .gz with .mbox in the file name mbox_file <- gsub("\\.gz$", ".mbox", gz_file) - cat("Processing:", gz_file, " -> ", mbox_file, "\n") + if (verbose) cat("Processing:", gz_file, " -> ", mbox_file, "\n") - # Open .gz file and unzip its contents to .mbox + # Open the .gz file in binary mode for reading gz_con <- gzfile(gz_file, open = "rb") + + # Create a new .mbox file and open it in binary mode for writing out_con <- file(mbox_file, open = "wb") - # Read and write the contents + # Read the contents of the .gz file and write the chunks to the .mbox file while (TRUE) { bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) if (length(bytes) == 0) break writeBin(bytes, out_con) } - # Close connections + # Close both the input (gz) and output (mbox) file connections close(gz_con) close(out_con) - # Remove the .gz file + # After successfully converting the file, delete the original .gz file file.remove(gz_file) - # Add the converted file to the list + # Add the newly created .mbox file to the list of converted files converted_mbox_files <- c(converted_mbox_files, mbox_file) } - # Return the list of converted .mbox files + # Return the vector of all the .mbox files that were created or updated return(converted_mbox_files) } @@ -372,7 +420,7 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, # Remove file if error # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",destination[[counter]])) + warning(stringi::stri_c("Unable to download: ",destination[[counter]])) file.remove(full_tmp_save_path) } @@ -399,17 +447,19 @@ download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, #' @param mbox_path path to mbox archive file (ends in .mbox) #' @export #' @family parsers -parse_mbox <- function(perceval_path,mbox_path){ +parse_mbox <- function(perceval_path, mbox_path){ # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") perceval_path <- path.expand(perceval_path) mbox_path <- path.expand(mbox_path) # Remove ".mbox" - mbox_uri <- stri_replace_last(mbox_path,replacement="",regex=".mbox") + mbox_uri <- stringi::stri_replace_last_regex(mbox_path, pattern = "\\.mbox$", replacement = "") + # Use percerval to parse mbox_path. --json line is required to be parsed by jsonlite::fromJSON. perceval_output <- system2(perceval_path, args = c('mbox',mbox_uri,mbox_path,'--json-line'), stdout = TRUE, stderr = FALSE) + # Parsed JSON output as a data.table. perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose=FALSE)) @@ -479,12 +529,12 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r # format the date correctly cdate <- format(as.POSIXct(reply_datetime, format = "%Y-%m-%dT%H:%M:%S"), "%a, %e %b %Y %H:%M:%S ") - reply_from_full_info <- paste0(reply_from_author, " <", reply_from_email, ">") - reply_to_full_info <- paste0(reply_to_author, " <", reply_to_email, ">") - reply_cc_full_info <- paste0(reply_cc_author, " <", reply_cc_email, ">") + reply_from_full_info <- stringi::stri_c(reply_from_author, " <", reply_from_email, ">") + reply_to_full_info <- stringi::stri_c(reply_to_author, " <", reply_to_email, ">") + reply_cc_full_info <- stringi::stri_c(reply_cc_author, " <", reply_cc_email, ">") - mbox_content <- paste0( + mbox_content <- stringi::stri_c( "From MAILER-DAEMON Thu Jul 18 13:48:48 2013", "\nPath: example.com!not-for-mail", "\nFrom: ", reply_from_full_info, @@ -503,7 +553,7 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r "\nX-Accept-Language: en-us ", "\nOriginal-To: ", reply_to_full_info, " ", reply_cc_full_info, "\nPrecedence: bulk", - "\nX-Mailing-List: ", paste0(mailing_list, "@example.com"), + "\nX-Mailing-List: ", stringi::stri_c(mailing_list, "@example.com"), "\n\n", reply_body ) @@ -524,7 +574,7 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r make_mbox_mailing_list <- function(replies, folder_path = "/tmp", file_name) { # Create a unique filename for the mbox file - mbox_filepath <- file.path(folder_path, paste0(file_name, ".mbox")) + mbox_filepath <- file.path(folder_path, stringi::stri_c(file_name, ".mbox")) # make the file mbox_body <- stringi::stri_c(replies,collapse = "\n\n") diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index b36a0d7a..51a98bf2 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -8,21 +8,32 @@ download_pipermail( mailing_list, start_year_month, end_year_month, - save_folder_path + save_folder_path, + verbose = TRUE ) } \arguments{ -\item{mailing_list}{The name of the mailing list being downloaded} +\item{mailing_list}{The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/")} -\item{start_year_month}{The year and month of the first file to be downloaded} +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM')} -\item{end_year_month}{The year and month of the last file to be downloaded} +\item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month)} \item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} + +\item{verbose}{Logical; if TRUE, prints diagnostic messages during the download process} } \value{ Returns `downloaded_files`, a vector of the downloaded files in the current working directory } \description{ -Download all pipermail files in an archive as mbox files +This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. +It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. +The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. + +When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, +overwriting any existing file with the same name. The original .gz file is deleted after extraction. + +The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. +The function only downloads files that fall between the specified start_year_month and end_year_month. } diff --git a/man/process_gz_to_mbox_in_folder.Rd b/man/process_gz_to_mbox_in_folder.Rd index 3564ac9c..1964df28 100644 --- a/man/process_gz_to_mbox_in_folder.Rd +++ b/man/process_gz_to_mbox_in_folder.Rd @@ -2,22 +2,21 @@ % Please edit documentation in R/mail.R \name{process_gz_to_mbox_in_folder} \alias{process_gz_to_mbox_in_folder} -\title{Process .gz files in a folder, unzip and convert them to .mbox -Checks a folder for any .gz files, unzips them, and renames them -to .mbox format. The original .gz files are deleted after unzipping. If a .mbox -file with the same name already exists, it will be overwritten.} +\title{Process .gz files in a folder and convert them to .mbox} \usage{ -process_gz_to_mbox_in_folder(folder_path) +process_gz_to_mbox_in_folder(folder_path, verbose = TRUE) } \arguments{ \item{folder_path}{The path to the folder containing both .gz and .mbox files.} + +\item{verbose}{if TRUE, prints diagnostic messages during processing.} } \value{ A list of the .mbox files that were created or updated. } \description{ -Process .gz files in a folder, unzip and convert them to .mbox -Checks a folder for any .gz files, unzips them, and renames them -to .mbox format. The original .gz files are deleted after unzipping. If a .mbox -file with the same name already exists, it will be overwritten. +This function scans a specified folder for any .gz files, unzips them, +and renames them to the .mbox format. After unzipping, the original .gz files are deleted. +If a .mbox file with the same name already exists, it will be overwritten. +This makes sure that all the files in the folder are in .mbox format, ready for parsing. } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index ba2b2e15..934363a5 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -2,28 +2,34 @@ % Please edit documentation in R/mail.R \name{refresh_pipermail} \alias{refresh_pipermail} -\title{Refresh mbox files downloaded via pipermail -Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -the latest downloaded mbox. It deletes this file, then redownloads it along with all future months -up to the current real-life month. -If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}}} +\title{Refresh mbox files downloaded via pipermail} \usage{ -refresh_pipermail(mailing_list, start_year_month, save_folder_path) +refresh_pipermail( + mailing_list, + start_year_month, + save_folder_path, + verbose = TRUE +) } \arguments{ -\item{mailing_list}{The name of the mailing list being downloaded} +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/")} -\item{start_year_month}{The year and month of the first file to be downloaded} +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored.} + +\item{verbose}{Logical; if TRUE, prints diagnostic messages.} } \value{ -Returns `downloaded_files`, a vector of the downloaded files in the current working directory +Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. } \description{ -Refresh mbox files downloaded via pipermail -Uses the adopted file name convention by \code{\link{download_pipermail}} to identify -the latest downloaded mbox. It deletes this file, then redownloads it along with all future months -up to the current real-life month. -If the directory is empty, then it downloads all pipermail files (as mbox files) via \code{\link{download_pipermail}} +This function refreshes the mailing list files by checking the contents of a specified folder. +If the folder is empty, it calls \code{download_pipermail} to download all pipermail files from start_year_month to the current month. +If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +along with all future months up to the current real-life month. + +The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. } diff --git a/tools.yml b/tools.yml index 64667d8d..d3bbc518 100644 --- a/tools.yml +++ b/tools.yml @@ -1,5 +1,5 @@ # https://github.com/chaoss/grimoirelab-perceval -perceval: ~/perceval/bin/perceval +perceval: /Users/dao/anaconda3/bin/perceval # https://github.com/multilang-depends/depends depends: ~/depends-0.9.6/depends.jar # https://github.com/tsantalis/RefactoringMiner#running-refactoringminer-from-the-command-line From b91389b39f2fb50a65af9eb0e1e7a35e422c7750 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Sat, 21 Sep 2024 12:16:06 -1000 Subject: [PATCH 14/80] i #284 Added download_mod_mbox function and edited notebook Updated parameters for download_mod_mbox to use Apache Pony Mail links as Apache lists now redirect there - Modified downloads to use YYYYMM instead of YYYY - Removed the option for downloading by year for clearer functionality. - Updated vignette/download_mail.Rmd Signed-off-by: Dao McGill --- R/mail.R | 148 ++++++++++++++++++++---------------- conf/helix.yml | 16 ++-- man/download_mod_mbox.Rd | 33 ++++---- man/download_pipermail.Rd | 2 +- man/refresh_pipermail.Rd | 2 +- vignettes/download_mail.Rmd | 62 ++++++++++++--- 6 files changed, 161 insertions(+), 102 deletions(-) diff --git a/R/mail.R b/R/mail.R index fc2b83ec..49a041cb 100644 --- a/R/mail.R +++ b/R/mail.R @@ -22,7 +22,7 @@ #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM') #' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month) #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored -#' @param verbose Logical; if TRUE, prints diagnostic messages during the download process +#' @param verbose if TRUE, prints diagnostic messages during the download process #' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { @@ -180,7 +180,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s #' @param mailing_list The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/") #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored. -#' @param verbose Logical; if TRUE, prints diagnostic messages. +#' @param verbose if TRUE, prints diagnostic messages. #' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { @@ -297,76 +297,96 @@ process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { } -#' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -#' @param base_url An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes -#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory -#' @param from_year First year in the range to be downloaded -#' @param to_year Last year in the range to be downloaded -#' @param save_file_path the full path, including file name and extension to save the file -#' @param is_per_month If TRUE, does not delete monthly files in tmp. (Default = TRUE) -#' @param verbose Prints progress during execution -#' @return Returns the path of the downloaded mbox file. -#' @export -download_mod_mbox <- function(base_url, mailing_list, from_year, to_year, save_file_path,is_per_month=TRUE,verbose=FALSE) { - - - #Initialize variables - counter <- 0 - destination <- list() - - #Open file handle to output file - output <- path.expand(save_file_path) - fileConn <- file(output, "w+") - - #Loop through time and compose the mbox file - for (year in (from_year:to_year)) { +############## Mod Mbox Downloader ############## +#' Download all mod_mbox files in a mailing list as mbox files +#' +#' @description This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. +#' It constructs the download URLs for each month based on the start and end date range and downloads the mbox files +#' in the format "YYYY-MM". The downloaded .mbox files are saved in the specified folder, with a naming convention +#' of kaiaulu_YYYYMM.mbox. +#' +#' The function loops through each month in the range specified by `start_year_month` and `end_year_month`, +#' and constructs the appropriate URL to download each month's data. If any download fails, an error message is printed. +#' +#' @param mailing_list The URL of the Apache Pony Mail list from which mbox files are to be downloaded +#' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM'). +#' @param save_file_path The folder path where all the downloaded mbox files will be stored. +#' @param verbose if TRUE, prints detailed messages during the download process. +#' @return Returns `save_file_path`, the folder path where the mbox files are stored. +#' @export +download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_file_path, verbose = FALSE) { + + ########## Extract Mailing List Name ########## + # Extract the mailing list name from the given URL. This is because the actual list name is + # embedded within the URL (after the 'list.html?'). + # We are using 'sub()' to perform a simple string replacement, extracting everything after 'list.html?'. + mailing_list_name <- sub(".*list.html\\?(.+)", "\\1", mailing_list) + if (verbose) cat("Base list extracted:", mailing_list_name, "\n") + + ########## Prepare Year and Month ########## + # The start_year_month and end_year_month are in the format "YYYYMM". + # Split them into year and month for easier looping. + # Extract first 4 digits as start year, and last 2 digits as start month. + start_year <- as.numeric(substr(start_year_month, 1, 4)) + start_month <- as.numeric(substr(start_year_month, 5, 6)) + # Extract first 4 digits as end year, and last 2 digits as end month. + end_year <- as.numeric(substr(end_year_month, 1, 4)) + end_month <- as.numeric(substr(end_year_month, 5, 6)) + + ########## Download Loop ########## + # Iterate over the years and months from start_year/month to end_year/month. + # This is done by looping over the years, and for each year, looping over the 12 months. + for (year in start_year:end_year) { for (month in 1:12) { - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d%02d.mbox", year, month) - - if(verbose){ - print(stringi::stri_c("Downloading:",destination[[counter]],sep = " ")) - } - - #Try file download and save result - full_month_url <- stringi::stri_c(base_url, mailing_list, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path('/tmp',destination[[counter]]) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - #If download was successful, write to mbox file, if not, delete file - if (httr::http_error(x) == FALSE) { - - #Open read connection - readCon <- file(full_tmp_save_path, "r") - - data <- readLines(full_tmp_save_path) - - #Write data to output - writeLines(data, fileConn) - - #Close read connection - close(readCon) + # Skip months before the start_month or after the end_month for the start and end year. + if (year == start_year && month < start_month) next + if (year == end_year && month > end_month) break + + ######### Construct URL and Save Path ########## + # Construct the month string (e.g., '2023-04') and the full download URL. + # Make sure the month has two digits. + month_str <- sprintf("%02d", month) + # Create a string in the format "YYYY-MM" + year_month_str <- sprintf("%04d-%02d", year, month) + # This constructs the URL from which the mbox for the current year and month will be downloaded. + # The format for the URL is fixed by Apache's Pony Mail service. + download_url <- stringi::stri_c("https://lists.apache.org/api/mbox.lua?list=", mailing_list_name, "&date=", year_month_str) + + # Create the file name where the mbox will be saved locally, in the format ''kaiaulu_'YYYYMM.mbox'. + file_name <- stringi::stri_c("kaiaulu_", year, month_str, ".mbox") + file_path <- file.path(save_file_path, file_name) + + if (verbose) { + cat("Constructed URL:", download_url, "\n") + cat("Saving to file:", file_path, "\n") } - #Delete the /tmp/ monthly files - if(!is_per_month){ - unlink(full_tmp_save_path, force = TRUE) + ########## Download Mbox File ########## + # Download the file using httr::GET, saving it directly to the destination file path. + response <- httr::GET(download_url, httr::write_disk(file_path, overwrite = TRUE)) + # Get the status code to see if the download succeeded. + status_code <- httr::status_code(response) + + # Check for successful download (status code 200). + if (status_code == 200) { + if (verbose) cat("Successfully downloaded:", download_url, "\n") + } else { + if (verbose) { + cat("Failed to download:", download_url, "\n") + cat("HTTP Status Code:", status_code, "\n") + } + # Remove failed download file. + unlink(file_path) } - - } - } - #Close connection to target mbox file - close(fileConn) - - #return output location - return(output) + ########## Return Save Path ########## + # Return the folder path where all mbox files were saved. + return(save_file_path) } #' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} diff --git a/conf/helix.yml b/conf/helix.yml index ba69d6e9..c5f62d27 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -50,15 +50,15 @@ version_control: mailing_list: mod_mbox: mail_key_1: - archive_url: http://mail-archives.apache.org/mod_mbox/helix-dev - mbox: ../../rawdata/helix/mod_mbox/helix-dev/ - mailing_list: helix-dev - archive_type: apache + mailing_list: https://lists.apache.org/list.html?announce@apache.org + start_year_month: 202310 + end_year_month: 202405 + save_file_path: "../save_mbox_mail" mail_key_2: - archive_url: http://mail-archives.apache.org/mod_mbox/helix-user - mbox: ../../rawdata/helix/mod_mbox/helix-user/ - mailing_list: helix-user - archive_type: apache + mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org + start_year_month: 202201 + end_year_month: 202401 + save_file_path: "../save_mbox_mail" pipermail: project_key_1: # archive_url: https://mta.openssl.org/mailman/listinfo/ diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index 3f4ec8e5..26a765e3 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -2,36 +2,37 @@ % Please edit documentation in R/mail.R \name{download_mod_mbox} \alias{download_mod_mbox} -\title{Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}}} +\title{Download all mod_mbox files in a mailing list as mbox files} \usage{ download_mod_mbox( - base_url, mailing_list, - from_year, - to_year, + start_year_month, + end_year_month, save_file_path, - is_per_month = TRUE, verbose = FALSE ) } \arguments{ -\item{base_url}{An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes} +\item{mailing_list}{The URL of the Apache Pony Mail list from which mbox files are to be downloaded +(e.g., "https://lists.apache.org/list.html?announce@apache.org").} -\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{from_year}{First year in the range to be downloaded} +\item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM').} -\item{to_year}{Last year in the range to be downloaded} +\item{save_file_path}{The folder path where all the downloaded mbox files will be stored.} -\item{save_file_path}{the full path, including file name and extension to save the file} - -\item{is_per_month}{If TRUE, does not delete monthly files in tmp. (Default = TRUE)} - -\item{verbose}{Prints progress during execution} +\item{verbose}{if TRUE, prints detailed messages during the download process.} } \value{ -Returns the path of the downloaded mbox file. +Returns `save_file_path`, the folder path where the mbox files are stored. } \description{ -Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} +This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. +It constructs the download URLs for each month based on the start and end date range and downloads the mbox files +in the format "YYYY-MM". The downloaded .mbox files are saved in the specified folder, with a naming convention +of kaiaulu_YYYYMM.mbox. + +The function loops through each month in the range specified by `start_year_month` and `end_year_month`, +and constructs the appropriate URL to download each month's data. If any download fails, an error message is printed. } diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 51a98bf2..0aa1bc50 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -21,7 +21,7 @@ download_pipermail( \item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} -\item{verbose}{Logical; if TRUE, prints diagnostic messages during the download process} +\item{verbose}{if TRUE, prints diagnostic messages during the download process} } \value{ Returns `downloaded_files`, a vector of the downloaded files in the current working directory diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index 934363a5..dc2ce0b2 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -18,7 +18,7 @@ refresh_pipermail( \item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored.} -\item{verbose}{Logical; if TRUE, prints diagnostic messages.} +\item{verbose}{if TRUE, prints diagnostic messages.} } \value{ Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index d7eb631d..a5f7f53a 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -17,12 +17,12 @@ seed <- 1 set.seed(seed) # Load libraries - library(kaiaulu) - library(data.table) - library(yaml) - library(stringi) - library(XML) - library(httr) + require(kaiaulu) + require(data.table) + require(yaml) + require(stringi) + require(XML) + require(httr) ``` @@ -31,7 +31,9 @@ set.seed(seed) Mailing list data is stored in a variety of archives. See: - Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). - Pipermail: [OpenSSL](https://mta.openssl.org/mailman/listinfo/). -is notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail. +This notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail. + +# Pipermail ## Mailing List Organization @@ -44,12 +46,10 @@ Mailing lists are typically organized by topic or purpose. For example, the [Ope Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. -# Project Configuration File +## Project Configuration File To start, we load the project configuration file, which contains parameters for downloading the mailing list archives. -// # Project Configuration File - ```{r} conf <- yaml::read_yaml("conf/helix.yml") mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] @@ -64,7 +64,7 @@ save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["s - end_year_month: The ending date for downloading archives (in YYYYMM format). - save_folder_path: The local directory where the downloaded archives will be saved. -# Pipermail Downloader +## Pipermail Downloader You can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory. The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. ```{r} # Download archives @@ -78,7 +78,7 @@ download_pipermail( ``` After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc. -# Pipermail Refresher +## Pipermail Refresher In some cases, you may want to refresh the archive to ensure the most recent months are up-to-date or to handle updates to the mailing list. The refresh_pipermail() function helps automate this process. How refresh_pipermail Works @@ -95,3 +95,41 @@ refresh_pipermail( ``` This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive if necessary and adding any new months that have been added to the mailing list. + +# Mod Mbox + +## Mailing List Organization +Mod Mbox archives also organize mailing lists by topic. The apache mailing list archives can be found at https://lists.apache.org/. + +## Project Configuration File +Similar to Pipermail, we load the configuration for Mod Mbox from the YAML file, which includes the mailing list URL, the date range, and the save folder path. + +```{r} +mod_mbox_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_list"]] +mod_start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["start_year_month"]] +mod_end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["end_year_month"]] +mod_save_file_path <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["save_file_path"]] +``` + +### Explanation of Configuration Parameters +- mailing_list: The URL of the Mod Mbox mailing list (e.g., https://lists.apache.org/list.html?announce@apache.org). +- start_year_month: The first month to download (format: YYYYMM). +- end_year_month: The last month to download (format: YYYYMM). +- save_file_path: The directory where the downloaded .mbox files will be saved. + +##Mod Mbox Downloader +The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. + +```{r} +download_mod_mbox( + mailing_list = mod_mbox_list, + start_year_month = mod_start_year_month, + end_year_month = mod_end_year_month, + save_file_path = mod_save_file_path, + verbose = TRUE +``` + +After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 +and saves the files in the specified folder. + + From 0cc41231724e9e096672c753deeb52e040bb8627 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 22 Sep 2024 12:02:41 -1000 Subject: [PATCH 15/80] i #284 Added refresh_mod_mbox function for refreshing Mod Mbox archives - Created `refresh_mod_mbox` function to automatically refresh mailing list archives downloaded using Mod Mbox. - The function checks for the latest downloaded file, deletes it, and redownloads the archive from that month to the current date. - Added documentation for `refresh_mod_mbox` to the notebook. Signed-off-by: Dao McGill --- NAMESPACE | 2 +- R/mail.R | 111 +++++++++++++++-------------- conf/helix.yml | 4 +- man/download_mod_mbox.Rd | 6 +- man/download_mod_mbox_per_month.Rd | 37 ---------- man/refresh_mod_mbox.Rd | 35 +++++++++ vignettes/download_mail.Rmd | 22 +++++- 7 files changed, 116 insertions(+), 101 deletions(-) delete mode 100644 man/download_mod_mbox_per_month.Rd create mode 100644 man/refresh_mod_mbox.Rd diff --git a/NAMESPACE b/NAMESPACE index f6e15a60..75f0d9fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,7 +15,6 @@ export(download_jira_issues) export(download_jira_issues_by_date) export(download_jira_issues_by_issue_key) export(download_mod_mbox) -export(download_mod_mbox_per_month) export(download_pipermail) export(dv8_clsxb_to_clsxj) export(dv8_clsxj_to_clsxb) @@ -139,6 +138,7 @@ export(query_src_text_namespace) export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) +export(refresh_mod_mbox) export(refresh_pipermail) export(smell_missing_links) export(smell_organizational_silo) diff --git a/R/mail.R b/R/mail.R index 49a041cb..c58152cc 100644 --- a/R/mail.R +++ b/R/mail.R @@ -313,11 +313,11 @@ process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { #' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). #' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM'). -#' @param save_file_path The folder path where all the downloaded mbox files will be stored. +#' @param save_folder_path The folder path where all the downloaded mbox files will be stored. #' @param verbose if TRUE, prints detailed messages during the download process. -#' @return Returns `save_file_path`, the folder path where the mbox files are stored. +#' @return Returns `save_folder_path`, the folder path where the mbox files are stored. #' @export -download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_file_path, verbose = FALSE) { +download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = FALSE) { ########## Extract Mailing List Name ########## # Extract the mailing list name from the given URL. This is because the actual list name is @@ -357,7 +357,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa # Create the file name where the mbox will be saved locally, in the format ''kaiaulu_'YYYYMM.mbox'. file_name <- stringi::stri_c("kaiaulu_", year, month_str, ".mbox") - file_path <- file.path(save_file_path, file_name) + file_path <- file.path(save_folder_path, file_name) if (verbose) { cat("Constructed URL:", download_url, "\n") @@ -386,70 +386,71 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa ########## Return Save Path ########## # Return the folder path where all mbox files were saved. - return(save_file_path) + return(save_folder_path) } -#' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -#' @param archive_url A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes -#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory -#' @param archive_type Name of the archive that the project mailing list is archived in (e.g. apache) -#' @param from_year First year in the range to be downloaded -#' @param to_year Last year in the range to be downloaded -#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. -#' @param verbose Prints progress during execution -#' @return Returns the path of the downloaded mbox file. -#' @export -download_mod_mbox_per_month <- function(archive_url, mailing_list, archive_type, from_year, to_year, save_folder_path,verbose=FALSE) { - - #Initialize variables - counter <- 0 - destination <- list() +############## Mod Mbox Refresher ############## - #Open file handle to output file - output <- path.expand(save_folder_path) +#' Refresh mbox files downloaded via mod_mbox +#' +#' @description This function refreshes the mailing list files by checking the contents of a specified folder. +#' If the folder is empty, it calls \code{download_mod_mbox} to download all mod_mbox files from start_year_month to the current month. +#' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +#' along with all future months up to the current real-life month. +#' +#' The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +#' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +#' Redownloading the most recent file ensures any files added in that month after the latest refresh are included. +#' +#' @param mailing_list The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org') +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param save_folder_path The folder path in which all the downloaded mod_mbox files will be stored. +#' @param verbose if TRUE, prints diagnostic messages. +#' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. +#' @export +refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { - current_date <- Sys.Date() - current_year <- as.numeric(substr(current_date, 1, 4)) - current_month <- as.numeric(substr(current_date, 6, 7)) + ########## Check if Folder is Empty ########## + # Check the contents of the folder to see if any .mbox files are already present. + # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' + files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") - #Loop through time and compose the mbox file - for (year in (from_year:to_year)) { + if (length(files_in_folder) == 0) { + # If the folder is empty, download all mod_mbox files starting from start_year_month + # The end date is set to the current month based on the system date + end_year_month <- format(Sys.Date(), "%Y%m") + if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") - for (month in 1:12) { - # Check to stop function when month iterates path current real life month - if (year == current_year && month > current_month) { - return(output) - } - counter <- counter + 1 + # Call the download_mod_mbox function to download files from start_year_month to end_year_month + download_mod_mbox(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = verbose) + } + ########## Identify the Most Recent Month ########## + else { + # If the folder is not empty, identify the most recent month based on the filenames + # The filenames follow the pattern 'kaiaulu_YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d%02d.mbox", year, month) - mbox_file_name <- stringi::stri_c(mailing_list, archive_type, destination[[counter]], sep = "_") + # Find the most recent month by taking the maximum of the extracted YYYYMM values + recent_month <- max(year_months) - if(verbose){ - print(stringi::stri_c("Downloading:",mbox_file_name,sep = " ")) - } + # Delete the most recent file before redownloading it + recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + if (file.exists(recent_file)) { + file.remove(recent_file) + if (verbose) cat("Deleted the most recent file:", recent_file, "\n") + } - #Try file download and save result - full_month_url <- stringi::stri_c(archive_url, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,mbox_file_name) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(stringi::stri_c("Unable to download: ",destination[[counter]])) - file.remove(full_tmp_save_path) - } + ########## Redownload from the Most Recent Month ########## + # Set the end_year_month to the current month (based on the system date) + end_year_month <- format(Sys.Date(), "%Y%m") - } + # Redownload files from the most recent month (that was just deleted) to the current month + if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") + # Call the download_mod_mbox function to redownload the deleted month and all subsequent months up to the current month + download_mod_mbox(mailing_list, recent_month, end_year_month, save_folder_path, verbose = verbose) } - - #return output location - return(output) } diff --git a/conf/helix.yml b/conf/helix.yml index c5f62d27..18b1bc6d 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -53,12 +53,12 @@ mailing_list: mailing_list: https://lists.apache.org/list.html?announce@apache.org start_year_month: 202310 end_year_month: 202405 - save_file_path: "../save_mbox_mail" + save_folder_path: "../save_mbox_mail" mail_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org start_year_month: 202201 end_year_month: 202401 - save_file_path: "../save_mbox_mail" + save_folder_path: "../save_mbox_mail" pipermail: project_key_1: # archive_url: https://mta.openssl.org/mailman/listinfo/ diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index 26a765e3..c628be38 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -8,7 +8,7 @@ download_mod_mbox( mailing_list, start_year_month, end_year_month, - save_file_path, + save_folder_path, verbose = FALSE ) } @@ -20,12 +20,12 @@ download_mod_mbox( \item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM').} -\item{save_file_path}{The folder path where all the downloaded mbox files will be stored.} +\item{save_folder_path}{The folder path where all the downloaded mbox files will be stored.} \item{verbose}{if TRUE, prints detailed messages during the download process.} } \value{ -Returns `save_file_path`, the folder path where the mbox files are stored. +Returns `save_folder_path`, the folder path where the mbox files are stored. } \description{ This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. diff --git a/man/download_mod_mbox_per_month.Rd b/man/download_mod_mbox_per_month.Rd deleted file mode 100644 index 2debab7b..00000000 --- a/man/download_mod_mbox_per_month.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mail.R -\name{download_mod_mbox_per_month} -\alias{download_mod_mbox_per_month} -\title{Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}}} -\usage{ -download_mod_mbox_per_month( - archive_url, - mailing_list, - archive_type, - from_year, - to_year, - save_folder_path, - verbose = FALSE -) -} -\arguments{ -\item{archive_url}{A url pointing to the mod_mbox mailing list directory (e.g. "http://mail-archives.apache.org/mod_mbox/apr-dev") without trailing slashes} - -\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} - -\item{archive_type}{Name of the archive that the project mailing list is archived in (e.g. apache)} - -\item{from_year}{First year in the range to be downloaded} - -\item{to_year}{Last year in the range to be downloaded} - -\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} - -\item{verbose}{Prints progress during execution} -} -\value{ -Returns the path of the downloaded mbox file. -} -\description{ -Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -} diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd new file mode 100644 index 00000000..43f6349a --- /dev/null +++ b/man/refresh_mod_mbox.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{refresh_mod_mbox} +\alias{refresh_mod_mbox} +\title{Refresh mbox files downloaded via mod_mbox} +\usage{ +refresh_mod_mbox( + mailing_list, + start_year_month, + save_folder_path, + verbose = TRUE +) +} +\arguments{ +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org')} + +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} + +\item{save_folder_path}{The folder path in which all the downloaded mod_mbox files will be stored.} + +\item{verbose}{if TRUE, prints diagnostic messages.} +} +\value{ +Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. +} +\description{ +This function refreshes the mailing list files by checking the contents of a specified folder. +If the folder is empty, it calls \code{download_mod_mbox} to download all mod_mbox files from start_year_month to the current month. +If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +along with all future months up to the current real-life month. + +The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +Redownloading the most recent file ensures any files added in that month after the latest refresh are included. +} diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index a5f7f53a..1e635350 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -108,14 +108,14 @@ Similar to Pipermail, we load the configuration for Mod Mbox from the YAML file, mod_mbox_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_list"]] mod_start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["start_year_month"]] mod_end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["end_year_month"]] -mod_save_file_path <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["save_file_path"]] +mod_save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["save_folder_path"]] ``` ### Explanation of Configuration Parameters - mailing_list: The URL of the Mod Mbox mailing list (e.g., https://lists.apache.org/list.html?announce@apache.org). - start_year_month: The first month to download (format: YYYYMM). - end_year_month: The last month to download (format: YYYYMM). -- save_file_path: The directory where the downloaded .mbox files will be saved. +- save_folder_path: The directory where the downloaded .mbox files will be saved. ##Mod Mbox Downloader The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. @@ -125,11 +125,27 @@ download_mod_mbox( mailing_list = mod_mbox_list, start_year_month = mod_start_year_month, end_year_month = mod_end_year_month, - save_file_path = mod_save_file_path, + save_folder_path = mod_save_folder_path, verbose = TRUE ``` After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 and saves the files in the specified folder. +## Mod Mbox Refresher +To refresh these archives to ensure that you have the latest messages, you can use the refresh_mod_mbox function. This function works similarly to the Pipermail refresher. +How refresh_mod_mbox Works +1. Checks if the folder is empty and, if so, downloads the archives starting from start_year_month to the current month by calling download_mod_mbox(). +2. If the folder contains files, it identifies the most recent one using the YYYYMM found in the filename. This file is deleted, and then redownloaded along with all future months. + +```{r} +refresh_mod_mbox( + mailing_list = mod_mbox_list, + start_year_month = mod_start_year_month, + save_folder_path = mod_save_folder_path + verbose = TRUE +) +``` + +This ensures your archive is up-to-date, accounting for new data that may have been added to the mailing list since the last download. From 0dc60013b730b2057b907ed0b14ada241d497702 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Mon, 30 Sep 2024 17:17:41 -1000 Subject: [PATCH 16/80] i #284 Updated Notebook - Updated vignettes/download_mail.Rmd to working version - Fixed errors in helix.yml - Minor edits in mail.R Signed-off-by: Dao McGill --- R/mail.R | 4 +-- conf/helix.yml | 8 ++++-- vignettes/download_mail.Rmd | 49 ++++++++++++++++++++++++++++++------- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/R/mail.R b/R/mail.R index c58152cc..861d32d2 100644 --- a/R/mail.R +++ b/R/mail.R @@ -254,7 +254,7 @@ process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { # If there are no .gz files, print a message (if verbose is TRUE) and return NULL if (length(gz_files) == 0) { if (verbose) cat("This folder does not contain any .gz files.\n") - return(NULL) + return(invisible(NULL)) } # Create a vector to store the names of the converted .mbox files @@ -317,7 +317,7 @@ process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { #' @param verbose if TRUE, prints detailed messages during the download process. #' @return Returns `save_folder_path`, the folder path where the mbox files are stored. #' @export -download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = FALSE) { +download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { ########## Extract Mailing List Name ########## # Extract the mailing list name from the given URL. This is because the actual list name is diff --git a/conf/helix.yml b/conf/helix.yml index 18b1bc6d..a3464fab 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -49,16 +49,18 @@ version_control: mailing_list: mod_mbox: - mail_key_1: + project_key_1: mailing_list: https://lists.apache.org/list.html?announce@apache.org start_year_month: 202310 end_year_month: 202405 save_folder_path: "../save_mbox_mail" - mail_key_2: + save_parsed_folder_path: "../save_parsed_mail" + project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org start_year_month: 202201 end_year_month: 202401 save_folder_path: "../save_mbox_mail" + save_parsed_folder_path: "../save_parsed_mail" pipermail: project_key_1: # archive_url: https://mta.openssl.org/mailman/listinfo/ @@ -66,12 +68,14 @@ mailing_list: start_year_month: 202310 end_year_month: 202405 save_folder_path: "../save_folder_mail" + save_parsed_folder_path: "../save_parsed_mail" project_key_2: # archive_url: https://mta.openssl.org/mailman/listinfo/ mailing_list: https://mta.openssl.org/pipermail/openssl-project/ start_year_month: 202203 end_year_month: 202303 save_folder_path: "../save_folder_mail_2" + save_parsed_folder_path: "../save_parsed_mail" issue_tracker: jira: diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 1e635350..a1e91950 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -51,7 +51,7 @@ Each mailing list maintains archives of past messages, often organized by month To start, we load the project configuration file, which contains parameters for downloading the mailing list archives. ```{r} -conf <- yaml::read_yaml("conf/helix.yml") +conf <- yaml::read_yaml("../conf/helix.yml") mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]] end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]] @@ -72,7 +72,8 @@ download_pipermail( mailing_list = mailing_list, start_year_month = start_year_month, end_year_month = end_year_month, - save_folder_path = save_folder_path + save_folder_path = save_folder_path, + verbose = TRUE ) ``` @@ -90,7 +91,8 @@ How refresh_pipermail Works refresh_pipermail( mailing_list = mailing_list, start_year_month = start_year_month, - save_folder_path = save_folder_path + save_folder_path = save_folder_path, + verbose = TRUE ) ``` @@ -105,10 +107,10 @@ Mod Mbox archives also organize mailing lists by topic. The apache mailing list Similar to Pipermail, we load the configuration for Mod Mbox from the YAML file, which includes the mailing list URL, the date range, and the save folder path. ```{r} -mod_mbox_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_list"]] -mod_start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["start_year_month"]] -mod_end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["end_year_month"]] -mod_save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["save_folder_path"]] +mod_mbox_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] +mod_start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["start_year_month"]] +mod_end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["end_year_month"]] +mod_save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["save_folder_path"]] ``` ### Explanation of Configuration Parameters @@ -117,7 +119,7 @@ mod_save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["s - end_year_month: The last month to download (format: YYYYMM). - save_folder_path: The directory where the downloaded .mbox files will be saved. -##Mod Mbox Downloader +## Mod Mbox Downloader The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. ```{r} @@ -127,6 +129,7 @@ download_mod_mbox( end_year_month = mod_end_year_month, save_folder_path = mod_save_folder_path, verbose = TRUE + ) ``` After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 @@ -143,9 +146,37 @@ How refresh_mod_mbox Works refresh_mod_mbox( mailing_list = mod_mbox_list, start_year_month = mod_start_year_month, - save_folder_path = mod_save_folder_path + save_folder_path = mod_save_folder_path, verbose = TRUE ) ``` This ensures your archive is up-to-date, accounting for new data that may have been added to the mailing list since the last download. + +# Parser + +After downloading the mailing list archives as .mbox files, the next step is to parse these files to extract meaningful information for analysis. The parse_mbox() function utilizes the Perceval library to parse .mbox files and convert them into structured data tables. This enables easier manipulation and analysis of mailing list data. + +## Mbox Parser +The parse_mbox() function takes an .mbox file and parses it into a structured data.table using the Perceval library. + +For the configuration, make sure you have the correct path to the Perceval library in the conf file. + +```{r} +tools_config <- yaml::read_yaml("../tools.yml") +parse_perceval_path <- tools_config[["perceval"]] + +conf <- yaml::read_yaml("../conf/helix.yml") +parse_mbox_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["save_folder_path"]] +``` +Run the function using this: +```{r} +parsed_mail <- parse_mbox( + perceval_path = parse_perceval_path, + mbox_path = parse_mbox_path +) +``` +This will store the parsed data into the parsed_mail variable. To view the table, use: +```{r} +View(parsed_mail) +``` From f0027dcd266628a399258cf57d4ba42fbac79eb8 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 1 Oct 2024 16:45:06 -1000 Subject: [PATCH 17/80] i #284 Testing Github Actions - Check works locally - Commit all changed files --- DESCRIPTION | 2 +- man/commit_message_id_coverage.Rd | 2 +- man/download_jira_issues_by_date.Rd | 4 ++-- man/download_jira_issues_by_issue_key.Rd | 4 ++-- man/download_mod_mbox.Rd | 2 +- man/metric_churn_per_commit_interval.Rd | 2 +- man/metric_churn_per_commit_per_file.Rd | 2 +- man/metric_file_bug_churn.Rd | 2 +- man/metric_file_bug_frequency.Rd | 2 +- man/metric_file_churn.Rd | 2 +- man/metric_file_non_bug_churn.Rd | 2 +- man/metric_file_non_bug_frequency.Rd | 2 +- man/motif_factory_anti_square.Rd | 4 ++-- man/motif_factory_anti_triangle.Rd | 4 ++-- man/motif_factory_square.Rd | 4 ++-- man/motif_factory_triangle.Rd | 4 ++-- man/parse_bugzilla_perceval_rest_issue_comments.Rd | 6 +++--- man/parse_bugzilla_perceval_traditional_issue_comments.Rd | 6 +++--- man/parse_bugzilla_rest_comments.Rd | 6 +++--- man/parse_bugzilla_rest_issues.Rd | 4 ++-- man/parse_bugzilla_rest_issues_comments.Rd | 4 ++-- man/parse_commit_message_id.Rd | 6 +++--- man/parse_dependencies.Rd | 6 +++--- man/parse_dv8_clusters.Rd | 6 +++--- man/parse_gitlog.Rd | 6 +++--- man/parse_jira.Rd | 4 ++-- man/parse_jira_latest_date.Rd | 6 +++--- man/parse_jira_rss_xml.Rd | 6 +++--- man/parse_mbox.Rd | 4 ++-- man/parse_mbox_latest_date.Rd | 4 ++-- man/parse_nvdfeed.Rd | 8 ++++---- man/refresh_jira_issues.Rd | 8 ++++---- 32 files changed, 67 insertions(+), 67 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 128ee129..1607fb2f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -49,4 +49,4 @@ Imports: VignetteBuilder: knitr URL: https://github.com/sailuh/kaiaulu BugReports: https://github.com/sailuh/kaiaulu/issues -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index 68fad761..e7f0c6ef 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/download_jira_issues_by_date.Rd b/man/download_jira_issues_by_date.Rd index 697fdb48..ccb3c7c2 100644 --- a/man/download_jira_issues_by_date.Rd +++ b/man/download_jira_issues_by_date.Rd @@ -72,13 +72,13 @@ For further details on the `created` JQL Query see [the associated JIRA API docu \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/download_jira_issues_by_issue_key.Rd b/man/download_jira_issues_by_issue_key.Rd index b452878f..8213ee17 100644 --- a/man/download_jira_issues_by_issue_key.Rd +++ b/man/download_jira_issues_by_issue_key.Rd @@ -67,13 +67,13 @@ For further details on the `issueKey` JQL Query see [the associated JIRA API doc \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues_by_date}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_date}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues_by_date}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_date}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index c628be38..c02cf5d8 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -9,7 +9,7 @@ download_mod_mbox( start_year_month, end_year_month, save_folder_path, - verbose = FALSE + verbose = TRUE ) } \arguments{ diff --git a/man/metric_churn_per_commit_interval.Rd b/man/metric_churn_per_commit_interval.Rd index 21f5e494..6969492d 100644 --- a/man/metric_churn_per_commit_interval.Rd +++ b/man/metric_churn_per_commit_interval.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn_per_commit_per_file}()}, \code{\link{metric_churn}()}, +\code{\link{metric_churn_per_commit_per_file}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_churn_per_commit_per_file.Rd b/man/metric_churn_per_commit_per_file.Rd index 75b48c85..577d3f63 100644 --- a/man/metric_churn_per_commit_per_file.Rd +++ b/man/metric_churn_per_commit_per_file.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits per commit per file Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn}()}, +\code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_bug_churn.Rd b/man/metric_file_bug_churn.Rd index 29bef17d..7bea610e 100644 --- a/man/metric_file_bug_churn.Rd +++ b/man/metric_file_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed bug type issues the file was involv \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_bug_frequency.Rd b/man/metric_file_bug_frequency.Rd index 607aef62..f978666e 100644 --- a/man/metric_file_bug_frequency.Rd +++ b/man/metric_file_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed bug type issues the file was involved. \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_churn.Rd b/man/metric_file_churn.Rd index 3e2babd4..67049ea5 100644 --- a/man/metric_file_churn.Rd +++ b/man/metric_file_churn.Rd @@ -18,9 +18,9 @@ The total churn of a file \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_non_bug_churn.Rd b/man/metric_file_non_bug_churn.Rd index bf35bb1c..049b9cd8 100644 --- a/man/metric_file_non_bug_churn.Rd +++ b/man/metric_file_non_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed non-bug type issues the file was in \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_non_bug_frequency.Rd b/man/metric_file_non_bug_frequency.Rd index da87d00a..9516ce61 100644 --- a/man/metric_file_non_bug_frequency.Rd +++ b/man/metric_file_non_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed non-bug type issues the file was invol \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/motif_factory_anti_square.Rd b/man/motif_factory_anti_square.Rd index 06cefd8f..8850ff38 100644 --- a/man/motif_factory_anti_square.Rd +++ b/man/motif_factory_anti_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_triangle}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_triangle}()} } \concept{motif} diff --git a/man/motif_factory_anti_triangle.Rd b/man/motif_factory_anti_triangle.Rd index b5a789a8..349cce19 100644 --- a/man/motif_factory_anti_triangle.Rd +++ b/man/motif_factory_anti_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_triangle}()} } \concept{motif} diff --git a/man/motif_factory_square.Rd b/man/motif_factory_square.Rd index 74101dfd..1c94e2a3 100644 --- a/man/motif_factory_square.Rd +++ b/man/motif_factory_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_triangle}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_triangle}()} } \concept{motif} diff --git a/man/motif_factory_triangle.Rd b/man/motif_factory_triangle.Rd index 61d81313..0a99faa8 100644 --- a/man/motif_factory_triangle.Rd +++ b/man/motif_factory_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_square}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_square}()} } \concept{motif} diff --git a/man/parse_bugzilla_perceval_rest_issue_comments.Rd b/man/parse_bugzilla_perceval_rest_issue_comments.Rd index 610eeb6f..87d29f0c 100644 --- a/man/parse_bugzilla_perceval_rest_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_rest_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval REST API Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd index f6f3b7f2..0cfacfd0 100644 --- a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval traditional Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_comments.Rd b/man/parse_bugzilla_rest_comments.Rd index 57999ca2..b12be91b 100644 --- a/man/parse_bugzilla_rest_comments.Rd +++ b/man/parse_bugzilla_rest_comments.Rd @@ -19,17 +19,17 @@ Parse Bugzilla comments data obtained from json files from Bugzilla crawler \cod Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues.Rd b/man/parse_bugzilla_rest_issues.Rd index da912e4b..bdd8bdde 100644 --- a/man/parse_bugzilla_rest_issues.Rd +++ b/man/parse_bugzilla_rest_issues.Rd @@ -27,11 +27,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues_comments.Rd b/man/parse_bugzilla_rest_issues_comments.Rd index b884739f..05da2855 100644 --- a/man/parse_bugzilla_rest_issues_comments.Rd +++ b/man/parse_bugzilla_rest_issues_comments.Rd @@ -29,11 +29,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_commit_message_id.Rd b/man/parse_commit_message_id.Rd index 13d9e542..e090ef19 100644 --- a/man/parse_commit_message_id.Rd +++ b/man/parse_commit_message_id.Rd @@ -19,16 +19,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dependencies.Rd b/man/parse_dependencies.Rd index a7136742..e4c58051 100644 --- a/man/parse_dependencies.Rd +++ b/man/parse_dependencies.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dv8_clusters.Rd b/man/parse_dv8_clusters.Rd index 987936bf..b4dc6249 100644 --- a/man/parse_dv8_clusters.Rd +++ b/man/parse_dv8_clusters.Rd @@ -17,16 +17,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} Other dv8: diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index d4370808..7d65786f 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira.Rd b/man/parse_jira.Rd index c3e8fe9a..0db0e226 100644 --- a/man/parse_jira.Rd +++ b/man/parse_jira.Rd @@ -33,16 +33,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_latest_date.Rd b/man/parse_jira_latest_date.Rd index d05f3b82..e2a730b5 100644 --- a/man/parse_jira_latest_date.Rd +++ b/man/parse_jira_latest_date.Rd @@ -25,16 +25,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_rss_xml.Rd b/man/parse_jira_rss_xml.Rd index 17b88ff5..1c0abecb 100644 --- a/man/parse_jira_rss_xml.Rd +++ b/man/parse_jira_rss_xml.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_jira_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index fd578695..d4852995 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -23,15 +23,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index b45f1cbd..82187a41 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -24,15 +24,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_nvdfeed.Rd b/man/parse_nvdfeed.Rd index 1c4365bd..0accc69d 100644 --- a/man/parse_nvdfeed.Rd +++ b/man/parse_nvdfeed.Rd @@ -18,16 +18,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, -\code{\link{parse_mbox}()} +\code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()} } \concept{parsers} diff --git a/man/refresh_jira_issues.Rd b/man/refresh_jira_issues.Rd index 6e7118eb..20be8882 100644 --- a/man/refresh_jira_issues.Rd +++ b/man/refresh_jira_issues.Rd @@ -61,14 +61,14 @@ data. \code{\link{parse_jira_latest_date}} to retrieve the file path of the latest issue key Other downloaders: +\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()}, -\code{\link{download_jira_issues}()} +\code{\link{download_jira_issues_by_issue_key}()} Other jira: +\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()}, -\code{\link{download_jira_issues}()} +\code{\link{download_jira_issues_by_issue_key}()} } \concept{downloaders} \concept{jira} From 9b9c8963c25db4432113f80f90a94d6e3cfa53dd Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Wed, 2 Oct 2024 13:57:42 -1000 Subject: [PATCH 18/80] i #284 Renamed save_folder_mail parameter to mbox - Renamed for match with convention set by issue #230 Signed-off-by: Dao McGill --- R/mail.R | 75 ++++++++++++++------------- conf/helix.yml | 16 +++--- man/download_mod_mbox.Rd | 6 +-- man/download_pipermail.Rd | 4 +- man/make_mbox_mailing_list.Rd | 4 +- man/parse_mbox.Rd | 4 +- man/parse_mbox_latest_date.Rd | 4 +- man/process_gz_to_mbox_in_folder.Rd | 4 +- man/refresh_mod_mbox.Rd | 9 +--- man/refresh_pipermail.Rd | 9 +--- tools.yml | 2 +- vignettes/download_mail.Rmd | 20 +++---- vignettes/parallelized_parse_mbox.Rmd | 51 ++++++++++++++++++ 13 files changed, 125 insertions(+), 83 deletions(-) create mode 100644 vignettes/parallelized_parse_mbox.Rmd diff --git a/R/mail.R b/R/mail.R index 861d32d2..a00025ee 100644 --- a/R/mail.R +++ b/R/mail.R @@ -21,11 +21,11 @@ #' @param mailing_list The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/") #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM') #' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month) -#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored +#' @param mbox The folder path in which all the downloaded pipermail files will be stored #' @param verbose if TRUE, prints diagnostic messages during the download process #' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { +download_pipermail <- function(mailing_list, start_year_month, end_year_month, mbox, verbose = TRUE) { ########## Download and Parse Mailing List HTML for Links ########## # Ensure mailing_list url ends with a slash, which is important when constructing links for downloading files, @@ -122,7 +122,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s } # Define the destination file name and path where the downloaded content will be saved as a .mbox file. - dest <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox')) + dest <- file.path(mbox, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox')) ########## Write Downloaded File to Disk ########## # Print diagnostic info if verbose is TRUE @@ -134,7 +134,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s # Write the downloaded file to disk. If the file is a .gz file, it needs to be unzipped and converted to .mbox format. if (grepl("\\.gz$", download_url)) { # Download the .gz file to a temporary location. - gz_file_path <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) + gz_file_path <- file.path(mbox, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) # Unzip the .gz file and save the contents as a .mbox file. @@ -179,16 +179,16 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s #' #' @param mailing_list The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/") #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). -#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored. +#' @param mbox The folder path in which all the downloaded pipermail files will be stored. #' @param verbose if TRUE, prints diagnostic messages. #' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { +refresh_pipermail <- function(mailing_list, start_year_month, mbox, verbose = TRUE) { ########## Check if Folder is Empty ########## # Check the contents of the folder to see if any .mbox files are already present # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' - files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + files_in_folder <- list.files(mbox, pattern = "kaiaulu_\\d{6}\\.mbox$") if (length(files_in_folder) == 0) { # If the folder is empty, download all pipermail files starting from the start_year_month @@ -197,7 +197,7 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") # Call the download_pipermail function to download files from start_year_month to end_year_month - download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) + download_pipermail(mailing_list, start_year_month, end_year_month, mbox) } ########## Identify the Most Recent Month ########## else { @@ -209,7 +209,7 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, recent_month <- max(year_months) # Delete the most recent file before redownloading it - recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + recent_file <- file.path(mbox, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) if (verbose) cat("Deleted the most recent file:", recent_file, "\n") @@ -223,12 +223,12 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") # Call the download_pipermail function to redownload the deleted month and all subsequent months up to the current month - download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) + download_pipermail(mailing_list, recent_month, end_year_month, mbox) } ########## Process .gz Files After Refresh ########## # Call process_gz_to_mbox_in_folder to ensure all .gz files are converted to .mbox after the refresh if (verbose) cat("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") - process_gz_to_mbox_in_folder(folder_path = save_folder_path, verbose = verbose) + process_gz_to_mbox_in_folder(mbox = mbox, verbose = verbose) } @@ -239,14 +239,14 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, #' If a .mbox file with the same name already exists, it will be overwritten. #' This makes sure that all the files in the folder are in .mbox format, ready for parsing. #' -#' @param folder_path The path to the folder containing both .gz and .mbox files. +#' @param mbox The path to the folder containing both .gz and .mbox files. #' @param verbose if TRUE, prints diagnostic messages during processing. #' @return A list of the .mbox files that were created or updated. #' @export -process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { +process_gz_to_mbox_in_folder <- function(mbox, verbose = TRUE) { # Get the list of all files in the folder, including full paths - files <- list.files(folder_path, full.names = TRUE) + files <- list.files(mbox, full.names = TRUE) # Identify .gz files from the list of files gz_files <- files[grepl("\\.gz$", files)] @@ -313,11 +313,11 @@ process_gz_to_mbox_in_folder <- function(folder_path, verbose = TRUE) { #' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). #' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM'). -#' @param save_folder_path The folder path where all the downloaded mbox files will be stored. +#' @param mbox The folder path where all the downloaded mbox files will be stored. #' @param verbose if TRUE, prints detailed messages during the download process. -#' @return Returns `save_folder_path`, the folder path where the mbox files are stored. +#' @return Returns `mbox`, the folder path where the mbox files are stored. #' @export -download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { +download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, mbox, verbose = TRUE) { ########## Extract Mailing List Name ########## # Extract the mailing list name from the given URL. This is because the actual list name is @@ -357,7 +357,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa # Create the file name where the mbox will be saved locally, in the format ''kaiaulu_'YYYYMM.mbox'. file_name <- stringi::stri_c("kaiaulu_", year, month_str, ".mbox") - file_path <- file.path(save_folder_path, file_name) + file_path <- file.path(mbox, file_name) if (verbose) { cat("Constructed URL:", download_url, "\n") @@ -386,7 +386,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa ########## Return Save Path ########## # Return the folder path where all mbox files were saved. - return(save_folder_path) + return(mbox) } @@ -405,16 +405,16 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa #' #' @param mailing_list The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org') #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). -#' @param save_folder_path The folder path in which all the downloaded mod_mbox files will be stored. +#' @param mbox The folder path in which all the downloaded mod_mbox files will be stored. #' @param verbose if TRUE, prints diagnostic messages. #' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { +refresh_mod_mbox <- function(mailing_list, start_year_month, mbox, verbose = TRUE) { ########## Check if Folder is Empty ########## # Check the contents of the folder to see if any .mbox files are already present. # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' - files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + files_in_folder <- list.files(mbox, pattern = "kaiaulu_\\d{6}\\.mbox$") if (length(files_in_folder) == 0) { # If the folder is empty, download all mod_mbox files starting from start_year_month @@ -423,7 +423,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") # Call the download_mod_mbox function to download files from start_year_month to end_year_month - download_mod_mbox(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = verbose) + download_mod_mbox(mailing_list, start_year_month, end_year_month, mbox, verbose = verbose) } ########## Identify the Most Recent Month ########## else { @@ -435,7 +435,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v recent_month <- max(year_months) # Delete the most recent file before redownloading it - recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + recent_file <- file.path(mbox, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) if (verbose) cat("Deleted the most recent file:", recent_file, "\n") @@ -449,7 +449,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") # Call the download_mod_mbox function to redownload the deleted month and all subsequent months up to the current month - download_mod_mbox(mailing_list, recent_month, end_year_month, save_folder_path, verbose = verbose) + download_mod_mbox(mailing_list, recent_month, end_year_month, mbox, verbose = verbose) } } @@ -465,19 +465,19 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v #' consistently renamed for clarity. #' #' @param perceval_path path to perceval binary -#' @param mbox_path path to mbox archive file (ends in .mbox) +#' @param mbox path to mbox archive file (ends in .mbox) #' @export #' @family parsers -parse_mbox <- function(perceval_path, mbox_path){ +parse_mbox <- function(perceval_path, mbox){ # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") perceval_path <- path.expand(perceval_path) - mbox_path <- path.expand(mbox_path) + mbox <- path.expand(mbox) # Remove ".mbox" - mbox_uri <- stringi::stri_replace_last_regex(mbox_path, pattern = "\\.mbox$", replacement = "") + mbox_uri <- stringi::stri_replace_last_regex(mbox, pattern = "\\.mbox$", replacement = "") - # Use percerval to parse mbox_path. --json line is required to be parsed by jsonlite::fromJSON. + # Use percerval to parse mbox. --json line is required to be parsed by jsonlite::fromJSON. perceval_output <- system2(perceval_path, - args = c('mbox',mbox_uri,mbox_path,'--json-line'), + args = c('mbox',mbox_uri,mbox,'--json-line'), stdout = TRUE, stderr = FALSE) @@ -506,12 +506,12 @@ parse_mbox <- function(perceval_path, mbox_path){ #' The folder assumes the following convention: "(mailing_list)_(archive_type)_yearmonth.mbox" #' For example: "geronimo-dev_apache_202401.mbox". This nomenclature is defined by \code{\link{download_mod_mbox_per_month}} #' -#' @param mbox_path path to mbox archive file (ends in .mbox) +#' @param mbox path to mbox archive file (ends in .mbox) #' @return Returns the name of the latest mod_mbox file #' @export #' @family parsers -parse_mbox_latest_date <- function(mbox_path) { - file_list <- list.files(mbox_path) +parse_mbox_latest_date <- function(mbox) { + file_list <- list.files(mbox) date_list <- list() for(i in file_list){ i <- sub(".mbox", "", i) @@ -588,14 +588,14 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r #' fake .mbox file #' #' @param replies An array of replies that have been created with \code{\link{make_mbox_reply}} -#' @param folder_path Folder path for the .mbox file being created. Defaulted at /tmp +#' @param mbox Folder path for the .mbox file being created. Defaulted at /tmp #' @param file_name Name of the file that will store the .mbox file #' @return the path of the .mbox file that was created #' @export -make_mbox_mailing_list <- function(replies, folder_path = "/tmp", file_name) { +make_mbox_mailing_list <- function(replies, mbox = "/tmp", file_name) { # Create a unique filename for the mbox file - mbox_filepath <- file.path(folder_path, stringi::stri_c(file_name, ".mbox")) + mbox_filepath <- file.path(mbox, stringi::stri_c(file_name, ".mbox")) # make the file mbox_body <- stringi::stri_c(replies,collapse = "\n\n") @@ -608,3 +608,4 @@ make_mbox_mailing_list <- function(replies, folder_path = "/tmp", file_name) { + diff --git a/conf/helix.yml b/conf/helix.yml index a3464fab..f2b13969 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -53,29 +53,29 @@ mailing_list: mailing_list: https://lists.apache.org/list.html?announce@apache.org start_year_month: 202310 end_year_month: 202405 - save_folder_path: "../save_mbox_mail" - save_parsed_folder_path: "../save_parsed_mail" + mbox: "../../extdata/save_mbox_mail" + save_parsed_folder_path: "../../extdata/save_parsed_mail" project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org start_year_month: 202201 end_year_month: 202401 - save_folder_path: "../save_mbox_mail" - save_parsed_folder_path: "../save_parsed_mail" + mbox: "../../extdata/save_mbox_mail" + save_parsed_folder_path: "../../../extdata/save_parsed_mail" pipermail: project_key_1: # archive_url: https://mta.openssl.org/mailman/listinfo/ mailing_list: https://mta.openssl.org/pipermail/openssl-users/ start_year_month: 202310 end_year_month: 202405 - save_folder_path: "../save_folder_mail" - save_parsed_folder_path: "../save_parsed_mail" + mbox: "../../extdata/save_folder_mail" + save_parsed_folder_path: "../../../extdata/save_parsed_mail" project_key_2: # archive_url: https://mta.openssl.org/mailman/listinfo/ mailing_list: https://mta.openssl.org/pipermail/openssl-project/ start_year_month: 202203 end_year_month: 202303 - save_folder_path: "../save_folder_mail_2" - save_parsed_folder_path: "../save_parsed_mail" + mbox: "../../extdata/save_folder_mail_2" + save_parsed_folder_path: "../../../extdata/save_parsed_mail" issue_tracker: jira: diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index c02cf5d8..49d75910 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -8,7 +8,7 @@ download_mod_mbox( mailing_list, start_year_month, end_year_month, - save_folder_path, + mbox, verbose = TRUE ) } @@ -20,12 +20,12 @@ download_mod_mbox( \item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM').} -\item{save_folder_path}{The folder path where all the downloaded mbox files will be stored.} +\item{mbox}{The folder path where all the downloaded mbox files will be stored.} \item{verbose}{if TRUE, prints detailed messages during the download process.} } \value{ -Returns `save_folder_path`, the folder path where the mbox files are stored. +Returns `mbox`, the folder path where the mbox files are stored. } \description{ This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 0aa1bc50..0b2f1d33 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -8,7 +8,7 @@ download_pipermail( mailing_list, start_year_month, end_year_month, - save_folder_path, + mbox, verbose = TRUE ) } @@ -19,7 +19,7 @@ download_pipermail( \item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month)} -\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} +\item{mbox}{The folder path in which all the downloaded pipermail files will be stored} \item{verbose}{if TRUE, prints diagnostic messages during the download process} } diff --git a/man/make_mbox_mailing_list.Rd b/man/make_mbox_mailing_list.Rd index c81dbfd3..2ab66721 100644 --- a/man/make_mbox_mailing_list.Rd +++ b/man/make_mbox_mailing_list.Rd @@ -4,12 +4,12 @@ \alias{make_mbox_mailing_list} \title{Takes in mbox replies and creates a .mbox file} \usage{ -make_mbox_mailing_list(replies, folder_path = "/tmp", file_name) +make_mbox_mailing_list(replies, mbox = "/tmp", file_name) } \arguments{ \item{replies}{An array of replies that have been created with \code{\link{make_mbox_reply}}} -\item{folder_path}{Folder path for the .mbox file being created. Defaulted at /tmp} +\item{mbox}{Folder path for the .mbox file being created. Defaulted at /tmp} \item{file_name}{Name of the file that will store the .mbox file} } diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index d4852995..99bbdaeb 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -4,12 +4,12 @@ \alias{parse_mbox} \title{Parse mbox from Perceval} \usage{ -parse_mbox(perceval_path, mbox_path) +parse_mbox(perceval_path, mbox) } \arguments{ \item{perceval_path}{path to perceval binary} -\item{mbox_path}{path to mbox archive file (ends in .mbox)} +\item{mbox}{path to mbox archive file (ends in .mbox)} } \description{ Parses an mbox file, which consists of emails in a mailbox, using the Perceval library. diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index 82187a41..3ce065fd 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -4,10 +4,10 @@ \alias{parse_mbox_latest_date} \title{Parse mbox latest date} \usage{ -parse_mbox_latest_date(mbox_path) +parse_mbox_latest_date(mbox) } \arguments{ -\item{mbox_path}{path to mbox archive file (ends in .mbox)} +\item{mbox}{path to mbox archive file (ends in .mbox)} } \value{ Returns the name of the latest mod_mbox file diff --git a/man/process_gz_to_mbox_in_folder.Rd b/man/process_gz_to_mbox_in_folder.Rd index 1964df28..93536f12 100644 --- a/man/process_gz_to_mbox_in_folder.Rd +++ b/man/process_gz_to_mbox_in_folder.Rd @@ -4,10 +4,10 @@ \alias{process_gz_to_mbox_in_folder} \title{Process .gz files in a folder and convert them to .mbox} \usage{ -process_gz_to_mbox_in_folder(folder_path, verbose = TRUE) +process_gz_to_mbox_in_folder(mbox, verbose = TRUE) } \arguments{ -\item{folder_path}{The path to the folder containing both .gz and .mbox files.} +\item{mbox}{The path to the folder containing both .gz and .mbox files.} \item{verbose}{if TRUE, prints diagnostic messages during processing.} } diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd index 43f6349a..e83f8e1f 100644 --- a/man/refresh_mod_mbox.Rd +++ b/man/refresh_mod_mbox.Rd @@ -4,19 +4,14 @@ \alias{refresh_mod_mbox} \title{Refresh mbox files downloaded via mod_mbox} \usage{ -refresh_mod_mbox( - mailing_list, - start_year_month, - save_folder_path, - verbose = TRUE -) +refresh_mod_mbox(mailing_list, start_year_month, mbox, verbose = TRUE) } \arguments{ \item{mailing_list}{The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org')} \item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{save_folder_path}{The folder path in which all the downloaded mod_mbox files will be stored.} +\item{mbox}{The folder path in which all the downloaded mod_mbox files will be stored.} \item{verbose}{if TRUE, prints diagnostic messages.} } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index dc2ce0b2..bb541307 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -4,19 +4,14 @@ \alias{refresh_pipermail} \title{Refresh mbox files downloaded via pipermail} \usage{ -refresh_pipermail( - mailing_list, - start_year_month, - save_folder_path, - verbose = TRUE -) +refresh_pipermail(mailing_list, start_year_month, mbox, verbose = TRUE) } \arguments{ \item{mailing_list}{The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/")} \item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored.} +\item{mbox}{The folder path in which all the downloaded pipermail files will be stored.} \item{verbose}{if TRUE, prints diagnostic messages.} } diff --git a/tools.yml b/tools.yml index d3bbc518..51f9155d 100644 --- a/tools.yml +++ b/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /usr/local/Cellar/universal-ctags/HEAD-40b5861/bin/ctags +utags: /usr/local/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index a1e91950..7c2feb9f 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -55,14 +55,14 @@ conf <- yaml::read_yaml("../conf/helix.yml") mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]] end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]] -save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["save_folder_path"]] +mbox <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mbox"]] ``` ### Explanation of Configuration Parameters - mailing_list: The URL of the mailing list archive index page (e.g., https://lists.openssl.org/pipermail/openssl-users/). - start_year_month: The starting date for downloading archives (in YYYYMM format). - end_year_month: The ending date for downloading archives (in YYYYMM format). -- save_folder_path: The local directory where the downloaded archives will be saved. +- mbox: The local directory where the downloaded archives will be saved. ## Pipermail Downloader You can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory. The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. @@ -72,7 +72,7 @@ download_pipermail( mailing_list = mailing_list, start_year_month = start_year_month, end_year_month = end_year_month, - save_folder_path = save_folder_path, + mbox = mbox, verbose = TRUE ) @@ -91,7 +91,7 @@ How refresh_pipermail Works refresh_pipermail( mailing_list = mailing_list, start_year_month = start_year_month, - save_folder_path = save_folder_path, + mbox = mbox, verbose = TRUE ) @@ -110,14 +110,14 @@ Similar to Pipermail, we load the configuration for Mod Mbox from the YAML file, mod_mbox_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] mod_start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["start_year_month"]] mod_end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["end_year_month"]] -mod_save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["save_folder_path"]] +mod_mbox <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox"]] ``` ### Explanation of Configuration Parameters - mailing_list: The URL of the Mod Mbox mailing list (e.g., https://lists.apache.org/list.html?announce@apache.org). - start_year_month: The first month to download (format: YYYYMM). - end_year_month: The last month to download (format: YYYYMM). -- save_folder_path: The directory where the downloaded .mbox files will be saved. +- mbox: The directory where the downloaded .mbox files will be saved. ## Mod Mbox Downloader The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. @@ -127,7 +127,7 @@ download_mod_mbox( mailing_list = mod_mbox_list, start_year_month = mod_start_year_month, end_year_month = mod_end_year_month, - save_folder_path = mod_save_folder_path, + mbox = mod_mbox, verbose = TRUE ) ``` @@ -146,7 +146,7 @@ How refresh_mod_mbox Works refresh_mod_mbox( mailing_list = mod_mbox_list, start_year_month = mod_start_year_month, - save_folder_path = mod_save_folder_path, + mbox = mod_mbox, verbose = TRUE ) ``` @@ -167,13 +167,13 @@ tools_config <- yaml::read_yaml("../tools.yml") parse_perceval_path <- tools_config[["perceval"]] conf <- yaml::read_yaml("../conf/helix.yml") -parse_mbox_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["save_folder_path"]] +parse_mbox <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox"]] ``` Run the function using this: ```{r} parsed_mail <- parse_mbox( perceval_path = parse_perceval_path, - mbox_path = parse_mbox_path + mbox = parse_mbox ) ``` This will store the parsed data into the parsed_mail variable. To view the table, use: diff --git a/vignettes/parallelized_parse_mbox.Rmd b/vignettes/parallelized_parse_mbox.Rmd new file mode 100644 index 00000000..979e0f14 --- /dev/null +++ b/vignettes/parallelized_parse_mbox.Rmd @@ -0,0 +1,51 @@ +--- +title: "Parallel Parsing of Mbox Files Using Python and R" +output: + html_document: + toc: true + number_sections: true +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Parallel Parsing of Mbox Files Using Python and R} + %\VignetteEncoding{UTF-8} +--- + +rm(list = ls()) +seed <- 1 +set.seed(seed) + +# Load required libraries +require(kaiaulu) +require(data.table) +require(yaml) +require(cli) + +# Introduction +This notebook demonstrates how to efficiently parse .mbox files using parallel processing. Python's ThreadPoolExecutor handles the parallel parsing of multiple .mbox files using an R script (parsembox.R) through a Python wrapper. This allows concurrent processing of mailing list archives, which is useful when dealing with large datasets. + +# Prerequisites +1. A collection of .mbox files to parse (see download_mail.Rmd for help with this) +2. Perceval + +## Project Configuration File + +```{r} +tools_config <- yaml::read_yaml("../tools.yml") +perceval_path <- tools_config[["perceval"]] + +conf <- yaml::read_yaml("../conf/helix.yml") +save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["save_folder_path"]] +``` + +## Python Thread Manager +We will use the parse_mbox.py Python script to execute the parsing of multiple .mbox files in parallel. The Python script uses the ThreadPoolExecutor to run the R script (parsembox.R) concurrently on each .mbox file. + +Here is how it works: + +1. Python's ThreadPoolExecutor is used to handle parallel processing. +2. Each thread calls the R script for one .mbox file. +3. The R script processes the file and saves the output as a CSV file. + +```{bash} +python3 ../inst/python/parse_mbox.py +``` From 7249c9b51ca8adb7c576b0c36e08a368015d7fb6 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Wed, 2 Oct 2024 21:52:28 -1000 Subject: [PATCH 19/80] i #284 Updated Notebook download_mail.Rmd - Reverted name change of save_folder_mail - Removed previous documentation file for mail (download_mod_mbox.Rmd) - Updates to dowmload_mail.Rmd --- DESCRIPTION | 3 +- NEWS.md | 2 + R/mail.R | 60 ++++++------ conf/helix.yml | 22 +++-- man/download_mod_mbox.Rd | 6 +- man/download_pipermail.Rd | 4 +- man/parse_mbox.Rd | 4 +- man/process_gz_to_mbox_in_folder.Rd | 4 +- man/refresh_mod_mbox.Rd | 9 +- man/refresh_pipermail.Rd | 9 +- vignettes/download_mail.Rmd | 134 +++++++++++++++++++------- vignettes/download_mod_mbox.Rmd | 102 -------------------- vignettes/parallelized_parse_mbox.Rmd | 51 ---------- 13 files changed, 166 insertions(+), 244 deletions(-) delete mode 100644 vignettes/download_mod_mbox.Rmd delete mode 100644 vignettes/parallelized_parse_mbox.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index 1607fb2f..5a793074 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,8 @@ Authors@R: c( person('Nicole', 'Hoess', role = c('ctb')), person('Anthony', 'Lau', role = c('ctb')), person('Sean', 'Sunoo', role = c('ctb')), - person('Ian Jaymes', 'Iwata', role= c('ctb')) + person('Ian Jaymes', 'Iwata', role= c('ctb')), + person('Dao', 'McGill', role= c('ctb')) ) Maintainer: Carlos Paradis License: MPL-2.0 | file LICENSE diff --git a/NEWS.md b/NEWS.md index 0abffa8e..869a7d8a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ __kaiaulu 0.0.0.9700 (in development)__ ### NEW FEATURES + * Refactor of all R/mail.R mailing list functions for downloading and refreshing both pipermail and mod mbox archives. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `refresh_mod_mbox()` and `refresh_pipermail()` has been added. They are both functions that downloads mbox issues that are not already downloaded up until the current year and month. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `parse_mbox_latest_date()` has been added. This function returns the file name of the downloaded mbox file containing the latest date for use by `download_mbox_per_month()` and `download_pipermail` to implement a refresh capability. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `refresh_jira_issues()` had been added. It is a wrapper function for the previous downloader and downloads only issues greater than the greatest key already downloaded. @@ -30,6 +31,7 @@ __kaiaulu 0.0.0.9700 (in development)__ ### MINOR IMPROVEMENTS + * All mailing list documentation can now be found in 'download_mail.Rmd'. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `download_pipermail()` now downloads all the txt and txt.gz files in the accessed pipermail archive as mbox files. [#284](https://github.com/sailuh/kaiaulu/issues/284) * The line metrics notebook now provides further guidance on adjusting the snapshot and filtering. * The R File and R Function parser can now properly parse R folders which contain folders within (not following R package structure). Both `.r` and `.R` files are also now captured (previously only one of the two were specified, but R accepts both). [#235](https://github.com/sailuh/kaiaulu/issues/235) diff --git a/R/mail.R b/R/mail.R index a00025ee..710ccaa2 100644 --- a/R/mail.R +++ b/R/mail.R @@ -21,11 +21,11 @@ #' @param mailing_list The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/") #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM') #' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month) -#' @param mbox The folder path in which all the downloaded pipermail files will be stored +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored #' @param verbose if TRUE, prints diagnostic messages during the download process #' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(mailing_list, start_year_month, end_year_month, mbox, verbose = TRUE) { +download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { ########## Download and Parse Mailing List HTML for Links ########## # Ensure mailing_list url ends with a slash, which is important when constructing links for downloading files, @@ -122,7 +122,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, m } # Define the destination file name and path where the downloaded content will be saved as a .mbox file. - dest <- file.path(mbox, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox')) + dest <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox')) ########## Write Downloaded File to Disk ########## # Print diagnostic info if verbose is TRUE @@ -134,7 +134,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, m # Write the downloaded file to disk. If the file is a .gz file, it needs to be unzipped and converted to .mbox format. if (grepl("\\.gz$", download_url)) { # Download the .gz file to a temporary location. - gz_file_path <- file.path(mbox, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) + gz_file_path <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) # Unzip the .gz file and save the contents as a .mbox file. @@ -179,16 +179,16 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, m #' #' @param mailing_list The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/") #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). -#' @param mbox The folder path in which all the downloaded pipermail files will be stored. +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored. #' @param verbose if TRUE, prints diagnostic messages. #' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_pipermail <- function(mailing_list, start_year_month, mbox, verbose = TRUE) { +refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { ########## Check if Folder is Empty ########## # Check the contents of the folder to see if any .mbox files are already present # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' - files_in_folder <- list.files(mbox, pattern = "kaiaulu_\\d{6}\\.mbox$") + files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") if (length(files_in_folder) == 0) { # If the folder is empty, download all pipermail files starting from the start_year_month @@ -197,7 +197,7 @@ refresh_pipermail <- function(mailing_list, start_year_month, mbox, verbose = TR if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") # Call the download_pipermail function to download files from start_year_month to end_year_month - download_pipermail(mailing_list, start_year_month, end_year_month, mbox) + download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) } ########## Identify the Most Recent Month ########## else { @@ -209,7 +209,7 @@ refresh_pipermail <- function(mailing_list, start_year_month, mbox, verbose = TR recent_month <- max(year_months) # Delete the most recent file before redownloading it - recent_file <- file.path(mbox, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) if (verbose) cat("Deleted the most recent file:", recent_file, "\n") @@ -223,12 +223,12 @@ refresh_pipermail <- function(mailing_list, start_year_month, mbox, verbose = TR if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") # Call the download_pipermail function to redownload the deleted month and all subsequent months up to the current month - download_pipermail(mailing_list, recent_month, end_year_month, mbox) + download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) } ########## Process .gz Files After Refresh ########## # Call process_gz_to_mbox_in_folder to ensure all .gz files are converted to .mbox after the refresh if (verbose) cat("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") - process_gz_to_mbox_in_folder(mbox = mbox, verbose = verbose) + process_gz_to_mbox_in_folder(save_folder_path = save_folder_path, verbose = verbose) } @@ -239,14 +239,14 @@ refresh_pipermail <- function(mailing_list, start_year_month, mbox, verbose = TR #' If a .mbox file with the same name already exists, it will be overwritten. #' This makes sure that all the files in the folder are in .mbox format, ready for parsing. #' -#' @param mbox The path to the folder containing both .gz and .mbox files. +#' @param save_folder_path The path to the folder containing both .gz and .mbox files. #' @param verbose if TRUE, prints diagnostic messages during processing. #' @return A list of the .mbox files that were created or updated. #' @export -process_gz_to_mbox_in_folder <- function(mbox, verbose = TRUE) { +process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { # Get the list of all files in the folder, including full paths - files <- list.files(mbox, full.names = TRUE) + files <- list.files(save_folder_path, full.names = TRUE) # Identify .gz files from the list of files gz_files <- files[grepl("\\.gz$", files)] @@ -313,11 +313,11 @@ process_gz_to_mbox_in_folder <- function(mbox, verbose = TRUE) { #' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). #' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM'). -#' @param mbox The folder path where all the downloaded mbox files will be stored. +#' @param save_folder_path The folder path where all the downloaded mbox files will be stored. #' @param verbose if TRUE, prints detailed messages during the download process. -#' @return Returns `mbox`, the folder path where the mbox files are stored. +#' @return Returns `save_folder_path`, the folder path where the mbox files are stored. #' @export -download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, mbox, verbose = TRUE) { +download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { ########## Extract Mailing List Name ########## # Extract the mailing list name from the given URL. This is because the actual list name is @@ -357,7 +357,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, mb # Create the file name where the mbox will be saved locally, in the format ''kaiaulu_'YYYYMM.mbox'. file_name <- stringi::stri_c("kaiaulu_", year, month_str, ".mbox") - file_path <- file.path(mbox, file_name) + file_path <- file.path(save_folder_path, file_name) if (verbose) { cat("Constructed URL:", download_url, "\n") @@ -386,7 +386,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, mb ########## Return Save Path ########## # Return the folder path where all mbox files were saved. - return(mbox) + return(save_folder_path) } @@ -405,16 +405,16 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, mb #' #' @param mailing_list The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org') #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). -#' @param mbox The folder path in which all the downloaded mod_mbox files will be stored. +#' @param save_folder_path The folder path in which all the downloaded mod_mbox files will be stored. #' @param verbose if TRUE, prints diagnostic messages. #' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_mod_mbox <- function(mailing_list, start_year_month, mbox, verbose = TRUE) { +refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { ########## Check if Folder is Empty ########## # Check the contents of the folder to see if any .mbox files are already present. # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' - files_in_folder <- list.files(mbox, pattern = "kaiaulu_\\d{6}\\.mbox$") + files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") if (length(files_in_folder) == 0) { # If the folder is empty, download all mod_mbox files starting from start_year_month @@ -423,7 +423,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, mbox, verbose = TRU if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") # Call the download_mod_mbox function to download files from start_year_month to end_year_month - download_mod_mbox(mailing_list, start_year_month, end_year_month, mbox, verbose = verbose) + download_mod_mbox(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = verbose) } ########## Identify the Most Recent Month ########## else { @@ -435,7 +435,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, mbox, verbose = TRU recent_month <- max(year_months) # Delete the most recent file before redownloading it - recent_file <- file.path(mbox, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) if (verbose) cat("Deleted the most recent file:", recent_file, "\n") @@ -449,7 +449,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, mbox, verbose = TRU if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") # Call the download_mod_mbox function to redownload the deleted month and all subsequent months up to the current month - download_mod_mbox(mailing_list, recent_month, end_year_month, mbox, verbose = verbose) + download_mod_mbox(mailing_list, recent_month, end_year_month, save_folder_path, verbose = verbose) } } @@ -465,19 +465,19 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, mbox, verbose = TRU #' consistently renamed for clarity. #' #' @param perceval_path path to perceval binary -#' @param mbox path to mbox archive file (ends in .mbox) +#' @param mbox_path path to mbox archive file (ends in .mbox) #' @export #' @family parsers -parse_mbox <- function(perceval_path, mbox){ +parse_mbox <- function(perceval_path, mbox_path){ # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") perceval_path <- path.expand(perceval_path) - mbox <- path.expand(mbox) + mbox_path <- path.expand(mbox_path) # Remove ".mbox" - mbox_uri <- stringi::stri_replace_last_regex(mbox, pattern = "\\.mbox$", replacement = "") + mbox_uri <- stringi::stri_replace_last_regex(mbox_path, pattern = "\\.mbox$", replacement = "") # Use percerval to parse mbox. --json line is required to be parsed by jsonlite::fromJSON. perceval_output <- system2(perceval_path, - args = c('mbox',mbox_uri,mbox,'--json-line'), + args = c('mbox',mbox_uri,mbox_path,'--json-line'), stdout = TRUE, stderr = FALSE) diff --git a/conf/helix.yml b/conf/helix.yml index f2b13969..2612fcf3 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -53,29 +53,31 @@ mailing_list: mailing_list: https://lists.apache.org/list.html?announce@apache.org start_year_month: 202310 end_year_month: 202405 - mbox: "../../extdata/save_mbox_mail" - save_parsed_folder_path: "../../extdata/save_parsed_mail" + save_folder_path: "../../extdata/save_mbox_mail" + # mbox_path is for use only with parse_mbox() function. It is the file to parse. + mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202410.mbox" project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org start_year_month: 202201 end_year_month: 202401 - mbox: "../../extdata/save_mbox_mail" - save_parsed_folder_path: "../../../extdata/save_parsed_mail" + save_folder_path: "../../extdata/save_mbox_mail" + # mbox_path is for use only with parse_mbox() function. It is the file to parse. + mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" pipermail: project_key_1: - # archive_url: https://mta.openssl.org/mailman/listinfo/ mailing_list: https://mta.openssl.org/pipermail/openssl-users/ start_year_month: 202310 end_year_month: 202405 - mbox: "../../extdata/save_folder_mail" - save_parsed_folder_path: "../../../extdata/save_parsed_mail" + save_folder_path: "../../extdata/save_folder_mail" + # mbox_path is for use only with parse_mbox() function. It is the file to parse. + mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202310.mbox" project_key_2: - # archive_url: https://mta.openssl.org/mailman/listinfo/ mailing_list: https://mta.openssl.org/pipermail/openssl-project/ start_year_month: 202203 end_year_month: 202303 - mbox: "../../extdata/save_folder_mail_2" - save_parsed_folder_path: "../../../extdata/save_parsed_mail" + save_folder_path: "../../extdata/save_folder_mail_2" + # mbox_path is for use only with parse_mbox() function. It is the file to parse. + mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" issue_tracker: jira: diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index 49d75910..c02cf5d8 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -8,7 +8,7 @@ download_mod_mbox( mailing_list, start_year_month, end_year_month, - mbox, + save_folder_path, verbose = TRUE ) } @@ -20,12 +20,12 @@ download_mod_mbox( \item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM').} -\item{mbox}{The folder path where all the downloaded mbox files will be stored.} +\item{save_folder_path}{The folder path where all the downloaded mbox files will be stored.} \item{verbose}{if TRUE, prints detailed messages during the download process.} } \value{ -Returns `mbox`, the folder path where the mbox files are stored. +Returns `save_folder_path`, the folder path where the mbox files are stored. } \description{ This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 0b2f1d33..0aa1bc50 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -8,7 +8,7 @@ download_pipermail( mailing_list, start_year_month, end_year_month, - mbox, + save_folder_path, verbose = TRUE ) } @@ -19,7 +19,7 @@ download_pipermail( \item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month)} -\item{mbox}{The folder path in which all the downloaded pipermail files will be stored} +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} \item{verbose}{if TRUE, prints diagnostic messages during the download process} } diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 99bbdaeb..d4852995 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -4,12 +4,12 @@ \alias{parse_mbox} \title{Parse mbox from Perceval} \usage{ -parse_mbox(perceval_path, mbox) +parse_mbox(perceval_path, mbox_path) } \arguments{ \item{perceval_path}{path to perceval binary} -\item{mbox}{path to mbox archive file (ends in .mbox)} +\item{mbox_path}{path to mbox archive file (ends in .mbox)} } \description{ Parses an mbox file, which consists of emails in a mailbox, using the Perceval library. diff --git a/man/process_gz_to_mbox_in_folder.Rd b/man/process_gz_to_mbox_in_folder.Rd index 93536f12..681022f2 100644 --- a/man/process_gz_to_mbox_in_folder.Rd +++ b/man/process_gz_to_mbox_in_folder.Rd @@ -4,10 +4,10 @@ \alias{process_gz_to_mbox_in_folder} \title{Process .gz files in a folder and convert them to .mbox} \usage{ -process_gz_to_mbox_in_folder(mbox, verbose = TRUE) +process_gz_to_mbox_in_folder(save_folder_path, verbose = TRUE) } \arguments{ -\item{mbox}{The path to the folder containing both .gz and .mbox files.} +\item{save_folder_path}{The path to the folder containing both .gz and .mbox files.} \item{verbose}{if TRUE, prints diagnostic messages during processing.} } diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd index e83f8e1f..43f6349a 100644 --- a/man/refresh_mod_mbox.Rd +++ b/man/refresh_mod_mbox.Rd @@ -4,14 +4,19 @@ \alias{refresh_mod_mbox} \title{Refresh mbox files downloaded via mod_mbox} \usage{ -refresh_mod_mbox(mailing_list, start_year_month, mbox, verbose = TRUE) +refresh_mod_mbox( + mailing_list, + start_year_month, + save_folder_path, + verbose = TRUE +) } \arguments{ \item{mailing_list}{The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org')} \item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{mbox}{The folder path in which all the downloaded mod_mbox files will be stored.} +\item{save_folder_path}{The folder path in which all the downloaded mod_mbox files will be stored.} \item{verbose}{if TRUE, prints diagnostic messages.} } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index bb541307..dc2ce0b2 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -4,14 +4,19 @@ \alias{refresh_pipermail} \title{Refresh mbox files downloaded via pipermail} \usage{ -refresh_pipermail(mailing_list, start_year_month, mbox, verbose = TRUE) +refresh_pipermail( + mailing_list, + start_year_month, + save_folder_path, + verbose = TRUE +) } \arguments{ \item{mailing_list}{The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/")} \item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{mbox}{The folder path in which all the downloaded pipermail files will be stored.} +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored.} \item{verbose}{if TRUE, prints diagnostic messages.} } diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 7c2feb9f..3ea7a547 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -27,56 +27,76 @@ set.seed(seed) # Introduction - Mailing list data is stored in a variety of archives. See: - Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). - Pipermail: [OpenSSL](https://mta.openssl.org/mailman/listinfo/). -This notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail. +(More information on this in the sections below.) This notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail. # Pipermail ## Mailing List Organization - Mailing lists are typically organized by topic or purpose. For example, the [OpenSSL project](https://www.openssl.org/community/mailinglists.html) maintains several mailing lists, each serving a different group: -- **openssl-announce**: For important announcements. -- **openssl-commits**: For commit messages. -- **openssl-project**: For project discussions. -- **openssl-users**: For general user questions and discussions. +- **project-announce**: For important announcements. +- **project-commits**: For commit messages. +- **project-project**: For project discussions. +- **project-users**: For general user questions and discussions. -Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. +Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully selected, since this effects the quality and completeness of analysis. ## Project Configuration File +To start, we load the project configuration file, which contains parameters for downloading the mailing list archives. Instead of hard-coding these values in the notebook, we store them in a project configuration file in YAML format. This makes the parameters easier to manage. +Here is an example of the pipermail mailing list section from the configuration file (conf/helix.yml): + +```{yaml} +# top-level key for mailing list config +mailing_list: + # for pipermail + pipermail: + project_key_1: + mailing_list: https://mta.openssl.org/pipermail/openssl-users/ + start_year_month: 202310 + end_year_month: 202405 + save_folder_path: "../../extdata/save_folder_mail" + +``` + +The configuration file contains the following parameters for each mailing list archive: + +- project_key_1: A unique key for the project. There can be multiple projects in both the pipermail and mod mbox sections. +- pipermail/ mod_mbox: Indicates whether the setting are for pipermail or mod mbox. Although the parameters are the same, this helps to differentiate between the two types of mailing list archives. +- mailing_list: The URL of the mailing list archive page. Note that this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). +- start_year_month: The starting date for downloading archives (in YYYYMM format). +- end_year_month: The ending date for downloading archives (in YYYYMM format). +- save_folder_path: The local directory where the downloaded archives will be saved (if you run the code in this notebook, the archives will be saved in a folder 'extdata', located in the parent directory of kaiaulu (wherever your kaiaulu folder is kept)). -To start, we load the project configuration file, which contains parameters for downloading the mailing list archives. +By organizing the configuration in this way, you can manage multiple projects and mailing lists easily. The notebook reads these parameters and uses them to download and process the archives. + +## Pipermail Downloader +The following code reads the configuration parameters for project_key_1 of pipermail: ```{r} conf <- yaml::read_yaml("../conf/helix.yml") mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]] end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]] -mbox <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mbox"]] +save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["save_folder_path"]] ``` -### Explanation of Configuration Parameters -- mailing_list: The URL of the mailing list archive index page (e.g., https://lists.openssl.org/pipermail/openssl-users/). -- start_year_month: The starting date for downloading archives (in YYYYMM format). -- end_year_month: The ending date for downloading archives (in YYYYMM format). -- mbox: The local directory where the downloaded archives will be saved. +After setting the configurations above, you can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory (save_folder_path). The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. -## Pipermail Downloader -You can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory. The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. ```{r} # Download archives download_pipermail( mailing_list = mailing_list, start_year_month = start_year_month, end_year_month = end_year_month, - mbox = mbox, + save_folder_path = save_folder_path, verbose = TRUE ) ``` + After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc. ## Pipermail Refresher @@ -86,50 +106,73 @@ How refresh_pipermail Works 1. Checks if the folder is empty: If the folder is empty, it downloads archives starting from start_year_month to the current month using download_pipermail(). 2. Finds the most recent file: If the folder is not empty, the function checks for the most recent month’s file (based on the filename) and deletes it. 3. Redownloads from the most recent month: The function then redownloads the archive from the most recent month up to the current month. + ```{r} # Refresh archives refresh_pipermail( mailing_list = mailing_list, start_year_month = start_year_month, - mbox = mbox, + save_folder_path = save_folder_path, verbose = TRUE ) ``` -This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive if necessary and adding any new months that have been added to the mailing list. + +This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive and adding any new months that have been added to the mailing list. # Mod Mbox ## Mailing List Organization Mod Mbox archives also organize mailing lists by topic. The apache mailing list archives can be found at https://lists.apache.org/. +Just like with Pipermail, mailing list archives in Mod Mbox can be split across different formats or locations, and vary in completeness and available metadata. It is important to select the appropriate archive that is compatible with Kaiaulu and suits your analysis needs. + ## Project Configuration File -Similar to Pipermail, we load the configuration for Mod Mbox from the YAML file, which includes the mailing list URL, the date range, and the save folder path. +Like in Pipermail, we load the configuration for Mod Mbox from the YAML file, which includes the mailing list URL, the date range, and the save folder path. + +Here's an example of the relevant section in the configuration file (conf/helix.yml): +```{yaml} +# top-level key for mailing list config +mailing_list: + # for mod mbox + mod_mbox: + project_key_1: + mailing_list: https://lists.apache.org/list.html?announce@apache.org + start_year_month: 202310 + end_year_month: 202405 + save_folder_path: "../../extdata/save_mbox_mail" + +``` + +The configuration parameters are the same as the ones explained in the section at the top of this notebook, except that the mailing_list should point to a Mod Mbox mailing list URL. + +The following code reads the configuration parameters: ```{r} -mod_mbox_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] -mod_start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["start_year_month"]] -mod_end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["end_year_month"]] -mod_mbox <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox"]] +conf <- yaml::read_yaml("../conf/helix.yml") +mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] +start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["start_year_month"]] +end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["end_year_month"]] +save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["save_folder_path"]] ``` -### Explanation of Configuration Parameters - mailing_list: The URL of the Mod Mbox mailing list (e.g., https://lists.apache.org/list.html?announce@apache.org). - start_year_month: The first month to download (format: YYYYMM). - end_year_month: The last month to download (format: YYYYMM). -- mbox: The directory where the downloaded .mbox files will be saved. +- save_folder_path: The directory where the downloaded .mbox files will be saved. ## Mod Mbox Downloader The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. ```{r} download_mod_mbox( - mailing_list = mod_mbox_list, - start_year_month = mod_start_year_month, - end_year_month = mod_end_year_month, - mbox = mod_mbox, + mailing_list = mailing_list, + start_year_month = start_year_month, + end_year_month = end_year_month, + save_folder_path = save_folder_path, verbose = TRUE ) + ``` After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 @@ -144,9 +187,9 @@ How refresh_mod_mbox Works ```{r} refresh_mod_mbox( - mailing_list = mod_mbox_list, - start_year_month = mod_start_year_month, - mbox = mod_mbox, + mailing_list = mailing_list, + start_year_month = start_year_month, + save_folder_path= save_folder_path, verbose = TRUE ) ``` @@ -162,18 +205,35 @@ The parse_mbox() function takes an .mbox file and parses it into a structured da For the configuration, make sure you have the correct path to the Perceval library in the conf file. +Here's an example of the relevant section in the tools.yml file: +```{yaml} +perceval: /usr/local/bin/perceval +``` +And in the helix.yml configuration file: +```{yaml} +mailing_list: + # for mod mbox + mod_mbox: + project_key_1: + mbox_path: "../../extdata/save_mbox_mail.kaiaulu_202310.mbox" +``` +perceval: found in tools.yml, this should be set to your local path to the perceval binary (use > which perceval to locate the path). +mbox_path: should point to the saved .mbox file that will be parsed. See the mbox_path in the mailing_list sections of helix.yml. + +Load the configuration: ```{r} tools_config <- yaml::read_yaml("../tools.yml") parse_perceval_path <- tools_config[["perceval"]] conf <- yaml::read_yaml("../conf/helix.yml") -parse_mbox <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox"]] +mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_path"]] + ``` -Run the function using this: +Run the parser: ```{r} parsed_mail <- parse_mbox( perceval_path = parse_perceval_path, - mbox = parse_mbox + mbox_path = mbox_path ) ``` This will store the parsed data into the parsed_mail variable. To view the table, use: diff --git a/vignettes/download_mod_mbox.Rmd b/vignettes/download_mod_mbox.Rmd deleted file mode 100644 index 48ba38c4..00000000 --- a/vignettes/download_mod_mbox.Rmd +++ /dev/null @@ -1,102 +0,0 @@ ---- -title: "Download Mod Mbox Mailing List Archives" -output: - html_document: - toc: true - number_sections: true -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{Download Mod Mbox Mailing List Archives} - %\VignetteEncoding{UTF-8} ---- - - -```{r} -rm(list = ls()) -seed <- 1 -set.seed(seed) -``` - -```{r warning=FALSE,message=FALSE} -require(kaiaulu) -require(data.table) -``` - -# Introduction - -Mailing list data is stored in a variety of archives (e.g. see [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). This notebook showcases how to obtain data from mod_mbox archives, which is adopted by the Apache Software Foundation. - -## Project Configuration File - -As usual, the first step is to load the project configuration file. - -# Project Configuration File - -```{r} -conf <- yaml::read_yaml("../conf/helix.yml") -save_path_mbox <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mbox"]] -mod_mbox_url <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_url"]] -mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["mailing_list"]] -archive_type <- conf[["mailing_list"]][["mod_mbox"]][["mail_key_1"]][["archive_type"]] -start_year <- 2024 -end_year <- 2018 - -conf2 <- yaml::read_yaml("../conf/openssl.yml") -save_path_pipermail <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["pipermail"]] -pipermail_url <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["archive_url"]] -mailing_list2 <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["mailing_list"]] -archive_type2 <- conf2[["mailing_list"]][["pipermail"]][["mail_key_1"]][["archive_type"]] - -perceval_path <- yaml::read_yaml("../tools.yml")[["perceval"]] -``` - -# Mod Mbox Downloader - -```{r eval = FALSE} -mbox <- download_mod_mbox_per_month(archive_url = mod_mbox_url, - mailing_list = mailing_list, - archive_type = archive_type, - from_year=start_year, - to_year=end_year, - save_folder_path = save_path_mbox, - verbose = TRUE) -``` - -# Refresh Mod Mbox - -```{r eval = FALSE} -mbox_latest <- parse_mbox_latest_date(save_path_mbox) -refresh_mod_mbox(archive_url = mod_mbox_url, - mailing_list = mailing_list, - archive_type = archive_type, - from_year = 2024, - save_folder_path = save_path_mbox, - verbose = TRUE) -``` - -# Pipermail Downloader - -```{r eval = FALSE} -download_pipermail(archive_url = pipermail_url, - mailing_list = mailing_list2, - archive_type = archive_type2, - save_folder_path = save_path_pipermail) -``` - -# Pipermail Refresher - -```{r eval = FALSE} -mbox_latest <- parse_mbox_latest_date(save_path_pipermail) -refresh_pipermail(archive_url = pipermail_url, - mailing_list=mailing_list2, - archive_type=archive_type2, - save_folder_path=save_path_pipermail, - verbose=TRUE) -``` - -# Parse Mbox - -```{r eval = FALSE} -parse_mbox(perceval_path, save_path_mbox) -parse_mbox(perceval_path, save_path_pipermail) -``` diff --git a/vignettes/parallelized_parse_mbox.Rmd b/vignettes/parallelized_parse_mbox.Rmd deleted file mode 100644 index 979e0f14..00000000 --- a/vignettes/parallelized_parse_mbox.Rmd +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Parallel Parsing of Mbox Files Using Python and R" -output: - html_document: - toc: true - number_sections: true -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{Parallel Parsing of Mbox Files Using Python and R} - %\VignetteEncoding{UTF-8} ---- - -rm(list = ls()) -seed <- 1 -set.seed(seed) - -# Load required libraries -require(kaiaulu) -require(data.table) -require(yaml) -require(cli) - -# Introduction -This notebook demonstrates how to efficiently parse .mbox files using parallel processing. Python's ThreadPoolExecutor handles the parallel parsing of multiple .mbox files using an R script (parsembox.R) through a Python wrapper. This allows concurrent processing of mailing list archives, which is useful when dealing with large datasets. - -# Prerequisites -1. A collection of .mbox files to parse (see download_mail.Rmd for help with this) -2. Perceval - -## Project Configuration File - -```{r} -tools_config <- yaml::read_yaml("../tools.yml") -perceval_path <- tools_config[["perceval"]] - -conf <- yaml::read_yaml("../conf/helix.yml") -save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["save_folder_path"]] -``` - -## Python Thread Manager -We will use the parse_mbox.py Python script to execute the parsing of multiple .mbox files in parallel. The Python script uses the ThreadPoolExecutor to run the R script (parsembox.R) concurrently on each .mbox file. - -Here is how it works: - -1. Python's ThreadPoolExecutor is used to handle parallel processing. -2. Each thread calls the R script for one .mbox file. -3. The R script processes the file and saves the output as a CSV file. - -```{bash} -python3 ../inst/python/parse_mbox.py -``` From 2a1ba984af7f03a717acabbf9db0a81c5c3d366e Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 3 Oct 2024 09:33:46 -1000 Subject: [PATCH 20/80] Revert "i #284 Testing Github Actions" This reverts commit f0027dcd266628a399258cf57d4ba42fbac79eb8. --- DESCRIPTION | 2 +- man/commit_message_id_coverage.Rd | 2 +- man/download_jira_issues_by_date.Rd | 4 ++-- man/download_jira_issues_by_issue_key.Rd | 4 ++-- man/download_mod_mbox.Rd | 2 +- man/metric_churn_per_commit_interval.Rd | 2 +- man/metric_churn_per_commit_per_file.Rd | 2 +- man/metric_file_bug_churn.Rd | 2 +- man/metric_file_bug_frequency.Rd | 2 +- man/metric_file_churn.Rd | 2 +- man/metric_file_non_bug_churn.Rd | 2 +- man/metric_file_non_bug_frequency.Rd | 2 +- man/motif_factory_anti_square.Rd | 4 ++-- man/motif_factory_anti_triangle.Rd | 4 ++-- man/motif_factory_square.Rd | 4 ++-- man/motif_factory_triangle.Rd | 4 ++-- man/parse_bugzilla_perceval_rest_issue_comments.Rd | 6 +++--- man/parse_bugzilla_perceval_traditional_issue_comments.Rd | 6 +++--- man/parse_bugzilla_rest_comments.Rd | 6 +++--- man/parse_bugzilla_rest_issues.Rd | 4 ++-- man/parse_bugzilla_rest_issues_comments.Rd | 4 ++-- man/parse_commit_message_id.Rd | 6 +++--- man/parse_dependencies.Rd | 6 +++--- man/parse_dv8_clusters.Rd | 6 +++--- man/parse_gitlog.Rd | 6 +++--- man/parse_jira.Rd | 4 ++-- man/parse_jira_latest_date.Rd | 6 +++--- man/parse_jira_rss_xml.Rd | 6 +++--- man/parse_mbox.Rd | 4 ++-- man/parse_mbox_latest_date.Rd | 4 ++-- man/parse_nvdfeed.Rd | 8 ++++---- man/refresh_jira_issues.Rd | 8 ++++---- 32 files changed, 67 insertions(+), 67 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5a793074..ae28c702 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,4 +50,4 @@ Imports: VignetteBuilder: knitr URL: https://github.com/sailuh/kaiaulu BugReports: https://github.com/sailuh/kaiaulu/issues -RoxygenNote: 7.3.2 +RoxygenNote: 7.2.3 diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index e7f0c6ef..68fad761 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/download_jira_issues_by_date.Rd b/man/download_jira_issues_by_date.Rd index ccb3c7c2..697fdb48 100644 --- a/man/download_jira_issues_by_date.Rd +++ b/man/download_jira_issues_by_date.Rd @@ -72,13 +72,13 @@ For further details on the `created` JQL Query see [the associated JIRA API docu \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/download_jira_issues_by_issue_key.Rd b/man/download_jira_issues_by_issue_key.Rd index 8213ee17..b452878f 100644 --- a/man/download_jira_issues_by_issue_key.Rd +++ b/man/download_jira_issues_by_issue_key.Rd @@ -67,13 +67,13 @@ For further details on the `issueKey` JQL Query see [the associated JIRA API doc \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index c02cf5d8..c628be38 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -9,7 +9,7 @@ download_mod_mbox( start_year_month, end_year_month, save_folder_path, - verbose = TRUE + verbose = FALSE ) } \arguments{ diff --git a/man/metric_churn_per_commit_interval.Rd b/man/metric_churn_per_commit_interval.Rd index 6969492d..21f5e494 100644 --- a/man/metric_churn_per_commit_interval.Rd +++ b/man/metric_churn_per_commit_interval.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_churn_per_commit_per_file.Rd b/man/metric_churn_per_commit_per_file.Rd index 577d3f63..75b48c85 100644 --- a/man/metric_churn_per_commit_per_file.Rd +++ b/man/metric_churn_per_commit_per_file.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits per commit per file Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_bug_churn.Rd b/man/metric_file_bug_churn.Rd index 7bea610e..29bef17d 100644 --- a/man/metric_file_bug_churn.Rd +++ b/man/metric_file_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed bug type issues the file was involv \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_bug_frequency.Rd b/man/metric_file_bug_frequency.Rd index f978666e..607aef62 100644 --- a/man/metric_file_bug_frequency.Rd +++ b/man/metric_file_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed bug type issues the file was involved. \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_churn.Rd b/man/metric_file_churn.Rd index 67049ea5..3e2babd4 100644 --- a/man/metric_file_churn.Rd +++ b/man/metric_file_churn.Rd @@ -18,9 +18,9 @@ The total churn of a file \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_non_bug_churn.Rd b/man/metric_file_non_bug_churn.Rd index 049b9cd8..bf35bb1c 100644 --- a/man/metric_file_non_bug_churn.Rd +++ b/man/metric_file_non_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed non-bug type issues the file was in \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_non_bug_frequency.Rd b/man/metric_file_non_bug_frequency.Rd index 9516ce61..da87d00a 100644 --- a/man/metric_file_non_bug_frequency.Rd +++ b/man/metric_file_non_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed non-bug type issues the file was invol \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/motif_factory_anti_square.Rd b/man/motif_factory_anti_square.Rd index 8850ff38..06cefd8f 100644 --- a/man/motif_factory_anti_square.Rd +++ b/man/motif_factory_anti_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_triangle}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()} +\code{\link{motif_factory_triangle}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/motif_factory_anti_triangle.Rd b/man/motif_factory_anti_triangle.Rd index 349cce19..b5a789a8 100644 --- a/man/motif_factory_anti_triangle.Rd +++ b/man/motif_factory_anti_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()} +\code{\link{motif_factory_triangle}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/motif_factory_square.Rd b/man/motif_factory_square.Rd index 1c94e2a3..74101dfd 100644 --- a/man/motif_factory_square.Rd +++ b/man/motif_factory_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_triangle}()} +\code{\link{motif_factory_triangle}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/motif_factory_triangle.Rd b/man/motif_factory_triangle.Rd index 0a99faa8..61d81313 100644 --- a/man/motif_factory_triangle.Rd +++ b/man/motif_factory_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_square}()} +\code{\link{motif_factory_square}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/parse_bugzilla_perceval_rest_issue_comments.Rd b/man/parse_bugzilla_perceval_rest_issue_comments.Rd index 87d29f0c..610eeb6f 100644 --- a/man/parse_bugzilla_perceval_rest_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_rest_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval REST API Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd index 0cfacfd0..f6f3b7f2 100644 --- a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval traditional Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_comments.Rd b/man/parse_bugzilla_rest_comments.Rd index b12be91b..57999ca2 100644 --- a/man/parse_bugzilla_rest_comments.Rd +++ b/man/parse_bugzilla_rest_comments.Rd @@ -19,17 +19,17 @@ Parse Bugzilla comments data obtained from json files from Bugzilla crawler \cod Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues.Rd b/man/parse_bugzilla_rest_issues.Rd index bdd8bdde..da912e4b 100644 --- a/man/parse_bugzilla_rest_issues.Rd +++ b/man/parse_bugzilla_rest_issues.Rd @@ -27,11 +27,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues_comments.Rd b/man/parse_bugzilla_rest_issues_comments.Rd index 05da2855..b884739f 100644 --- a/man/parse_bugzilla_rest_issues_comments.Rd +++ b/man/parse_bugzilla_rest_issues_comments.Rd @@ -29,11 +29,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_commit_message_id.Rd b/man/parse_commit_message_id.Rd index e090ef19..13d9e542 100644 --- a/man/parse_commit_message_id.Rd +++ b/man/parse_commit_message_id.Rd @@ -19,16 +19,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dependencies.Rd b/man/parse_dependencies.Rd index e4c58051..a7136742 100644 --- a/man/parse_dependencies.Rd +++ b/man/parse_dependencies.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dv8_clusters.Rd b/man/parse_dv8_clusters.Rd index b4dc6249..987936bf 100644 --- a/man/parse_dv8_clusters.Rd +++ b/man/parse_dv8_clusters.Rd @@ -17,16 +17,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} Other dv8: diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index 7d65786f..d4370808 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira.Rd b/man/parse_jira.Rd index 0db0e226..c3e8fe9a 100644 --- a/man/parse_jira.Rd +++ b/man/parse_jira.Rd @@ -33,16 +33,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_latest_date.Rd b/man/parse_jira_latest_date.Rd index e2a730b5..d05f3b82 100644 --- a/man/parse_jira_latest_date.Rd +++ b/man/parse_jira_latest_date.Rd @@ -25,16 +25,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_rss_xml.Rd b/man/parse_jira_rss_xml.Rd index 1c0abecb..17b88ff5 100644 --- a/man/parse_jira_rss_xml.Rd +++ b/man/parse_jira_rss_xml.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index d4852995..fd578695 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -23,15 +23,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index 3ce065fd..149caec7 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -24,15 +24,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_nvdfeed.Rd b/man/parse_nvdfeed.Rd index 0accc69d..1c4365bd 100644 --- a/man/parse_nvdfeed.Rd +++ b/man/parse_nvdfeed.Rd @@ -18,16 +18,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, -\code{\link{parse_mbox_latest_date}()} +\code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()} } \concept{parsers} diff --git a/man/refresh_jira_issues.Rd b/man/refresh_jira_issues.Rd index 20be8882..6e7118eb 100644 --- a/man/refresh_jira_issues.Rd +++ b/man/refresh_jira_issues.Rd @@ -61,14 +61,14 @@ data. \code{\link{parse_jira_latest_date}} to retrieve the file path of the latest issue key Other downloaders: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()} +\code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()} Other jira: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()} +\code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()} } \concept{downloaders} \concept{jira} From 7bf8ba6510c4050a44d625965181fb8246b53ad4 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 3 Oct 2024 10:15:04 -1000 Subject: [PATCH 21/80] i #284 Refactored parse_mbox_latest_date and Fixed Roxygen Errors - parse_mbox_lateset_date() now uses new naming convention for files - Added to download_mail.Rmd - Fixed documentation for download_pipermail() Signed-off-by: Dao McGill --- DESCRIPTION | 2 +- R/mail.R | 45 ++++++++++++++++++------------- man/commit_message_id_coverage.Rd | 2 +- man/download_mod_mbox.Rd | 2 +- man/download_pipermail.Rd | 6 ++--- man/parse_mbox.Rd | 4 +-- man/parse_mbox_latest_date.Rd | 17 +++++------- vignettes/download_mail.Rmd | 32 +++++++++++++++++++++- 8 files changed, 73 insertions(+), 37 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ae28c702..5a793074 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,4 +50,4 @@ Imports: VignetteBuilder: knitr URL: https://github.com/sailuh/kaiaulu BugReports: https://github.com/sailuh/kaiaulu/issues -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 diff --git a/R/mail.R b/R/mail.R index 710ccaa2..c2e355c6 100644 --- a/R/mail.R +++ b/R/mail.R @@ -18,9 +18,9 @@ #' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. #' The function only downloads files that fall between the specified start_year_month and end_year_month. #' -#' @param mailing_list The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/") -#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM') -#' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month) +#' @param mailing_list The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/" +#' @param start_year_month The year and month of the first file to be downloaded format: 'YYYYMM' +#' @param end_year_month The year and month of the last file to be downloaded format: 'YYYYMM', or use Sys.Date #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored #' @param verbose if TRUE, prints diagnostic messages during the download process #' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory @@ -501,25 +501,34 @@ parse_mbox <- function(perceval_path, mbox_path){ #' Parse mbox latest date #' -#' Returns the name of the latest mod_mbox file downloaded in the specified folder +#' @description This function returns the name of the latest mod_mbox file downloaded in the specified folder +#' based on the naming convention `kaiaulu_YYYYMM.mbox`. For example: `kaiaulu_202401.mbox`. #' -#' The folder assumes the following convention: "(mailing_list)_(archive_type)_yearmonth.mbox" -#' For example: "geronimo-dev_apache_202401.mbox". This nomenclature is defined by \code{\link{download_mod_mbox_per_month}} -#' -#' @param mbox path to mbox archive file (ends in .mbox) -#' @return Returns the name of the latest mod_mbox file +#' @param save_folder_path path to the folder containing the mbox files +#' @return `latest_mbox_file` the name of the latest mod_mbox file #' @export #' @family parsers -parse_mbox_latest_date <- function(mbox) { - file_list <- list.files(mbox) - date_list <- list() - for(i in file_list){ - i <- sub(".mbox", "", i) - i <- sub("[^_]*_[^_]*_", "", i) - date_list <- append(date_list, i) +parse_mbox_latest_date <- function(save_folder_path) { + # List all .mbox files in the folder with the expected naming pattern + file_list <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + + if (length(file_list) == 0) { + warning("No .mbox files found in the folder.") + return(invisible(NULL)) } - latest_date <- as.character(max(unlist(date_list))) - latest_mbox_file <- grep(latest_date, file_list, value = TRUE) + + # Extract the dates from the filenames + date_list <- sub("kaiaulu_(\\d{6})\\.mbox$", "\\1", file_list) + + # Convert dates to numeric for comparison + date_numeric <- as.numeric(date_list) + + # Find the latest date + latest_date <- max(date_numeric, na.rm = TRUE) + + # Find the file corresponding to the latest date + latest_mbox_file <- file_list[date_numeric == latest_date] + return(latest_mbox_file) } diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index 68fad761..e7f0c6ef 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index c628be38..c02cf5d8 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -9,7 +9,7 @@ download_mod_mbox( start_year_month, end_year_month, save_folder_path, - verbose = FALSE + verbose = TRUE ) } \arguments{ diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 0aa1bc50..a4e2fdd8 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -13,11 +13,11 @@ download_pipermail( ) } \arguments{ -\item{mailing_list}{The name of the mailing list being downloaded (e.g. "https://mta.openssl.org/pipermail/openssl-announce/")} +\item{mailing_list}{The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/"} -\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM')} +\item{start_year_month}{The year and month of the first file to be downloaded format: 'YYYYMM'} -\item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM', or use 'format(Sys.Date(), "%Y%m")' for the current month)} +\item{end_year_month}{The year and month of the last file to be downloaded format: 'YYYYMM', or use Sys.Date} \item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index fd578695..d4852995 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -23,15 +23,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index 149caec7..eedf9633 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -4,35 +4,32 @@ \alias{parse_mbox_latest_date} \title{Parse mbox latest date} \usage{ -parse_mbox_latest_date(mbox) +parse_mbox_latest_date(save_folder_path) } \arguments{ -\item{mbox}{path to mbox archive file (ends in .mbox)} +\item{save_folder_path}{path to the folder containing the mbox files} } \value{ -Returns the name of the latest mod_mbox file +`latest_mbox_file` the name of the latest mod_mbox file } \description{ -Returns the name of the latest mod_mbox file downloaded in the specified folder -} -\details{ -The folder assumes the following convention: "(mailing_list)_(archive_type)_yearmonth.mbox" -For example: "geronimo-dev_apache_202401.mbox". This nomenclature is defined by \code{\link{download_mod_mbox_per_month}} +This function returns the name of the latest mod_mbox file downloaded in the specified folder +based on the naming convention `kaiaulu_YYYYMM.mbox`. For example: `kaiaulu_202401.mbox`. } \seealso{ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 3ea7a547..a6c17804 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -57,7 +57,7 @@ mailing_list: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ start_year_month: 202310 end_year_month: 202405 - save_folder_path: "../../extdata/save_folder_mail" + save_folder_path: "../extdata/save_folder_mail" ``` @@ -240,3 +240,33 @@ This will store the parsed data into the parsed_mail variable. To view the table ```{r} View(parsed_mail) ``` + +## Retrieve the Latest Mbox File +We can use the parse_mbox_latest_date() function to identify the most recent .mbox file in the specified folder. This can be useful when you want to automate the parsing of the latest data without manually specifying the file name. + +First, make sure that the save_folder_path is correctly set to the directory where your .mbox files are stored. +```{r} +# Get the latest mbox file +latest_mbox_file <- parse_mbox_latest_date(save_folder_path = save_folder_path) +print(latest_mbox_file) +``` +This will output the name of the latest .mbox file based on the YYYYMM pattern in the filename. +We can use this to update mbox_path to point to the latest file, and call the parse_mbox() function to parse the latest data. +```{r} +# Update mbox_path to use the latest file +mbox_path <- file.path(save_folder_path, latest_mbox_file) +print(mbox_path) +``` +To parse this file: +```{r} +# Parse the latest mbox file +parsed_mail <- parse_mbox( + perceval_path = parse_perceval_path, + mbox_path = mbox_path +) +``` +Now, parsed_mail contains the parsed data from the latest .mbox file. +```{r} +# View the parsed data +View(parsed_mail) +``` From aa6064878e086c41ec68c36d8abb08eb25cd90a6 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 3 Oct 2024 10:16:23 -1000 Subject: [PATCH 22/80] i #284 Update NEWS.md - added parse_mbox_latest_date --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 869a7d8a..ff498384 100644 --- a/NEWS.md +++ b/NEWS.md @@ -31,7 +31,8 @@ __kaiaulu 0.0.0.9700 (in development)__ ### MINOR IMPROVEMENTS - * All mailing list documentation can now be found in 'download_mail.Rmd'. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `parse_mbox_latest_date()` now uses the new mbox naming convention for the latest date. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * All mailing list documentation can now be found in `download_mail.Rmd`. [#284](https://github.com/sailuh/kaiaulu/issues/284) * `download_pipermail()` now downloads all the txt and txt.gz files in the accessed pipermail archive as mbox files. [#284](https://github.com/sailuh/kaiaulu/issues/284) * The line metrics notebook now provides further guidance on adjusting the snapshot and filtering. * The R File and R Function parser can now properly parse R folders which contain folders within (not following R package structure). Both `.r` and `.R` files are also now captured (previously only one of the two were specified, but R accepts both). [#235](https://github.com/sailuh/kaiaulu/issues/235) From 64e06469e768e859f83e01ffb99f5bdf07b27457 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Sat, 5 Oct 2024 14:30:12 -1000 Subject: [PATCH 23/80] i #284 Updated Notebook, exec/mailinglist.R and R/mail.R - Update pkgdown.yml - Set eval to False for notebook - Added warning for failed downloads - Added check for missing months in the date range within save_folder_path - Changed mbox_path in parsers to mbox_file_path - Use gt package to view tables - Made changes so Knit works for download_mail.Rmd - Updated exec/mailinglist.R to use new functions - To do: Use getter functions once they are merged Signed-off-by: Dao McGill --- R/mail.R | 110 ++++++++++++++++- _pkgdown.yml | 8 +- conf/helix.yml | 8 +- exec/mailinglist.R | 116 +++++++++--------- man/download_jira_issues_by_date.Rd | 4 +- man/download_jira_issues_by_issue_key.Rd | 4 +- man/download_mod_mbox.Rd | 4 +- man/download_pipermail.Rd | 2 + man/metric_churn_per_commit_interval.Rd | 2 +- man/metric_churn_per_commit_per_file.Rd | 2 +- man/metric_file_bug_churn.Rd | 2 +- man/metric_file_bug_frequency.Rd | 2 +- man/metric_file_churn.Rd | 2 +- man/metric_file_non_bug_churn.Rd | 2 +- man/metric_file_non_bug_frequency.Rd | 2 +- man/motif_factory_anti_square.Rd | 4 +- man/motif_factory_anti_triangle.Rd | 4 +- man/motif_factory_square.Rd | 4 +- man/motif_factory_triangle.Rd | 4 +- ...e_bugzilla_perceval_rest_issue_comments.Rd | 6 +- ...lla_perceval_traditional_issue_comments.Rd | 6 +- man/parse_bugzilla_rest_comments.Rd | 6 +- man/parse_bugzilla_rest_issues.Rd | 4 +- man/parse_bugzilla_rest_issues_comments.Rd | 4 +- man/parse_commit_message_id.Rd | 6 +- man/parse_dependencies.Rd | 6 +- man/parse_dv8_clusters.Rd | 6 +- man/parse_gitlog.Rd | 6 +- man/parse_jira.Rd | 4 +- man/parse_jira_latest_date.Rd | 6 +- man/parse_jira_rss_xml.Rd | 6 +- man/parse_mbox.Rd | 4 +- man/parse_nvdfeed.Rd | 8 +- man/refresh_jira_issues.Rd | 8 +- vignettes/download_mail.Rmd | 104 ++++++++++------ 35 files changed, 306 insertions(+), 170 deletions(-) diff --git a/R/mail.R b/R/mail.R index c2e355c6..dd4e28b4 100644 --- a/R/mail.R +++ b/R/mail.R @@ -17,6 +17,8 @@ #' #' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. #' The function only downloads files that fall between the specified start_year_month and end_year_month. +#' When both formats fail to download, the function issues a warning indicating the missing month. +#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. #' #' @param mailing_list The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/" #' @param start_year_month The year and month of the first file to be downloaded format: 'YYYYMM' @@ -84,6 +86,9 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s } } + ########## Initialize Vector for Failed Months ########## + failed_months <- character() + ########## Use Links to Download Individual Files ########## # Initialize a vector for storing the paths of the downloaded files. downloaded_files <- c() @@ -117,6 +122,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s response <- httr::GET(download_url, httr::timeout(60)) if (httr::status_code(response) != 200) { warning("Both .txt and .gz downloads failed for link: ", link, "\n") + failed_months <- c(failed_months, year_month_clean) next } } @@ -159,6 +165,48 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s downloaded_files <- c(downloaded_files, dest) } + ########## Summary of Downloads ########## + if (length(failed_months) > 0) { + warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", ")) + } + # List the files in the save_folder_path. + downloaded_files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$", full.names = FALSE) + + # Extract the YYYYMM from the file names. + downloaded_dates <- as.numeric(sub("kaiaulu_(\\d{6})\\.mbox", "\\1", downloaded_files_in_folder)) + + # Create the expected list of YYYYMM between start_year_month and end_year_month. + start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d") + end_date <- as.Date(paste0(end_year_month, "01"), format = "%Y%m%d") + all_dates <- seq(start_date, end_date, by = "month") + expected_dates <- as.numeric(format(all_dates, "%Y%m")) + + # Identify missing months. + missing_months <- setdiff(expected_dates, downloaded_dates) + + # Determine the earliest and latest dates downloaded. + if (length(downloaded_dates) > 0) { + min_downloaded_date <- min(downloaded_dates) + max_downloaded_date <- max(downloaded_dates) + + if (verbose) { + cat("\nSummary of Downloads:\n") + cat("save_folder_path contains mail from date ", min_downloaded_date, " to ", max_downloaded_date, "\n") + } + } else { + if (verbose) { + cat("No files found in save_folder_path\n") + } + } + + if (length(missing_months) == 0) { + if (verbose) { + cat("No missing months\n") + } + } else { + warning("Months missing in the date range: ", paste(missing_months, collapse = ", "), "\n") + } + ########## Return List of Downloaded Files ########## # Return the list of downloaded .mbox files return(downloaded_files) @@ -307,7 +355,9 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { #' of kaiaulu_YYYYMM.mbox. #' #' The function loops through each month in the range specified by `start_year_month` and `end_year_month`, -#' and constructs the appropriate URL to download each month's data. If any download fails, an error message is printed. +#' and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. +#' This means the file could not be found and that month's data may not exist. +#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. #' #' @param mailing_list The URL of the Apache Pony Mail list from which mbox files are to be downloaded #' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). @@ -336,6 +386,10 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa end_year <- as.numeric(substr(end_year_month, 1, 4)) end_month <- as.numeric(substr(end_year_month, 5, 6)) + ########## Initialize Vectors for Failed Months ########## + # Vectors to track failed downloads. + failed_months <- character() + ########## Download Loop ########## # Iterate over the years and months from start_year/month to end_year/month. # This is done by looping over the years, and for each year, looping over the 12 months. @@ -380,10 +434,54 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa } # Remove failed download file. unlink(file_path) + failed_months <- c(failed_months, year_month_str) } } } + ########## Summary of Failed Downloads ########## + if (length(failed_months) > 0) { + warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", ")) + } + + # List the files in the save_folder_path + downloaded_files <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$", full.names = FALSE) + + # Extract the YYYYMM from the file names + downloaded_dates <- as.numeric(sub("kaiaulu_(\\d{6})\\.mbox", "\\1", downloaded_files)) + + # Find the expected list of YYYYMM between start_year_month and end_year_month + start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d") + end_date <- as.Date(paste0(end_year_month, "01"), format = "%Y%m%d") + all_dates <- seq(start_date, end_date, by = "month") + expected_dates <- as.numeric(format(all_dates, "%Y%m")) + + # Identify missing months + missing_months <- setdiff(expected_dates, downloaded_dates) + + # Determine the earliest and latest dates downloaded + if (length(downloaded_dates) > 0) { + min_downloaded_date <- min(downloaded_dates) + max_downloaded_date <- max(downloaded_dates) + + if (verbose) { + cat("\nSummary of Downloads:\n") + cat("save_folder_path contains mail from date", min_downloaded_date, "to", max_downloaded_date, "\n") + } + } else { + if (verbose) { + cat("No files found in save_folder_path\n") + } + } + + if (length(missing_months) == 0) { + if (verbose) { + cat("No missing months\n") + } + } else { + warning("Months missing in the date range:", paste(missing_months, collapse = ", "), "\n") + } + ########## Return Save Path ########## # Return the folder path where all mbox files were saved. return(save_folder_path) @@ -465,19 +563,19 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v #' consistently renamed for clarity. #' #' @param perceval_path path to perceval binary -#' @param mbox_path path to mbox archive file (ends in .mbox) +#' @param mbox_file_path path to mbox archive file (ends in .mbox) #' @export #' @family parsers -parse_mbox <- function(perceval_path, mbox_path){ +parse_mbox <- function(perceval_path, mbox_file_path){ # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") perceval_path <- path.expand(perceval_path) - mbox_path <- path.expand(mbox_path) + mbox_file_path <- path.expand(mbox_file_path) # Remove ".mbox" - mbox_uri <- stringi::stri_replace_last_regex(mbox_path, pattern = "\\.mbox$", replacement = "") + mbox_uri <- stringi::stri_replace_last_regex(mbox_file_path, pattern = "\\.mbox$", replacement = "") # Use percerval to parse mbox. --json line is required to be parsed by jsonlite::fromJSON. perceval_output <- system2(perceval_path, - args = c('mbox',mbox_uri,mbox_path,'--json-line'), + args = c('mbox',mbox_uri,mbox_file_path,'--json-line'), stdout = TRUE, stderr = FALSE) diff --git a/_pkgdown.yml b/_pkgdown.yml index 98b851a8..3166eb33 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -91,15 +91,17 @@ reference: - title: __Mail__ desc: > Download, parsing and data generation of mailing lists. - See the [Download Mbox](../articles/download_mod_mbox.html) + See the [Download Mail](../articles/download_mail.html) and [Reply](../articles/reply_communication_showcase.html) Notebooks for details. - contents: - download_pipermail - - convert_pipermail_to_mbox + - refresh_pipermail - download_mod_mbox - - download_mod_mbox_per_month + - refresh_mod_mbox + - process_gz_to_mbox_in_folder - parse_mbox + - parse_mbox_latest_date - make_mbox_reply - make_mbox_mailing_list - title: __JIRA__ diff --git a/conf/helix.yml b/conf/helix.yml index 2612fcf3..adf7ef98 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -55,14 +55,14 @@ mailing_list: end_year_month: 202405 save_folder_path: "../../extdata/save_mbox_mail" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202410.mbox" + mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202410.mbox" project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org start_year_month: 202201 end_year_month: 202401 save_folder_path: "../../extdata/save_mbox_mail" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" + mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ @@ -70,14 +70,14 @@ mailing_list: end_year_month: 202405 save_folder_path: "../../extdata/save_folder_mail" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202310.mbox" + mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202310.mbox" project_key_2: mailing_list: https://mta.openssl.org/pipermail/openssl-project/ start_year_month: 202203 end_year_month: 202303 save_folder_path: "../../extdata/save_folder_mail_2" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" + mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" issue_tracker: jira: diff --git a/exec/mailinglist.R b/exec/mailinglist.R index 278edbdd..ffcb0b87 100755 --- a/exec/mailinglist.R +++ b/exec/mailinglist.R @@ -6,42 +6,38 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -require(yaml,quietly=TRUE) -require(cli,quietly=TRUE) -require(docopt,quietly=TRUE) -require(kaiaulu,quietly=TRUE) -require(data.table,quietly=TRUE) - - +require(yaml, quietly = TRUE) +require(cli, quietly = TRUE) +require(docopt, quietly = TRUE) +require(kaiaulu, quietly = TRUE) +require(data.table, quietly = TRUE) doc <- " USAGE: mailinglist.R tabulate help mailinglist.R tabulate mailinglist.R download modmbox help - mailinglist.R download modmbox - mailinglist.R download modmboxmonth help - mailinglist.R download modmboxmonth + mailinglist.R download modmbox + mailinglist.R download pipermail help + mailinglist.R download pipermail mailinglist.R (-h | --help) mailinglist.R --version DESCRIPTION: Provides a suite of functions to interact with Mailing Lists. Please see - Kaiaulu's README.md for instructions on how to create + Kaiaulu's README.md for instructions on how to create and . - OPTIONS: -h --help Show this screen. --version Show version. " - - arguments <- docopt::docopt(doc, version = 'Kaiaulu 0.0.0.9600') -if(arguments[["tabulate"]] & arguments[["help"]]){ - cli_alert_info("Tabulates a mailing list using parse_mbox().") -}else if(arguments[["tabulate"]]){ + +if (arguments[["tabulate"]] & arguments[["help"]]) { + cli::cli_alert_info("Tabulates a mailing list using parse_mbox().") +} else if (arguments[["tabulate"]]) { tools_path <- arguments[[""]] conf_path <- arguments[[""]] @@ -51,56 +47,62 @@ if(arguments[["tabulate"]] & arguments[["help"]]){ conf <- yaml::read_yaml(conf_path) perceval_path <- path.expand(tool[["perceval"]]) - mbox_path <- path.expand(conf[["mailing_list"]][["mbox"]]) - - project_mbox <- parse_mbox(perceval_path,mbox_path) + mbox_file_path <- path.expand(conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_file_path"]]) - cli_alert_success(paste0("Tabulated mailing list was saved at: ",save_path)) + project_mbox <- parse_mbox(perceval_path, mbox_file_path) - data.table::fwrite(project_mbox,save_path) -}else if(arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]){ - cli_alert_info("Saves a mailing list archive from mod_mbox as a .mbox file - using download_mod_mbox().") -}else if(arguments[["download"]] & arguments[["modmbox"]]){ + data.table::fwrite(project_mbox, save_path) + cli::cli_alert_success(paste0("Tabulated mailing list was saved at: ", save_path)) +} else if (arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]) { + cli::cli_alert_info("Downloads mailing list archives from mod_mbox using download_mod_mbox().") +} else if (arguments[["download"]] & arguments[["modmbox"]]) { conf_path <- arguments[[""]] - save_path <- arguments[[""]] - conf <- yaml::read_yaml(conf_path) + start_year_month <- arguments[[""]] + end_year_month <- arguments[[""]] + save_folder_path <- arguments[[""]] - mod_mbox_url <- conf[["mailing_list"]][["domain"]] - mailing_list <- conf[["mailing_list"]][["list_key"]][1] - - start_year <- arguments[[""]] - end_year <- arguments[[""]] + conf <- yaml::read_yaml(conf_path) + mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] - mbox <- download_mod_mbox(base_url = mod_mbox_url, - mailing_list = mailing_list, - from_year=start_year, - to_year=end_year, - save_file_path = save_path, - verbose = TRUE) + download_mod_mbox( + mailing_list = mailing_list, + start_year_month = start_year_month, + end_year_month = end_year_month, + save_folder_path = save_folder_path, + verbose = TRUE + ) - cli_alert_success(paste0("Downloaded mailing list was saved at: ",save_path)) -}else if(arguments[["download"]] & arguments[["modmboxmonth"]]){ + cli::cli_alert_success(paste0("Downloaded mailing list archives were saved at: ", save_folder_path)) +} else if (arguments[["download"]] & arguments[["pipermail"]] & arguments[["help"]]) { + cli::cli_alert_info("Downloads mailing list archives from pipermail using download_pipermail().") +} else if (arguments[["download"]] & arguments[["pipermail"]]) { conf_path <- arguments[[""]] - save_path <- arguments[[""]] - conf <- yaml::read_yaml(conf_path) - - mod_mbox_url <- conf[["mailing_list"]][["domain"]] - mailing_list <- conf[["mailing_list"]][["list_key"]][1] + start_year_month <- arguments[[""]] + end_year_month <- arguments[[""]] + save_folder_path <- arguments[[""]] - start_year <- arguments[[""]] - end_year <- arguments[[""]] - - mbox <- download_mod_mbox_per_month(base_url = mod_mbox_url, - mailing_list = mailing_list, - from_year=start_year, - to_year=end_year, - save_folder_path = save_path, - verbose = TRUE) - - cli_alert_success(paste0("Downloaded mailing list was saved at: ",save_path)) + conf <- yaml::read_yaml(conf_path) + mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] + + download_pipermail( + mailing_list = mailing_list, + start_year_month = start_year_month, + end_year_month = end_year_month, + save_folder_path = save_folder_path, + verbose = TRUE + ) + + cli::cli_alert_success(paste0("Downloaded mailing list archives were saved at: ", save_folder_path)) + +} else if (arguments[["-h"]] || arguments[["--help"]]) { + cli::cli_alert_info(doc) +} else if (arguments[["--version"]]) { + cli::cli_alert_info('Kaiaulu 0.0.0.9600') +} else { + cli::cli_alert_danger("Invalid command or arguments. Use --help for usage information.") } + diff --git a/man/download_jira_issues_by_date.Rd b/man/download_jira_issues_by_date.Rd index 697fdb48..ccb3c7c2 100644 --- a/man/download_jira_issues_by_date.Rd +++ b/man/download_jira_issues_by_date.Rd @@ -72,13 +72,13 @@ For further details on the `created` JQL Query see [the associated JIRA API docu \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_issue_key}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/download_jira_issues_by_issue_key.Rd b/man/download_jira_issues_by_issue_key.Rd index b452878f..8213ee17 100644 --- a/man/download_jira_issues_by_issue_key.Rd +++ b/man/download_jira_issues_by_issue_key.Rd @@ -67,13 +67,13 @@ For further details on the `issueKey` JQL Query see [the associated JIRA API doc \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues_by_date}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_date}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues_by_date}()}, \code{\link{download_jira_issues}()}, +\code{\link{download_jira_issues_by_date}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index c02cf5d8..e1835761 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -34,5 +34,7 @@ in the format "YYYY-MM". The downloaded .mbox files are saved in the specified f of kaiaulu_YYYYMM.mbox. The function loops through each month in the range specified by `start_year_month` and `end_year_month`, -and constructs the appropriate URL to download each month's data. If any download fails, an error message is printed. +and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. +This means the file could not be found and that month's data may not exist. +At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. } diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index a4e2fdd8..0244abbd 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -36,4 +36,6 @@ overwriting any existing file with the same name. The original .gz file is delet The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. The function only downloads files that fall between the specified start_year_month and end_year_month. +When both formats fail to download, the function issues a warning indicating the missing month. +At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. } diff --git a/man/metric_churn_per_commit_interval.Rd b/man/metric_churn_per_commit_interval.Rd index 21f5e494..6969492d 100644 --- a/man/metric_churn_per_commit_interval.Rd +++ b/man/metric_churn_per_commit_interval.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn_per_commit_per_file}()}, \code{\link{metric_churn}()}, +\code{\link{metric_churn_per_commit_per_file}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_churn_per_commit_per_file.Rd b/man/metric_churn_per_commit_per_file.Rd index 75b48c85..577d3f63 100644 --- a/man/metric_churn_per_commit_per_file.Rd +++ b/man/metric_churn_per_commit_per_file.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits per commit per file Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn}()}, +\code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_bug_churn.Rd b/man/metric_file_bug_churn.Rd index 29bef17d..7bea610e 100644 --- a/man/metric_file_bug_churn.Rd +++ b/man/metric_file_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed bug type issues the file was involv \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_bug_frequency.Rd b/man/metric_file_bug_frequency.Rd index 607aef62..f978666e 100644 --- a/man/metric_file_bug_frequency.Rd +++ b/man/metric_file_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed bug type issues the file was involved. \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_churn.Rd b/man/metric_file_churn.Rd index 3e2babd4..67049ea5 100644 --- a/man/metric_file_churn.Rd +++ b/man/metric_file_churn.Rd @@ -18,9 +18,9 @@ The total churn of a file \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_non_bug_churn.Rd b/man/metric_file_non_bug_churn.Rd index bf35bb1c..049b9cd8 100644 --- a/man/metric_file_non_bug_churn.Rd +++ b/man/metric_file_non_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed non-bug type issues the file was in \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_non_bug_frequency.Rd b/man/metric_file_non_bug_frequency.Rd index da87d00a..9516ce61 100644 --- a/man/metric_file_non_bug_frequency.Rd +++ b/man/metric_file_non_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed non-bug type issues the file was invol \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/motif_factory_anti_square.Rd b/man/motif_factory_anti_square.Rd index 06cefd8f..8850ff38 100644 --- a/man/motif_factory_anti_square.Rd +++ b/man/motif_factory_anti_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_triangle}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_triangle}()} } \concept{motif} diff --git a/man/motif_factory_anti_triangle.Rd b/man/motif_factory_anti_triangle.Rd index b5a789a8..349cce19 100644 --- a/man/motif_factory_anti_triangle.Rd +++ b/man/motif_factory_anti_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_triangle}()} } \concept{motif} diff --git a/man/motif_factory_square.Rd b/man/motif_factory_square.Rd index 74101dfd..1c94e2a3 100644 --- a/man/motif_factory_square.Rd +++ b/man/motif_factory_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_triangle}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_triangle}()} } \concept{motif} diff --git a/man/motif_factory_triangle.Rd b/man/motif_factory_triangle.Rd index 61d81313..0a99faa8 100644 --- a/man/motif_factory_triangle.Rd +++ b/man/motif_factory_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: +\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_square}()}, -\code{\link{motif_factory}()} +\code{\link{motif_factory_square}()} } \concept{motif} diff --git a/man/parse_bugzilla_perceval_rest_issue_comments.Rd b/man/parse_bugzilla_perceval_rest_issue_comments.Rd index 610eeb6f..87d29f0c 100644 --- a/man/parse_bugzilla_perceval_rest_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_rest_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval REST API Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd index f6f3b7f2..0cfacfd0 100644 --- a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval traditional Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_comments.Rd b/man/parse_bugzilla_rest_comments.Rd index 57999ca2..b12be91b 100644 --- a/man/parse_bugzilla_rest_comments.Rd +++ b/man/parse_bugzilla_rest_comments.Rd @@ -19,17 +19,17 @@ Parse Bugzilla comments data obtained from json files from Bugzilla crawler \cod Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues.Rd b/man/parse_bugzilla_rest_issues.Rd index da912e4b..bdd8bdde 100644 --- a/man/parse_bugzilla_rest_issues.Rd +++ b/man/parse_bugzilla_rest_issues.Rd @@ -27,11 +27,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues_comments.Rd b/man/parse_bugzilla_rest_issues_comments.Rd index b884739f..05da2855 100644 --- a/man/parse_bugzilla_rest_issues_comments.Rd +++ b/man/parse_bugzilla_rest_issues_comments.Rd @@ -29,11 +29,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_commit_message_id.Rd b/man/parse_commit_message_id.Rd index 13d9e542..e090ef19 100644 --- a/man/parse_commit_message_id.Rd +++ b/man/parse_commit_message_id.Rd @@ -19,16 +19,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dependencies.Rd b/man/parse_dependencies.Rd index a7136742..e4c58051 100644 --- a/man/parse_dependencies.Rd +++ b/man/parse_dependencies.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dv8_clusters.Rd b/man/parse_dv8_clusters.Rd index 987936bf..b4dc6249 100644 --- a/man/parse_dv8_clusters.Rd +++ b/man/parse_dv8_clusters.Rd @@ -17,16 +17,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} Other dv8: diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index d4370808..7d65786f 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira.Rd b/man/parse_jira.Rd index c3e8fe9a..0db0e226 100644 --- a/man/parse_jira.Rd +++ b/man/parse_jira.Rd @@ -33,16 +33,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_latest_date.Rd b/man/parse_jira_latest_date.Rd index d05f3b82..e2a730b5 100644 --- a/man/parse_jira_latest_date.Rd +++ b/man/parse_jira_latest_date.Rd @@ -25,16 +25,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_rss_xml.Rd b/man/parse_jira_rss_xml.Rd index 17b88ff5..1c0abecb 100644 --- a/man/parse_jira_rss_xml.Rd +++ b/man/parse_jira_rss_xml.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_jira_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index d4852995..9b128dd8 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -4,12 +4,12 @@ \alias{parse_mbox} \title{Parse mbox from Perceval} \usage{ -parse_mbox(perceval_path, mbox_path) +parse_mbox(perceval_path, mbox_file_path) } \arguments{ \item{perceval_path}{path to perceval binary} -\item{mbox_path}{path to mbox archive file (ends in .mbox)} +\item{mbox_file_path}{path to mbox archive file (ends in .mbox)} } \description{ Parses an mbox file, which consists of emails in a mailbox, using the Perceval library. diff --git a/man/parse_nvdfeed.Rd b/man/parse_nvdfeed.Rd index 1c4365bd..0accc69d 100644 --- a/man/parse_nvdfeed.Rd +++ b/man/parse_nvdfeed.Rd @@ -18,16 +18,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, -\code{\link{parse_mbox}()} +\code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()} } \concept{parsers} diff --git a/man/refresh_jira_issues.Rd b/man/refresh_jira_issues.Rd index 6e7118eb..20be8882 100644 --- a/man/refresh_jira_issues.Rd +++ b/man/refresh_jira_issues.Rd @@ -61,14 +61,14 @@ data. \code{\link{parse_jira_latest_date}} to retrieve the file path of the latest issue key Other downloaders: +\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()}, -\code{\link{download_jira_issues}()} +\code{\link{download_jira_issues_by_issue_key}()} Other jira: +\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()}, -\code{\link{download_jira_issues}()} +\code{\link{download_jira_issues_by_issue_key}()} } \concept{downloaders} \concept{jira} diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index a6c17804..94f516c6 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -11,7 +11,7 @@ vignette: > --- -```{r} +```{r eval=FALSE} rm(list = ls()) seed <- 1 set.seed(seed) @@ -23,18 +23,17 @@ set.seed(seed) require(stringi) require(XML) require(httr) + require(gt) ``` # Introduction + Mailing list data is stored in a variety of archives. See: - Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). - Pipermail: [OpenSSL](https://mta.openssl.org/mailman/listinfo/). (More information on this in the sections below.) This notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail. -# Pipermail - -## Mailing List Organization Mailing lists are typically organized by topic or purpose. For example, the [OpenSSL project](https://www.openssl.org/community/mailinglists.html) maintains several mailing lists, each serving a different group: - **project-announce**: For important announcements. @@ -42,13 +41,18 @@ Mailing lists are typically organized by topic or purpose. For example, the [Ope - **project-project**: For project discussions. - **project-users**: For general user questions and discussions. +Mod Mbox archives also organize mailing lists by topic. The apache mailing list archives can be found at https://lists.apache.org/. + Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully selected, since this effects the quality and completeness of analysis. +# Pipermail + ## Project Configuration File + To start, we load the project configuration file, which contains parameters for downloading the mailing list archives. Instead of hard-coding these values in the notebook, we store them in a project configuration file in YAML format. This makes the parameters easier to manage. Here is an example of the pipermail mailing list section from the configuration file (conf/helix.yml): -```{yaml} +``` # top-level key for mailing list config mailing_list: # for pipermail @@ -73,6 +77,7 @@ The configuration file contains the following parameters for each mailing list a By organizing the configuration in this way, you can manage multiple projects and mailing lists easily. The notebook reads these parameters and uses them to download and process the archives. ## Pipermail Downloader + The following code reads the configuration parameters for project_key_1 of pipermail: ```{r} @@ -85,7 +90,7 @@ save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["s After setting the configurations above, you can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory (save_folder_path). The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. -```{r} +```{r eval=FALSE} # Download archives download_pipermail( mailing_list = mailing_list, @@ -100,14 +105,16 @@ download_pipermail( After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc. ## Pipermail Refresher + In some cases, you may want to refresh the archive to ensure the most recent months are up-to-date or to handle updates to the mailing list. The refresh_pipermail() function helps automate this process. How refresh_pipermail Works 1. Checks if the folder is empty: If the folder is empty, it downloads archives starting from start_year_month to the current month using download_pipermail(). 2. Finds the most recent file: If the folder is not empty, the function checks for the most recent month’s file (based on the filename) and deletes it. 3. Redownloads from the most recent month: The function then redownloads the archive from the most recent month up to the current month. +# add warning for files do not exist -```{r} +```{r eval=FALSE} # Refresh archives refresh_pipermail( mailing_list = mailing_list, @@ -122,16 +129,13 @@ This function will ensure that the most recent archives are always up-to-date by # Mod Mbox -## Mailing List Organization -Mod Mbox archives also organize mailing lists by topic. The apache mailing list archives can be found at https://lists.apache.org/. - -Just like with Pipermail, mailing list archives in Mod Mbox can be split across different formats or locations, and vary in completeness and available metadata. It is important to select the appropriate archive that is compatible with Kaiaulu and suits your analysis needs. - ## Project Configuration File + Like in Pipermail, we load the configuration for Mod Mbox from the YAML file, which includes the mailing list URL, the date range, and the save folder path. Here's an example of the relevant section in the configuration file (conf/helix.yml): -```{yaml} + +``` # top-level key for mailing list config mailing_list: # for mod mbox @@ -148,7 +152,7 @@ The configuration parameters are the same as the ones explained in the section a The following code reads the configuration parameters: -```{r} +```{r eval=FALSE} conf <- yaml::read_yaml("../conf/helix.yml") mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["start_year_month"]] @@ -162,9 +166,10 @@ save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["sa - save_folder_path: The directory where the downloaded .mbox files will be saved. ## Mod Mbox Downloader + The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. -```{r} +```{r eval=FALSE} download_mod_mbox( mailing_list = mailing_list, start_year_month = start_year_month, @@ -179,13 +184,14 @@ After running the function, it constructs URLs like: https://lists.apache.org/ap and saves the files in the specified folder. ## Mod Mbox Refresher + To refresh these archives to ensure that you have the latest messages, you can use the refresh_mod_mbox function. This function works similarly to the Pipermail refresher. How refresh_mod_mbox Works 1. Checks if the folder is empty and, if so, downloads the archives starting from start_year_month to the current month by calling download_mod_mbox(). 2. If the folder contains files, it identifies the most recent one using the YYYYMM found in the filename. This file is deleted, and then redownloaded along with all future months. -```{r} +```{r eval=FALSE} refresh_mod_mbox( mailing_list = mailing_list, start_year_month = start_year_month, @@ -201,72 +207,96 @@ This ensures your archive is up-to-date, accounting for new data that may have b After downloading the mailing list archives as .mbox files, the next step is to parse these files to extract meaningful information for analysis. The parse_mbox() function utilizes the Perceval library to parse .mbox files and convert them into structured data tables. This enables easier manipulation and analysis of mailing list data. ## Mbox Parser + The parse_mbox() function takes an .mbox file and parses it into a structured data.table using the Perceval library. For the configuration, make sure you have the correct path to the Perceval library in the conf file. Here's an example of the relevant section in the tools.yml file: -```{yaml} + +``` perceval: /usr/local/bin/perceval ``` + And in the helix.yml configuration file: -```{yaml} + +``` mailing_list: # for mod mbox mod_mbox: project_key_1: - mbox_path: "../../extdata/save_mbox_mail.kaiaulu_202310.mbox" + mbox_file_path: "../../extdata/save_mbox_mail.kaiaulu_202310.mbox" ``` + perceval: found in tools.yml, this should be set to your local path to the perceval binary (use > which perceval to locate the path). -mbox_path: should point to the saved .mbox file that will be parsed. See the mbox_path in the mailing_list sections of helix.yml. +mbox_file_path: should point to the saved .mbox file that will be parsed. See the mbox_path in the mailing_list sections of helix.yml. Load the configuration: -```{r} + +```{r eval=FALSE} tools_config <- yaml::read_yaml("../tools.yml") parse_perceval_path <- tools_config[["perceval"]] conf <- yaml::read_yaml("../conf/helix.yml") -mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_path"]] +mbox_file_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_file_path"]] ``` + Run the parser: -```{r} + +```{r eval=FALSE} parsed_mail <- parse_mbox( perceval_path = parse_perceval_path, - mbox_path = mbox_path + mbox_file_path = mbox_file_path ) ``` + This will store the parsed data into the parsed_mail variable. To view the table, use: -```{r} -View(parsed_mail) + +```{r eval=FALSE} +# Display the first 10 rows of the parsed data using gt +# Refer to the gt documentation for more options on displaying tables +parsed_mail %>% + head(10) %>% + gt() ``` ## Retrieve the Latest Mbox File + We can use the parse_mbox_latest_date() function to identify the most recent .mbox file in the specified folder. This can be useful when you want to automate the parsing of the latest data without manually specifying the file name. First, make sure that the save_folder_path is correctly set to the directory where your .mbox files are stored. -```{r} + +```{r eval=FALSE} # Get the latest mbox file latest_mbox_file <- parse_mbox_latest_date(save_folder_path = save_folder_path) print(latest_mbox_file) ``` This will output the name of the latest .mbox file based on the YYYYMM pattern in the filename. -We can use this to update mbox_path to point to the latest file, and call the parse_mbox() function to parse the latest data. -```{r} -# Update mbox_path to use the latest file -mbox_path <- file.path(save_folder_path, latest_mbox_file) -print(mbox_path) +We can use this to update mbox_file_path to point to the latest file, and call the parse_mbox() function to parse the latest data. + +```{r eval=FALSE} +# Update mbox_file_path to use the latest file +mbox_file_path <- file.path(save_folder_path, latest_mbox_file) +print(mbox_file_path) ``` + To parse this file: -```{r} + +```{r eval=FALSE} # Parse the latest mbox file parsed_mail <- parse_mbox( perceval_path = parse_perceval_path, - mbox_path = mbox_path + mbox_file_path = mbox_file_path ) ``` + Now, parsed_mail contains the parsed data from the latest .mbox file. -```{r} -# View the parsed data -View(parsed_mail) + +```{r eval=FALSE} +# Display the first 10 rows of parsed_mail using gt +# Refer to the gt documentation for more options on displaying tables +parsed_mail %>% + head(10) %>% + gt() ``` From 2b6a963bf7bc0e02e626ab8cf93c1d423f040635 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 6 Oct 2024 11:41:54 -1000 Subject: [PATCH 24/80] i #284 Changed Notebook to Use Project Working Directory --- vignettes/download_mail.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 94f516c6..a3bd3485 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -81,7 +81,7 @@ By organizing the configuration in this way, you can manage multiple projects an The following code reads the configuration parameters for project_key_1 of pipermail: ```{r} -conf <- yaml::read_yaml("../conf/helix.yml") +conf <- yaml::read_yaml("conf/helix.yml") mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]] end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]] @@ -153,7 +153,7 @@ The configuration parameters are the same as the ones explained in the section a The following code reads the configuration parameters: ```{r eval=FALSE} -conf <- yaml::read_yaml("../conf/helix.yml") +conf <- yaml::read_yaml("conf/helix.yml") mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["start_year_month"]] end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["end_year_month"]] @@ -234,10 +234,10 @@ mbox_file_path: should point to the saved .mbox file that will be parsed. See th Load the configuration: ```{r eval=FALSE} -tools_config <- yaml::read_yaml("../tools.yml") +tools_config <- yaml::read_yaml("tools.yml") parse_perceval_path <- tools_config[["perceval"]] -conf <- yaml::read_yaml("../conf/helix.yml") +conf <- yaml::read_yaml("conf/helix.yml") mbox_file_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_file_path"]] ``` From dc40dbaf80a049505628f87723a5a33d085a8c98 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 6 Oct 2024 13:39:38 -1000 Subject: [PATCH 25/80] i #284 Minor Fix: Folder Paths in helix.yml --- conf/helix.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/conf/helix.yml b/conf/helix.yml index adf7ef98..ea47741e 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -53,31 +53,31 @@ mailing_list: mailing_list: https://lists.apache.org/list.html?announce@apache.org start_year_month: 202310 end_year_month: 202405 - save_folder_path: "../../extdata/save_mbox_mail" + save_folder_path: "../extdata/save_mbox_mail" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202410.mbox" + mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202410.mbox" project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org start_year_month: 202201 end_year_month: 202401 - save_folder_path: "../../extdata/save_mbox_mail" + save_folder_path: "../extdata/save_mbox_mail" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" + mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202210.mbox" pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ start_year_month: 202310 end_year_month: 202405 - save_folder_path: "../../extdata/save_folder_mail" + save_folder_path: "../extdata/save_folder_mail" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202310.mbox" + mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202310.mbox" project_key_2: mailing_list: https://mta.openssl.org/pipermail/openssl-project/ start_year_month: 202203 end_year_month: 202303 - save_folder_path: "../../extdata/save_folder_mail_2" + save_folder_path: "../extdata/save_folder_mail_2" # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../../extdata/save_mbox_mail/kaiaulu_202210.mbox" + mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202210.mbox" issue_tracker: jira: From d6f3b4197c6766d4aad1e070a40a1eb145b8197e Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Wed, 9 Oct 2024 05:04:33 -0700 Subject: [PATCH 26/80] i #284 fixes incorrect call R/example.R contained an unused parameter, triggering warnings on build. Signed-off-by: Carlos Paradis --- R/example.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/example.R b/R/example.R index f70bd5c6..389bcf28 100644 --- a/R/example.R +++ b/R/example.R @@ -584,7 +584,7 @@ example_mailing_list_two_threads <- function(folder_path = "/tmp", folder_name, replies <- c(thread_1_reply_1, thread_1_reply_2, thread_2_reply_1) # Create mbox file from the list of replies - mbox_path <- make_mbox_mailing_list(replies = replies, folder_path = folder_path, file_name = file_name) + mbox_path <- make_mbox_mailing_list(replies = replies, file_name = file_name) return(mbox_path) } From f02ecb1d8c9d78348110f091b016dfa5f4c49700 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Wed, 9 Oct 2024 05:24:16 -0700 Subject: [PATCH 27/80] i #284 attempt fix on Actions Actions is failing due to being unable to install XML. Some new error yet again on Actions. Trying to make the version requirement less strict to see if it is able to install. Signed-off-by: Carlos Paradis --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5a793074..9c783555 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,7 @@ Imports: httr (>= 1.4.1), curl (>= 4.3), gh (>= 1.2.0), - XML (>= 3.99-0), + XML (>= 3.99), RColorBrewer (>= 1.1-2), cli (>= 2.0.2), docopt (>= 0.7.1) From 7f38d1c7bf1802035fc5bdca142f07dc16b07261 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Wed, 9 Oct 2024 05:47:02 -0700 Subject: [PATCH 28/80] i #284 incomplete storytelling review The story is a bit too dry and assumes much of the user. The file format stored is not brief. Modified it a bit to add an example on how it can be revised. Signed-off-by: Carlos Paradis --- vignettes/download_mail.Rmd | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index a3bd3485..db622760 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -29,6 +29,33 @@ set.seed(seed) # Introduction +Open source projects require a means for developers to communicate. These may include mailing lists, issue trackers, discord, etc. This notebooks showcases how to download data from mailing list archives. Two often used archive types are [mod_mbox](https://httpd.apache.org/mod_mbox/) and [pipermail](https://en.wikipedia.org/wiki/GNU_Mailman#cite_note-9), which Kaiaulu offer functions to download data from. The former is commonly used by the Apache Software Foundation projects. The latter, is more commonly use in GNU related projects, but this can vary. + +# Project Configuration File + +Mailing List archives are hosted by their respective open source projects. Therefore, in order to use Kaiaulu downloaders to obtain mail data, you will need to access the respective open source project, and find out the URL tied to the archive you are interested. Generally, that is the developer mailing list, if your interest is to understand communication patterns among developers. Alternatively, if the focus of the research is Q&A from the userbase, then a user mailing list may make more sense. + +Because project lifetime can go as far as a few decades, to have the full picture of what communication took place in the project, if your analysis include a long period of time, you may need to download multiple archives to combine them after turning them into tables using Kaiaulu parser. + +The information you need to find out for each open source project is documented in Kaiaulu using a project configuration file format. For pipermail and mod_mbox this is as follows: + +``` +# top-level key for mailing list config +mailing_list: + # for pipermail + pipermail: + project_key_1: + mailing_list: https://mta.openssl.org/pipermail/openssl-users/ + start_year_month: 202310 + end_year_month: 202405 + save_folder_path: "../extdata/save_folder_mail" + +``` + +Regardless of which mail archive you choose, the downloaders will store the mail data in monthly files, in a `.mbox` format. This is a simple text file that contains some markings to identify the header of the e-mail containing title, authors, etc. You can open any of the .mbox downloaded files with any text editor. + +#### Edit below + Mailing list data is stored in a variety of archives. See: - Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). - Pipermail: [OpenSSL](https://mta.openssl.org/mailman/listinfo/). @@ -81,7 +108,7 @@ By organizing the configuration in this way, you can manage multiple projects an The following code reads the configuration parameters for project_key_1 of pipermail: ```{r} -conf <- yaml::read_yaml("conf/helix.yml") +conf <- yaml::read_yaml("../conf/helix.yml") mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]] end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]] From 309fa34d3dd7457ac72e41151bf14ffb208c5507 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Wed, 9 Oct 2024 05:58:57 -0700 Subject: [PATCH 29/80] i #284 downgrade version of R for XML In case the error of XML compile is tied to this issue: https://github.com/r-lib/actions/issues/559 revert to 4.1 to see if it solves the problem. Signed-off-by: Carlos Paradis --- .github/workflows/R-CMD-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 2350bf81..770e1334 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.2'] + r-version: ['4.1'] steps: - uses: actions/checkout@v3 From e04bd3196467a959983fa729a64698097f230c61 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Wed, 9 Oct 2024 06:10:02 -0700 Subject: [PATCH 30/80] i #284 gcc not found on Actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue seems to be tied to gcc compiler not working. Attempt to bump OS X version up rather than downgrade R. See GitHub Action for CHECK on the line: "checking whether the C compiler works... no" right before: "ERROR: configuration failed for package ‘XML’" Signed-off-by: Carlos Paradis --- .github/workflows/R-CMD-check.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 770e1334..5d08f1cf 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -12,10 +12,10 @@ name: R-CMD-check jobs: R-CMD-check: - runs-on: macOS-13 + runs-on: macOS-14 strategy: matrix: - r-version: ['4.1'] + r-version: ['4.2'] steps: - uses: actions/checkout@v3 From dbd7092271de766e84fb9dc7032470d63e396c68 Mon Sep 17 00:00:00 2001 From: Nicholas Beydler Date: Wed, 9 Oct 2024 17:53:33 -1000 Subject: [PATCH 31/80] i #284 Refactored download_mail.Rmd - Refactored the download_mail.Rmd notebook to expect the use of the getters from R/config.R (i #230 contains the getter functions in R/config.R). --- vignettes/download_mail.Rmd | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index db622760..28d135a6 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -86,8 +86,6 @@ mailing_list: pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - start_year_month: 202310 - end_year_month: 202405 save_folder_path: "../extdata/save_folder_mail" ``` @@ -108,11 +106,11 @@ By organizing the configuration in this way, you can manage multiple projects an The following code reads the configuration parameters for project_key_1 of pipermail: ```{r} -conf <- yaml::read_yaml("../conf/helix.yml") -mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] -start_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["start_year_month"]] -end_year_month <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["end_year_month"]] -save_folder_path <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["save_folder_path"]] +conf <- parse_config("conf/helix.yml") +mailing_list <- get_pipermail_domain(conf, "project_key_1") +start_year_month <- 202310 +end_year_month <- 202405 +save_folder_path <- get_pipermail_path(conf, "project_key_1") ``` After setting the configurations above, you can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory (save_folder_path). The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. @@ -169,8 +167,6 @@ mailing_list: mod_mbox: project_key_1: mailing_list: https://lists.apache.org/list.html?announce@apache.org - start_year_month: 202310 - end_year_month: 202405 save_folder_path: "../../extdata/save_mbox_mail" ``` @@ -180,11 +176,11 @@ The configuration parameters are the same as the ones explained in the section a The following code reads the configuration parameters: ```{r eval=FALSE} -conf <- yaml::read_yaml("conf/helix.yml") -mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] -start_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["start_year_month"]] -end_year_month <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["end_year_month"]] -save_folder_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["save_folder_path"]] +conf <- parse_config("conf/helix.yml") +mailing_list <- get_mbox_domain(conf, "project_key_1") +start_year_month <- 202310 +end_year_month <- 202405 +save_folder_path <- get_mbox_path(conf, "project_key_1") ``` - mailing_list: The URL of the Mod Mbox mailing list (e.g., https://lists.apache.org/list.html?announce@apache.org). @@ -261,12 +257,11 @@ mbox_file_path: should point to the saved .mbox file that will be parsed. See th Load the configuration: ```{r eval=FALSE} -tools_config <- yaml::read_yaml("tools.yml") -parse_perceval_path <- tools_config[["perceval"]] - -conf <- yaml::read_yaml("conf/helix.yml") -mbox_file_path <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_file_path"]] +tools <- parse_config("tools.yml") +parse_perceval_path <- get_tool_project("perceval", tools) +conf <- parse_config("conf/helix.yml") +mbox_file_path <- get_mbox_input_file(conf, "project_key_1") ``` Run the parser: From c4b9d16f6186c04001da978d80ba7483ce0aece1 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 10 Oct 2024 12:24:49 -1000 Subject: [PATCH 32/80] i #284 Testing GitHub Actions after Merge --- conf/helix.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/helix.yml b/conf/helix.yml index 8c4d02e4..779d4b52 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -69,14 +69,14 @@ mailing_list: start_year_month: 202310 end_year_month: 202405 save_folder_path: "../extdata/save_folder_mail" - # mbox_path is for use only with parse_mbox() function. It is the file to parse. + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse. mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202310.mbox" project_key_2: mailing_list: https://mta.openssl.org/pipermail/openssl-project/ start_year_month: 202203 end_year_month: 202303 save_folder_path: "../extdata/save_folder_mail_2" - # mbox_path is for use only with parse_mbox() function. It is the file to parse. + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse. mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202210.mbox" issue_tracker: From 90b05ed883a57d014ae646a11de66e8e467bf5f1 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 10 Oct 2024 12:39:15 -1000 Subject: [PATCH 33/80] i #284 GH Actions (changed perceval path) --- tools.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools.yml b/tools.yml index 51f9155d..4515ae80 100644 --- a/tools.yml +++ b/tools.yml @@ -1,5 +1,5 @@ # https://github.com/chaoss/grimoirelab-perceval -perceval: /Users/dao/anaconda3/bin/perceval +perceval: ~/perceval/bin/perceval # https://github.com/multilang-depends/depends depends: ~/depends-0.9.6/depends.jar # https://github.com/tsantalis/RefactoringMiner#running-refactoringminer-from-the-command-line From 3e5f8f78e4e6708064678315a0d60a0552a49808 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 10 Oct 2024 12:47:03 -1000 Subject: [PATCH 34/80] i #284 Change Roxygen version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9c783555..4b9b0460 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,4 +50,4 @@ Imports: VignetteBuilder: knitr URL: https://github.com/sailuh/kaiaulu BugReports: https://github.com/sailuh/kaiaulu/issues -RoxygenNote: 7.3.2 +RoxygenNote: 7.2.3 From 4af2c21f8c5013adb43cfed839de0ac470bbf9bd Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 10 Oct 2024 15:27:12 -1000 Subject: [PATCH 35/80] i #284 Update Notebook and config file - This should fail until the getters are merged. Signed-off-by: Dao McGill --- conf/helix.yml | 32 ++--- vignettes/download_mail.Rmd | 253 ++++++++++++++++++++---------------- 2 files changed, 155 insertions(+), 130 deletions(-) diff --git a/conf/helix.yml b/conf/helix.yml index 779d4b52..3a049411 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -51,33 +51,25 @@ mailing_list: mod_mbox: project_key_1: mailing_list: https://lists.apache.org/list.html?announce@apache.org - start_year_month: 202310 - end_year_month: 202405 - save_folder_path: "../extdata/save_mbox_mail" - # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202410.mbox" + save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org - start_year_month: 202201 - end_year_month: 202401 - save_folder_path: "../extdata/save_mbox_mail" - # mbox_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202210.mbox" + save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - start_year_month: 202310 - end_year_month: 202405 - save_folder_path: "../extdata/save_folder_mail" - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202310.mbox" + save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox project_key_2: mailing_list: https://mta.openssl.org/pipermail/openssl-project/ - start_year_month: 202203 - end_year_month: 202303 - save_folder_path: "../extdata/save_folder_mail_2" - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse. - mbox_file_path: "../extdata/save_mbox_mail/kaiaulu_202210.mbox" + save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 28d135a6..194c3d82 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -31,30 +31,7 @@ set.seed(seed) Open source projects require a means for developers to communicate. These may include mailing lists, issue trackers, discord, etc. This notebooks showcases how to download data from mailing list archives. Two often used archive types are [mod_mbox](https://httpd.apache.org/mod_mbox/) and [pipermail](https://en.wikipedia.org/wiki/GNU_Mailman#cite_note-9), which Kaiaulu offer functions to download data from. The former is commonly used by the Apache Software Foundation projects. The latter, is more commonly use in GNU related projects, but this can vary. -# Project Configuration File - -Mailing List archives are hosted by their respective open source projects. Therefore, in order to use Kaiaulu downloaders to obtain mail data, you will need to access the respective open source project, and find out the URL tied to the archive you are interested. Generally, that is the developer mailing list, if your interest is to understand communication patterns among developers. Alternatively, if the focus of the research is Q&A from the userbase, then a user mailing list may make more sense. - -Because project lifetime can go as far as a few decades, to have the full picture of what communication took place in the project, if your analysis include a long period of time, you may need to download multiple archives to combine them after turning them into tables using Kaiaulu parser. - -The information you need to find out for each open source project is documented in Kaiaulu using a project configuration file format. For pipermail and mod_mbox this is as follows: - -``` -# top-level key for mailing list config -mailing_list: - # for pipermail - pipermail: - project_key_1: - mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - start_year_month: 202310 - end_year_month: 202405 - save_folder_path: "../extdata/save_folder_mail" - -``` - -Regardless of which mail archive you choose, the downloaders will store the mail data in monthly files, in a `.mbox` format. This is a simple text file that contains some markings to identify the header of the e-mail containing title, authors, etc. You can open any of the .mbox downloaded files with any text editor. - -#### Edit below +# Mailing List Organization Mailing list data is stored in a variety of archives. See: - Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). @@ -72,12 +49,13 @@ Mod Mbox archives also organize mailing lists by topic. The apache mailing list Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully selected, since this effects the quality and completeness of analysis. -# Pipermail +# Project Configuration File + +Mailing List archives are hosted by their respective open source projects. Therefore, in order to use Kaiaulu downloaders to obtain mail data, you will need to access the respective open source project, and find out the URL tied to the archive you are interested in. Generally, that is the developer mailing list, if your interest is to understand communication patterns among developers. Alternatively, if the focus of the research is Q&A from the user base, then a user mailing list may make more sense. -## Project Configuration File +Because project lifetime can go as far as a few decades, to have the full picture of what communication took place in the project you may need to download multiple archives and combine them, after turning them into tables using the Kaiaulu parser. -To start, we load the project configuration file, which contains parameters for downloading the mailing list archives. Instead of hard-coding these values in the notebook, we store them in a project configuration file in YAML format. This makes the parameters easier to manage. -Here is an example of the pipermail mailing list section from the configuration file (conf/helix.yml): +The information you need to find out for each open source project is documented in Kaiaulu using a project configuration file format. For pipermail and mod_mbox this is as follows: ``` # top-level key for mailing list config @@ -86,24 +64,48 @@ mailing_list: pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - save_folder_path: "../extdata/save_folder_mail" - + start_year_month: 202310 + end_year_month: 202405 + save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox + # for mod mbox + mod_mbox: + apache_announce: + mailing_list: https://lists.apache.org/list.html?announce@apache.org + start_year_month: 202310 + end_year_month: 202405 + save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox ``` +Explanation: -The configuration file contains the following parameters for each mailing list archive: - +- mailing_list: The top-level key for mailing list configurations. - project_key_1: A unique key for the project. There can be multiple projects in both the pipermail and mod mbox sections. - pipermail/ mod_mbox: Indicates whether the setting are for pipermail or mod mbox. Although the parameters are the same, this helps to differentiate between the two types of mailing list archives. - mailing_list: The URL of the mailing list archive page. Note that this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). - start_year_month: The starting date for downloading archives (in YYYYMM format). - end_year_month: The ending date for downloading archives (in YYYYMM format). - save_folder_path: The local directory where the downloaded archives will be saved (if you run the code in this notebook, the archives will be saved in a folder 'extdata', located in the parent directory of kaiaulu (wherever your kaiaulu folder is kept)). +- mbox_file_path: The path to the .mbox file used by the parse_mbox() function. + +Note: It is important that the paths specified in save_folder_path and mbox_file_path are accurate and do not conflict between projects. By organizing the configuration in this way, you can manage multiple projects and mailing lists easily. The notebook reads these parameters and uses them to download and process the archives. -## Pipermail Downloader +Regardless of which mail archive you choose, the downloaders will store the mail data in monthly files, in a `.mbox` format. This is a simple text file that contains some markings to identify the header of the e-mail containing title, authors, etc. You can open any of the .mbox downloaded files with any text editor. + +## Pipermail Configuration -The following code reads the configuration parameters for project_key_1 of pipermail: +For Pipermail, we need to specify the project key, which is used to retrieve the configuration parameters for the specific project. The project key is used to identify the project in the configuration file. + +```{r} +# Define the project key +project_key <- "project_key_1" +``` + +Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. ```{r} conf <- parse_config("conf/helix.yml") @@ -113,7 +115,86 @@ end_year_month <- 202405 save_folder_path <- get_pipermail_path(conf, "project_key_1") ``` -After setting the configurations above, you can download the archives using the download_pipermail() function, which downloads and saves .mbox files to the specified directory (save_folder_path). The .mbox files are named with the format kaiaulu_YYYYMM.mbox, where YYYYMM refers to the year and month of the archive. +Note that the date range is not set with a getter. The range for downloads changes often, and should be set manually using the YYYYMM format. + +Explanation of Getters: + +- get_pipermail_domain(config_file, project_key_index): Retrieves the mailing list URL. +- get_pipermail_path(config_file, project_key_index): Retrieves the local folder path for saving archives. +- get_pipermail_input_file(config_file, project_key_index): Retrieves the .mbox file path for parsing (parse_mbox function). + +## Mbox Configuration + +Similarly to Pipermail, we need to specify the project key for Mod Mbox. The project key is used to retrieve the configuration parameters for the specific project. + +```{r} +# Define the project key +project_key <- "project_key_1" +``` + +Use the getters to extract the parameters: + +```{r eval=FALSE} +conf <- parse_config("conf/helix.yml") +mailing_list <- get_mbox_domain(conf, "project_key_1") +start_year_month <- 202310 +end_year_month <- 202405 +save_folder_path <- get_mbox_path(conf, "project_key_1") +``` + +Explanation of Getters: + +get_mbox_domain(config_file, project_key_index): Retrieves the mailing list URL. +get_mbox_path(config_file, project_key_index): Retrieves the local folder path for saving archives. +get_mbox_input_file(config_file, project_key_index): Retrieves the .mbox file path for parsing. + +start_year_month and end_year_month should be set manually, as with pipermail. + +## Tools Configuration + +In addition to the mailing list configurations, you need to specify the path to the perceval binary in tools.yml, which is used by the parse_mbox() function to parse .mbox files.It should look something like this: + +```{r} +perceval: /usr/local/bin/perceval +``` + +Now, you can load the configurations in your R script or notebook using the following code: + +```{r} +# Load tools configuration +tools <- parse_config("tools.yml") +parse_perceval_path <- get_tool("perceval", tools) + +# Load project configuration +conf <- parse_config("conf/helix.yml") +mbox_file_path <- get_mbox_input_file(conf, "project_key_1") +``` + +Explanation of Getters: + +parse_config(): Function to parse the YAML configuration files. +get_tool("perceval", tools): Retrieves the Perceval path from the tools configuration. +get_mbox_input_file(conf, "project_key_1"): Retrieves the .mbox file path for project_key_1 from the helix configuration. + +Now that you have loaded the configurations, you can proceed to use them in downloading and parsing the mailing list archives. + +# Downloaders and Refreshers + +## Pipermail Downloader + +### How download_pipermail() Works +The download_pipermail() function downloads Pipermail archives from a specified mailing list within a given date range. Here's how it operates: + +- Archive Index Retrieval: It begins by downloading an HTML page that lists the URLs for the monthly archives, which are typically available in .txt or .gz formats. +- File Downloading: The function attempts to download the .txt file for each month. If the .txt file is unavailable, it falls back to downloading the .gz (gzipped) file. +- File Processing: If a .gz file is downloaded, the function unzips it and converts it into an .mbox file. The original .gz file is deleted after extraction to save space. +- File Saving: The downloaded .mbox files are saved in the specified folder with the naming convention kaiaulu_YYYYMM.mbox, where YYYYMM represents the year and month. +- Date Range Filtering: Only files within the specified start_year_month and end_year_month are downloaded. +- Error Handling: If both .txt and .gz formats fail to download for a particular month, a warning is issued indicating the missing month. +- Summary Output: At the end of the process, the function summarizes the downloads, indicating the range of dates present and any missing months. +- Set verbose to TRUE to see status updates and detailed output. + +### Example Usage ```{r eval=FALSE} # Download archives @@ -137,7 +218,8 @@ How refresh_pipermail Works 1. Checks if the folder is empty: If the folder is empty, it downloads archives starting from start_year_month to the current month using download_pipermail(). 2. Finds the most recent file: If the folder is not empty, the function checks for the most recent month’s file (based on the filename) and deletes it. 3. Redownloads from the most recent month: The function then redownloads the archive from the most recent month up to the current month. -# add warning for files do not exist + +### Example Usage ```{r eval=FALSE} # Refresh archives @@ -152,46 +234,22 @@ refresh_pipermail( This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive and adding any new months that have been added to the mailing list. -# Mod Mbox - -## Project Configuration File - -Like in Pipermail, we load the configuration for Mod Mbox from the YAML file, which includes the mailing list URL, the date range, and the save folder path. - -Here's an example of the relevant section in the configuration file (conf/helix.yml): - -``` -# top-level key for mailing list config -mailing_list: - # for mod mbox - mod_mbox: - project_key_1: - mailing_list: https://lists.apache.org/list.html?announce@apache.org - save_folder_path: "../../extdata/save_mbox_mail" - -``` - -The configuration parameters are the same as the ones explained in the section at the top of this notebook, except that the mailing_list should point to a Mod Mbox mailing list URL. - -The following code reads the configuration parameters: - -```{r eval=FALSE} -conf <- parse_config("conf/helix.yml") -mailing_list <- get_mbox_domain(conf, "project_key_1") -start_year_month <- 202310 -end_year_month <- 202405 -save_folder_path <- get_mbox_path(conf, "project_key_1") -``` +## Mod Mbox Downloader -- mailing_list: The URL of the Mod Mbox mailing list (e.g., https://lists.apache.org/list.html?announce@apache.org). -- start_year_month: The first month to download (format: YYYYMM). -- end_year_month: The last month to download (format: YYYYMM). -- save_folder_path: The directory where the downloaded .mbox files will be saved. +### How download_mod_mbox() Works +The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range: -## Mod Mbox Downloader +- URL Construction: It constructs the download URLs for each month based on the mailing list URL and the date range. +- File Downloading: Downloads the .mbox file for each month in the format "YYYY-MM". +- File Saving: Saves the downloaded .mbox files in the specified folder with the naming convention kaiaulu_YYYYMM.mbox. +- Date Range Looping: Iterates through each month between start_year_month and end_year_month. +- Error Handling: Issues a warning if a download fails for a specific month, indicating that the month's data may not exist. +- Summary Output: Provides a summary of the downloads, including any missing months. The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. +### Example Usage + ```{r eval=FALSE} download_mod_mbox( mailing_list = mailing_list, @@ -214,6 +272,8 @@ How refresh_mod_mbox Works 1. Checks if the folder is empty and, if so, downloads the archives starting from start_year_month to the current month by calling download_mod_mbox(). 2. If the folder contains files, it identifies the most recent one using the YYYYMM found in the filename. This file is deleted, and then redownloaded along with all future months. +### Example Usage + ```{r eval=FALSE} refresh_mod_mbox( mailing_list = mailing_list, @@ -225,46 +285,20 @@ refresh_mod_mbox( This ensures your archive is up-to-date, accounting for new data that may have been added to the mailing list since the last download. -# Parser +# Parsers After downloading the mailing list archives as .mbox files, the next step is to parse these files to extract meaningful information for analysis. The parse_mbox() function utilizes the Perceval library to parse .mbox files and convert them into structured data tables. This enables easier manipulation and analysis of mailing list data. ## Mbox Parser -The parse_mbox() function takes an .mbox file and parses it into a structured data.table using the Perceval library. +### ow parse_mbox() Works +- Perceval Integration: Interfaces with the Perceval library to parse the .mbox file. +- Flexible Parsing: Handles variations in .mbox file structures, which may have inconsistent fields due to different email headers. +- Data Extraction: Extracts key information such as email content, sender, recipients, dates, and threading information. +- Consistent Column Naming: Ensures that columns of interest are consistently renamed for clarity, even if the raw data varies. -For the configuration, make sure you have the correct path to the Perceval library in the conf file. -Here's an example of the relevant section in the tools.yml file: - -``` -perceval: /usr/local/bin/perceval -``` - -And in the helix.yml configuration file: - -``` -mailing_list: - # for mod mbox - mod_mbox: - project_key_1: - mbox_file_path: "../../extdata/save_mbox_mail.kaiaulu_202310.mbox" -``` - -perceval: found in tools.yml, this should be set to your local path to the perceval binary (use > which perceval to locate the path). -mbox_file_path: should point to the saved .mbox file that will be parsed. See the mbox_path in the mailing_list sections of helix.yml. - -Load the configuration: - -```{r eval=FALSE} -tools <- parse_config("tools.yml") -parse_perceval_path <- get_tool_project("perceval", tools) - -conf <- parse_config("conf/helix.yml") -mbox_file_path <- get_mbox_input_file(conf, "project_key_1") -``` - -Run the parser: +### Example Usage ```{r eval=FALSE} parsed_mail <- parse_mbox( @@ -273,7 +307,7 @@ parsed_mail <- parse_mbox( ) ``` -This will store the parsed data into the parsed_mail variable. To view the table, use: +This will store the parsed data into the parsed_mail variable. You can use the gt package to display the parsed data in a readable format: ```{r eval=FALSE} # Display the first 10 rows of the parsed data using gt @@ -283,20 +317,19 @@ parsed_mail %>% gt() ``` +Note: Displaying the entire dataset may not be practical if it's large. Showing a sample provides a glimpse of the structure. + ## Retrieve the Latest Mbox File We can use the parse_mbox_latest_date() function to identify the most recent .mbox file in the specified folder. This can be useful when you want to automate the parsing of the latest data without manually specifying the file name. First, make sure that the save_folder_path is correctly set to the directory where your .mbox files are stored. -```{r eval=FALSE} -# Get the latest mbox file -latest_mbox_file <- parse_mbox_latest_date(save_folder_path = save_folder_path) -print(latest_mbox_file) -``` This will output the name of the latest .mbox file based on the YYYYMM pattern in the filename. We can use this to update mbox_file_path to point to the latest file, and call the parse_mbox() function to parse the latest data. +### Example Usage + ```{r eval=FALSE} # Update mbox_file_path to use the latest file mbox_file_path <- file.path(save_folder_path, latest_mbox_file) From 8094402bd20ee86419399e5f417c3b45c5d56f84 Mon Sep 17 00:00:00 2001 From: daomcgill <77309217+daomcgill@users.noreply.github.com> Date: Mon, 14 Oct 2024 17:21:10 -1000 Subject: [PATCH 36/80] i #284 Final Updates for Mail Notebook --- vignettes/download_mail.Rmd | 82 ++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 194c3d82..9adfae1f 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -31,6 +31,8 @@ set.seed(seed) Open source projects require a means for developers to communicate. These may include mailing lists, issue trackers, discord, etc. This notebooks showcases how to download data from mailing list archives. Two often used archive types are [mod_mbox](https://httpd.apache.org/mod_mbox/) and [pipermail](https://en.wikipedia.org/wiki/GNU_Mailman#cite_note-9), which Kaiaulu offer functions to download data from. The former is commonly used by the Apache Software Foundation projects. The latter, is more commonly use in GNU related projects, but this can vary. +Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully selected, since this effects the quality and completeness of analysis. + # Mailing List Organization Mailing list data is stored in a variety of archives. See: @@ -47,8 +49,6 @@ Mailing lists are typically organized by topic or purpose. For example, the [Ope Mod Mbox archives also organize mailing lists by topic. The apache mailing list archives can be found at https://lists.apache.org/. -Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully selected, since this effects the quality and completeness of analysis. - # Project Configuration File Mailing List archives are hosted by their respective open source projects. Therefore, in order to use Kaiaulu downloaders to obtain mail data, you will need to access the respective open source project, and find out the URL tied to the archive you are interested in. Generally, that is the developer mailing list, if your interest is to understand communication patterns among developers. Alternatively, if the focus of the research is Q&A from the user base, then a user mailing list may make more sense. @@ -176,13 +176,14 @@ parse_config(): Function to parse the YAML configuration files. get_tool("perceval", tools): Retrieves the Perceval path from the tools configuration. get_mbox_input_file(conf, "project_key_1"): Retrieves the .mbox file path for project_key_1 from the helix configuration. -Now that you have loaded the configurations, you can proceed to use them in downloading and parsing the mailing list archives. - # Downloaders and Refreshers -## Pipermail Downloader +## Downloaders + +With the configurations loaded, we can proceed to download the mailing list archives. The downloaders are responsible for fetching the archives from the specified mailing lists and saving them locally in .mbox format. + +### Pipermail Downloader -### How download_pipermail() Works The download_pipermail() function downloads Pipermail archives from a specified mailing list within a given date range. Here's how it operates: - Archive Index Retrieval: It begins by downloading an HTML page that lists the URLs for the monthly archives, which are typically available in .txt or .gz formats. @@ -194,7 +195,7 @@ The download_pipermail() function downloads Pipermail archives from a specified - Summary Output: At the end of the process, the function summarizes the downloads, indicating the range of dates present and any missing months. - Set verbose to TRUE to see status updates and detailed output. -### Example Usage +#### Example Usage ```{r eval=FALSE} # Download archives @@ -210,33 +211,8 @@ download_pipermail( After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc. -## Pipermail Refresher - -In some cases, you may want to refresh the archive to ensure the most recent months are up-to-date or to handle updates to the mailing list. The refresh_pipermail() function helps automate this process. - -How refresh_pipermail Works -1. Checks if the folder is empty: If the folder is empty, it downloads archives starting from start_year_month to the current month using download_pipermail(). -2. Finds the most recent file: If the folder is not empty, the function checks for the most recent month’s file (based on the filename) and deletes it. -3. Redownloads from the most recent month: The function then redownloads the archive from the most recent month up to the current month. - -### Example Usage - -```{r eval=FALSE} -# Refresh archives -refresh_pipermail( - mailing_list = mailing_list, - start_year_month = start_year_month, - save_folder_path = save_folder_path, - verbose = TRUE -) - -``` - -This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive and adding any new months that have been added to the mailing list. - -## Mod Mbox Downloader +### Mod Mbox Downloader -### How download_mod_mbox() Works The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range: - URL Construction: It constructs the download URLs for each month based on the mailing list URL and the date range. @@ -248,7 +224,7 @@ The download_mod_mbox() function downloads Mod Mbox archives from a specified Ap The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. -### Example Usage +#### Example Usage ```{r eval=FALSE} download_mod_mbox( @@ -264,7 +240,37 @@ download_mod_mbox( After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 and saves the files in the specified folder. -## Mod Mbox Refresher +## Refreshers + +Over time, new messages are added to mailing lists. It's important to keep your local archives up-to-date to ensure that your analysis includes the latest communications. The refreshers are functions designed to update your existing archives efficiently. + +Mailing lists are dynamic, with new emails being added regularly. If you're conducting ongoing analysis or need the most recent data, it's important to refresh your downloaded archives. Manually redownloading all archives can be time-consuming and inefficient. The refresher functions automate this process by updating only the necessary parts of your archives, saving time and ensuring data completeness. + +### Pipermail Refresher + +In some cases, you may want to refresh the archive to ensure the most recent months are up-to-date or to handle updates to the mailing list. The refresh_pipermail() function helps automate this process. + +How refresh_pipermail Works +1. Checks if the folder is empty: If the folder is empty, it downloads archives starting from start_year_month to the current month using download_pipermail(). +2. Finds the most recent file: If the folder is not empty, the function checks for the most recent month’s file (based on the filename) and deletes it. +3. Redownloads from the most recent month: The function then redownloads the archive from the most recent month up to the current month. + +#### Example Usage + +```{r eval=FALSE} +# Refresh archives +refresh_pipermail( + mailing_list = mailing_list, + start_year_month = start_year_month, + save_folder_path = save_folder_path, + verbose = TRUE +) + +``` + +This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive and adding any new months that have been added to the mailing list. + +### Mod Mbox Refresher To refresh these archives to ensure that you have the latest messages, you can use the refresh_mod_mbox function. This function works similarly to the Pipermail refresher. @@ -272,7 +278,7 @@ How refresh_mod_mbox Works 1. Checks if the folder is empty and, if so, downloads the archives starting from start_year_month to the current month by calling download_mod_mbox(). 2. If the folder contains files, it identifies the most recent one using the YYYYMM found in the filename. This file is deleted, and then redownloaded along with all future months. -### Example Usage +#### Example Usage ```{r eval=FALSE} refresh_mod_mbox( @@ -291,7 +297,9 @@ After downloading the mailing list archives as .mbox files, the next step is to ## Mbox Parser -### ow parse_mbox() Works +After downloading the mailing list archives as .mbox files, the next step is to parse these files to extract meaningful information for analysis. The parse_mbox() function utilizes the Perceval library to parse .mbox files and convert them into structured data tables. This enables easier manipulation and analysis of mailing list data. + +### How parse_mbox() Works - Perceval Integration: Interfaces with the Perceval library to parse the .mbox file. - Flexible Parsing: Handles variations in .mbox file structures, which may have inconsistent fields due to different email headers. - Data Extraction: Extracts key information such as email content, sender, recipients, dates, and threading information. From 5fb3af7537befdf8e97fc13e82d669106076bc00 Mon Sep 17 00:00:00 2001 From: Nicholas Beydler Date: Thu, 17 Oct 2024 17:56:32 -1000 Subject: [PATCH 37/80] i #284 Fixed Relative Paths in a Notebook - The project configuration section of a notebook was incorrectly using the project directory (kaiaulu/) as its working directory rather than the directory that it resides in (/vignettes/) as its working directory. --- vignettes/download_mail.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 9adfae1f..c9ac8be5 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -108,7 +108,7 @@ project_key <- "project_key_1" Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. ```{r} -conf <- parse_config("conf/helix.yml") +conf <- parse_config("../conf/helix.yml") mailing_list <- get_pipermail_domain(conf, "project_key_1") start_year_month <- 202310 end_year_month <- 202405 @@ -135,7 +135,7 @@ project_key <- "project_key_1" Use the getters to extract the parameters: ```{r eval=FALSE} -conf <- parse_config("conf/helix.yml") +conf <- parse_config("../conf/helix.yml") mailing_list <- get_mbox_domain(conf, "project_key_1") start_year_month <- 202310 end_year_month <- 202405 @@ -162,11 +162,11 @@ Now, you can load the configurations in your R script or notebook using the foll ```{r} # Load tools configuration -tools <- parse_config("tools.yml") +tools <- parse_config("../tools.yml") parse_perceval_path <- get_tool("perceval", tools) # Load project configuration -conf <- parse_config("conf/helix.yml") +conf <- parse_config("../conf/helix.yml") mbox_file_path <- get_mbox_input_file(conf, "project_key_1") ``` From e56848a64238a409f6f3388524b7ff841d470b5f Mon Sep 17 00:00:00 2001 From: Anthony Lau <98019016+anthonyjlau@users.noreply.github.com> Date: Mon, 11 Nov 2024 17:59:47 -1000 Subject: [PATCH 38/80] i #230 create config file interface This commits perform a major refactoring of how Kaiaulu interface with config files, and the suggested folder organization to store rawdata and analysis. The configuration files are generalized to account for anomaly cases when performing project analysis. For instance, long-lived projects may contain multiple repositories, issue trackers, mailing list, etc. The new template of the configuration file allows to account for this information. Moreover, changes to the config template cascaded in changes to all notebooks, as the access to the config was hardcoded to the file organization. A new set of get_ functions should make this the last commit that change in template cascades into notebooks. All actively maintained notebooks (not prefixed by underline under vignettes/) have been updated to use the get functions. Future changes, therefore, will only affect the get() functions in R/config.R. The folder organization of the filepaths has also been modified. Previously, filepaths assumed as default in the versioned config files suggested organizing code as rawdata/git_repo/projectX ; rawdata/jira/projectY. This organization was not practical for sharing data manually, as the user would need to zip several folders individually. The new organization is now rawdata/projectX/git_repo ; rawdata/projectX/jira. This means users only need to zip projectX and that will contain all the data wanted to be shared. A minor typo on graph.R was also fixed for merge function calls from `sorted=` to `sort=`. --- DESCRIPTION | 4 +- NAMESPACE | 51 + NEWS.md | 3 + R/config.R | 1215 +++++++++++++++++ R/graph.R | 4 +- README.md | 6 +- _pkgdown.yml | 59 + conf/ambari.yml | 128 +- conf/apr.yml | 114 +- conf/calculator.yml | 112 +- conf/camel.yml | 111 +- conf/chromium.yml | 140 +- conf/geronimo.yml | 108 +- conf/helix.yml | 123 +- conf/junit5.yml | 130 +- conf/kaiaulu.yml | 139 +- conf/openssl.yml | 138 +- conf/redhat.yml | 248 ++++ conf/spark.yml | 138 +- conf/thrift.yml | 132 +- conf/tomcat.yml | 141 +- conf/tse_apex.yml | 112 +- man/get_bugzilla_issue_comment_path.Rd | 23 + man/get_bugzilla_issue_path.Rd | 23 + man/get_bugzilla_project_key.Rd | 23 + man/get_cveid_regex.Rd | 22 + man/get_depends_code_language.Rd | 21 + man/get_depends_keep_dependencies_type.Rd | 21 + man/get_dv8_flaws_params.Rd | 21 + man/get_dv8_folder_path.Rd | 22 + man/get_enumeration_commits.Rd | 21 + man/get_file_extensions.Rd | 21 + man/get_filter_commit_size.Rd | 21 + man/get_git_branches.Rd | 21 + man/get_git_repo_path.Rd | 21 + man/get_github_commit_path.Rd | 24 + man/get_github_issue_event_path.Rd | 25 + man/get_github_issue_or_pr_comment_path.Rd | 24 + man/get_github_issue_path.Rd | 23 + man/get_github_issue_search_path.Rd | 24 + man/get_github_keys.Rd | 21 + man/get_github_owner.Rd | 23 + man/get_github_pull_request_path.Rd | 25 + man/get_github_repo.Rd | 23 + man/get_issue_id_regex.Rd | 21 + man/get_jira_domain.Rd | 23 + man/get_jira_issues_comments_path.Rd | 24 + man/get_jira_issues_path.Rd | 23 + man/get_jira_keys.Rd | 21 + man/get_jira_project_key_name.Rd | 23 + man/get_mbox_domain.Rd | 23 + man/get_mbox_input_file.Rd | 24 + man/get_mbox_key_indexes.Rd | 21 + man/get_mbox_path.Rd | 24 + man/get_nvdfeed_folder_path.Rd | 22 + man/get_pattern4_filepath.Rd | 21 + man/get_pattern4_folder_path.Rd | 21 + man/get_pipermail_domain.Rd | 23 + man/get_pipermail_input_file.Rd | 24 + man/get_pipermail_path.Rd | 24 + man/get_srcml_filepath.Rd | 21 + man/get_substring_filepath.Rd | 21 + man/get_tool_project.Rd | 24 + man/get_topics.Rd | 21 + man/get_uctags_line_types.Rd | 21 + man/get_understand_code_language.Rd | 21 + man/get_understand_keep_dependencies_type.Rd | 21 + man/get_understand_output_path.Rd | 21 + man/get_understand_project_path.Rd | 21 + man/get_window_end_commit.Rd | 21 + man/get_window_size.Rd | 20 + man/get_window_start_commit.Rd | 21 + man/parse_config.Rd | 21 + ...motif_analysis.Rmd => _motif_analysis.Rmd} | 25 +- ...ase.Rmd => _refactoringminer_showcase.Rmd} | 14 +- ....Rmd => _reply_communication_showcase.Rmd} | 11 +- ...howcase.Rmd => _social_smell_showcase.Rmd} | 38 +- vignettes/blamed_line_types_showcase.Rmd | 16 +- vignettes/bug_count.Rmd | 17 +- vignettes/causal_flaws.Rmd | 45 +- vignettes/community_detection_showcase.Rmd | 26 +- vignettes/depends_showcase.Rmd | 18 +- vignettes/download_github_comments.Rmd | 38 +- vignettes/download_jira_issues.Rmd | 22 +- vignettes/download_mod_mbox.Rmd | 52 - vignettes/dv8_showcase.Rmd | 26 +- vignettes/github_api_showcase.Rmd | 27 +- vignettes/gitlog_entity_showcase.Rmd | 16 +- vignettes/gitlog_showcase.Rmd | 16 +- vignettes/gitlog_vulnerabilities_showcase.Rmd | 16 +- vignettes/graph_gof_showcase.Rmd | 20 +- vignettes/issue_social_smell_showcase.Rmd | 35 +- vignettes/line_metrics_showcase.Rmd | 14 +- vignettes/text_gof_showcase.Rmd | 16 +- 94 files changed, 4364 insertions(+), 630 deletions(-) create mode 100644 R/config.R create mode 100644 conf/redhat.yml create mode 100644 man/get_bugzilla_issue_comment_path.Rd create mode 100644 man/get_bugzilla_issue_path.Rd create mode 100644 man/get_bugzilla_project_key.Rd create mode 100644 man/get_cveid_regex.Rd create mode 100644 man/get_depends_code_language.Rd create mode 100644 man/get_depends_keep_dependencies_type.Rd create mode 100644 man/get_dv8_flaws_params.Rd create mode 100644 man/get_dv8_folder_path.Rd create mode 100644 man/get_enumeration_commits.Rd create mode 100644 man/get_file_extensions.Rd create mode 100644 man/get_filter_commit_size.Rd create mode 100644 man/get_git_branches.Rd create mode 100644 man/get_git_repo_path.Rd create mode 100644 man/get_github_commit_path.Rd create mode 100644 man/get_github_issue_event_path.Rd create mode 100644 man/get_github_issue_or_pr_comment_path.Rd create mode 100644 man/get_github_issue_path.Rd create mode 100644 man/get_github_issue_search_path.Rd create mode 100644 man/get_github_keys.Rd create mode 100644 man/get_github_owner.Rd create mode 100644 man/get_github_pull_request_path.Rd create mode 100644 man/get_github_repo.Rd create mode 100644 man/get_issue_id_regex.Rd create mode 100644 man/get_jira_domain.Rd create mode 100644 man/get_jira_issues_comments_path.Rd create mode 100644 man/get_jira_issues_path.Rd create mode 100644 man/get_jira_keys.Rd create mode 100644 man/get_jira_project_key_name.Rd create mode 100644 man/get_mbox_domain.Rd create mode 100644 man/get_mbox_input_file.Rd create mode 100644 man/get_mbox_key_indexes.Rd create mode 100644 man/get_mbox_path.Rd create mode 100644 man/get_nvdfeed_folder_path.Rd create mode 100644 man/get_pattern4_filepath.Rd create mode 100644 man/get_pattern4_folder_path.Rd create mode 100644 man/get_pipermail_domain.Rd create mode 100644 man/get_pipermail_input_file.Rd create mode 100644 man/get_pipermail_path.Rd create mode 100644 man/get_srcml_filepath.Rd create mode 100644 man/get_substring_filepath.Rd create mode 100644 man/get_tool_project.Rd create mode 100644 man/get_topics.Rd create mode 100644 man/get_uctags_line_types.Rd create mode 100644 man/get_understand_code_language.Rd create mode 100644 man/get_understand_keep_dependencies_type.Rd create mode 100644 man/get_understand_output_path.Rd create mode 100644 man/get_understand_project_path.Rd create mode 100644 man/get_window_end_commit.Rd create mode 100644 man/get_window_size.Rd create mode 100644 man/get_window_start_commit.Rd create mode 100644 man/parse_config.Rd rename vignettes/{motif_analysis.Rmd => _motif_analysis.Rmd} (95%) rename vignettes/{refactoringminer_showcase.Rmd => _refactoringminer_showcase.Rmd} (67%) rename vignettes/{reply_communication_showcase.Rmd => _reply_communication_showcase.Rmd} (95%) rename vignettes/{social_smell_showcase.Rmd => _social_smell_showcase.Rmd} (97%) delete mode 100644 vignettes/download_mod_mbox.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index f693b7a6..7c257f6b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,7 +20,9 @@ Authors@R: c( person('Nicole', 'Hoess', role = c('ctb')), person('Anthony', 'Lau', role = c('ctb')), person('Sean', 'Sunoo', role = c('ctb')), - person('Ian Jaymes', 'Iwata', role= c('ctb')) + person('Ian Jaymes', 'Iwata', role= c('ctb')), + person('Nicholas', 'Beydler', role = c('ctb')), + person('Mark', 'Burgess', role = c('ctb')) ) Maintainer: Carlos Paradis License: MPL-2.0 | file LICENSE diff --git a/NAMESPACE b/NAMESPACE index 2e3b17bf..bc0561fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,7 +48,57 @@ export(filter_by_file_extension) export(filter_by_filepath_substring) export(filter_by_last_files_change) export(format_name_email) +export(get_bugzilla_issue_comment_path) +export(get_bugzilla_issue_path) +export(get_bugzilla_project_key) +export(get_cveid_regex) export(get_date_from_commit_hash) +export(get_depends_code_language) +export(get_depends_keep_dependencies_type) +export(get_dv8_flaws_params) +export(get_dv8_folder_path) +export(get_enumeration_commits) +export(get_file_extensions) +export(get_filter_commit_size) +export(get_git_branches) +export(get_git_repo_path) +export(get_github_commit_path) +export(get_github_issue_event_path) +export(get_github_issue_or_pr_comment_path) +export(get_github_issue_path) +export(get_github_issue_search_path) +export(get_github_keys) +export(get_github_owner) +export(get_github_pull_request_path) +export(get_github_repo) +export(get_issue_id_regex) +export(get_jira_domain) +export(get_jira_issues_comments_path) +export(get_jira_issues_path) +export(get_jira_keys) +export(get_jira_project_key_name) +export(get_mbox_domain) +export(get_mbox_input_file) +export(get_mbox_key_indexes) +export(get_mbox_path) +export(get_nvdfeed_folder_path) +export(get_pattern4_filepath) +export(get_pattern4_folder_path) +export(get_pipermail_domain) +export(get_pipermail_input_file) +export(get_pipermail_path) +export(get_srcml_filepath) +export(get_substring_filepath) +export(get_tool_project) +export(get_topics) +export(get_uctags_line_types) +export(get_understand_code_language) +export(get_understand_keep_dependencies_type) +export(get_understand_output_path) +export(get_understand_project_path) +export(get_window_end_commit) +export(get_window_size) +export(get_window_start_commit) export(git_add) export(git_blame) export(git_checkout) @@ -110,6 +160,7 @@ export(parse_bugzilla_rest_comments) export(parse_bugzilla_rest_issues) export(parse_bugzilla_rest_issues_comments) export(parse_commit_message_id) +export(parse_config) export(parse_dependencies) export(parse_dv8_architectural_flaws) export(parse_dv8_clusters) diff --git a/NEWS.md b/NEWS.md index 5da1b52c..7877d397 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,7 @@ __kaiaulu 0.0.0.9700 (in development)__ ### NEW FEATURES + * `config.R` now contains a set of getter functions used to centralize the gathering of configuration data and these getter functions are used to refactor configuration file information gathering. For example, loading configuration file information with variable assignment is as follows `git_repo_path <- config_file[["version_control"]][["log"]]` but refactoring with a config.R getter function becomes `git_repo_path <- get_git_repo_path(config_file)`. [#230](https://github.com/sailuh/kaiaulu/issues/230) * `refresh_jira_issues()` had been added. It is a wrapper function for the previous downloader and downloads only issues greater than the greatest key already downloaded. [#275](https://github.com/sailuh/kaiaulu/issues/275) * `download_jira_issues()`, `download_jira_issues_by_issue_key()`, and `download_jira_issues_by_date()` has been added. This allows for downloading of Jira issues without the use of JirAgileR and specification of issue Id and created ranges. It also interacts with `parse_jira_latest_date()` to implement a refresh capability. [#275](https://github.com/sailuh/kaiaulu/issues/275) * `make_jira_issue()` and `make_jira_issue_tracker()` no longer create fake issues following JirAgileR format, but instead the raw data obtained from JIRA API. This is compatible with the new parser function for JIRA. [#277](https://github.com/sailuh/kaiaulu/issues/277) @@ -46,6 +47,7 @@ __kaiaulu 0.0.0.9700 (in development)__ * graph_to_dsmj is now vectorized, increasing performance [#209](https://github.com/sailuh/kaiaulu/issues/209) * Bugzilla API now allows for output file to be specified. [#202](https://github.com/sailuh/kaiaulu/issues/202) * Paired parser functions now expects a filepath instead of a json string character. [#202](https://github.com/sailuh/kaiaulu/issues/202) + * refactored file organization in config files for clearer hierarchy. [#230](https://github.com/sailuh/kaiaulu/issues/230) ### BUG FIXES @@ -222,6 +224,7 @@ __kaiaulu [0.0.0.9000](https://github.com/sailuh/kaiaulu/milestone/1) (04/24/202 * lubridate dependency was removed, this package now uses base R POSIXct to handle dates. [#13](https://github.com/sailuh/kaiaulu/issues/13) * stringr was replaced by stringi to respect license terms of this and stringr packages. [#21](https://github.com/sailuh/kaiaulu/issues/21) + ### BUG FIXES * non defined function parameter on mbox has been fixed. [#25](https://github.com/sailuh/kaiaulu/issues/25) diff --git a/R/config.R b/R/config.R new file mode 100644 index 00000000..4b80ec14 --- /dev/null +++ b/R/config.R @@ -0,0 +1,1215 @@ +# Kaiaulu - https://github.com/sailuh/kaiaulu +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at https://mozilla.org/MPL/2.0/. + + +########## Configuration File Parser Functions ########## + +#' Returns the parsed configuration file (.yml). +#' +#' @description The input file is expected to be in the .yml format. +#' The function returns a parsed version of the input .yml file, and it will +#' inform the user if the input .yml file path does not exist. The contents +#' of the input .yml file may contain machine-dependent paths that may need to +#' be modified by the user. +#' +#' @param config_path The path of the config file from the kaiaulu directory (e.g. "conf/kaiaulu.yml"). +#' @return The parsed config file whose path is specified by `config_path`. +#' @export +parse_config <- function(config_path) { + + conf <- yaml::read_yaml(config_path) + + if (is.null(conf)) { + warning("Path does not exist.") + } + + return(conf) +} + +##### Git Getter Functions ##### + +#' Returns the path to the .git of the project repository that is being analyzed. +#' +#' @description This function returns the specific path to the .git of the +#' project repository that is being analyzed specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the .git path of the project repository +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The local git repository path specified in `config_file`. +#' @export +get_git_repo_path <- function(config_file) { + + git_repo_path <- config_file[["version_control"]][["log"]] + + if (is.null(git_repo_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(git_repo_path) +} + +#' Returns the list of git branches used for analysis in the current project. +#' +#' @description This function returns a list of the git branches used for +#' analysis in the current project specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the list of branches to be analyzed +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of git branches. +#' @export +get_git_branches <- function(config_file) { + + git_branch <- config_file[["version_control"]][["branch"]] + + if (is.null(git_branch)) { + warning("Attribute does not exist in the configuration file.") + } + + return(git_branch) +} + + + +##### Mailing List Functions Start ##### + +#' Returns the list of mailing list mod mbox project keys. +#' +#' @description This function returns the list of mailing list mod mbox project +#' keys, that is specified in the input parameter `config_file`. The input, +#' `config_file` must be a parsed configuration file. The function will inform +#' the user if the project keys exist in the parsed configuration +#' file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of mod mbox mailing list keys. +#' @export +get_mbox_key_indexes <- function(config_file) { + + mbox_keys <- config_file[["mailing_list"]][["mod_mbox"]] + + if (is.null(mbox_keys)) { + warning("Attribute does not exist in the configuration file.") + } + + return(mbox_keys) +} + +#' Returns the URL to the archives for mbox for a specific project key. +#' +#' @description This function returns the URL to the archives for a specific +#' project key, `project_key_index`, that is specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the specific URL to the archives for +#' mbox exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The URL of the mbox mailing list archive for project specified by key `project_key_index`. +#' @export +get_mbox_domain <- function(config_file, project_key_index) { + + mbox_url <- config_file[["mailing_list"]][["mod_mbox"]][[project_key_index]][["mailing_list"]] + + if (is.null(mbox_url)) { + warning("Attribute does not exist in the configuration file.") + } + + return(mbox_url) +} + +#' Returns the local folder path to store mbox data for a specific project key. +#' +#' @description This function returns the local folder path used to store +#' mbox data for a specific project key, `project_key_index`, that is specified +#' in the input parameter `config_file`. The input, `config_file` must be a +#' parsed configuration file. The function will inform the user if the specific +#' local folder path to store mbox data exists in the parsed configuration +#' file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local mbox path for project specified by key `project_key_index`. +#' @export +get_mbox_path <- function(config_file, project_key_index) { + + mbox_path <- config_file[["mailing_list"]][["mod_mbox"]][[project_key_index]][["save_folder_path"]] + + if (is.null(mbox_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(mbox_path) +} + +#' Returns the local input file for mbox for a specific project key. +#' +#' @description This function returns the local file used for input for +#' mbox for a specific project key, `project_key_index`, that is specified +#' in the input parameter `config_file`. The input, `config_file` must be a +#' parsed configuration file. The function will inform the user if the specific +#' local input file path for mbox exists in the parsed configuration file, +#' `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local input file mbox path for project specified by key `project_key_index`. +#' @export +get_mbox_input_file <- function(config_file, project_key_index) { + + mbox_input <- config_file[["mailing_list"]][["mod_mbox"]][[project_key_index]][["mbox_file_path"]] + + if (is.null(mbox_input)) { + warning("Attribute does not exist in the configuration file.") + } + + return(mbox_input) +} + +#' Returns the URL to the archives for pipermail for a specific project key. +#' +#' @description This function returns the URL to the archives for a specific +#' project key, `project_key_index`, that is specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the specific URL to the archives for +#' pipermail exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The URL of the pipermail mailing list archive for project specified by key `project_key_index`. +#' @export +get_pipermail_domain <- function(config_file, project_key_index) { + + pipermail_url <- config_file[["mailing_list"]][["pipermail"]][[project_key_index]][["mailing_list"]] + + if (is.null(pipermail_url)) { + warning("Attribute does not exist in the configuration file.") + } + + return(pipermail_url) +} + +#' Returns the local folder path to store pipermail data for a specific project key. +#' +#' @description This function returns the local folder path used to store +#' pipermail data for a specific project key, `project_key_index`, that is specified +#' in the input parameter `config_file`. The input, `config_file` must be a +#' parsed configuration file. The function will inform the user if the specific +#' local folder path to store pipermail data exists in the parsed configuration +#' file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local pipermail path for project specified by key `project_key_index`. +#' @export +get_pipermail_path <- function(config_file, project_key_index) { + + pipermail_path <- config_file[["mailing_list"]][["pipermail"]][[project_key_index]][["save_folder_path"]] + + if (is.null(pipermail_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(pipermail_path) +} + +#' Returns the local input file for pipermail for a specific project key. +#' +#' @description This function returns the local file used for input for +#' pipermail for a specific project key, `project_key_index`, that is specified +#' in the input parameter `config_file`. The input, `config_file` must be a +#' parsed configuration file. The function will inform the user if the specific +#' local input file path for pipermail exists in the parsed configuration file, +#' `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local input file pipermail path for project specified by key `project_key_index`. +#' @export +get_pipermail_input_file <- function(config_file, project_key_index) { + + pipermail_input <- config_file[["mailing_list"]][["pipermail"]][[project_key_index]][["mbox_file_path"]] + + if (is.null(pipermail_input)) { + warning("Attribute does not exist in the configuration file.") + } + + return(pipermail_input) +} + +##### Mailing List Functions End ##### + +##### Issue Tracker Functions Start ##### + +##### Jira Functions ##### + +#' Returns the list of Jira issue tracker project keys. +#' +#' @description This function returns the list of Jira issue tracker project +#' keys, that is specified in the input parameter `config_file`. The input, +#' `config_file` must be a parsed configuration file. The function will inform +#' the user if the project keys exist in the parsed configuration +#' file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of Jira issue tracker project keys. +#' @export +get_jira_keys <- function(config_file) { + + jira_key <- config_file[["issue_tracker"]][["jira"]] + + if (is.null(jira_key)) { + warning("Attribute does not exist in the configuration file.") + } + + return(jira_key) +} + +#' Returns the Jira project domain for a specific project key. +#' +#' @description This function returns the Jira project domain for a specific +#' project key, that is specified in the input parameter `config_file`. +#' The input, `config_file` must be a parsed configuration file. The function +#' will inform the user if the domain exists in the parsed configuration file, +#' `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The Jira domain for project specified by key `project_key_index`. +#' @export +get_jira_domain <- function(config_file, project_key_index) { + + domain <- config_file[["issue_tracker"]][["jira"]][[project_key_index]][["domain"]] + + if (is.null(domain)) { + warning("Attribute does not exist in the configuration file.") + } + + return(domain) +} + +#' Returns the name of the Jira project key for a specific project key. +#' +#' @description This function returns the Jira project key name for a specific +#' project key, that is specified in the input parameter `config_file`. +#' The input, `config_file` must be a parsed configuration file. The function +#' will inform the user if the project key name exists in the parsed +#' configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The Jira project key name for project specified by key `project_key_index`. +#' @export +get_jira_project_key_name <- function(config_file, project_key_index) { + + name <- config_file[["issue_tracker"]][["jira"]][[project_key_index]][["project_key"]] + + if (is.null(name)) { + warning("Attribute does not exist in the configuration file.") + } + + return(name) +} + +#' Returns the local folder path for Jira issues for a specific project key. +#' +#' @description This function returns the folder path for Jira issues for a +#' specific project key, that is specified in the input parameter `config_file`. +#' The input, `config_file` must be a parsed configuration file. The function +#' will inform the user if the folder path for Jira issues exists in the parsed +#' configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The Jira issue folder path for project specified by key `project_key_index`. +#' @export +get_jira_issues_path <- function(config_file, project_key_index) { + + jira_issues_path <- config_file[["issue_tracker"]][["jira"]][[project_key_index]][["issues"]] + + if (is.null(jira_issues_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(jira_issues_path) +} + +#' Returns the local folder path for Jira issue comments for a specific +#' project key. +#' +#' @description This function returns the local folder path for Jira issue +#' comments for a specific project key, that is specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the local folder +#' path for the comments exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The folder path for Jira issue comments for project specified by key `project_key_index`. +#' @export +get_jira_issues_comments_path <- function(config_file, project_key_index) { + + jira_issue_comments_path <- config_file[["issue_tracker"]][["jira"]][[project_key_index]][["issue_comments"]] + + if (is.null(jira_issue_comments_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(jira_issue_comments_path) +} + +##### Github Functions ##### + +#' Returns the list of GitHub issue tracker project keys. +#' +#' @description This function returns the list of GitHub issue tracker project +#' keys, that is specified in the input parameter `config_file`. The input, +#' `config_file` must be a parsed configuration file. The function will inform +#' the user if the project keys exist in the parsed configuration +#' file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of GitHub issue tracker project keys. +#' @export +get_github_keys <- function(config_file) { + + keys <- config_file[["issue_tracker"]][["github"]] + + if (is.null(keys)) { + warning("Attribute does not exist in the configuration file.") + } + + return(keys) +} + +#' Returns the owner for a GitHub repository for a specific project key. +#' +#' @description This function returns the owner for a GitHub repository for a +#' specific project key, that is specified in the input parameter `config_file`. +#' The input, `config_file` must be a parsed configuration file. The function +#' will inform the user if the owner for the GitHub repository exists in the +#' parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The GitHub project owner name for project specified by key `project_key_index`. +#' @export +get_github_owner <- function(config_file, project_key_index) { + + owner <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["owner"]] + + if (is.null(owner)) { + warning("Attribute does not exist in the configuration file.") + } + + return(owner) +} + +#' Returns the name of the GitHub repository for a specific project key. +#' +#' @description This function returns the name of the GitHub repository for a +#' specific project key, that is specified in the input parameter `config_file`. +#' The input, `config_file` must be a parsed configuration file. The function +#' will inform the user if the name of the GitHub repository exists in the +#' parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The name of the GitHub repository for project specified by key `project_key_index`. +#' @export +get_github_repo <- function(config_file, project_key_index) { + + repo <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["repo"]] + + if (is.null(repo)) { + warning("Attribute does not exist in the configuration file.") + } + + return(repo) +} + +#' Returns the local folder path for GitHub issues for a specific project key. +#' +#' @description This function returns the local folder path for GitHub issues +#' for a specific project key, that is specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the folder path for GitHub issues exists +#' in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for GitHub issues for project specified by key `project_key_index`. +#' @export +get_github_issue_path <- function(config_file, project_key_index) { + + issue_path <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["issue"]] + + if (is.null(issue_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(issue_path) +} + +#' Returns the local folder path for GitHub Issue or Pull Request comments for +#' a specific project key. +#' +#' @description This function returns the local folder path for GitHub Issue or +#' Pull Request comments for a specific project key, that is specified in the +#' input parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the local folder +#' path for the comments exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for GitHub Issues or PR comments for project specified by key `project_key_index`. +#' @export +get_github_issue_or_pr_comment_path <- function(config_file, project_key_index) { + + issue_or_pr_comment_path <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["issue_or_pr_comment"]] + + if (is.null(issue_or_pr_comment_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(issue_or_pr_comment_path) +} + +#' Returns the local folder path for GitHub Issue Searches for a specific +#' project key. +#' +#' @description This function returns the local folder path for GitHub Issue +#' Searches for a specific project key, that is specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the local folder path for the issue +#' searches exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for GitHub issue search for project specified by key `project_key_index`. +#' @export +get_github_issue_search_path <- function(config_file, project_key_index) { + + issue_search_path <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["issue_search"]] + + if (is.null(issue_search_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(issue_search_path) +} + +#' Returns the local folder path for GitHub Pull Requests for a specific +#' project key. +#' +#' @description This function returns the local folder path for GitHub Pull +#' Requests for a specific project key, that is specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the local folder +#' path for the pull requests exists in the parsed configuration file, +#' `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for GitHub pull requests for project specified by key `project_key_index`. +#' @export +get_github_pull_request_path <- function(config_file, project_key_index) { + + pull_request_path <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["pull_request"]] + + if (is.null(pull_request_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(pull_request_path) +} + +#' Returns the local folder path for GitHub issue events for a specific project +#' key. +#' +#' @description This function returns the local folder path for GitHub issue +#' events for a specific project key, that is specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the local folder +#' path for the issue events exists in the parsed configuration file, +#' `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for GitHub issue events for project specified by key `project_key_index`. +#' @export +get_github_issue_event_path <- function(config_file, project_key_index) { + + issue_event_path <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["issue_event"]] + + if (is.null(issue_event_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(issue_event_path) +} + +#' Returns the local folder path for GitHub commits for a specific project key. +#' +#' @description This function returns the local folder path for GitHub commits +#' for a specific project key, that is specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the local folder +#' path for the commits exists in the parsed configuration file, +#' `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for GitHub commits for project specified by key `project_key_index`. +#' @export +get_github_commit_path <- function(config_file, project_key_index) { + + commit_path <- config_file[["issue_tracker"]][["github"]][[project_key_index]][["commit"]] + + if (is.null(commit_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(commit_path) +} + +### Bugzilla Functions ##### + +#' Returns the name of the Bugzilla project key for a specific project key index. +#' +#' @description This function returns the name of the Bugzilla project key for +#' a specific project key, that is specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the name of the Bugzilla project key +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The Bugzilla project key name for project specified by key `project_key_index`. +#' @export +get_bugzilla_project_key <- function(config_file, project_key_index) { + + bugzilla_key <- config_file[["issue_tracker"]][["bugzilla"]][[project_key_index]][["project_key"]] + + if (is.null(bugzilla_key)) { + warning("Attribute does not exist in the configuration file.") + } + + return(bugzilla_key) +} + +#' Returns the local folder path for Bugzilla issues for a specific project key. +#' +#' @description This function returns the local folder path for Bugzilla issues +#' for a specific project key, that is specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the folder path for Bugzilla issues +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for Bugzilla issues for project specified by key `project_key_index`. +#' @export +get_bugzilla_issue_path <- function(config_file, project_key_index) { + + issue_path <- config_file[["issue_tracker"]][["bugzilla"]][[project_key_index]][["issues"]] + + if (is.null(issue_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(issue_path) +} + +#' Returns the local folder path for Bugzilla issue comments for a specific project key. +#' +#' @description This function returns the local folder path for Bugzilla issue +#' comments for a specific project key, that is specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the folder path for Bugzilla issue +#' comments exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @param project_key_index The name of the index of the project key (e.g. "project_key_1" or "project_key_2"). +#' @return The local folder path for Bugzilla issue comments for project specified by key `project_key_index`. +#' @export +get_bugzilla_issue_comment_path <- function(config_file, project_key_index) { + + issue_comment_path <- config_file[["issue_tracker"]][["bugzilla"]][[project_key_index]][["issue_comments"]] + + if (is.null(issue_comment_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(issue_comment_path) +} + +##### Issue Tracker Functions End ##### + + + +##### Vulnerabilities Functions Start ##### + +#' Returns the local folder path that contains the nvd (National Vulnerability +#' Database) feeds. +#' +#' @description This function returns the local folder path for nvd feeds, +#' that is specified in the input parameter `config_file`. The input, +#' `config_file` must be a parsed configuration file. The function will inform +#' the user if the local folder path for the nvd feeds exists in the parsed +#' configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The folder path with nvd feeds. +#' @export +get_nvdfeed_folder_path <- function(config_file) { + + nvdfeed_folder_path <- config_file[["vulnerabilities"]][["nvd_feed"]] + + if (is.null(nvdfeed_folder_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(nvdfeed_folder_path) +} + +##### Vulnerabilities Functions End ##### + + + +##### Regular Expression Functions Start ##### + +#' Returns the issue Id regular expression for commit messages. +#' +#' @description This function returns the issue Id regular expression for commit +#' messages, that is specified in the input parameter `config_file`. The input, +#' `config_file` must be a parsed configuration file. The function will inform +#' the user if the issue Id regular expression for commit messages exists in the +#' parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The commit message issue Id regular expression. +#' @export +get_issue_id_regex <- function(config_file) { + + issue_id_regex <- config_file[["commit_message_id_regex"]][["issue_id"]] + + if (is.null(issue_id_regex)) { + warning("Attribute does not exist in the configuration file.") + } + + return(issue_id_regex) +} + +#' Returns the cve (Common Vulnerabilities and Exposures) regular expression +#' for commit messages. +#' +#' @description This function returns the cve regular expression for commit +#' messages, that is specified in the input parameter `config_file`. The input, +#' `config_file` must be a parsed configuration file. The function will inform +#' the user if the cve regular expression for commit messages exists in the +#' parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The commit message CVE regular expression. +#' @export +get_cveid_regex <- function(config_file) { + + cveid_regex <- config_file[["commit_message_id_regex"]][["cve_id"]] + + if (is.null(cveid_regex)) { + warning("Attribute does not exist in the configuration file.") + } + + return(cveid_regex) +} + +##### Regular Expression Functions End ##### + + + +##### Filter Functions Start ##### + +#' Returns the list of file extensions used for filtering files to keep. +#' +#' @description This function returns the list of file extensions that will be +#' used for filtering files specified in the input parameter `config_file`. The +#' input, `config_file` must be a parsed configuration file. The function will +#' inform the user if the list of file extensions exists in the parsed +#' configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of file extensions to keep. +#' @export +get_file_extensions <- function(config_file) { + + file_extensions <- config_file[["filter"]][["keep_filepaths_ending_with"]] + + if (is.null(file_extensions)) { + warning("Attribute does not exist in the configuration file.") + } + + return(file_extensions) +} + +#' Returns the list of file extensions used for filtering files to remove. +#' +#' @description This function returns the list of file extensions that will be +#' used for filtering files specified in the input parameter `config_file`. The +#' input, `config_file` must be a parsed configuration file. The function will +#' inform the user if the list of file extensions exists in the parsed +#' configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of file extensions to remove. +#' @export +get_substring_filepath <- function(config_file) { + + substring_filepath <- config_file[["filter"]][["remove_filepaths_containing"]] + + if (is.null(substring_filepath)) { + warning("Attribute does not exist in the configuration file.") + } + + return(substring_filepath) +} + +#' Returns the commit size threshold to remove file paths. +#' +#' @description This function returns an integer number that represents the +#' threshold for a commit size to remove file paths specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the commit size +#' threshold exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The commit size to filter out. +#' @export +get_filter_commit_size <- function(config_file) { + + filter_commit_size <- config_file[["filter"]][["remove_filepaths_on_commit_size_greather_than"]] + + if (is.null(filter_commit_size)) { + warning("Attribute does not exist in the configuration file.") + } + + return(filter_commit_size) +} + +##### Filter Functions End ##### + + + +##### Third Party Tools Functions Start ##### + +#' Returns the specified tool project from a parsed tool configuration file. +#' +#' @description This function returns a path to a specified tool from a +#' specified parsed tool configuration file. The function takes the input +#' `tool_name` and uses it to index a specific tool project in a parsed +#' tool configuration file, `config_file`, where it then returns the specified +#' tool project. The function will inform the user if the specified attribute, +#' `tool_name`, exists in the parsed configuration file, `config_file`. +#' +#' @param tool_name The name of the tool (e.g. "perceval" or "dv8"). +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The specified `tool_name` tool project from `config_file`. +#' @export +get_tool_project <- function(tool_name, config_file) { + + tool_path <- config_file[[tool_name]] + + if (is.null(tool_path)) { + warning("Attribute does not exist.") + } + + return(tool_path) +} + +#' Returns the depends code language for analysis. +#' +#' @description This function returns the specified code language that should +#' be used to parse file-file static dependencies with the depends tool, that +#' is specified in the input parameter `config_file`. The input, `config_file` +#' must be a parsed configuration file. The function will inform the user if +#' the depends code language exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The code language for parsing file-file static dependencies. +#' @export +get_depends_code_language <- function(config_file) { + + language <- config_file[["tool"]][["depends"]][["code_language"]] + + if (is.null(language)) { + warning("Attribute does not exist in the configuration file.") + } + + return(language) +} + +#' Returns a list of the types of dependencies to keep for analysis. +#' +#' @description This function returns the specified types of dependencies to +#' keep for analysis with the depends tool, that is specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the list of the +#' types of dependencies exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return A list of the types of depends dependencies to keep for analysis. +#' @export +get_depends_keep_dependencies_type <- function(config_file) { + + keep_dependencies_type <- config_file[["tool"]][["depends"]][["keep_dependencies_type"]] + + if (is.null(keep_dependencies_type)) { + warning("Attribute does not exist in the configuration file.") + } + + return(keep_dependencies_type) +} + +#' Returns the path to the folder used to store files for DV8 analysis. +#' +#' @description This function returns the path to the folder that will be +#' used to store various intermediate files for DV8 analysis, that is specified +#' in the input parameter `config_file`. The input, `config_file` must be a +#' parsed configuration file. The function will inform the user if the path +#' path to the folder for intermediate file storage for DV8 analysis exists in +#' the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The DV8 project folder path. +#' @export +get_dv8_folder_path <- function(config_file) { + + project_path <- config_file[["tool"]][["dv8"]][["folder_path"]] + + if (is.null(project_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(project_path) +} + +#' Returns the list of architectural flaws thresholds for DV8 analysis. +#' +#' @description This function returns the list of architectural flaws thresholds +#' for DV8 analysis, that is specified in the input parameter `config_file`. +#' The input, `config_file` must be a parsed configuration file. The function +#' will inform the user if the list of architectural flaws thresholds +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of DV8 architectural flaws thresholds. +#' @export +get_dv8_flaws_params <- function(config_file) { + + dv8_flaws_params <- config_file[["tool"]][["dv8"]][["architectural_flaws"]] + + if (is.null(dv8_flaws_params)) { + warning("Attribute does not exist in the configuration file.") + } + + return(dv8_flaws_params) +} + +#' Returns the types to keep to to be considered for analysis. +#' +#' @description This function returns the types of file-file dependencies that +#' should be considered, that are specified in the input parameter +#' `config_file`. The input, `config_file` must be a parsed configuration file. +#' The function will inform the user if the lines type to keep exists in the +#' parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The lines type to keep for analysis. +#' @export +get_uctags_line_types <- function(config_file) { + + kinds <- config_file[["tool"]][["uctags"]][["keep_lines_type"]] + + if (is.null(kinds)) { + warning("Attribute does not exist in the configuration file.") + } + + return(kinds) +} + +#' Returns the file path for the output of the srcML analysis for the project. +#' +#' @description This function returns the file path to be used to store the +#' output of the srcML analysis for the project, that is specified in the +#' input parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the file path +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The output file path for srcML analysis. +#' @export +get_srcml_filepath <- function(config_file) { + + srcml_filepath <- config_file[["tool"]][["srcml"]][["srcml_path"]] + + if (is.null(srcml_filepath)) { + warning("Attribute does not exist in the configuration file.") + } + + return(srcml_filepath) +} + +#' Returns the folder path for class pattern4 analysis. +#' +#' @description This function returns the folder path used to store the classes +#' for the pattern4 analysis for the project, that is specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the folder path +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The Pattern4 class folder path. +#' @export +get_pattern4_folder_path <- function(config_file) { + + pattern4_folder_path <- config_file[["tool"]][["pattern4"]][["class_folder_path"]] + + if (is.null(pattern4_folder_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(pattern4_folder_path) +} + +#' Returns the folder path for the output of the pattern4 analysis. +#' +#' @description This function returns the folder path that contains the +#' output of the pattern4 analysis for the project, that is specified in the +#' input parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the folder path +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The Pattern4 output folder path. +#' @export +get_pattern4_filepath <- function(config_file) { + + pattern4_filepath <- config_file[["tool"]][["pattern4"]][["output_filepath"]] + + if (is.null(pattern4_filepath)) { + warning("Attribute does not exist in the configuration file.") + } + + return(pattern4_filepath) +} + +#' Returns the understand code language for analysis. +#' +#' @description This function returns the specified code language that should +#' be used to parse dependencies with the understand tool, that +#' is specified in the input parameter `config_file`. The input, `config_file` +#' must be a parsed configuration file. The function will inform the user if +#' the understand code language exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The code language for parsing with the understand tool. +#' @export +get_understand_code_language <- function(config_file) { + + language <- config_file[["tool"]][["understand"]][["code_language"]] + + if (is.null(language)) { + warning("Attribute does not exist in the configuration file.") + } + + return(language) +} + +#' Returns a list of the types of understand dependencies to keep for analysis. +#' +#' @description This function returns the specified types of dependencies to +#' keep for analysis with the understand tool, that is specified in the input +#' parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the list of the +#' types of understand dependencies exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return A list of the types of understand dependencies to keep for analysis. +#' @export +get_understand_keep_dependencies_type <- function(config_file) { + + keep_dependencies_type <- config_file[["tool"]][["understand"]][["keep_dependencies_type"]] + + if (is.null(keep_dependencies_type)) { + warning("Attribute does not exist in the configuration file.") + } + + return(keep_dependencies_type) +} + +#' Returns the folder path for the input of the understand analysis. +#' +#' @description This function returns the folder path that contains the +#' input of the understand analysis for the project, that is specified in the +#' input parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the folder path +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The understand project folder path. +#' @export +get_understand_project_path <- function(config_file) { + + understand_project_path <- config_file[["tool"]][["understand"]][["project_path"]] + + if (is.null(understand_project_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(understand_project_path) +} + +#' Returns the folder path for the output of the understand analysis. +#' +#' @description This function returns the folder path that contains the +#' output of the understand analysis for the project, that is specified in the +#' input parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the folder path +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The understand output folder path. +#' @export +get_understand_output_path <- function(config_file) { + + understand_output_path <- config_file[["tool"]][["understand"]][["output_path"]] + + if (is.null(understand_output_path)) { + warning("Attribute does not exist in the configuration file.") + } + + return(understand_output_path) +} + +##### Third Party Tools Functions End ##### + + + +##### Analysis Functions Start ##### + +#' Returns the list of topics and keywords for analysis. +#' +#' @description This function returns the list of keywords and topics for +#' analysis, that is specified in the input parameter `config_file`. The +#' input, `config_file` must be a parsed configuration file. The function will +#' inform the user if the list of keywords and topics exists in the parsed +#' configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of keywords and topics for analysis. +#' @export +get_topics <- function(config_file) { + + topics <- config_file[["analysis"]][["topics"]] + + if (is.null(topics)) { + warning("Attribute does not exist in the configuration file.") + } + + return(topics) +} + +#' Returns the starting commit for a window for analysis. +#' +#' @description This function returns the starting commit for a window of time +#' for analysis (the time stamp is inferred from gitlog), that is specified in +#' the input parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the start commit +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The start commit for a window for analysis. +#' @export +get_window_start_commit <- function(config_file) { + + start_commit <- config_file[["analysis"]][["window"]][["start_commit"]] + + if (is.null(start_commit)) { + warning("Attribute does not exist in the configuration file.") + } + + return(start_commit) +} + +#' Returns the ending commit for a window for analysis. +#' +#' @description This function returns the ending commit for a window of time +#' for analysis (the time stamp is inferred from gitlog), that is specified in +#' the input parameter `config_file`. The input, `config_file` must be a parsed +#' configuration file. The function will inform the user if the end commit +#' exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The end commit for a window for analysis. +#' @export +get_window_end_commit <- function(config_file) { + + end_commit <- config_file[["analysis"]][["window"]][["end_commit"]] + + if (is.null(end_commit)) { + warning("Attribute does not exist in the configuration file.") + } + + return(end_commit) +} + +#' Returns the size of a window for analysis. +#' +#' @description This function returns the size of a window, that is +#' specified in the input parameter `config_file`. The input, `config_file` +#' must be a parsed configuration file. The function will inform the user if +#' the window size exists in the parsed configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The size of a window for analysis. +#' @export +get_window_size <- function(config_file) { + + window_size <- config_file[["analysis"]][["window"]][["size_days"]] + + if (is.null(window_size)) { + warning("Attribute does not exist in the configuration file.") + } + + return(window_size) +} + +#' Returns the list of enumerated commit intervals for analysis. +#' +#' @description This function returns a list of enumerated commit intervals, +#' that is specified in the input parameter `config_file`. The input, +#' `config_file` must be a parsed configuration file. The function will inform +#' the user if the list of enumerated commit intervals exists in the parsed +#' configuration file, `config_file`. +#' +#' @param config_file The parsed configuration file obtained from \code{\link{parse_config}}. +#' @return The list of enumerated commit intervals. +#' @export +get_enumeration_commits <- function(config_file) { + + enumeration_commit <- config_file[["analysis"]][["enumeration"]][["commit"]] + + if (is.null(enumeration_commit)) { + warning("Attribute does not exist in the configuration file.") + } + + return(enumeration_commit) +} + +##### Analysis Functions End ##### diff --git a/R/graph.R b/R/graph.R index f533244d..0b3dba58 100644 --- a/R/graph.R +++ b/R/graph.R @@ -434,12 +434,12 @@ temporal_graph_projection <- function(graph,mode,weight_scheme_function = NULL,t # use the weight_scheme_cum_temporal(). combinations <- merge(combinations,edgelist,all.x=TRUE,by.x = "from_edgeid", by.y="edgeid", - sorted = FALSE) + sort = FALSE) setnames(combinations, old=c("from","weight","datetimetz"), new=c("from_projection","from_weight","from_datetimetz")) combinations <- merge(combinations,edgelist,all.x=TRUE,by.x = "to_edgeid", by.y="edgeid", - sorted = FALSE) + sort = FALSE) setnames(combinations, old=c("from","weight","datetimetz"), new=c("to_projection","to_weight","to_datetimetz")) diff --git a/README.md b/README.md index 3fc3e836..5e77998f 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ For detailed functionality, see Kaiaulu's [API](http://itm0.shidler.hawaii.edu/k Please [ask questions on Discussions](https://github.com/sailuh/kaiaulu/discussions) or open an issue on the [issue tracker](https://github.com/sailuh/kaiaulu/issues) if you found a bug, or your answer can't be found in the documentation. A more comprehensive and ever growing list of features is available on the [Project Wiki](https://github.com/sailuh/kaiaulu/wiki). - ## Installation Kaiaulu has been tested on OS X and Ubuntu. For Windows and other OS users, try [Virtualbox](https://www.virtualbox.org/), @@ -33,7 +32,7 @@ I also recommend you download the repo to have some example project configuratio 4. Build the documentation `devtools::document(roclets = c('rd', 'collate', 'namespace'))`. 5. Build Kaiaulu (Top right pane in RStudio -> Build tab -> Install and Restart) 6. Run `vignettes/kaiaulu_architecture.Rmd` - 7. See the Wiki's [Third Party Tools Setup](https://github.com/sailuh/kaiaulu/wiki/Third-Party-Tools-Setup) if you are using a Notebook that relies on them. These require very minimal overhead by downloading a binary file, and specifying their path on `tools.yml` (see example on the repository). + 7. See the Wiki's [Third Party Tools Setup](https://github.com/sailuh/kaiaulu/wiki/Third-Party-Tools-Setup) if you are using a Notebook that relies on them. These require very minimal overhead by downloading a binary file and specifying their path on `tools.yml` (see example on the repository). ### Getting started @@ -41,13 +40,10 @@ To get started, browse through [the docs](http://itm0.shidler.hawaii.edu/kaiaulu ### Cheatsheets - | Social Smells | Architectural Flaws | | ------------- | ------------- | | | | - - ## Stay up-to-date * Read the [NEWS file](https://github.com/sailuh/kaiaulu/blob/master/NEWS.md). diff --git a/_pkgdown.yml b/_pkgdown.yml index c3c69f8d..e468fe60 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -285,3 +285,62 @@ reference: - dv8_clsxb_to_clsxj - dv8_clsxj_to_clsxb - parse_dv8_clusters +- title: __Config__ + desc: > + Get functions to access config fields. + The use of get() in Notebooks and executable + scripts is encouraged fornotebook compatibility + in future versions of the config. +- contents: + - parse_config + - get_tool_project + - gitlog_to_hdsmj + - get_bugzilla_issue_comment_path + - get_bugzilla_issue_path + - get_bugzilla_project_key + - get_cveid_regex + - get_depends_code_language + - get_depends_keep_dependencies_type + - get_dv8_flaws_params + - get_dv8_folder_path + - get_enumeration_commits + - get_file_extensions + - get_filter_commit_size + - get_git_branches + - get_git_repo_path + - get_github_commit_path + - get_github_issue_event_path + - get_github_issue_or_pr_comment_path + - get_github_issue_path + - get_github_issue_search_path + - get_github_keys + - get_github_owner + - get_github_pull_request_path + - get_github_repo + - get_issue_id_regex + - get_jira_domain + - get_jira_issues_comments_path + - get_jira_issues_path + - get_jira_keys + - get_jira_project_key_name + - get_mbox_domain + - get_mbox_input_file + - get_mbox_key_indexes + - get_mbox_path + - get_nvdfeed_folder_path + - get_pattern4_filepath + - get_pattern4_folder_path + - get_pipermail_domain + - get_pipermail_input_file + - get_pipermail_path + - get_srcml_filepath + - get_substring_filepath + - get_topics + - get_uctags_line_types + - get_understand_code_language + - get_understand_keep_dependencies_type + - get_understand_output_path + - get_understand_project_path + - get_window_end_commit + - get_window_size + - get_window_start_commit diff --git a/conf/ambari.yml b/conf/ambari.yml index 405ad4d9..45d695d0 100644 --- a/conf/ambari.yml +++ b/conf/ambari.yml @@ -36,7 +36,8 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/ambari/.git + # log: ../../rawdata/git_repo/ambari/.git + log: ../../rawdata/ambari/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/apache/ambari # List of branches used for analysis @@ -51,28 +52,56 @@ version_control: - branch-2.5 mailing_list: - # Where is the mbox located locally? - mbox: ../../rawdata/mbox/ambari-dev.mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - ambari-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/ambari-dev + save_folder_path: ../../rawdata/ambari/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/ambari/mod_mbox/save_mbox_mail/ambari.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/ambari-user + save_folder_path: ../../rawdata/ambari/mod_mbox/save_mbox_mail_2/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/ambari/mod_mbox/save_mbox_mail_2/ambari.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: AMBARI - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/ambari_issues.json - #issue_comments: ../../rawdata/issue_tracker/ambari_issue_comments.json + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: AMBARI + # Download using `download_jira_data.Rmd` + # issues: ../../rawdata/ambari/jira/issues/ambari/ + # issue_comments: ../../rawdata/ambari/jira/issue_comments/ambari/ github: - # Obtained from the project's GitHub URL - owner: apache - repo: ambari - # Download using `download_github_comments.Rmd` - #replies: ../../rawdata/github/ambari/ + project_key_1: + # Obtained from the project's GitHub URL + owner: apache + repo: ambari + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/ambari/github/issue_or_pr_comment/apache_ambari/ + issue: ../../rawdata/ambari/github/issue/apache_ambari/ + issue_search: ../../rawdata/ambari/github/issue_search/apache_ambari/ + issue_event: ../../rawdata/ambari/github/issue_event/apache_ambari/ + pull_request: ../../rawdata/ambari/github/pull_request/apache_ambari/ + commit: ../../rawdata/ambari/github/commit/apache_ambari/ + # bugzilla: + # project_key_1: + # project_key: ambari + # issues: ../../rawdata/ambari/bugzilla/issues/ambari/ + # issue_comments: ../../rawdata/ambari/bugzilla/issue_comments/ambari/ + #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -97,7 +126,6 @@ filter: remove_filepaths_containing: - test - # Third Party Tools Configuration # # # See Kaiaulu's README.md for details on how to setup these tools. @@ -122,6 +150,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -141,14 +195,44 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: # You can specify the intervals in 2 ways: window, or enumeration window: # If using gitlog, use start_commit and end_commit. Timestamp is inferred from gitlog - #start_commit: b53fa3c4755b7ae6af86cf893924d3e88d449401 - #end_commit: 1a711038698490b1f6423e3e4801ae98598d0366 +# start_commit: b53fa3c4755b7ae6af86cf893924d3e88d449401 +# end_commit: 1a711038698490b1f6423e3e4801ae98598d0366 # Alternatively, you can specify the start and end datetime (UTC timezone is assumed). start_datetime: 2017-01-31 00:00:00 end_datetime: 2019-01-31 00:00:00 diff --git a/conf/apr.yml b/conf/apr.yml index 65892ed4..60dc6263 100644 --- a/conf/apr.yml +++ b/conf/apr.yml @@ -36,7 +36,7 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/APR/.git + log: ../../rawdata/apr/git_repo/APR/.git # From where the git log was downloaded? log_url: https://github.com/apache/apr # List of branches used for analysis @@ -45,33 +45,73 @@ version_control: - trunk mailing_list: - # Where is the mbox located locally? - mbox: ../../rawdata/mbox/apr-dev_2012_2019.mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - apr-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/apr-dev + save_folder_path: ../../rawdata/apr/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/apr/mod_mbox/save_mbox_mail/apr.mbox +# project_key_2: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: - jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - #project_key: HELIX - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/helix_issues.json - #issue_comments: ../../rawdata/issue_tracker/helix_issue_comments.json + # jira: + # project_key_1: + # # Obtained from the project's JIRA URL + # domain: https://issues.apache.org/jira + # project_key: HELIX + # # Download using `download_jira_data.Rmd` + # issues: ../../rawdata/apr/jira/issues/helix/ + # issue_comments: ../../rawdata/apr/jira/issue_comments/helix/ github: - # Obtained from the project's GitHub URL - owner: apache - repo: apr - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/apr/ + project_key_1: + # Obtained from the project's GitHub URL + owner: apache + repo: apr + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/apr/github/issue_or_pr_comment/apache_apr/ + issue: ../../rawdata/apr/github/issue/apache_apr/ + issue_search: ../../rawdata/apr/github/issue_search/apache_apr/ + issue_event: ../../rawdata/apr/github/issue_event/apache_apr/ + pull_request: ../../rawdata/apr/github/pull_request/apache_apr/ + commit: ../../rawdata/apr/github/commit/apache_apr/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ + + #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) # Download at: https://nvd.nist.gov/vuln/data-feeds - #nvd_feed: rawdata/nvdfeed + # nvd_feed: rawdata/nvdfeed # Commit message CVE or Issue Regular Expression (regex) # See project's commit message for examples to create the regex @@ -120,7 +160,7 @@ tool: # The project folder path to store various intermediate # files for DV8 Analysis # The folder name will be used in the file names. - folder_path: ../../analysis/dv8/apr + folder_path: ../../analysis/apr/dv8/ # the architectural flaws thresholds that should be used architectural_flaws: cliqueDepends: @@ -161,6 +201,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/calculator.yml b/conf/calculator.yml index 2b95b6f8..250786d9 100644 --- a/conf/calculator.yml +++ b/conf/calculator.yml @@ -36,36 +36,74 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/Calculator/.git + log: ../../rawdata/Calculator/git_repo/Calculator/.git # From where the git log was downloaded? log_url: https://github.com/HouariZegai/Calculator # List of branches used for analysis branch: - master -mailing_list: - # Where is the mbox located locally? - #mbox: ../../rawdata/mbox/apr-dev_2012_2019.mbox - # What is the domain of the chosen mailing list archive? - #domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - #list_key: - # - apr-dev +# mailing_list: +# mod_mbox: +# project_key_1: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-dev +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox -#issue_tracker: +issue_tracker: # jira: - # Obtained from the project's JIRA URL -# domain: https://issues.apache.org/jira - #project_key: HELIX - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/helix_issues.json - #issue_comments: ../../rawdata/issue_tracker/helix_issue_comments.json +# project_key_1: +# # Obtained from the project's JIRA URL +# domain: https://sailuh.atlassian.net +# project_key: SAILUH +# # Download using `download_jira_data.Rmd` +# issues: ../../rawdata/kaiaulu/jira/issues/sailuh/ +# issue_comments: ../../rawdata/kaiaulu/jira/issue_comments/sailuh/ github: - # Obtained from the project's GitHub URL - owner: HouariZegai - repo: Calculator - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/Calculator/ + project_key_1: + # Obtained from the project's GitHub URL + owner: HouariZegai + repo: Calculator + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/Calculator/github/issue_or_pr_comment/HouariZegai_Calculator/ + issue: ../../rawdata/Calculator/github/issue/HouariZegai_Calculator/ + issue_search: ../../rawdata/Calculator/github/issue_search/HouariZegai_Calculator/ + issue_event: ../../rawdata/Calculator/github/issue_event/HouariZegai_Calculator/ + pull_request: ../../rawdata/Calculator/github/pull_request/HouariZegai_Calculator/ + commit: ../../rawdata/Calculator/github/commit/HouariZegai_Calculator/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -120,7 +158,7 @@ tool: # The project folder path to store various intermediate # files for DV8 Analysis # The folder name will be used in the file names. - folder_path: ../../analysis/dv8/calculator + folder_path: ../../analysis/Calculator/dv8/ # the architectural flaws thresholds that should be used architectural_flaws: cliqueDepends: @@ -161,6 +199,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/camel.yml b/conf/camel.yml index 04dbbfca..ae897ca9 100644 --- a/conf/camel.yml +++ b/conf/camel.yml @@ -36,7 +36,7 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/camel/.git + log: ../../rawdata/camel/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/apache/camel # List of branches used for analysis @@ -46,29 +46,67 @@ version_control: - camel-2.11.4 - camel-3.21.0 -#mailing_list: - # Where is the mbox located locally? -# mbox: ../../rawdata/mbox/apr-dev_2012_2019.mbox - # What is the domain of the chosen mailing list archive? -# domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? -# list_key: -# - apr-dev +mailing_list: + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/camel-dev + save_folder_path: ../../rawdata/camel/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/camel/mod_mbox/save_mbox_mail/camel.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/camel-users + save_folder_path: ../../rawdata/camel/mod_mbox/save_mbox_mail_2/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/camel.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: CAMEL - # Download using `download_jira_data.Rmd` - issues: ../../rawdata/issue_tracker/camel/issues/ - issue_comments: ../../rawdata/issue_tracker/camel/issue_comments/ -# github: - # Obtained from the project's GitHub URL -# owner: apache -# repo: apr - # Download using `download_github_comments.Rmd` -# replies: ../../rawdata/github/apr/ + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: CAMEL + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/camel/jira/issues/ + issue_comments: ../../rawdata/camel/jira/issue_comments/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -124,7 +162,7 @@ tool: # The project folder path to store various intermediate # files for DV8 Analysis # The folder name will be used in the file names. - folder_path: ../../analysis/dv8/camel_1_6 + folder_path: ../../analysis/camel/dv8/camel_1_6 # the architectural flaws thresholds that should be used architectural_flaws: cliqueDepends: @@ -168,8 +206,35 @@ tool: # srcML allow to parse src code as text (e.g. identifiers) srcml: # The file path to where you wish to store the srcml output of the project - srcml_path: ../../analysis/camel/srcml_camel.xml + srcml_path: ../../analysis/camel/srcml/srcml_camel.xml # Specify which types of Dependencies to keep - see the Depends tool README.md for details. + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ + # Analysis Configuration # analysis: # A list of topic and keywords (see src_text_showcase.Rmd). diff --git a/conf/chromium.yml b/conf/chromium.yml index 3d0f1288..a0b2ef7c 100644 --- a/conf/chromium.yml +++ b/conf/chromium.yml @@ -36,36 +36,74 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/chromium/.git + log: ../../rawdata/chromium/git_repo/chromium/.git # From where the git log was downloaded? log_url: https://chromium.googlesource.com/chromium/src # List of branches used for analysis branch: - master -mailing_list: - # Where is the mbox located locally? - #mbox: ../../rawdata/mbox/geronimo-dev.mbox - # What is the domain of the chosen mailing list archive? - #domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - #list_key: - # - geronimo-dev +# mailing_list: +# mod_mbox: +# project_key_1: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-dev +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox -issue_tracker: - jira: - # Obtained from the project's JIRA URL - #domain: https://issues.apache.org/jira - #project_key: GERONIMO - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/geronimo_issues.json - #issue_comments: ../../rawdata/issue_tracker/geronimo_issue_comments.json - github: - # Obtained from the project's GitHub URL - #owner: sailuh - #repo: kaiaulu - # Download using `download_github_comments.Rmd` - #replies: ../../rawdata/github/kaiaulu/ +# issue_tracker: +# jira: +# project_key_1: +# # Obtained from the project's JIRA URL +# domain: https://sailuh.atlassian.net +# project_key: SAILUH +# # Download using `download_jira_data.Rmd` +# issues: ../../rawdata/kaiaulu/jira/issues/sailuh/ +# issue_comments: ../../rawdata/kaiaulu/jira/issue_comments/sailuh/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -117,6 +155,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -136,6 +200,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/geronimo.yml b/conf/geronimo.yml index 56841607..925cc558 100644 --- a/conf/geronimo.yml +++ b/conf/geronimo.yml @@ -36,7 +36,7 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/geronimo/.git + log: ../../rawdata/geronimo/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/apache/geronimo # List of branches used for analysis @@ -44,28 +44,66 @@ version_control: - trunk mailing_list: - # Where is the mbox located locally? - mbox: ../../rawdata/mbox/geronimo-dev.mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - geronimo-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/geronimo-dev + save_folder_path: ../../rawdata/geronimo/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/geronimo/mod_mbox/save_mbox_mail/geronimo.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/geronimo-user + save_folder_path: ../../rawdata/geronimo/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: GERONIMO - # Download using `download_jira_data.Rmd` - issues: ../../rawdata/issue_tracker/geronimo/issues/ - issue_comments: ../../rawdata/issue_tracker/geronimo/issue_comments/ + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: GERONIMO + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/geronimo/jira/issues/ + issue_comments: ../../rawdata/geronimo/jira/issue_comments/ github: - # Obtained from the project's GitHub URL - owner: apache - repo: geronimo - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/geronimo/ + project_key_1: + # Obtained from the project's GitHub URL + owner: apache + repo: geronimo + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/geronimo/github/issue_or_pr_comment/apache_geronimo/ + issue: ../../rawdata/geronimo/github/issue/apache_geronimo/ + issue_search: ../../rawdata/geronimo/github/issue_search/apache_geronimo/ + issue_event: ../../rawdata/geronimo/github/issue_event/apache_geronimo/ + pull_request: ../../rawdata/geronimo/github/pull_request/apache_geronimo/ + commit: ../../rawdata/geronimo/github/commit/apache_geronimo/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -121,7 +159,7 @@ tool: # The project folder path to store various intermediate # files for DV8 Analysis # The folder name will be used in the file names. - folder_path: ../../analysis/dv8/geronimo + folder_path: ../../analysis/geronimo/dv8/ # the architectural flaws thresholds that should be used architectural_flaws: cliqueDepends: @@ -162,6 +200,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/helix.yml b/conf/helix.yml index 6ca5cdf1..d1bbf7f9 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -34,7 +34,7 @@ project: version_control: # Where is the git log located locally? - log: ../../rawdata/git_repo/helix/.git + log: ../../rawdata/helix/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/apache/helix # List of branches used for analysis @@ -48,30 +48,55 @@ version_control: - revert-1685-master mailing_list: - # Where is the mbox located locally? - # This is the path to the .git of the project repository you are analyzing. - # The .git is hidden, so you can see it using `ls -a` - mbox: ../../rawdata/mbox/helix_mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - helix-dev + mod_mbox: + project_key_1: + mailing_list: https://lists.apache.org/list.html?announce@apache.org + save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/helix.mbox + project_key_2: + mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org + save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/helix.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/openssl-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/openssl-project/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: HELIX - # Download using `download_jira_data.Rmd` - issues: ../../rawdata/issue_tracker/helix/issues/ - issue_comments: ../../rawdata/issue_tracker/helix/issue_comments/ + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: HELIX + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/helix/jira/issues/helix + issue_comments: ../../rawdata/helix/jira/issue_comments/helix github: - # Obtained from the project's GitHub URL - owner: apache - repo: helix - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/helix/ + project_key_1: + # Obtained from the project's GitHub URL + owner: apache + repo: helix + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/helix/github/issue_or_pr_comment/apache_helix/ + issue: ../../rawdata/helix/github/issue/apache_helix/ + issue_search: ../../rawdata/helix/github/issue_search/apache_helix/ + issue_event: ../../rawdata/helix/github/issue_event/apache_helix/ + pull_request: ../../rawdata/helix/github/pull_request/apache_helix/ + commit: ../../rawdata/helix/github/commit/apache_helix/ + # bugzilla: + # project_key_1: + # project_key: helix + # issues: ../../rawdata/helix/bugzilla/issues/helix/ + # issue_comments: ../../rawdata/helix/bugzilla/issue_comments/helix/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -121,6 +146,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -140,6 +191,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/junit5.yml b/conf/junit5.yml index cdf0a332..cb37b4f2 100644 --- a/conf/junit5.yml +++ b/conf/junit5.yml @@ -36,47 +36,86 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/junit5/.git + # log: ../../rawdata/git_repo/junit5/.git + log: ../../rawdata/junit5/git_repo/junit5/.git # From where the git log was downloaded? log_url: https://github.com/junit-team/junit5/ # List of branches used for analysis branch: - main -#mailing_list: - # Where is the mbox located locally? -# mbox: ../../rawdata/mbox/apr-dev_2012_2019.mbox - # What is the domain of the chosen mailing list archive? -# domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? -# list_key: -# - apr-dev +# mailing_list: +# mod_mbox: +# project_key_1: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-dev +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox -#issue_tracker: -# jira: - # Obtained from the project's JIRA URL -# domain: https://issues.apache.org/jira - #project_key: HELIX - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/helix_issues.json - #issue_comments: ../../rawdata/issue_tracker/helix_issue_comments.json -# github: - # Obtained from the project's GitHub URL -# owner: apache -# repo: apr - # Download using `download_github_comments.Rmd` -# replies: ../../rawdata/github/apr/ +# issue_tracker: +# jira: +# project_key_1: +# # Obtained from the project's JIRA URL +# domain: https://sailuh.atlassian.net +# project_key: SAILUH +# # Download using `download_jira_data.Rmd` +# issues: ../../rawdata/kaiaulu/jira/issues/sailuh/ +# issue_comments: ../../rawdata/kaiaulu/jira/issue_comments/sailuh/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) # Download at: https://nvd.nist.gov/vuln/data-feeds - #nvd_feed: rawdata/nvdfeed + # nvd_feed: rawdata/nvdfeed # Commit message CVE or Issue Regular Expression (regex) # See project's commit message for examples to create the regex -#commit_message_id_regex: +# commit_message_id_regex: # issue_id: \#[0-9]+ - #cve_id: ? +# cve_id: ? filter: keep_filepaths_ending_with: @@ -120,7 +159,7 @@ tool: # The project folder path to store various intermediate # files for DV8 Analysis # The folder name will be used in the file names. - folder_path: ../../analysis/dv8/junit + folder_path: ../../analysis/junit5/dv8/ # the architectural flaws thresholds that should be used architectural_flaws: cliqueDepends: @@ -164,17 +203,34 @@ tool: # srcML allow to parse src code as text (e.g. identifiers) srcml: # The file path to where you wish to store the srcml output of the project - srcml_path: ../../analysis/junit/srcml_junit.xml + srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml pattern4: - # The file path to where you wish to store the srcml output of the project - class_folder_path: ../../rawdata/git_repo/junit5/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # The file path to where you wish to store the classes of the pattern4 analysis + class_folder_path: ../../rawdata/junit5/git_repo/junit5/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # The file path to where you wish to store the output of the pattern4 analysis + output_filepath: ../../analysis/junit5/pattern4/ compile_note: > 1. Switch Java version to Java 17: https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 2. Disable VPN to pull modules from Gradle Plugin Portal. 3. Use sudo ./gradlew build 4. After building, locate the engine class files and specify as the class_folder_path: - in this case they are in: /path/to/junit5/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ + # Analysis Configuration # analysis: # A list of topic and keywords (see src_text_showcase.Rmd). @@ -203,20 +259,20 @@ analysis: - command # You can specify the intervals in 2 ways: window, or enumeration # window: - # If using gitlog, use start_commit and end_commit. Timestamp is inferred from gitlog +# # If using gitlog, use start_commit and end_commit. Timestamp is inferred from gitlog # start_commit: 9eae9e96f15e1f216162810cef4271a439a74223 # end_commit: f8f9ec1f249dd552065aa37c983bed4d4d869bb0 - # Use datetime only if no gitlog is used in the analysis. - #start_datetime: 2013-05-01 00:00:00 - #end_datetime: 2013-11-01 00:00:00 +# # Use datetime only if no gitlog is used in the analysis. +# start_datetime: 2013-05-01 00:00:00 +# end_datetime: 2013-11-01 00:00:00 # size_days: 90 # enumeration: - # If using gitlog, specify the commits +# # If using gitlog, specify the commits # commit: # - 9eae9e96f15e1f216162810cef4271a439a74223 # - f1d2d568776b3708dd6a3077376e2331f9268b04 # - c33a2ce74c84f0d435bfa2dd8953d132ebf7a77a - # Use datetime only if no gitlog is used in the analysis. Timestamp is inferred from gitlog +# # Use datetime only if no gitlog is used in the analysis. Timestamp is inferred from gitlog # datetime: # - 2013-05-01 00:00:00 # - 2013-08-01 00:00:00 diff --git a/conf/kaiaulu.yml b/conf/kaiaulu.yml index 3d0e8e9b..14cc2ed9 100644 --- a/conf/kaiaulu.yml +++ b/conf/kaiaulu.yml @@ -36,41 +36,79 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/kaiaulu/.git + log: ../../rawdata/kaiaulu/git_repo/kaiaulu/.git # From where the git log was downloaded? log_url: https://github.com/sailuh/kaiaulu # List of branches used for analysis branch: - master -mailing_list: - # Where is the mbox located locally? - #mbox: ../../rawdata/mbox/geronimo-dev.mbox - # What is the domain of the chosen mailing list archive? - #domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - #list_key: - # - geronimo-dev +# mailing_list: +# mod_mbox: +# project_key_1: +# mailing_list: https://lists.apache.org/list.html?announce@apache.org +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/openssl-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/openssl-project/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://sailuh.atlassian.net - project_key: SAILUH - # Download using `download_jira_data.Rmd` - issues: ../../rawdata/issue_tracker/kaiaulu/issues/ - issue_comments: ../../rawdata/issue_tracker/kaiaulu/issue_comments/ + project_key_1: + # Obtained from the project's JIRA URL + domain: https://sailuh.atlassian.net + project_key: SAILUH + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/kaiaulu/jira/sailuh/issues/ + issue_comments: ../../rawdata/kaiaulu/jira/sailuh/issue_comments/ github: - # Obtained from the project's GitHub URL - owner: sailuh - repo: kaiaulu - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/kaiaulu + project_key_1: + # Obtained from the project's GitHub URL + owner: sailuh + repo: kaiaulu + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/kaiaulu/github/sailuh_kaiaulu/issue_or_pr_comment/ + issue: ../../rawdata/kaiaulu/github/sailuh_kaiaulu/issue/ + issue_search: ../../rawdata/kaiaulu/github/sailuh_kaiaulu/issue_search/ + issue_event: ../../rawdata/kaiaulu/github/sailuh_kaiaulu/issue_event/ + pull_request: ../../rawdata/kaiaulu/github/sailuh_kaiaulu/pull_request/ + commit: ../../rawdata/kaiaulu/github/sailuh_kaiaulu/commit/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ -#vulnerabilities: +vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) # Download at: https://nvd.nist.gov/vuln/data-feeds - #nvd_feed: rawdata/nvdfeed + nvd_feed: rawdata/nvdfeed # Commit message CVE or Issue Regular Expression (regex) # See project's commit message for examples to create the regex @@ -109,6 +147,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -128,6 +192,37 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + understand: + # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + code_language: java + # Specify which types of Dependencies to keep + keep_dependencies_type: + - Import + - Call + - Create + - Use + - Type GenericArgument + # Where the files to analyze should be stored + project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # Where the output for the understands analysis is stored + output_path: ../../analysis/kaiaulu/understand/ + # Analysis Configuration # analysis: diff --git a/conf/openssl.yml b/conf/openssl.yml index aa7b2254..d41cf319 100644 --- a/conf/openssl.yml +++ b/conf/openssl.yml @@ -36,7 +36,7 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/OpenSSL/.git + log: ../../rawdata/openssl/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/openssl/openssl # List of branches used for analysis @@ -45,29 +45,65 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - #mbox: ../../rawdata/mbox/openssl_dev_mbox # 2004-2008 fields are complete - mbox: ../../rawdata/mbox/openssl-dev.mbx # 2002-2019 gmail field is redacted due to google groups - # What is the domain of the chosen mailing list archive? - #domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - #list_key: - # - apr-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-dev + save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user + save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox + pipermail: + project_key_1: + mailing_list: https://mta.openssl.org/pipermail/openssl-dev/ + save_folder_path: ../../rawdata/openssl/pipermail/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/openssl/pipermail/save_mbox_mail/openssl.mbox + project_key_2: + mailing_list: https://mta.openssl.org/pipermail/openssl-users/ + save_folder_path: ../../rawdata/openssl/pipermail/save_mbox_mail_2/ +# mbox_file_path: ../../rawdata/openssl/pipermail/save_mbox_mail_2/openssl.mbox -#issue_tracker: - #jira: - # Obtained from the project's JIRA URL - #domain: https://issues.apache.org/jira - #project_key: HELIX - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/helix_issues.json - #issue_comments: ../../rawdata/issue_tracker/helix_issue_comments.json - #github: - # Obtained from the project's GitHub URL - #owner: apache - #repo: apr - # Download using `download_github_comments.Rmd` - #replies: ../../rawdata/github/apr/ +# issue_tracker: +# jira: +# project_key_1: +# # Obtained from the project's JIRA URL +# domain: https://sailuh.atlassian.net +# project_key: SAILUH +# # Download using `download_jira_data.Rmd` +# issues: ../../rawdata/kaiaulu/jira/issues/sailuh/ +# issue_comments: ../../rawdata/kaiaulu/jira/issue_comments/sailuh/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -117,6 +153,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -136,6 +198,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/redhat.yml b/conf/redhat.yml new file mode 100644 index 00000000..49a8d746 --- /dev/null +++ b/conf/redhat.yml @@ -0,0 +1,248 @@ +# -*- yaml -*- +# https://github.com/sailuh/kaiaulu +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without any warranty. + +# Project Configuration File # +# +# To perform analysis on open source projects, you need to manually +# collect some information from the project's website. As there is +# no standardized website format, this file serves to distill +# important data source information so it can be reused by others +# and understood by Kaiaulu. +# +# Please check https://github.com/sailuh/kaiaulu/tree/master/conf to +# see if a project configuration file already exists. Otherwise, we +# would appreciate if you share your curated file with us by sending a +# Pull Request: https://github.com/sailuh/kaiaulu/pulls +# +# Note, you do NOT need to specify this entire file to conduct analysis. +# Each R Notebook uses a different portion of this file. To know what +# information is used, see the project configuration file section at +# the start of each R Notebook. +# +# Please comment unused parameters instead of deleting them for clarity. +# If you have questions, please open a discussion: +# https://github.com/sailuh/kaiaulu/discussions + +project: + website: http://itm0.shidler.hawaii.edu/kaiaulu + openhub: https://www.openhub.net/p/kaiaulu + +version_control: + # Where is the git log located locally? + # This is the path to the .git of the project repository you are analyzing. + # The .git is hidden, so you can see it using `ls -a` + log: ../../rawdata/kaiaulu/git_repo/.git + # From where the git log was downloaded? + log_url: https://github.com/sailuh/kaiaulu + # List of branches used for analysis + branch: + - master + +# mailing_list: +# mod_mbox: +# project_key_1: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-dev +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox + +# issue_tracker: +# jira: +# project_key_1: +# # Obtained from the project's JIRA URL +# domain: https://sailuh.atlassian.net +# project_key: SAILUH +# # Download using `download_jira_data.Rmd` +# issues: ../../rawdata/kaiaulu/jira/issues/sailuh/ +# issue_comments: ../../rawdata/kaiaulu/jira/issue_comments/sailuh/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ + + +#vulnerabilities: + # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) + # Download at: https://nvd.nist.gov/vuln/data-feeds + #nvd_feed: rawdata/nvdfeed + +# Commit message CVE or Issue Regular Expression (regex) +# See project's commit message for examples to create the regex +commit_message_id_regex: + issue_id: \#[0-9]+ + #cve_id: ? + +filter: + keep_filepaths_ending_with: + - R + remove_filepaths_containing: + - test + + +# Third Party Tools Configuration # +# +# See Kaiaulu's README.md for details on how to setup these tools. +tool: + # Depends allow to parse file-file static dependencies. + depends: + # accepts one language at a time: cpp, java, ruby, python, pom + # You can obtain this information on OpenHub or the project GiHub page right pane. + code_language: java + # Specify which types of Dependencies to keep - see the Depends tool README.md for details. + keep_dependencies_type: + - Cast + - Call + - Import + - Return + - Set + - Use + - Implement + - ImplLink + - Extend + - Create + - Throw + - Parameter + - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 + # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) + uctags: + # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details + # What types of file-file dependencies should be considered? If all + # dependencies are specified, Kaiaulu will use all of them if available. + keep_lines_type: + c: + - f # function definition + cpp: + - c # classes + - f # function definition + java: + - c # classes + - m # methods + python: + - c # classes + - f # functions + r: + - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ + +# Analysis Configuration # +analysis: + # You can specify the intervals in 2 ways: window, or enumeration + window: + # If using gitlog, use start_commit and end_commit. Timestamp is inferred from gitlog + start_commit: 224a729f44f554af311ca52cf01b105ded87499b + end_commit: 74cd4d4835a02e01e310476c6776192ad0d97173 + # Use datetime only if no gitlog is used in the analysis. + #start_datetime: 2013-05-01 00:00:00 + #end_datetime: 2013-11-01 00:00:00 + size_days: 30 +# enumeration: + # If using gitlog, specify the commits +# commit: +# - 9eae9e96f15e1f216162810cef4271a439a74223 +# - f1d2d568776b3708dd6a3077376e2331f9268b04 +# - c33a2ce74c84f0d435bfa2dd8953d132ebf7a77a + # Use datetime only if no gitlog is used in the analysis. Timestamp is inferred from gitlog +# datetime: +# - 2013-05-01 00:00:00 +# - 2013-08-01 00:00:00 +# - 2013-11-01 00:00:00 diff --git a/conf/spark.yml b/conf/spark.yml index bafca660..2e6bf2c2 100644 --- a/conf/spark.yml +++ b/conf/spark.yml @@ -34,7 +34,7 @@ project: version_control: # Where is the git log located locally? - log: ../../rawdata/git_repo/spark/.git + log: ../../rawdata/spark/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/apache/spark # List of branches used for analysis @@ -42,28 +42,66 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - #mbox: ../rawdata/mbox/spark-dev.mbox - # What is the domain of the chosen mailing list archive? - #domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - #list_key: - # - spark-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/spark-dev + save_folder_path: ../../rawdata/spark/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/spark/mod_mbox/save_mbox_mail/spark.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/spark-user + save_folder_path: ../../rawdata/spark/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox -issue_tracker: - jira: - # Obtained from the project's JIRA URL - #domain: https://issues.apache.org/jira - #project_key: SPARK - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/spark_issues.json - #issue_comments: ../../rawdata/issue_tracker/sparj_issue_comments.json - github: - # Obtained from the project's GitHub URL - #owner: apache - #repo: spark - # Download using `download_github_comments.Rmd` - #replies: ../../rawdata/github/spark/ +# issue_tracker: +# jira: +# project_key_1: +# # Obtained from the project's JIRA URL +# domain: https://sailuh.atlassian.net +# project_key: SAILUH +# # Download using `download_jira_data.Rmd` +# issues: ../../rawdata/kaiaulu/jira/issues/sailuh/ +# issue_comments: ../../rawdata/kaiaulu/jira/issue_comments/sailuh/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -113,6 +151,32 @@ tool: # - Throw # - Parameter # - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -132,6 +196,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/thrift.yml b/conf/thrift.yml index f47062b2..1f181200 100644 --- a/conf/thrift.yml +++ b/conf/thrift.yml @@ -34,7 +34,7 @@ project: version_control: # Where is the git log located locally? - log: ../../rawdata/git_repo/thrift/.git # cloned Apache Thrift repo and put path to its .git file + log: ../../rawdata/thrift/git_repo/.git # cloned Apache Thrift repo and put path to its .git file # From where the git log was downloaded? log_url: https://github.com/apache/thrift # List of branches used for analysis @@ -42,28 +42,66 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - mbox: ../../rawdata/mbox/thrift-dev.mbox # Download here: https://cdn.lfdr.de/stmc/ieee_tse_data/mail/thrift-dev.mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - thrift-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/thrift-dev + save_folder_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail/thrift.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/thrift-user + save_folder_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: THRIFT - # Download using `download_jira_data.Rmd` - issues: ../../rawdata/issue_tracker/thrift_issues.json - issue_comments: ../../rawdata/issue_tracker/thrift_issue_comments.json + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: THRIFT + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/thrift/jira/issues/thrift/ + issue_comments: ../../rawdata/thrift/jira/issue_comments/thrift/ github: - # Obtained from the project's GitHub URL - owner: apache - repo: thrift - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/thrift/ + project_key_1: + # Obtained from the project's GitHub URL + owner: apache + repo: thrift + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/thrift/github/issue_or_pr_comment/apache_thrift/ + issue: ../../rawdata/thrift/github/issue/apache_thrift/ + issue_search: ../../rawdata/thrift/github/issue_search/apache_thrift/ + issue_event: ../../rawdata/thrift/github/issue_event/apache_thrift/ + pull_request: ../../rawdata/thrift/github/pull_request/apache_thrift/ + commit: ../../rawdata/thrift/github/commit/apache_thrift/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -113,6 +151,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -132,6 +196,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/tomcat.yml b/conf/tomcat.yml index 33884bac..d689fa9d 100644 --- a/conf/tomcat.yml +++ b/conf/tomcat.yml @@ -36,7 +36,7 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../rawdata/git_repo/Tomcat/.git + log: ../../rawdata/tomcat/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/apache/tomcat # List of branches used for analysis @@ -44,31 +44,66 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - #mbox: ../../rawdata/mbox/tomcat-dev_2013_2019.mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - tomcat-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/tomcat-dev + save_folder_path: ../../rawdata/tomcat/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/tomcat/mod_mbox/save_mbox_mail/tomcat.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/tomcat-users + save_folder_path: ../../rawdata/tomcat/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/tomcat.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox -issue_tracker: - jira: - # Obtained from the project's JIRA URL - #domain: https://issues.apache.org/jira - #project_key: GERONIMO - # Download using `download_jira_data.Rmd` - #issues: ../../rawdata/issue_tracker/geronimo_issues.json - #issue_comments: ../../rawdata/issue_tracker/geronimo_issue_comments.json - github: - # Obtained from the project's GitHub URL - #owner: apache - #repo: geronimo - # Download using `download_github_comments.Rmd` - #replies: ../../rawdata/github/geronimo/ - # Currently not supported - #bugzilla: - #url: https://bz.apache.org/bugzilla/buglist.cgi?bug_status=__open__&no_redirect=1&order=Importance&product=Tomcat%209&query_format=specific +# issue_tracker: +# jira: +# project_key_1: +# # Obtained from the project's JIRA URL +# domain: https://sailuh.atlassian.net +# project_key: SAILUH +# # Download using `download_jira_data.Rmd` +# issues: ../../rawdata/kaiaulu/jira/issues/sailuh/ +# issue_comments: ../../rawdata/kaiaulu/jira/issue_comments/sailuh/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -118,6 +153,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -137,6 +198,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/conf/tse_apex.yml b/conf/tse_apex.yml index 46841ce9..1472c340 100644 --- a/conf/tse_apex.yml +++ b/conf/tse_apex.yml @@ -36,36 +36,74 @@ version_control: # Where is the git log located locally? # This is the path to the .git of the project repository you are analyzing. # The .git is hidden, so you can see it using `ls -a` - log: ../../../../tse_motif_2021/dataset/gitlog/apex-core/.git + log: ../../rawdata/apex/git_repo/.git # From where the git log was downloaded? log_url: https://github.com/apache/apex-core # List of branches used for analysis branch: - master -mailing_list: - # Where is the mbox located locally? - mbox: ../../../../tse_motif_2021/dataset/mbox/apex-dev.mbox - # What is the domain of the chosen mailing list archive? -# domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? -# list_key: -# - geronimo-dev +# mailing_list: +# mod_mbox: +# project_key_1: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-dev +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user +# save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: APEXCORE - # Download using `download_jira_data.Rmd` - issues: ../../../../tse_motif_2021/dataset/jira/apex - #issue_comments: ../../rawdata/issue_tracker/geronimo_issue_comments.json - #github: - # Obtained from the project's GitHub URL - #owner: apache - #repo: geronimo - # Download using `download_github_comments.Rmd` - #replies: ../../rawdata/github/geronimo/ + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: APEXCORE + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/apex/jira/issues/apexcore/ + issue_comments: ../../rawdata/apex/jira/issue_comments/apexcore/ +# github: +# project_key_1: +# # Obtained from the project's GitHub URL +# owner: sailuh +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/sailuh_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/sailuh_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/sailuh_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/sailuh_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/sailuh_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/sailuh_kaiaulu/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -121,7 +159,7 @@ tool: # The project folder path to store various intermediate # files for DV8 Analysis # The folder name will be used in the file names. - folder_path: ../../analysis/dv8/tse_apex + folder_path: ../../analysis/apex/dv8/ # the architectural flaws thresholds that should be used architectural_flaws: cliqueDepends: @@ -162,6 +200,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/man/get_bugzilla_issue_comment_path.Rd b/man/get_bugzilla_issue_comment_path.Rd new file mode 100644 index 00000000..e8832ae1 --- /dev/null +++ b/man/get_bugzilla_issue_comment_path.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_bugzilla_issue_comment_path} +\alias{get_bugzilla_issue_comment_path} +\title{Returns the local folder path for Bugzilla issue comments for a specific project key.} +\usage{ +get_bugzilla_issue_comment_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for Bugzilla issue comments for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for Bugzilla issue +comments for a specific project key, that is specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the folder path for Bugzilla issue +comments exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_bugzilla_issue_path.Rd b/man/get_bugzilla_issue_path.Rd new file mode 100644 index 00000000..f5438062 --- /dev/null +++ b/man/get_bugzilla_issue_path.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_bugzilla_issue_path} +\alias{get_bugzilla_issue_path} +\title{Returns the local folder path for Bugzilla issues for a specific project key.} +\usage{ +get_bugzilla_issue_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for Bugzilla issues for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for Bugzilla issues +for a specific project key, that is specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the folder path for Bugzilla issues +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_bugzilla_project_key.Rd b/man/get_bugzilla_project_key.Rd new file mode 100644 index 00000000..a29601e6 --- /dev/null +++ b/man/get_bugzilla_project_key.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_bugzilla_project_key} +\alias{get_bugzilla_project_key} +\title{Returns the name of the Bugzilla project key for a specific project key index.} +\usage{ +get_bugzilla_project_key(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The Bugzilla project key name for project specified by key `project_key_index`. +} +\description{ +This function returns the name of the Bugzilla project key for +a specific project key, that is specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the name of the Bugzilla project key +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_cveid_regex.Rd b/man/get_cveid_regex.Rd new file mode 100644 index 00000000..3e6064e9 --- /dev/null +++ b/man/get_cveid_regex.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_cveid_regex} +\alias{get_cveid_regex} +\title{Returns the cve (Common Vulnerabilities and Exposures) regular expression +for commit messages.} +\usage{ +get_cveid_regex(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The commit message CVE regular expression. +} +\description{ +This function returns the cve regular expression for commit +messages, that is specified in the input parameter `config_file`. The input, +`config_file` must be a parsed configuration file. The function will inform +the user if the cve regular expression for commit messages exists in the +parsed configuration file, `config_file`. +} diff --git a/man/get_depends_code_language.Rd b/man/get_depends_code_language.Rd new file mode 100644 index 00000000..bc8cca83 --- /dev/null +++ b/man/get_depends_code_language.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_depends_code_language} +\alias{get_depends_code_language} +\title{Returns the depends code language for analysis.} +\usage{ +get_depends_code_language(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The code language for parsing file-file static dependencies. +} +\description{ +This function returns the specified code language that should +be used to parse file-file static dependencies with the depends tool, that +is specified in the input parameter `config_file`. The input, `config_file` +must be a parsed configuration file. The function will inform the user if +the depends code language exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_depends_keep_dependencies_type.Rd b/man/get_depends_keep_dependencies_type.Rd new file mode 100644 index 00000000..54897420 --- /dev/null +++ b/man/get_depends_keep_dependencies_type.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_depends_keep_dependencies_type} +\alias{get_depends_keep_dependencies_type} +\title{Returns a list of the types of dependencies to keep for analysis.} +\usage{ +get_depends_keep_dependencies_type(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +A list of the types of depends dependencies to keep for analysis. +} +\description{ +This function returns the specified types of dependencies to +keep for analysis with the depends tool, that is specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the list of the +types of dependencies exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_dv8_flaws_params.Rd b/man/get_dv8_flaws_params.Rd new file mode 100644 index 00000000..41a076f4 --- /dev/null +++ b/man/get_dv8_flaws_params.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_dv8_flaws_params} +\alias{get_dv8_flaws_params} +\title{Returns the list of architectural flaws thresholds for DV8 analysis.} +\usage{ +get_dv8_flaws_params(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of DV8 architectural flaws thresholds. +} +\description{ +This function returns the list of architectural flaws thresholds +for DV8 analysis, that is specified in the input parameter `config_file`. +The input, `config_file` must be a parsed configuration file. The function +will inform the user if the list of architectural flaws thresholds +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_dv8_folder_path.Rd b/man/get_dv8_folder_path.Rd new file mode 100644 index 00000000..9c183712 --- /dev/null +++ b/man/get_dv8_folder_path.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_dv8_folder_path} +\alias{get_dv8_folder_path} +\title{Returns the path to the folder used to store files for DV8 analysis.} +\usage{ +get_dv8_folder_path(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The DV8 project folder path. +} +\description{ +This function returns the path to the folder that will be +used to store various intermediate files for DV8 analysis, that is specified +in the input parameter `config_file`. The input, `config_file` must be a +parsed configuration file. The function will inform the user if the path +path to the folder for intermediate file storage for DV8 analysis exists in +the parsed configuration file, `config_file`. +} diff --git a/man/get_enumeration_commits.Rd b/man/get_enumeration_commits.Rd new file mode 100644 index 00000000..9f910b75 --- /dev/null +++ b/man/get_enumeration_commits.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_enumeration_commits} +\alias{get_enumeration_commits} +\title{Returns the list of enumerated commit intervals for analysis.} +\usage{ +get_enumeration_commits(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of enumerated commit intervals. +} +\description{ +This function returns a list of enumerated commit intervals, +that is specified in the input parameter `config_file`. The input, +`config_file` must be a parsed configuration file. The function will inform +the user if the list of enumerated commit intervals exists in the parsed +configuration file, `config_file`. +} diff --git a/man/get_file_extensions.Rd b/man/get_file_extensions.Rd new file mode 100644 index 00000000..ea97656d --- /dev/null +++ b/man/get_file_extensions.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_file_extensions} +\alias{get_file_extensions} +\title{Returns the list of file extensions used for filtering files to keep.} +\usage{ +get_file_extensions(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of file extensions to keep. +} +\description{ +This function returns the list of file extensions that will be +used for filtering files specified in the input parameter `config_file`. The +input, `config_file` must be a parsed configuration file. The function will +inform the user if the list of file extensions exists in the parsed +configuration file, `config_file`. +} diff --git a/man/get_filter_commit_size.Rd b/man/get_filter_commit_size.Rd new file mode 100644 index 00000000..546be0cd --- /dev/null +++ b/man/get_filter_commit_size.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_filter_commit_size} +\alias{get_filter_commit_size} +\title{Returns the commit size threshold to remove file paths.} +\usage{ +get_filter_commit_size(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The commit size to filter out. +} +\description{ +This function returns an integer number that represents the +threshold for a commit size to remove file paths specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the commit size +threshold exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_git_branches.Rd b/man/get_git_branches.Rd new file mode 100644 index 00000000..4e89dca8 --- /dev/null +++ b/man/get_git_branches.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_git_branches} +\alias{get_git_branches} +\title{Returns the list of git branches used for analysis in the current project.} +\usage{ +get_git_branches(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of git branches. +} +\description{ +This function returns a list of the git branches used for +analysis in the current project specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the list of branches to be analyzed +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_git_repo_path.Rd b/man/get_git_repo_path.Rd new file mode 100644 index 00000000..5a02751e --- /dev/null +++ b/man/get_git_repo_path.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_git_repo_path} +\alias{get_git_repo_path} +\title{Returns the path to the .git of the project repository that is being analyzed.} +\usage{ +get_git_repo_path(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The local git repository path specified in `config_file`. +} +\description{ +This function returns the specific path to the .git of the +project repository that is being analyzed specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the .git path of the project repository +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_github_commit_path.Rd b/man/get_github_commit_path.Rd new file mode 100644 index 00000000..072cab45 --- /dev/null +++ b/man/get_github_commit_path.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_commit_path} +\alias{get_github_commit_path} +\title{Returns the local folder path for GitHub commits for a specific project key.} +\usage{ +get_github_commit_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for GitHub commits for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for GitHub commits +for a specific project key, that is specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the local folder +path for the commits exists in the parsed configuration file, +`config_file`. +} diff --git a/man/get_github_issue_event_path.Rd b/man/get_github_issue_event_path.Rd new file mode 100644 index 00000000..b5ed60d4 --- /dev/null +++ b/man/get_github_issue_event_path.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_issue_event_path} +\alias{get_github_issue_event_path} +\title{Returns the local folder path for GitHub issue events for a specific project +key.} +\usage{ +get_github_issue_event_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for GitHub issue events for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for GitHub issue +events for a specific project key, that is specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the local folder +path for the issue events exists in the parsed configuration file, +`config_file`. +} diff --git a/man/get_github_issue_or_pr_comment_path.Rd b/man/get_github_issue_or_pr_comment_path.Rd new file mode 100644 index 00000000..c41ef4ac --- /dev/null +++ b/man/get_github_issue_or_pr_comment_path.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_issue_or_pr_comment_path} +\alias{get_github_issue_or_pr_comment_path} +\title{Returns the local folder path for GitHub Issue or Pull Request comments for +a specific project key.} +\usage{ +get_github_issue_or_pr_comment_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for GitHub Issues or PR comments for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for GitHub Issue or +Pull Request comments for a specific project key, that is specified in the +input parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the local folder +path for the comments exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_github_issue_path.Rd b/man/get_github_issue_path.Rd new file mode 100644 index 00000000..40f4eb46 --- /dev/null +++ b/man/get_github_issue_path.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_issue_path} +\alias{get_github_issue_path} +\title{Returns the local folder path for GitHub issues for a specific project key.} +\usage{ +get_github_issue_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for GitHub issues for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for GitHub issues +for a specific project key, that is specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the folder path for GitHub issues exists +in the parsed configuration file, `config_file`. +} diff --git a/man/get_github_issue_search_path.Rd b/man/get_github_issue_search_path.Rd new file mode 100644 index 00000000..b7b17056 --- /dev/null +++ b/man/get_github_issue_search_path.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_issue_search_path} +\alias{get_github_issue_search_path} +\title{Returns the local folder path for GitHub Issue Searches for a specific +project key.} +\usage{ +get_github_issue_search_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for GitHub issue search for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for GitHub Issue +Searches for a specific project key, that is specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the local folder path for the issue +searches exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_github_keys.Rd b/man/get_github_keys.Rd new file mode 100644 index 00000000..884101e6 --- /dev/null +++ b/man/get_github_keys.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_keys} +\alias{get_github_keys} +\title{Returns the list of GitHub issue tracker project keys.} +\usage{ +get_github_keys(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of GitHub issue tracker project keys. +} +\description{ +This function returns the list of GitHub issue tracker project +keys, that is specified in the input parameter `config_file`. The input, +`config_file` must be a parsed configuration file. The function will inform +the user if the project keys exist in the parsed configuration +file, `config_file`. +} diff --git a/man/get_github_owner.Rd b/man/get_github_owner.Rd new file mode 100644 index 00000000..af80dfb8 --- /dev/null +++ b/man/get_github_owner.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_owner} +\alias{get_github_owner} +\title{Returns the owner for a GitHub repository for a specific project key.} +\usage{ +get_github_owner(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The GitHub project owner name for project specified by key `project_key_index`. +} +\description{ +This function returns the owner for a GitHub repository for a +specific project key, that is specified in the input parameter `config_file`. +The input, `config_file` must be a parsed configuration file. The function +will inform the user if the owner for the GitHub repository exists in the +parsed configuration file, `config_file`. +} diff --git a/man/get_github_pull_request_path.Rd b/man/get_github_pull_request_path.Rd new file mode 100644 index 00000000..75af220d --- /dev/null +++ b/man/get_github_pull_request_path.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_pull_request_path} +\alias{get_github_pull_request_path} +\title{Returns the local folder path for GitHub Pull Requests for a specific +project key.} +\usage{ +get_github_pull_request_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local folder path for GitHub pull requests for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for GitHub Pull +Requests for a specific project key, that is specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the local folder +path for the pull requests exists in the parsed configuration file, +`config_file`. +} diff --git a/man/get_github_repo.Rd b/man/get_github_repo.Rd new file mode 100644 index 00000000..5f474787 --- /dev/null +++ b/man/get_github_repo.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_github_repo} +\alias{get_github_repo} +\title{Returns the name of the GitHub repository for a specific project key.} +\usage{ +get_github_repo(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The name of the GitHub repository for project specified by key `project_key_index`. +} +\description{ +This function returns the name of the GitHub repository for a +specific project key, that is specified in the input parameter `config_file`. +The input, `config_file` must be a parsed configuration file. The function +will inform the user if the name of the GitHub repository exists in the +parsed configuration file, `config_file`. +} diff --git a/man/get_issue_id_regex.Rd b/man/get_issue_id_regex.Rd new file mode 100644 index 00000000..b8f47897 --- /dev/null +++ b/man/get_issue_id_regex.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_issue_id_regex} +\alias{get_issue_id_regex} +\title{Returns the issue Id regular expression for commit messages.} +\usage{ +get_issue_id_regex(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The commit message issue Id regular expression. +} +\description{ +This function returns the issue Id regular expression for commit +messages, that is specified in the input parameter `config_file`. The input, +`config_file` must be a parsed configuration file. The function will inform +the user if the issue Id regular expression for commit messages exists in the +parsed configuration file, `config_file`. +} diff --git a/man/get_jira_domain.Rd b/man/get_jira_domain.Rd new file mode 100644 index 00000000..7384713f --- /dev/null +++ b/man/get_jira_domain.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_jira_domain} +\alias{get_jira_domain} +\title{Returns the Jira project domain for a specific project key.} +\usage{ +get_jira_domain(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The Jira domain for project specified by key `project_key_index`. +} +\description{ +This function returns the Jira project domain for a specific +project key, that is specified in the input parameter `config_file`. +The input, `config_file` must be a parsed configuration file. The function +will inform the user if the domain exists in the parsed configuration file, +`config_file`. +} diff --git a/man/get_jira_issues_comments_path.Rd b/man/get_jira_issues_comments_path.Rd new file mode 100644 index 00000000..97078a2e --- /dev/null +++ b/man/get_jira_issues_comments_path.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_jira_issues_comments_path} +\alias{get_jira_issues_comments_path} +\title{Returns the local folder path for Jira issue comments for a specific +project key.} +\usage{ +get_jira_issues_comments_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The folder path for Jira issue comments for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path for Jira issue +comments for a specific project key, that is specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the local folder +path for the comments exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_jira_issues_path.Rd b/man/get_jira_issues_path.Rd new file mode 100644 index 00000000..6d3c31d9 --- /dev/null +++ b/man/get_jira_issues_path.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_jira_issues_path} +\alias{get_jira_issues_path} +\title{Returns the local folder path for Jira issues for a specific project key.} +\usage{ +get_jira_issues_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The Jira issue folder path for project specified by key `project_key_index`. +} +\description{ +This function returns the folder path for Jira issues for a +specific project key, that is specified in the input parameter `config_file`. +The input, `config_file` must be a parsed configuration file. The function +will inform the user if the folder path for Jira issues exists in the parsed +configuration file, `config_file`. +} diff --git a/man/get_jira_keys.Rd b/man/get_jira_keys.Rd new file mode 100644 index 00000000..46b1ca31 --- /dev/null +++ b/man/get_jira_keys.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_jira_keys} +\alias{get_jira_keys} +\title{Returns the list of Jira issue tracker project keys.} +\usage{ +get_jira_keys(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of Jira issue tracker project keys. +} +\description{ +This function returns the list of Jira issue tracker project +keys, that is specified in the input parameter `config_file`. The input, +`config_file` must be a parsed configuration file. The function will inform +the user if the project keys exist in the parsed configuration +file, `config_file`. +} diff --git a/man/get_jira_project_key_name.Rd b/man/get_jira_project_key_name.Rd new file mode 100644 index 00000000..bfe36336 --- /dev/null +++ b/man/get_jira_project_key_name.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_jira_project_key_name} +\alias{get_jira_project_key_name} +\title{Returns the name of the Jira project key for a specific project key.} +\usage{ +get_jira_project_key_name(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The Jira project key name for project specified by key `project_key_index`. +} +\description{ +This function returns the Jira project key name for a specific +project key, that is specified in the input parameter `config_file`. +The input, `config_file` must be a parsed configuration file. The function +will inform the user if the project key name exists in the parsed +configuration file, `config_file`. +} diff --git a/man/get_mbox_domain.Rd b/man/get_mbox_domain.Rd new file mode 100644 index 00000000..5ba02c6e --- /dev/null +++ b/man/get_mbox_domain.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_mbox_domain} +\alias{get_mbox_domain} +\title{Returns the URL to the archives for mbox for a specific project key.} +\usage{ +get_mbox_domain(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The URL of the mbox mailing list archive for project specified by key `project_key_index`. +} +\description{ +This function returns the URL to the archives for a specific +project key, `project_key_index`, that is specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the specific URL to the archives for +mbox exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_mbox_input_file.Rd b/man/get_mbox_input_file.Rd new file mode 100644 index 00000000..ced74f7f --- /dev/null +++ b/man/get_mbox_input_file.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_mbox_input_file} +\alias{get_mbox_input_file} +\title{Returns the local input file for mbox for a specific project key.} +\usage{ +get_mbox_input_file(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local input file mbox path for project specified by key `project_key_index`. +} +\description{ +This function returns the local file used for input for +mbox for a specific project key, `project_key_index`, that is specified +in the input parameter `config_file`. The input, `config_file` must be a +parsed configuration file. The function will inform the user if the specific +local input file path for mbox exists in the parsed configuration file, +`config_file`. +} diff --git a/man/get_mbox_key_indexes.Rd b/man/get_mbox_key_indexes.Rd new file mode 100644 index 00000000..541a0dec --- /dev/null +++ b/man/get_mbox_key_indexes.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_mbox_key_indexes} +\alias{get_mbox_key_indexes} +\title{Returns the list of mailing list mod mbox project keys.} +\usage{ +get_mbox_key_indexes(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of mod mbox mailing list keys. +} +\description{ +This function returns the list of mailing list mod mbox project +keys, that is specified in the input parameter `config_file`. The input, +`config_file` must be a parsed configuration file. The function will inform +the user if the project keys exist in the parsed configuration +file, `config_file`. +} diff --git a/man/get_mbox_path.Rd b/man/get_mbox_path.Rd new file mode 100644 index 00000000..527f212c --- /dev/null +++ b/man/get_mbox_path.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_mbox_path} +\alias{get_mbox_path} +\title{Returns the local folder path to store mbox data for a specific project key.} +\usage{ +get_mbox_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local mbox path for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path used to store +mbox data for a specific project key, `project_key_index`, that is specified +in the input parameter `config_file`. The input, `config_file` must be a +parsed configuration file. The function will inform the user if the specific +local folder path to store mbox data exists in the parsed configuration +file, `config_file`. +} diff --git a/man/get_nvdfeed_folder_path.Rd b/man/get_nvdfeed_folder_path.Rd new file mode 100644 index 00000000..2ad4b850 --- /dev/null +++ b/man/get_nvdfeed_folder_path.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_nvdfeed_folder_path} +\alias{get_nvdfeed_folder_path} +\title{Returns the local folder path that contains the nvd (National Vulnerability +Database) feeds.} +\usage{ +get_nvdfeed_folder_path(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The folder path with nvd feeds. +} +\description{ +This function returns the local folder path for nvd feeds, +that is specified in the input parameter `config_file`. The input, +`config_file` must be a parsed configuration file. The function will inform +the user if the local folder path for the nvd feeds exists in the parsed +configuration file, `config_file`. +} diff --git a/man/get_pattern4_filepath.Rd b/man/get_pattern4_filepath.Rd new file mode 100644 index 00000000..54a7ad36 --- /dev/null +++ b/man/get_pattern4_filepath.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_pattern4_filepath} +\alias{get_pattern4_filepath} +\title{Returns the folder path for the output of the pattern4 analysis.} +\usage{ +get_pattern4_filepath(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The Pattern4 output folder path. +} +\description{ +This function returns the folder path that contains the +output of the pattern4 analysis for the project, that is specified in the +input parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the folder path +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_pattern4_folder_path.Rd b/man/get_pattern4_folder_path.Rd new file mode 100644 index 00000000..8ecfe622 --- /dev/null +++ b/man/get_pattern4_folder_path.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_pattern4_folder_path} +\alias{get_pattern4_folder_path} +\title{Returns the folder path for class pattern4 analysis.} +\usage{ +get_pattern4_folder_path(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The Pattern4 class folder path. +} +\description{ +This function returns the folder path used to store the classes +for the pattern4 analysis for the project, that is specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the folder path +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_pipermail_domain.Rd b/man/get_pipermail_domain.Rd new file mode 100644 index 00000000..6b258d37 --- /dev/null +++ b/man/get_pipermail_domain.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_pipermail_domain} +\alias{get_pipermail_domain} +\title{Returns the URL to the archives for pipermail for a specific project key.} +\usage{ +get_pipermail_domain(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The URL of the pipermail mailing list archive for project specified by key `project_key_index`. +} +\description{ +This function returns the URL to the archives for a specific +project key, `project_key_index`, that is specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the specific URL to the archives for +pipermail exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_pipermail_input_file.Rd b/man/get_pipermail_input_file.Rd new file mode 100644 index 00000000..51e9a6b5 --- /dev/null +++ b/man/get_pipermail_input_file.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_pipermail_input_file} +\alias{get_pipermail_input_file} +\title{Returns the local input file for pipermail for a specific project key.} +\usage{ +get_pipermail_input_file(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local input file pipermail path for project specified by key `project_key_index`. +} +\description{ +This function returns the local file used for input for +pipermail for a specific project key, `project_key_index`, that is specified +in the input parameter `config_file`. The input, `config_file` must be a +parsed configuration file. The function will inform the user if the specific +local input file path for pipermail exists in the parsed configuration file, +`config_file`. +} diff --git a/man/get_pipermail_path.Rd b/man/get_pipermail_path.Rd new file mode 100644 index 00000000..05d445bf --- /dev/null +++ b/man/get_pipermail_path.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_pipermail_path} +\alias{get_pipermail_path} +\title{Returns the local folder path to store pipermail data for a specific project key.} +\usage{ +get_pipermail_path(config_file, project_key_index) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} + +\item{project_key_index}{The name of the index of the project key (e.g. "project_key_1" or "project_key_2").} +} +\value{ +The local pipermail path for project specified by key `project_key_index`. +} +\description{ +This function returns the local folder path used to store +pipermail data for a specific project key, `project_key_index`, that is specified +in the input parameter `config_file`. The input, `config_file` must be a +parsed configuration file. The function will inform the user if the specific +local folder path to store pipermail data exists in the parsed configuration +file, `config_file`. +} diff --git a/man/get_srcml_filepath.Rd b/man/get_srcml_filepath.Rd new file mode 100644 index 00000000..3bd0d746 --- /dev/null +++ b/man/get_srcml_filepath.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_srcml_filepath} +\alias{get_srcml_filepath} +\title{Returns the file path for the output of the srcML analysis for the project.} +\usage{ +get_srcml_filepath(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The output file path for srcML analysis. +} +\description{ +This function returns the file path to be used to store the +output of the srcML analysis for the project, that is specified in the +input parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the file path +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_substring_filepath.Rd b/man/get_substring_filepath.Rd new file mode 100644 index 00000000..cb6346b1 --- /dev/null +++ b/man/get_substring_filepath.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_substring_filepath} +\alias{get_substring_filepath} +\title{Returns the list of file extensions used for filtering files to remove.} +\usage{ +get_substring_filepath(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of file extensions to remove. +} +\description{ +This function returns the list of file extensions that will be +used for filtering files specified in the input parameter `config_file`. The +input, `config_file` must be a parsed configuration file. The function will +inform the user if the list of file extensions exists in the parsed +configuration file, `config_file`. +} diff --git a/man/get_tool_project.Rd b/man/get_tool_project.Rd new file mode 100644 index 00000000..26425912 --- /dev/null +++ b/man/get_tool_project.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_tool_project} +\alias{get_tool_project} +\title{Returns the specified tool project from a parsed tool configuration file.} +\usage{ +get_tool_project(tool_name, config_file) +} +\arguments{ +\item{tool_name}{The name of the tool (e.g. "perceval" or "dv8").} + +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The specified `tool_name` tool project from `config_file`. +} +\description{ +This function returns a path to a specified tool from a +specified parsed tool configuration file. The function takes the input +`tool_name` and uses it to index a specific tool project in a parsed +tool configuration file, `config_file`, where it then returns the specified +tool project. The function will inform the user if the specified attribute, +`tool_name`, exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_topics.Rd b/man/get_topics.Rd new file mode 100644 index 00000000..8de86b5d --- /dev/null +++ b/man/get_topics.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_topics} +\alias{get_topics} +\title{Returns the list of topics and keywords for analysis.} +\usage{ +get_topics(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The list of keywords and topics for analysis. +} +\description{ +This function returns the list of keywords and topics for +analysis, that is specified in the input parameter `config_file`. The +input, `config_file` must be a parsed configuration file. The function will +inform the user if the list of keywords and topics exists in the parsed +configuration file, `config_file`. +} diff --git a/man/get_uctags_line_types.Rd b/man/get_uctags_line_types.Rd new file mode 100644 index 00000000..cff7af2d --- /dev/null +++ b/man/get_uctags_line_types.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_uctags_line_types} +\alias{get_uctags_line_types} +\title{Returns the types to keep to to be considered for analysis.} +\usage{ +get_uctags_line_types(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The lines type to keep for analysis. +} +\description{ +This function returns the types of file-file dependencies that +should be considered, that are specified in the input parameter +`config_file`. The input, `config_file` must be a parsed configuration file. +The function will inform the user if the lines type to keep exists in the +parsed configuration file, `config_file`. +} diff --git a/man/get_understand_code_language.Rd b/man/get_understand_code_language.Rd new file mode 100644 index 00000000..3e761bc1 --- /dev/null +++ b/man/get_understand_code_language.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_understand_code_language} +\alias{get_understand_code_language} +\title{Returns the understand code language for analysis.} +\usage{ +get_understand_code_language(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The code language for parsing with the understand tool. +} +\description{ +This function returns the specified code language that should +be used to parse dependencies with the understand tool, that +is specified in the input parameter `config_file`. The input, `config_file` +must be a parsed configuration file. The function will inform the user if +the understand code language exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_understand_keep_dependencies_type.Rd b/man/get_understand_keep_dependencies_type.Rd new file mode 100644 index 00000000..1172afab --- /dev/null +++ b/man/get_understand_keep_dependencies_type.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_understand_keep_dependencies_type} +\alias{get_understand_keep_dependencies_type} +\title{Returns a list of the types of understand dependencies to keep for analysis.} +\usage{ +get_understand_keep_dependencies_type(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +A list of the types of understand dependencies to keep for analysis. +} +\description{ +This function returns the specified types of dependencies to +keep for analysis with the understand tool, that is specified in the input +parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the list of the +types of understand dependencies exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_understand_output_path.Rd b/man/get_understand_output_path.Rd new file mode 100644 index 00000000..7e314f03 --- /dev/null +++ b/man/get_understand_output_path.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_understand_output_path} +\alias{get_understand_output_path} +\title{Returns the folder path for the output of the understand analysis.} +\usage{ +get_understand_output_path(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The understand output folder path. +} +\description{ +This function returns the folder path that contains the +output of the understand analysis for the project, that is specified in the +input parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the folder path +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_understand_project_path.Rd b/man/get_understand_project_path.Rd new file mode 100644 index 00000000..b2c2bc6d --- /dev/null +++ b/man/get_understand_project_path.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_understand_project_path} +\alias{get_understand_project_path} +\title{Returns the folder path for the input of the understand analysis.} +\usage{ +get_understand_project_path(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The understand project folder path. +} +\description{ +This function returns the folder path that contains the +input of the understand analysis for the project, that is specified in the +input parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the folder path +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_window_end_commit.Rd b/man/get_window_end_commit.Rd new file mode 100644 index 00000000..20f24151 --- /dev/null +++ b/man/get_window_end_commit.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_window_end_commit} +\alias{get_window_end_commit} +\title{Returns the ending commit for a window for analysis.} +\usage{ +get_window_end_commit(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The end commit for a window for analysis. +} +\description{ +This function returns the ending commit for a window of time +for analysis (the time stamp is inferred from gitlog), that is specified in +the input parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the end commit +exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_window_size.Rd b/man/get_window_size.Rd new file mode 100644 index 00000000..d32899c9 --- /dev/null +++ b/man/get_window_size.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_window_size} +\alias{get_window_size} +\title{Returns the size of a window for analysis.} +\usage{ +get_window_size(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The size of a window for analysis. +} +\description{ +This function returns the size of a window, that is +specified in the input parameter `config_file`. The input, `config_file` +must be a parsed configuration file. The function will inform the user if +the window size exists in the parsed configuration file, `config_file`. +} diff --git a/man/get_window_start_commit.Rd b/man/get_window_start_commit.Rd new file mode 100644 index 00000000..75f25fcc --- /dev/null +++ b/man/get_window_start_commit.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{get_window_start_commit} +\alias{get_window_start_commit} +\title{Returns the starting commit for a window for analysis.} +\usage{ +get_window_start_commit(config_file) +} +\arguments{ +\item{config_file}{The parsed configuration file obtained from \code{\link{parse_config}}.} +} +\value{ +The start commit for a window for analysis. +} +\description{ +This function returns the starting commit for a window of time +for analysis (the time stamp is inferred from gitlog), that is specified in +the input parameter `config_file`. The input, `config_file` must be a parsed +configuration file. The function will inform the user if the start commit +exists in the parsed configuration file, `config_file`. +} diff --git a/man/parse_config.Rd b/man/parse_config.Rd new file mode 100644 index 00000000..5d5ef6cc --- /dev/null +++ b/man/parse_config.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/config.R +\name{parse_config} +\alias{parse_config} +\title{Returns the parsed configuration file (.yml).} +\usage{ +parse_config(config_path) +} +\arguments{ +\item{config_path}{The path of the config file from the kaiaulu directory (e.g. "conf/kaiaulu.yml").} +} +\value{ +The parsed config file whose path is specified by `config_path`. +} +\description{ +The input file is expected to be in the .yml format. +The function returns a parsed version of the input .yml file, and it will +inform the user if the input .yml file path does not exist. The contents +of the input .yml file may contain machine-dependent paths that may need to +be modified by the user. +} diff --git a/vignettes/motif_analysis.Rmd b/vignettes/_motif_analysis.Rmd similarity index 95% rename from vignettes/motif_analysis.Rmd rename to vignettes/_motif_analysis.Rmd index cd58df16..3b7de8d5 100644 --- a/vignettes/motif_analysis.Rmd +++ b/vignettes/_motif_analysis.Rmd @@ -38,26 +38,23 @@ Kaiaulu re-implementation is easy to extend, and allow for any combination of mo We demonstrate here both the triangle and square motif as originally defined in our paper, which leverages all 3 types of the networks. The project of analysis is Kaiaulu itself, however, this can be applied to other open source projects! ```{r} -tools_path <- "../tools.yml" -conf_path <- "../conf/kaiaulu.yml" +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/kaiaulu.yml") -tool <- yaml::read_yaml(tools_path) -scc_path <- tool[["scc"]] +scc_path <- get_tool_project("scc", tool) -oslom_dir_path <- tool[["oslom_dir"]] -oslom_undir_path <- tool[["oslom_undir"]] +oslom_dir_path <- get_tool_project("oslom_dir", tool) +oslom_undir_path <- get_tool_project("oslom_undir", tool) -conf <- yaml::read_yaml(conf_path) +perceval_path <- get_tool_project("perceval", tool) +git_repo_path <- get_git_repo_path(conf) +git_branch <- get_git_branches(conf)[1] -perceval_path <- tool[["perceval"]] -git_repo_path <- conf[["version_control"]][["log"]] -git_branch <- conf[["version_control"]][["branch"]][1] - -github_replies_path <- conf[["issue_tracker"]][["github"]][["replies"]] +github_replies_path <- get_github_issue_or_pr_comment_path(conf, "project_key_1") # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` ## Parse Gitlog diff --git a/vignettes/refactoringminer_showcase.Rmd b/vignettes/_refactoringminer_showcase.Rmd similarity index 67% rename from vignettes/refactoringminer_showcase.Rmd rename to vignettes/_refactoringminer_showcase.Rmd index eb25a155..5d37ac2b 100644 --- a/vignettes/refactoringminer_showcase.Rmd +++ b/vignettes/_refactoringminer_showcase.Rmd @@ -17,14 +17,12 @@ require(kaiaulu) This is a wrapper for [Refactoring Miner](https://github.com/tsantalis/RefactoringMiner#running-refactoringminer-from-the-command-line). See README.md for setup details. ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/tomcat.yml") -rminer_path <- tool[["refactoring_miner"]] -git_repo_path <- conf[["version_control"]][["log"]] -start_commit <- conf[["analysis"]][["enumeration"]][["commit"]][1] -end_commit <- conf[["analysis"]][["enumeration"]][["commit"]][2] - - +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/tomcat.yml") +rminer_path <- get_tool_project("refactoring_miner", tool) +git_repo_path <- get_git_repo_path(conf) +start_commit <- get_enumeration_commits(conf)[1] +end_commit <- get_enumeration_commits(conf)[2] ``` A sample of parsed refactorings is shown below in json format: diff --git a/vignettes/reply_communication_showcase.Rmd b/vignettes/_reply_communication_showcase.Rmd similarity index 95% rename from vignettes/reply_communication_showcase.Rmd rename to vignettes/_reply_communication_showcase.Rmd index ad11ddc9..e8206172 100644 --- a/vignettes/reply_communication_showcase.Rmd +++ b/vignettes/_reply_communication_showcase.Rmd @@ -29,12 +29,11 @@ require(yaml) Load config file. ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/helix.yml") -perceval_path <- tool[["perceval"]] - -mbox_path <- conf[["mailing_list"]][["mbox"]] -jira_issue_comments_path <- conf[["issue_tracker"]][["jira"]][["issue_comments"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/helix.yml") +perceval_path <- get_tool_project("perceval", tool) +mbox_path <- get_mbox_path(conf, "project_key_1") +jira_issue_comments_path <- get_jira_issues_comments_path(conf, "project_key_1") ``` diff --git a/vignettes/social_smell_showcase.Rmd b/vignettes/_social_smell_showcase.Rmd similarity index 97% rename from vignettes/social_smell_showcase.Rmd rename to vignettes/_social_smell_showcase.Rmd index eb34fb72..d2c6676e 100644 --- a/vignettes/social_smell_showcase.Rmd +++ b/vignettes/_social_smell_showcase.Rmd @@ -55,35 +55,29 @@ At the scope of this notebook, **only the first branch** (top) specified in the We also provide the path for `tools.yml`. Kaiaulu does not implement all available functionality from scratch. Conversely, it will also not expect all dependencies to be installed. Every function defined in the API expects as parameter a filepath to the external dependency binary. Tools.yml is a convenience file that stores all the binary paths, so it can be set once during setup and reused multiple times for analysis. You can find an example of `tools.yml` on the github repo from Kaiaulu root directory. For this notebook, you will need to install Perceval (use version 0.12.24) and OSLOM. Instructions to do so are available in the Kaiaulu README.md. Once you are finished, set the "perceval," "oslom_dir," and "oslom_undir" paths in your `tools.yml`. ```{r} -tools_path <- "../tools.yml" -conf_path <- "../conf/helix.yml" +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/helix.yml") -tool <- yaml::read_yaml(tools_path) -scc_path <- tool[["scc"]] +scc_path <- get_tool_project("scc", tool) -oslom_dir_path <- tool[["oslom_dir"]] -oslom_undir_path <- tool[["oslom_undir"]] - -conf <- yaml::read_yaml(conf_path) - -perceval_path <- tool[["perceval"]] -git_repo_path <- conf[["version_control"]][["log"]] -git_branch <- conf[["version_control"]][["branch"]][1] - -start_commit <- conf[["analysis"]][["window"]][["start_commit"]] -end_commit <- conf[["analysis"]][["window"]][["end_commit"]] -window_size <- conf[["analysis"]][["window"]][["size_days"]] - -mbox_path <- conf[["mailing_list"]][["mbox"]] -github_replies_path <- conf[["issue_tracker"]][["github"]][["replies"]] -jira_issue_comments_path <- conf[["issue_tracker"]][["jira"]][["issue_comments"]] +oslom_dir_path <- get_tool_project("oslom_dir", tool) +oslom_undir_path <- get_tool_project("oslom_undir", tool) +perceval_path <- get_tool_project("perceval", tool) +git_repo_path <- get_git_repo_path(conf) +git_branch <- get_git_branches(conf)[1] +start_commit <- get_window_start_commit(conf) +end_commit <- get_window_end_commit(conf) +window_size <- get_window_size(conf) +mbox_path <- get_mbox_path(conf, "project_key_1") +github_replies_path <- get_github_issue_or_pr_comment_path(conf, "project_key_1") +jira_issue_comments_path <- get_jira_issues_comments_path(conf, "project_key_1") # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` The remainder of this notebook does not require modifications. If you encounter an error in any code block below, chances are one or more parameters above have been specified incorrectly, or the project of choice may have led to an outlier case. Please open an issue if you encounter an error, or if not sure post on discussions in Kaiaulu's GitHub. **E-mailing bugs is discouraged as it is hard to track**. diff --git a/vignettes/blamed_line_types_showcase.Rmd b/vignettes/blamed_line_types_showcase.Rmd index 3f12eb05..9e4fa6b4 100644 --- a/vignettes/blamed_line_types_showcase.Rmd +++ b/vignettes/blamed_line_types_showcase.Rmd @@ -84,22 +84,22 @@ filter: We load the above various sections of the single .yaml project configuration file as follows: ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/apr.yml") +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/apr.yml") # 3rd Party Tools -perceval_path <- tool[["perceval"]] -utags_path <- tool[["utags"]] +perceval_path <- get_tool_project("perceval", tool) +utags_path <- get_tool_project("utags", tool) # Ctags Line Types -kinds <- conf[["tool"]][["uctags"]][["keep_lines_type"]] +kinds <- get_uctags_line_types(conf) # Local Git Repo Folder Path -git_repo_path <- conf[["version_control"]][["log"]] +git_repo_path <- get_git_repo_path(conf) # File Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` diff --git a/vignettes/bug_count.Rmd b/vignettes/bug_count.Rmd index bcaf5774..894b9a87 100644 --- a/vignettes/bug_count.Rmd +++ b/vignettes/bug_count.Rmd @@ -36,19 +36,20 @@ This notebook leverages the JIRA issues data collected from the R Notebook `down As usual, the first step is to load the project configuration file. ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/geronimo.yml") -perceval_path <- tool[["perceval"]] -git_repo_path <- conf[["version_control"]][["log"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/geronimo.yml") +perceval_path <- get_tool_project("perceval", tool) +git_repo_path <- get_git_repo_path(conf) # Issue ID Regex on Commit Messages -issue_id_regex <- conf[["commit_message_id_regex"]][["issue_id"]] +issue_id_regex <- get_issue_id_regex(conf) # Path to Jira Issues (obtained using `download_jira_data Notebook`) -jira_issues_path <- conf[["issue_tracker"]][["jira"]][["issues"]] +# Specify project_key_index in get_jira_issues_path() (e.g. "project_key_1") +jira_issues_path <- get_jira_issues_path(conf, "project_key_1") # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` To establish bug count, we must map each file to its associated issue. In general (but not always), an open source project will adopt a commit message convention to label issue ids. For example, Kaiaulu uses `i #`. JIRA dictates issue ids in the format PROJECT-. We can use this assumption to find the issues that the git log commits are trying to fix. You can create your own regular expression by manually inspecting some of the commit messages (for example, compare the regular expression from the geronimo project used here against [its commit message](https://github.com/apache/geronimo/commits/trunk)). diff --git a/vignettes/causal_flaws.Rmd b/vignettes/causal_flaws.Rmd index a03cc3be..9676c2e2 100644 --- a/vignettes/causal_flaws.Rmd +++ b/vignettes/causal_flaws.Rmd @@ -44,46 +44,47 @@ For implementation details, please refer to the function reference documentation The first step is loading the project configuration file, which contains information for both the project data provenance, and the various parameters used for all the tools. Refer to [Kaiaulu's repo conf folder](https://github.com/sailuh/kaiaulu/tree/master/conf) for the configuration file of interest used in the code block below. ```{r} -tool <- yaml::read_yaml("../tools.yml") -#conf <- yaml::read_yaml("../conf/tse_cassandra.yml") -conf <- yaml::read_yaml("../conf/camel.yml") -perceval_path <- tool[["perceval"]] -dv8_path <- tool[["dv8"]] -scc_path <- tool[["scc"]] +tool <- parse_config("../tools.yml") +#conf <- parse_config("../conf/tse_cassandra.yml") +conf <- parse_config("../conf/camel.yml") +perceval_path <- get_tool_project("perceval", tool) +dv8_path <- get_tool_project("dv8", tool) +scc_path <- get_tool_project("scc", tool) # Gitlog parameters -git_repo_path <- conf[["version_control"]][["log"]] -git_branch <- conf[["version_control"]][["branch"]][1] # camel 1.6.0 +git_repo_path <- get_git_repo_path(conf) +git_branch <- get_git_branches(conf)[1] # camel 1.6.0 # Depends parameters -depends_jar_path <- tool[["depends"]] -language <- conf[["tool"]][["depends"]][["code_language"]] -keep_dependencies_type <- conf[["tool"]][["depends"]][["keep_dependencies_type"]] +depends_jar_path <- get_tool_project("depends", tool) +language <- get_depends_code_language(conf) +keep_dependencies_type <- get_depends_keep_dependencies_type(conf) # Mailing List -mbox_path <- conf[["mailing_list"]][["mbox"]] +# Specify project_key_index in get_mbox_path() (e.g. "project_key_1") +mbox_path <- get_mbox_path(conf, "project_key_1") # DV8 parameters -project_path <- conf[["tool"]][["dv8"]][["folder_path"]] +project_path <- get_dv8_folder_path(conf) project_name <- stringi::stri_split_regex(project_path,pattern = "/")[[1]] project_name <- project_name[length(project_name)] -flaws_params <- conf[["tool"]][["dv8"]][["architectural_flaws"]] +flaws_params <- get_dv8_flaws_params(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] -filter_commit_size <- conf[["filter"]][["remove_filepaths_on_commit_size_greather_than"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) +filter_commit_size <- get_filter_commit_size(conf) # Issue ID Regex on Commit Messages -issue_id_regex <- conf[["commit_message_id_regex"]][["issue_id"]] +issue_id_regex <- get_issue_id_regex(conf) # Path to Jira Issues (obtained using `download_jira_data Notebook`) -jira_issues_path <- conf[["issue_tracker"]][["jira"]][["issues"]] -jira_issue_comments_path <- conf[["issue_tracker"]][["jira"]][["issue_comments"]] - - +# Specify project_key_index in get_jira_issues_path() (e.g. "project_key_1") +jira_issues_path <- get_jira_issues_path(conf, "project_key_1") +# Specify project_key_index in get_jira_issues_comments_path (e.g. "project_key_1") +jira_issue_comments_path <- get_jira_issues_comments_path(conf, "project_key_1") ``` # Raw Data Pre-Processing for DV8 diff --git a/vignettes/community_detection_showcase.Rmd b/vignettes/community_detection_showcase.Rmd index 5e77a4bd..ed9a9d74 100644 --- a/vignettes/community_detection_showcase.Rmd +++ b/vignettes/community_detection_showcase.Rmd @@ -48,24 +48,24 @@ The remainder of this Notebook illustrates two functions for some of Kaiaulu net As usual, the first step is to load the project configuration file. ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/apr.yml") +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/apr.yml") # 3rd Party Tools -perceval_path <- tool[["perceval"]] -utags_path <- tool[["utags"]] -oslom_dir_path <- tool[["oslom_dir"]] -oslom_undir_path <- tool[["oslom_undir"]] +perceval_path <- get_tool_project("perceval", tool) +utags_path <- get_tool_project("utags", tool) +oslom_dir_path <- get_tool_project("oslom_dir", tool) +oslom_undir_path <- get_tool_project("oslom_undir", tool) # Ctags Line Types -kinds <- conf[["tool"]][["uctags"]][["keep_lines_type"]] +kinds <- get_uctags_line_types(conf) # Local Git Repo Folder Path -git_repo_path <- conf[["version_control"]][["log"]] +git_repo_path <- get_git_repo_path(conf) # File Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` # Parse Git Log @@ -147,9 +147,9 @@ visIgraph(gcid,randomSeed = 1) ```{r} # Depends parameters -depends_jar_path <- tool[["depends"]] -language <- conf[["tool"]][["depends"]][["code_language"]] -keep_dependencies_type <- conf[["tool"]][["depends"]][["keep_dependencies_type"]] +depends_jar_path <- get_tool_project("depends", tool) +language <- get_depends_code_language(conf) +keep_dependencies_type <- get_depends_keep_dependencies_type(conf) ``` ```{r} diff --git a/vignettes/depends_showcase.Rmd b/vignettes/depends_showcase.Rmd index 925f5c23..d2c74e09 100644 --- a/vignettes/depends_showcase.Rmd +++ b/vignettes/depends_showcase.Rmd @@ -29,19 +29,19 @@ require(knitr) ``` ```{r} -tool <- yaml::read_yaml("../tools.yml") -#conf <- yaml::read_yaml("../conf/apr.yml") -conf <- yaml::read_yaml("../conf/helix.yml") -git_repo_path <- conf[["version_control"]][["log"]] +tool <- parse_config("../tools.yml") +#conf <- parse_config("../conf/apr.yml") +conf <- parse_config("../conf/helix.yml") +git_repo_path <- get_git_repo_path(conf) # Depends parameters -depends_jar_path <- tool[["depends"]] -language <- conf[["tool"]][["depends"]][["code_language"]] -keep_dependencies_type <- conf[["tool"]][["depends"]][["keep_dependencies_type"]] +depends_jar_path <- get_tool_project("depends", tool) +language <- get_depends_code_language(conf) +keep_dependencies_type <- get_depends_keep_dependencies_type(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` # Construct File Network diff --git a/vignettes/download_github_comments.Rmd b/vignettes/download_github_comments.Rmd index 2cbcdadd..b0de1f36 100644 --- a/vignettes/download_github_comments.Rmd +++ b/vignettes/download_github_comments.Rmd @@ -44,10 +44,13 @@ Therefore, in this Notebook we have to rely on three endpoints from the GitHub A To use the pipeline, you must specify the organization and project of interest, and your token. Obtain a github token following the instructions [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token). ```{r} -conf <- yaml::read_yaml("../conf/kaiaulu.yml") -save_path <- path.expand(conf[["issue_tracker"]][["github"]][["replies"]]) # Path you wish to save all raw data. A folder with the repo name and sub-folders will be created. -owner <- conf[["issue_tracker"]][["github"]][["owner"]] # Has to match github organization (e.g. github.com/sailuh) -repo <- conf[["issue_tracker"]][["github"]][["repo"]] # Has to match github repository (e.g. github.com/sailuh/perceive) +conf <- parse_config("../conf/kaiaulu.yml") +save_path_issue_or_pr_comments <- path.expand(get_github_issue_or_pr_comment_path(conf, "project_key_1")) +save_path_issue <- get_github_issue_path(conf, "project_key_1") +save_path_pull_request <- get_github_pull_request_path(conf, "project_key_1") +save_path_commit <- get_github_commit_path(conf, "project_key_1") +owner <- get_github_owner(conf, "project_key_1") # Has to match github organization (e.g. github.com/sailuh) +repo <- get_github_repo(conf, "project_key_1") # Has to match github repository (e.g. github.com/sailuh/perceive) # your file github_token (a text file) contains the GitHub token API token <- scan("~/.ssh/github_token",what="character",quiet=TRUE) ``` @@ -56,19 +59,10 @@ token <- scan("~/.ssh/github_token",what="character",quiet=TRUE) In this section we obtain the raw data (.json) containing all information the GitHub API endpoint provides. We parse the information of interest in the subsequent section. -```{r eval = FALSE} -dir.create(paste0(save_path)) -``` - ## Issues First we will obtain all the issues (i.e. "first comments"). -```{r} -save_path_issue <- paste0(save_path,"/issue/") -``` - - ```{r Collect all issues, eval = FALSE} gh_response <- github_api_project_issue(owner,repo,token) dir.create(save_path_issue) @@ -81,11 +75,6 @@ github_api_iterate_pages(token,gh_response, Next we obtain the "first comment" of every pull request. -```{r} -save_path_pull_request <- paste0(save_path,"/pull_request/") -``` - - ```{r Collect all pull requests, eval = FALSE} gh_response <- github_api_project_pull_request(owner,repo,token) dir.create(save_path_pull_request) @@ -98,11 +87,6 @@ github_api_iterate_pages(token,gh_response, Finally we obtain the comments of both issue and pull requests (which does not contain the data obtained in the prior two endpoints). -```{r} -save_path_issue_or_pr_comments <- paste0(save_path,"/issue_or_pr_comment/") -``` - - ```{r Collect all issue and pull request comments, eval = FALSE} gh_response <- github_api_project_issue_or_pr_comments(owner,repo,token) dir.create(save_path_issue_or_pr_comments) @@ -117,10 +101,6 @@ The three endpoints used above do not contain author and e-mail information, onl To do so, we can use the committer endpoint. -```{r} -save_path_commit <- paste0(save_path,"/commit/") -``` - ```{r Collect all authors and committers name and e-mail, eval = FALSE} gh_response <- github_api_project_commits(owner,repo,token) dir.create(save_path_commit) @@ -189,8 +169,8 @@ Note because we obtain the authors and committers name and e-mail, **only commen Below we show the result of such merge, including the name and e-mail fields obtained from the commit table. As before, we do not display the body column to prevent breaking the HTML format. -```{r} -replies <- parse_github_replies(save_path) +```{r eval = FALSE} +replies <- parse_github_replies(save_path_issue_or_pr_comments) tail(replies,2) %>% gt(auto_align = FALSE) diff --git a/vignettes/download_jira_issues.Rmd b/vignettes/download_jira_issues.Rmd index da2df744..3a699c61 100644 --- a/vignettes/download_jira_issues.Rmd +++ b/vignettes/download_jira_issues.Rmd @@ -45,16 +45,26 @@ To try out this Notebook, try the geronimo configuration file, and follow along First, we will load the Kaiaulu configuration file: ```{r} -conf <- yaml::read_yaml("../conf/kaiaulu.yml") +conf <- parse_config("../conf/kaiaulu.yml") + # Project domain -issue_tracker_domain <- conf[["issue_tracker"]][["jira"]][["domain"]] +# Specify project_key_index in get_jira_domain() (e.g. "project_key_1") +issue_tracker_domain <- get_jira_domain(conf, "project_key_1") + # Project key -issue_tracker_project_key <- conf[["issue_tracker"]][["jira"]][["project_key"]] +# Specify project_key_index in get_jira_project_key_name() (e.g. "project_key_1") +issue_tracker_project_key <- get_jira_project_key_name(conf, "project_key_1") + # Altered save paths. Important for naming conventions -save_path_issue_tracker_issues <- conf[["issue_tracker"]][["jira"]][["issues"]] -save_path_issue_tracker_issue_comments <- conf[["issue_tracker"]][["jira"]][["issue_comments"]] +# Specify project_key_index in get_jira_issues_path() (e.g. "project_key_1") +save_path_issue_tracker_issues <- get_jira_issues_path(conf, "project_key_1") + +# Specify project_key_index in get_jira_issues_comments_path() (e.g. "project_key_1") +save_path_issue_tracker_issue_comments <- get_jira_issues_comments_path(conf, "project_key_1") + # Unaltered save paths from config file for use with refresh function -refresh_issues <- conf[["issue_tracker"]][["jira"]][["issues"]] +# Specify project_key_index in get_jira_issues_path() (e.g. "project_key_1") +refresh_issues <- get_jira_issues_path(conf, "project_key_1") ``` If authentication is needed, save your username (e-mail) and password (API token) in a file, e.g. atlassian_credentials, where the first line is the username, and the second the API token, e.g. diff --git a/vignettes/download_mod_mbox.Rmd b/vignettes/download_mod_mbox.Rmd deleted file mode 100644 index 44a354b4..00000000 --- a/vignettes/download_mod_mbox.Rmd +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Download Mod Mbox Mailing List Archives" -output: - html_document: - toc: true - number_sections: true -vignette: > - %\VignetteEngine{knitr::rmarkdown} - %\VignetteIndexEntry{Download Mod Mbox Mailing List Archives} - %\VignetteEncoding{UTF-8} ---- - - -```{r} -rm(list = ls()) -seed <- 1 -set.seed(seed) -``` - -```{r warning=FALSE,message=FALSE} -require(kaiaulu) -require(data.table) -``` - -# Introduction - -Mailing list data is stored in a variety of archives (e.g. see [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). This notebook showcases how to obtain data from mod_mbox archives, which is adopted by the Apache Software Foundation. - -## Project Configuration File - -As usual, the first step is to load the project configuration file. - -# Project Configuration File - -```{r} -conf <- yaml::read_yaml("../conf/helix.yml") -save_path_mbox <- conf[["mailing_list"]][["mbox"]] -mod_mbox_url <- conf[["mailing_list"]][["domain"]] -mailing_list <- conf[["mailing_list"]][["list_key"]] -start_year <- 2017 -end_year <- 2018 -``` - -```{r eval = FALSE} -mbox <- download_mod_mbox_per_month(base_url = mod_mbox_url, - mailing_list = mailing_list, - from_year=start_year, - to_year=end_year, - save_folder_path = save_path_mbox, - verbose = TRUE) -``` - diff --git a/vignettes/dv8_showcase.Rmd b/vignettes/dv8_showcase.Rmd index 3d95f145..2e3e9a3c 100644 --- a/vignettes/dv8_showcase.Rmd +++ b/vignettes/dv8_showcase.Rmd @@ -30,30 +30,30 @@ require(openxlsx) ``` ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/calculator.yml") -perceval_path <- tool[["perceval"]] -dv8_path <- tool[["dv8"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/calculator.yml") +perceval_path <- get_tool_project("perceval", tool) +dv8_path <- get_tool_project("dv8", tool) # Gitlog parameters -git_repo_path <- conf[["version_control"]][["log"]] +git_repo_path <- get_git_repo_path(conf) # Depends parameters -depends_jar_path <- tool[["depends"]] -language <- conf[["tool"]][["depends"]][["code_language"]] -keep_dependencies_type <- conf[["tool"]][["depends"]][["keep_dependencies_type"]] +depends_jar_path <- get_tool_project("depends", tool) +language <- get_depends_code_language(conf) +keep_dependencies_type <- get_depends_keep_dependencies_type(conf) # DV8 parameters -project_path <- conf[["tool"]][["dv8"]][["folder_path"]] +project_path <- get_dv8_folder_path(conf) project_name <- stringi::stri_split_regex(project_path,pattern = "/")[[1]] project_name <- project_name[length(project_name)] -flaws_params <- conf[["tool"]][["dv8"]][["architectural_flaws"]] +flaws_params <- get_dv8_flaws_params(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] -filter_commit_size <- conf[["filter"]][["remove_filepaths_on_commit_size_greather_than"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) +filter_commit_size <- get_filter_commit_size(conf) ``` # Preparing Gitlog and Dependencies for DV8 diff --git a/vignettes/github_api_showcase.Rmd b/vignettes/github_api_showcase.Rmd index 588f4482..900c5451 100644 --- a/vignettes/github_api_showcase.Rmd +++ b/vignettes/github_api_showcase.Rmd @@ -45,10 +45,12 @@ The goal of the following steps is to obtain the data when a project started ass To use the pipeline, you must specify the organization and project of interest, and your token. ```{r} -conf <- yaml::read_yaml("../conf/kaiaulu.yml") -owner <- conf[["issue_tracker"]][["github"]][["owner"]] # Has to match github organization (e.g. github.com/sailuh) -repo <- conf[["issue_tracker"]][["github"]][["repo"]] # Has to match github repository (e.g. github.com/sailuh/perceive) -save_path <- path.expand(conf[["issue_tracker"]][["github"]][["replies"]]) # Path you wish to save all raw data. A folder with the repo name and sub-folders will be created. +conf <- parse_config("../conf/kaiaulu.yml") +owner <- get_github_owner(conf, "project_key_1") # Has to match github organization (e.g. github.com/sailuh) +repo <- get_github_repo(conf, "project_key_1") # Has to match github repository (e.g. github.com/sailuh/perceive) +save_path_issue_or_pr_comments <- path.expand(get_github_issue_or_pr_comment_path(conf, "project_key_1")) +save_path_issue_event <- get_github_issue_event_path(conf, "project_key_1") +save_path_commit <- get_github_commit_path(conf, "project_key_1") # your file github_token contains the GitHub token API obtained in the steps above token <- scan("~/.ssh/github_token",what="character",quiet=TRUE) ``` @@ -57,23 +59,12 @@ token <- scan("~/.ssh/github_token",what="character",quiet=TRUE) In this section we obtain the raw data (.json) containing all information the GitHub API endpoint provides. We parse the information of interest in the subsequent section. -```{r eval = FALSE} -dir.create(paste0(save_path)) -``` - - ## Issue Events First we obtain all issue events of the project, so we may later subset issue assignments. -```{r} -save_path_issue_event <- paste0(save_path,"/issue_event/") -``` - - ```{r Collect all issue events, eval = FALSE} gh_response <- github_api_project_issue_events(owner,repo,token) -dir.create(save_path_issue_event) github_api_iterate_pages(token,gh_response,save_path_issue_event,prefix="issue_event") ``` @@ -81,14 +72,8 @@ github_api_iterate_pages(token,gh_response,save_path_issue_event,prefix="issue_e Next we download commit data from GitHub API. This will be used to know which users in the issue events have or not merge permissions. -```{r} -save_path_commit <- paste0(save_path,"/commit/") -``` - - ```{r Collect all project commit messages, eval = FALSE} gh_response <- github_api_project_commits(owner,repo,token) -dir.create(save_path_commit) github_api_iterate_pages(token,gh_response,save_path_commit,prefix="commit") ``` diff --git a/vignettes/gitlog_entity_showcase.Rmd b/vignettes/gitlog_entity_showcase.Rmd index d6e4cbb8..87a8747b 100644 --- a/vignettes/gitlog_entity_showcase.Rmd +++ b/vignettes/gitlog_entity_showcase.Rmd @@ -32,22 +32,22 @@ require(knitr) # Project Configuration File ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/kaiaulu.yml") +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/kaiaulu.yml") # 3rd Party Tools -perceval_path <- tool[["perceval"]] -utags_path <- tool[["utags"]] +perceval_path <- get_tool_project("perceval", tool) +utags_path <- get_tool_project("utags", tool) # Ctags Line Types -kinds <- conf[["tool"]][["uctags"]][["keep_lines_type"]] +kinds <- get_uctags_line_types(conf) # Local Git Repo Folder Path -git_repo_path <- conf[["version_control"]][["log"]] +git_repo_path <- get_git_repo_path(conf) # File Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` # Construct Collaboration Network diff --git a/vignettes/gitlog_showcase.Rmd b/vignettes/gitlog_showcase.Rmd index 9e4a95b7..64854c2a 100644 --- a/vignettes/gitlog_showcase.Rmd +++ b/vignettes/gitlog_showcase.Rmd @@ -66,16 +66,16 @@ The file makes all assumptions explicit to you when using the code. Note these a The following code block reads the information explained just now: ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/kaiaulu.yml") -perceval_path <- tool[["perceval"]] -git_repo_path <- conf[["version_control"]][["log"]] -git_branch <- conf[["version_control"]][["branch"]][1] -nvdfeed_folder_path <- conf[["vulnerabilities"]][["nvd_feed"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/kaiaulu.yml") +perceval_path <- get_tool_project("perceval", tool) +git_repo_path <- get_git_repo_path(conf) +git_branch <- get_git_branches(conf)[1] +nvdfeed_folder_path <- get_nvdfeed_folder_path(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` This is all the project configuration files are used for. If you inspect the variables above, you will see they are just strings. As a reminder, the tools.yml is where you store the filepaths to third party software in your computer. Please see Kaiaulu's README.md for details. As a rule of thumb, any R Notebooks in Kaiaulu load the project configuration file at the start, much like you would normally initialize variables at the start of your source code. diff --git a/vignettes/gitlog_vulnerabilities_showcase.Rmd b/vignettes/gitlog_vulnerabilities_showcase.Rmd index 09474cdb..5ed6b745 100644 --- a/vignettes/gitlog_vulnerabilities_showcase.Rmd +++ b/vignettes/gitlog_vulnerabilities_showcase.Rmd @@ -28,16 +28,16 @@ require(yaml) Load config file. ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/chromium.yml") -perceval_path <- tool[["perceval"]] -git_repo_path <- conf[["version_control"]][["log"]] -nvdfeed_folder_path <- conf[["vulnerabilities"]][["nvd_feed"]] -cveid_regex <- conf[["commit_message_id_regex"]][["cve_id"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/chromium.yml") +perceval_path <- get_tool_project("perceval", tool) +git_repo_path <- get_git_repo_path(conf) +nvdfeed_folder_path <- get_nvdfeed_folder_path(conf) +cveid_regex <- get_cveid_regex(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` # Construct Contribution Network diff --git a/vignettes/graph_gof_showcase.Rmd b/vignettes/graph_gof_showcase.Rmd index f6f9d76a..19c3c89f 100644 --- a/vignettes/graph_gof_showcase.Rmd +++ b/vignettes/graph_gof_showcase.Rmd @@ -89,22 +89,22 @@ require(gt) ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/junit5.yml") -srcml_path <- tool[["srcml"]] -pattern4_path <- tool[["pattern4"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/junit5.yml") +srcml_path <- get_tool_project("srcml", tool) +pattern4_path <- get_tool_project("pattern4", tool) -git_repo_path <- conf[["version_control"]][["log"]] +git_repo_path <- get_git_repo_path(conf) folder_path <- stri_replace_last(git_repo_path,replacement="",regex=".git") # Tool Parameters -srcml_filepath <- conf[["tool"]][["srcml"]][["srcml_path"]] -class_folder_path <- conf[["tool"]][["pattern4"]][["class_folder_path"]] -pattern4_output_filepath <- conf[["tool"]][["pattern4"]][["output_filepath"]] +srcml_filepath <- get_srcml_filepath(conf) +class_folder_path <- get_pattern4_folder_path(conf) +pattern4_output_filepath <- get_pattern4_filepath(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` This is all the project configuration files are used for. If you inspect the variables above, you will see they are just strings. As a reminder, the tools.yml is where you store the filepaths to third party software in your computer. Please see Kaiaulu's README.md for details. As a rule of thumb, any R Notebooks in Kaiaulu load the project configuration file at the start, much like you would normally initialize variables at the start of your source code. diff --git a/vignettes/issue_social_smell_showcase.Rmd b/vignettes/issue_social_smell_showcase.Rmd index 42459ec0..8d537c54 100644 --- a/vignettes/issue_social_smell_showcase.Rmd +++ b/vignettes/issue_social_smell_showcase.Rmd @@ -55,33 +55,30 @@ At the scope of this notebook, **only the first branch** (top) specified in the We also provide the path for `tools.yml`. Kaiaulu does not implement all available functionality from scratch. Conversely, it will also not expect all dependencies to be installed. Every function defined in the API expects as parameter a filepath to the external dependency binary. Tools.yml is a convenience file that stores all the binary paths, so it can be set once during setup and reused multiple times for analysis. You can find an example of `tools.yml` on the github repo from Kaiaulu root directory. ```{r} -tools_path <- "../tools.yml" -conf_path <- "../conf/openssl.yml" +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/openssl.yml") -tool <- yaml::read_yaml(tools_path) -scc_path <- tool[["scc"]] +scc_path <- get_tool_project("scc", tool) -oslom_dir_path <- tool[["oslom_dir"]] -oslom_undir_path <- tool[["oslom_undir"]] +oslom_dir_path <- get_tool_project("oslom_dir", tool) +oslom_undir_path <- get_tool_project("oslom_undir", tool) -conf <- yaml::read_yaml(conf_path) +perceval_path <- get_tool_project("perceval", tool) +git_repo_path <- get_git_repo_path(conf) +git_branch <- get_git_branches(conf)[1] -perceval_path <- tool[["perceval"]] -git_repo_path <- conf[["version_control"]][["log"]] -git_branch <- conf[["version_control"]][["branch"]][1] +#start_commit <- get_window_start_commit(conf) +#end_commit <- get_window_end_commit(conf) +window_size <- get_window_size(conf) -#start_commit <- conf[["analysis"]][["window"]][["start_commit"]] -#end_commit <- conf[["analysis"]][["window"]][["end_commit"]] -window_size <- conf[["analysis"]][["window"]][["size_days"]] +mbox_path <- get_mbox_path(conf, "project_key_1") -mbox_path <- conf[["mailing_list"]][["mbox"]] - -nvdfeed_folder_path <- conf[["vulnerabilities"]][["nvd_feed"]] -cveid_regex <- conf[["commit_message_id_regex"]][["cve_id"]] +nvdfeed_folder_path <- get_nvdfeed_folder_path(conf) +cveid_regex <- get_cveid_regex(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` The remainder of this notebook does not require modifications. If you encounter an error in any code block below, chances are one or more parameters above have been specified incorrectly, or the project of choice may have led to an outlier case. Please open an issue if you encounter an error, or if not sure post on discussions in Kaiaulu's GitHub. **E-mailing bugs is discouraged as it is hard to track**. diff --git a/vignettes/line_metrics_showcase.Rmd b/vignettes/line_metrics_showcase.Rmd index 9bbd0d28..39b72d7d 100644 --- a/vignettes/line_metrics_showcase.Rmd +++ b/vignettes/line_metrics_showcase.Rmd @@ -57,16 +57,16 @@ filter: We load the necessary information in this code block: ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/apr.yml") -scc_path <- tool[["scc"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/apr.yml") +scc_path <- get_tool_project("scc", tool) -git_repo_path <- conf[["version_control"]][["log"]] -git_branch <- conf[["version_control"]][["branch"]][1] +git_repo_path <- get_git_repo_path(conf) +git_branch <- get_git_branches(conf)[1] # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) ``` The loaded variables are just strings or numbers. You can always inspect any variable to double check what is being used in the analysis. For instance, we are interested on analyzing a specific release of APR here: diff --git a/vignettes/text_gof_showcase.Rmd b/vignettes/text_gof_showcase.Rmd index bdb69649..1f065cba 100644 --- a/vignettes/text_gof_showcase.Rmd +++ b/vignettes/text_gof_showcase.Rmd @@ -112,22 +112,22 @@ require(gt) ```{r} -tool <- yaml::read_yaml("../tools.yml") -conf <- yaml::read_yaml("../conf/junit5.yml") -srcml_path <- tool[["srcml"]] +tool <- parse_config("../tools.yml") +conf <- parse_config("../conf/junit5.yml") +srcml_path <- get_tool_project("srcml", tool) -git_repo_path <- conf[["version_control"]][["log"]] +git_repo_path <- get_git_repo_path(conf) folder_path <- stri_replace_last(git_repo_path,replacement="",regex=".git") # Tool Parameters -srcml_filepath <- conf[["tool"]][["srcml"]][["srcml_path"]] +srcml_filepath <- get_srcml_filepath(conf) # Filters -file_extensions <- conf[["filter"]][["keep_filepaths_ending_with"]] -substring_filepath <- conf[["filter"]][["remove_filepaths_containing"]] +file_extensions <- get_file_extensions(conf) +substring_filepath <- get_substring_filepath(conf) # Analysis -topics <- conf[["analysis"]][["topics"]] +topics <- get_topics(conf) ``` This is all the project configuration files are used for. If you inspect the variables above, you will see they are just strings. As a reminder, the tools.yml is where you store the filepaths to third party software in your computer. Please see Kaiaulu's README.md for details. As a rule of thumb, any R Notebooks in Kaiaulu load the project configuration file at the start, much like you would normally initialize variables at the start of your source code. From def166089d5917bb4f86b2d3a6a7147c34945af9 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Tue, 12 Nov 2024 03:32:51 -0800 Subject: [PATCH 39/80] i #284 minor fixes and XML dependency Moves some of the narrative to functions. Function documentation can be accessed on demand instead of placed on Notebook. Some of the docs was moved to inline on the function of pipermail. Signed-off-by: Carlos Paradis --- DESCRIPTION | 2 +- R/mail.R | 12 ++++ man/commit_message_id_coverage.Rd | 2 +- man/download_jira_issues_by_date.Rd | 4 +- man/download_jira_issues_by_issue_key.Rd | 4 +- man/metric_churn_per_commit_interval.Rd | 2 +- man/metric_churn_per_commit_per_file.Rd | 2 +- man/metric_file_bug_churn.Rd | 2 +- man/metric_file_bug_frequency.Rd | 2 +- man/metric_file_churn.Rd | 2 +- man/metric_file_non_bug_churn.Rd | 2 +- man/metric_file_non_bug_frequency.Rd | 2 +- man/motif_factory_anti_square.Rd | 4 +- man/motif_factory_anti_triangle.Rd | 4 +- man/motif_factory_square.Rd | 4 +- man/motif_factory_triangle.Rd | 4 +- ...e_bugzilla_perceval_rest_issue_comments.Rd | 6 +- ...lla_perceval_traditional_issue_comments.Rd | 6 +- man/parse_bugzilla_rest_comments.Rd | 6 +- man/parse_bugzilla_rest_issues.Rd | 4 +- man/parse_bugzilla_rest_issues_comments.Rd | 4 +- man/parse_commit_message_id.Rd | 6 +- man/parse_dependencies.Rd | 6 +- man/parse_dv8_clusters.Rd | 6 +- man/parse_gitlog.Rd | 6 +- man/parse_jira.Rd | 4 +- man/parse_jira_latest_date.Rd | 6 +- man/parse_jira_rss_xml.Rd | 6 +- man/parse_mbox.Rd | 4 +- man/parse_mbox_latest_date.Rd | 4 +- man/parse_nvdfeed.Rd | 8 +-- man/refresh_jira_issues.Rd | 8 +-- vignettes/download_mail.Rmd | 59 ++----------------- 33 files changed, 82 insertions(+), 121 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0725c39f..3cbecaaf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,7 +45,7 @@ Imports: httr (>= 1.4.1), curl (>= 4.3), gh (>= 1.2.0), - XML (>= 3.99), + XML (>= 3.99-0.7), RColorBrewer (>= 1.1-2), cli (>= 2.0.2), docopt (>= 0.7.1) diff --git a/R/mail.R b/R/mail.R index dd4e28b4..d91c1653 100644 --- a/R/mail.R +++ b/R/mail.R @@ -30,6 +30,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { ########## Download and Parse Mailing List HTML for Links ########## + # Ensure mailing_list url ends with a slash, which is important when constructing links for downloading files, # since the extracted links are relative to the base URL. # e.g.base url: https://mta.openssl.org/pipermail/openssl-announce/ and extracted link: 2024-June.txt.gz @@ -37,6 +38,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s mailing_list <- stringi::stri_c(mailing_list, "/") } + # Archive Index Retrieval + # Begins by downloading an HTML page that lists the URLs + # for the monthly archives, which are typically available in .txt or .gz formats. + # Sends a GET request to the mailing list’s URL to retrieve contents. This is the main page of the mailing list archive, # which contains links to individual month files (in .txt or .gz format). response <- httr::GET(mailing_list, httr::timeout(60)) @@ -111,6 +116,9 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s txt_url <- stringi::stri_c(mailing_list, gsub("\\.gz$", "", link)) gz_url <- stringi::stri_c(mailing_list, link) + # The function attempts to download the .txt file for each month. + # If the .txt file is unavailable, it falls back to downloading the + # .gz (gzipped) file. # Attempt to download the .txt file first download_url <- txt_url response <- httr::GET(download_url, httr::timeout(60)) @@ -143,6 +151,8 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s gz_file_path <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) + # If a .gz file is downloaded, the function unzips it and converts it into an .mbox file. + # The original .gz file is deleted after extraction to save space. # Unzip the .gz file and save the contents as a .mbox file. gz_con <- gzfile(gz_file_path, open = "rb") out_con <- file(dest, open = "wb") @@ -172,6 +182,8 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s # List the files in the save_folder_path. downloaded_files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$", full.names = FALSE) + # The downloaded .mbox files are saved in the specified folder with the + # naming convention kaiaulu_YYYYMM.mbox, where YYYYMM represents the year and month. # Extract the YYYYMM from the file names. downloaded_dates <- as.numeric(sub("kaiaulu_(\\d{6})\\.mbox", "\\1", downloaded_files_in_folder)) diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index e7f0c6ef..68fad761 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/download_jira_issues_by_date.Rd b/man/download_jira_issues_by_date.Rd index ccb3c7c2..697fdb48 100644 --- a/man/download_jira_issues_by_date.Rd +++ b/man/download_jira_issues_by_date.Rd @@ -72,13 +72,13 @@ For further details on the `created` JQL Query see [the associated JIRA API docu \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/download_jira_issues_by_issue_key.Rd b/man/download_jira_issues_by_issue_key.Rd index 8213ee17..b452878f 100644 --- a/man/download_jira_issues_by_issue_key.Rd +++ b/man/download_jira_issues_by_issue_key.Rd @@ -67,13 +67,13 @@ For further details on the `issueKey` JQL Query see [the associated JIRA API doc \code{\link{refresh_jira_issues}} to obtain more recent data from any of the downloader functions Other jira: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} Other downloaders: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, +\code{\link{download_jira_issues}()}, \code{\link{refresh_jira_issues}()} } \concept{downloaders} diff --git a/man/metric_churn_per_commit_interval.Rd b/man/metric_churn_per_commit_interval.Rd index 6969492d..21f5e494 100644 --- a/man/metric_churn_per_commit_interval.Rd +++ b/man/metric_churn_per_commit_interval.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_churn_per_commit_per_file.Rd b/man/metric_churn_per_commit_per_file.Rd index 577d3f63..75b48c85 100644 --- a/man/metric_churn_per_commit_per_file.Rd +++ b/man/metric_churn_per_commit_per_file.Rd @@ -20,8 +20,8 @@ Calculates the churn metric for a sequence of commits per commit per file Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_bug_churn.Rd b/man/metric_file_bug_churn.Rd index 7bea610e..29bef17d 100644 --- a/man/metric_file_bug_churn.Rd +++ b/man/metric_file_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed bug type issues the file was involv \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_bug_frequency.Rd b/man/metric_file_bug_frequency.Rd index f978666e..607aef62 100644 --- a/man/metric_file_bug_frequency.Rd +++ b/man/metric_file_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed bug type issues the file was involved. \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_churn}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_churn.Rd b/man/metric_file_churn.Rd index 67049ea5..3e2babd4 100644 --- a/man/metric_file_churn.Rd +++ b/man/metric_file_churn.Rd @@ -18,9 +18,9 @@ The total churn of a file \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_non_bug_churn}()}, diff --git a/man/metric_file_non_bug_churn.Rd b/man/metric_file_non_bug_churn.Rd index 049b9cd8..bf35bb1c 100644 --- a/man/metric_file_non_bug_churn.Rd +++ b/man/metric_file_non_bug_churn.Rd @@ -20,9 +20,9 @@ The total churn sum of commits of all closed non-bug type issues the file was in \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/metric_file_non_bug_frequency.Rd b/man/metric_file_non_bug_frequency.Rd index 9516ce61..da87d00a 100644 --- a/man/metric_file_non_bug_frequency.Rd +++ b/man/metric_file_non_bug_frequency.Rd @@ -20,9 +20,9 @@ The total number of commits of all closed non-bug type issues the file was invol \seealso{ Other {metrics}: \code{\link{commit_message_id_coverage}()}, -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/motif_factory_anti_square.Rd b/man/motif_factory_anti_square.Rd index 8850ff38..06cefd8f 100644 --- a/man/motif_factory_anti_square.Rd +++ b/man/motif_factory_anti_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_triangle}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()} +\code{\link{motif_factory_triangle}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/motif_factory_anti_triangle.Rd b/man/motif_factory_anti_triangle.Rd index 349cce19..b5a789a8 100644 --- a/man/motif_factory_anti_triangle.Rd +++ b/man/motif_factory_anti_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_square}()}, -\code{\link{motif_factory_triangle}()} +\code{\link{motif_factory_triangle}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/motif_factory_square.Rd b/man/motif_factory_square.Rd index 1c94e2a3..74101dfd 100644 --- a/man/motif_factory_square.Rd +++ b/man/motif_factory_square.Rd @@ -26,9 +26,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_triangle}()} +\code{\link{motif_factory_triangle}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/motif_factory_triangle.Rd b/man/motif_factory_triangle.Rd index 0a99faa8..61d81313 100644 --- a/man/motif_factory_triangle.Rd +++ b/man/motif_factory_triangle.Rd @@ -22,9 +22,9 @@ in IEEE Transactions on Software Engineering, vol. 48, no. 8, pp. 3159-3184, } \seealso{ Other motif: -\code{\link{motif_factory}()}, \code{\link{motif_factory_anti_square}()}, \code{\link{motif_factory_anti_triangle}()}, -\code{\link{motif_factory_square}()} +\code{\link{motif_factory_square}()}, +\code{\link{motif_factory}()} } \concept{motif} diff --git a/man/parse_bugzilla_perceval_rest_issue_comments.Rd b/man/parse_bugzilla_perceval_rest_issue_comments.Rd index 87d29f0c..610eeb6f 100644 --- a/man/parse_bugzilla_perceval_rest_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_rest_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval REST API Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd index 0cfacfd0..f6f3b7f2 100644 --- a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd @@ -26,17 +26,17 @@ Parse Bugzilla data obtained from Perceval traditional Bugzilla backend Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_comments.Rd b/man/parse_bugzilla_rest_comments.Rd index b12be91b..57999ca2 100644 --- a/man/parse_bugzilla_rest_comments.Rd +++ b/man/parse_bugzilla_rest_comments.Rd @@ -19,17 +19,17 @@ Parse Bugzilla comments data obtained from json files from Bugzilla crawler \cod Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues.Rd b/man/parse_bugzilla_rest_issues.Rd index bdd8bdde..da912e4b 100644 --- a/man/parse_bugzilla_rest_issues.Rd +++ b/man/parse_bugzilla_rest_issues.Rd @@ -27,11 +27,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_bugzilla_rest_issues_comments.Rd b/man/parse_bugzilla_rest_issues_comments.Rd index 05da2855..b884739f 100644 --- a/man/parse_bugzilla_rest_issues_comments.Rd +++ b/man/parse_bugzilla_rest_issues_comments.Rd @@ -29,11 +29,11 @@ Other parsers: \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_commit_message_id.Rd b/man/parse_commit_message_id.Rd index e090ef19..13d9e542 100644 --- a/man/parse_commit_message_id.Rd +++ b/man/parse_commit_message_id.Rd @@ -19,16 +19,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dependencies.Rd b/man/parse_dependencies.Rd index e4c58051..a7136742 100644 --- a/man/parse_dependencies.Rd +++ b/man/parse_dependencies.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_dv8_clusters.Rd b/man/parse_dv8_clusters.Rd index b4dc6249..987936bf 100644 --- a/man/parse_dv8_clusters.Rd +++ b/man/parse_dv8_clusters.Rd @@ -17,16 +17,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} Other dv8: diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index 7d65786f..d4370808 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira.Rd b/man/parse_jira.Rd index 0db0e226..c3e8fe9a 100644 --- a/man/parse_jira.Rd +++ b/man/parse_jira.Rd @@ -33,16 +33,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_latest_date.Rd b/man/parse_jira_latest_date.Rd index e2a730b5..d05f3b82 100644 --- a/man/parse_jira_latest_date.Rd +++ b/man/parse_jira_latest_date.Rd @@ -25,16 +25,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_jira_rss_xml.Rd b/man/parse_jira_rss_xml.Rd index 1c0abecb..17b88ff5 100644 --- a/man/parse_jira_rss_xml.Rd +++ b/man/parse_jira_rss_xml.Rd @@ -28,16 +28,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 9b128dd8..780e984f 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -23,15 +23,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index eedf9633..486f35fd 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -21,15 +21,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_nvdfeed.Rd b/man/parse_nvdfeed.Rd index 0accc69d..1c4365bd 100644 --- a/man/parse_nvdfeed.Rd +++ b/man/parse_nvdfeed.Rd @@ -18,16 +18,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, -\code{\link{parse_mbox_latest_date}()} +\code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()} } \concept{parsers} diff --git a/man/refresh_jira_issues.Rd b/man/refresh_jira_issues.Rd index 20be8882..6e7118eb 100644 --- a/man/refresh_jira_issues.Rd +++ b/man/refresh_jira_issues.Rd @@ -61,14 +61,14 @@ data. \code{\link{parse_jira_latest_date}} to retrieve the file path of the latest issue key Other downloaders: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()} +\code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()} Other jira: -\code{\link{download_jira_issues}()}, \code{\link{download_jira_issues_by_date}()}, -\code{\link{download_jira_issues_by_issue_key}()} +\code{\link{download_jira_issues_by_issue_key}()}, +\code{\link{download_jira_issues}()} } \concept{downloaders} \concept{jira} diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index c9ac8be5..db06e71f 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -100,11 +100,6 @@ Regardless of which mail archive you choose, the downloaders will store the mail For Pipermail, we need to specify the project key, which is used to retrieve the configuration parameters for the specific project. The project key is used to identify the project in the configuration file. -```{r} -# Define the project key -project_key <- "project_key_1" -``` - Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. ```{r} @@ -117,21 +112,10 @@ save_folder_path <- get_pipermail_path(conf, "project_key_1") Note that the date range is not set with a getter. The range for downloads changes often, and should be set manually using the YYYYMM format. -Explanation of Getters: - -- get_pipermail_domain(config_file, project_key_index): Retrieves the mailing list URL. -- get_pipermail_path(config_file, project_key_index): Retrieves the local folder path for saving archives. -- get_pipermail_input_file(config_file, project_key_index): Retrieves the .mbox file path for parsing (parse_mbox function). - ## Mbox Configuration Similarly to Pipermail, we need to specify the project key for Mod Mbox. The project key is used to retrieve the configuration parameters for the specific project. -```{r} -# Define the project key -project_key <- "project_key_1" -``` - Use the getters to extract the parameters: ```{r eval=FALSE} @@ -142,21 +126,12 @@ end_year_month <- 202405 save_folder_path <- get_mbox_path(conf, "project_key_1") ``` -Explanation of Getters: - -get_mbox_domain(config_file, project_key_index): Retrieves the mailing list URL. -get_mbox_path(config_file, project_key_index): Retrieves the local folder path for saving archives. -get_mbox_input_file(config_file, project_key_index): Retrieves the .mbox file path for parsing. -start_year_month and end_year_month should be set manually, as with pipermail. +The `start_year_month` and `end_year_month` time range parameters should be set manually, as with pipermail. ## Tools Configuration -In addition to the mailing list configurations, you need to specify the path to the perceval binary in tools.yml, which is used by the parse_mbox() function to parse .mbox files.It should look something like this: - -```{r} -perceval: /usr/local/bin/perceval -``` +In addition to the mailing list configurations, you need to specify the path to the [Perceval](https://github.com/chaoss/grimoirelab-perceval) binary in tools.yml. See the wiki for further details on how to setup third party tools. Now, you can load the configurations in your R script or notebook using the following code: @@ -170,12 +145,6 @@ conf <- parse_config("../conf/helix.yml") mbox_file_path <- get_mbox_input_file(conf, "project_key_1") ``` -Explanation of Getters: - -parse_config(): Function to parse the YAML configuration files. -get_tool("perceval", tools): Retrieves the Perceval path from the tools configuration. -get_mbox_input_file(conf, "project_key_1"): Retrieves the .mbox file path for project_key_1 from the helix configuration. - # Downloaders and Refreshers ## Downloaders @@ -184,18 +153,7 @@ With the configurations loaded, we can proceed to download the mailing list arch ### Pipermail Downloader -The download_pipermail() function downloads Pipermail archives from a specified mailing list within a given date range. Here's how it operates: - -- Archive Index Retrieval: It begins by downloading an HTML page that lists the URLs for the monthly archives, which are typically available in .txt or .gz formats. -- File Downloading: The function attempts to download the .txt file for each month. If the .txt file is unavailable, it falls back to downloading the .gz (gzipped) file. -- File Processing: If a .gz file is downloaded, the function unzips it and converts it into an .mbox file. The original .gz file is deleted after extraction to save space. -- File Saving: The downloaded .mbox files are saved in the specified folder with the naming convention kaiaulu_YYYYMM.mbox, where YYYYMM represents the year and month. -- Date Range Filtering: Only files within the specified start_year_month and end_year_month are downloaded. -- Error Handling: If both .txt and .gz formats fail to download for a particular month, a warning is issued indicating the missing month. -- Summary Output: At the end of the process, the function summarizes the downloads, indicating the range of dates present and any missing months. -- Set verbose to TRUE to see status updates and detailed output. - -#### Example Usage +The download_pipermail() function downloads Pipermail archives from a specified mailing list within a given date range: ```{r eval=FALSE} # Download archives @@ -213,16 +171,7 @@ After running this function, the .mbox files will be saved in the specified dire ### Mod Mbox Downloader -The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range: - -- URL Construction: It constructs the download URLs for each month based on the mailing list URL and the date range. -- File Downloading: Downloads the .mbox file for each month in the format "YYYY-MM". -- File Saving: Saves the downloaded .mbox files in the specified folder with the naming convention kaiaulu_YYYYMM.mbox. -- Date Range Looping: Iterates through each month between start_year_month and end_year_month. -- Error Handling: Issues a warning if a download fails for a specific month, indicating that the month's data may not exist. -- Summary Output: Provides a summary of the downloads, including any missing months. - -The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. +The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range. The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. #### Example Usage From bfc75cb81fcae8efe8a402b61b451289097bfa96 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Tue, 12 Nov 2024 04:01:58 -0800 Subject: [PATCH 40/80] revert utags revert utags to match master Signed-off-by: Carlos Paradis --- tools.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools.yml b/tools.yml index 4515ae80..27951fe6 100644 --- a/tools.yml +++ b/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /usr/local/bin/ctags +utags: /usr/local/Cellar/universal-ctags/HEAD-62f0144/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ From c1830f600c8245b7fede8270cc1ab58f740eb002 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Tue, 12 Nov 2024 04:28:14 -0800 Subject: [PATCH 41/80] i #284 More narrative and config fixes Signed-off-by: Carlos Paradis --- conf/helix.yml | 15 +++++++++------ vignettes/download_mail.Rmd | 14 ++++---------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/conf/helix.yml b/conf/helix.yml index 16cf9ac8..a56b3a4e 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -48,28 +48,31 @@ version_control: - revert-1685-master mailing_list: + # If projects uses Apache Mod Mbox mod_mbox: + # There can be multiple projects in both the pipermail and mod mbox sections. project_key_1: mailing_list: https://lists.apache.org/list.html?announce@apache.org - save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ + save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox + mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu.mbox project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox + # If project uses Pipermail pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox - project_key_2: - mailing_list: https://mta.openssl.org/pipermail/openssl-project/ - save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ + #project_key_2: + # mailing_list: https://mta.openssl.org/pipermail/openssl-project/ + # save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox + # mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index db06e71f..3ccea773 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -79,16 +79,10 @@ mailing_list: # mbox_file_path is for use only with parse_mbox() function. It is the file to parse mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox ``` -Explanation: - -- mailing_list: The top-level key for mailing list configurations. -- project_key_1: A unique key for the project. There can be multiple projects in both the pipermail and mod mbox sections. -- pipermail/ mod_mbox: Indicates whether the setting are for pipermail or mod mbox. Although the parameters are the same, this helps to differentiate between the two types of mailing list archives. -- mailing_list: The URL of the mailing list archive page. Note that this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). -- start_year_month: The starting date for downloading archives (in YYYYMM format). -- end_year_month: The ending date for downloading archives (in YYYYMM format). -- save_folder_path: The local directory where the downloaded archives will be saved (if you run the code in this notebook, the archives will be saved in a folder 'extdata', located in the parent directory of kaiaulu (wherever your kaiaulu folder is kept)). -- mbox_file_path: The path to the .mbox file used by the parse_mbox() function. + +The most time intensive step you will be required is to locate the URL of the mailing list archive you wish for in the project website. This is specified under `mailing_list`. Note for pipermail this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). + + Note: It is important that the paths specified in save_folder_path and mbox_file_path are accurate and do not conflict between projects. From 484210088694cb84031ae2d2291828314a2d265c Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Tue, 12 Nov 2024 04:35:40 -0800 Subject: [PATCH 42/80] i #284 Remove description tags The rest of kaiaulu does not use this. Signed-off-by: Carlos Paradis --- R/mail.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/mail.R b/R/mail.R index d91c1653..1f1befcd 100644 --- a/R/mail.R +++ b/R/mail.R @@ -228,7 +228,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s #' Refresh mbox files downloaded via pipermail #' -#' @description This function refreshes the mailing list files by checking the contents of a specified folder. +#' This function refreshes the mailing list files by checking the contents of a specified folder. #' If the folder is empty, it calls \code{download_pipermail} to download all pipermail files from start_year_month to the current month. #' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it #' along with all future months up to the current real-life month. @@ -294,7 +294,7 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, #' Process .gz files in a folder and convert them to .mbox #' -#' @description This function scans a specified folder for any .gz files, unzips them, +#' This function scans a specified folder for any .gz files, unzips them, #' and renames them to the .mbox format. After unzipping, the original .gz files are deleted. #' If a .mbox file with the same name already exists, it will be overwritten. #' This makes sure that all the files in the folder are in .mbox format, ready for parsing. @@ -504,7 +504,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa #' Refresh mbox files downloaded via mod_mbox #' -#' @description This function refreshes the mailing list files by checking the contents of a specified folder. +#' This function refreshes the mailing list files by checking the contents of a specified folder. #' If the folder is empty, it calls \code{download_mod_mbox} to download all mod_mbox files from start_year_month to the current month. #' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it #' along with all future months up to the current real-life month. @@ -611,7 +611,7 @@ parse_mbox <- function(perceval_path, mbox_file_path){ #' Parse mbox latest date #' -#' @description This function returns the name of the latest mod_mbox file downloaded in the specified folder +#' This function returns the name of the latest mod_mbox file downloaded in the specified folder #' based on the naming convention `kaiaulu_YYYYMM.mbox`. For example: `kaiaulu_202401.mbox`. #' #' @param save_folder_path path to the folder containing the mbox files From 0f9769e3d3e86c97858e5116809c1e7e8be0d940 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Tue, 12 Nov 2024 04:46:10 -0800 Subject: [PATCH 43/80] i #284 more minor doc formatting fixes added url tag, etc. Signed-off-by: Carlos Paradis --- DESCRIPTION | 4 ++-- R/mail.R | 12 ++++++------ man/refresh_mod_mbox.Rd | 9 +++++---- man/refresh_pipermail.Rd | 9 +++++---- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3cbecaaf..1856f6a2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,8 +20,8 @@ Authors@R: c( person('Nicole', 'Hoess', role = c('ctb')), person('Anthony', 'Lau', role = c('ctb')), person('Sean', 'Sunoo', role = c('ctb')), - person('Ian Jaymes', 'Iwata', role= c('ctb')), - person('Dao', 'McGill', role= c('ctb')), + person('Ian Jaymes', 'Iwata', role = c('ctb')), + person('Dao', 'McGill', role = c('ctb')), person('Nicholas', 'Beydler', role = c('ctb')), person('Mark', 'Burgess', role = c('ctb')) ) diff --git a/R/mail.R b/R/mail.R index 1f1befcd..537188d5 100644 --- a/R/mail.R +++ b/R/mail.R @@ -229,15 +229,15 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s #' Refresh mbox files downloaded via pipermail #' #' This function refreshes the mailing list files by checking the contents of a specified folder. -#' If the folder is empty, it calls \code{download_pipermail} to download all pipermail files from start_year_month to the current month. +#' If the folder is empty, it calls \code{\link{download_pipermail}} to download all pipermail files from start_year_month to the current month. #' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it #' along with all future months up to the current real-life month. #' -#' The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +#' The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. #' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. #' Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. #' -#' @param mailing_list The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/") +#' @param mailing_list The URL of the mailing list being downloaded (e.g., \url{https://mta.openssl.org/pipermail/openssl-announce/}) #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). #' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored. #' @param verbose if TRUE, prints diagnostic messages. @@ -505,15 +505,15 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa #' Refresh mbox files downloaded via mod_mbox #' #' This function refreshes the mailing list files by checking the contents of a specified folder. -#' If the folder is empty, it calls \code{download_mod_mbox} to download all mod_mbox files from start_year_month to the current month. +#' If the folder is empty, it calls \code{\link{download_mod_mbox}} to download all mod_mbox files from start_year_month to the current month. #' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it #' along with all future months up to the current real-life month. #' -#' The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +#' The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. #' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. #' Redownloading the most recent file ensures any files added in that month after the latest refresh are included. #' -#' @param mailing_list The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org') +#' @param mailing_list The URL of the mailing list being downloaded (e.g., \url{https://lists.apache.org/list.html?announce@apache.org}) #' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). #' @param save_folder_path The folder path in which all the downloaded mod_mbox files will be stored. #' @param verbose if TRUE, prints diagnostic messages. diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd index 43f6349a..f8da91dd 100644 --- a/man/refresh_mod_mbox.Rd +++ b/man/refresh_mod_mbox.Rd @@ -12,7 +12,7 @@ refresh_mod_mbox( ) } \arguments{ -\item{mailing_list}{The URL of the mailing list being downloaded (e.g., 'https://lists.apache.org/list.html?announce@apache.org')} +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., \url{https://lists.apache.org/list.html?announce@apache.org})} \item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} @@ -25,11 +25,12 @@ Returns `downloaded_files`, a vector of the newly downloaded files in the curren } \description{ This function refreshes the mailing list files by checking the contents of a specified folder. -If the folder is empty, it calls \code{download_mod_mbox} to download all mod_mbox files from start_year_month to the current month. +If the folder is empty, it calls \code{\link{download_mod_mbox}} to download all mod_mbox files from start_year_month to the current month. If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it along with all future months up to the current real-life month. - -The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +} +\details{ +The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. Redownloading the most recent file ensures any files added in that month after the latest refresh are included. } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index dc2ce0b2..0e88851e 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -12,7 +12,7 @@ refresh_pipermail( ) } \arguments{ -\item{mailing_list}{The URL of the mailing list being downloaded (e.g., "https://mta.openssl.org/pipermail/openssl-announce/")} +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., \url{https://mta.openssl.org/pipermail/openssl-announce/})} \item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} @@ -25,11 +25,12 @@ Returns `downloaded_files`, a vector of the newly downloaded files in the curren } \description{ This function refreshes the mailing list files by checking the contents of a specified folder. -If the folder is empty, it calls \code{download_pipermail} to download all pipermail files from start_year_month to the current month. +If the folder is empty, it calls \code{\link{download_pipermail}} to download all pipermail files from start_year_month to the current month. If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it along with all future months up to the current real-life month. - -The naming convention of files is kaiaulu_YYYYMM.mbox, and the function uses this pattern to identify the most recent month. +} +\details{ +The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. } From 6f6a59b1765fadc3bdfda086950d9f215e7ce9f1 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 11:20:17 -1000 Subject: [PATCH 44/80] i #284 Updates to exec/mailinglist.R and Minor Fixes for Mail Configuration - Use refresh in exec - Use getters in exec - Change kaiaulu version in exec - Fix paths in helix.yml - Remove unused parameters - Change cat to message Signed-off-by: Dao McGill --- R/mail.R | 71 ++++++++++++++++------------------- conf/helix.yml | 18 ++++----- conf/openssl.yml | 1 - exec/mailinglist.R | 44 +++++++++++----------- man/download_mod_mbox.Rd | 2 +- man/download_pipermail.Rd | 4 +- man/make_mbox_mailing_list.Rd | 4 +- man/parse_mbox.Rd | 4 +- man/parse_mbox_latest_date.Rd | 4 +- vignettes/download_mail.Rmd | 2 +- 10 files changed, 75 insertions(+), 79 deletions(-) diff --git a/R/mail.R b/R/mail.R index 537188d5..9c169953 100644 --- a/R/mail.R +++ b/R/mail.R @@ -17,8 +17,8 @@ #' #' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. #' The function only downloads files that fall between the specified start_year_month and end_year_month. -#' When both formats fail to download, the function issues a warning indicating the missing month. -#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. +#' When both formats fail to download, the function issues a warning indimessageing the missing month. +#' At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. #' #' @param mailing_list The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/" #' @param start_year_month The year and month of the first file to be downloaded format: 'YYYYMM' @@ -141,13 +141,13 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s ########## Write Downloaded File to Disk ########## # Print diagnostic info if verbose is TRUE if (verbose) { - cat("Downloading: ", download_url, "\n") - cat("Saving to: ", dest, "\n") + message("Downloading: ", download_url, "\n") + message("Saving to: ", dest, "\n") } # Write the downloaded file to disk. If the file is a .gz file, it needs to be unzipped and converted to .mbox format. if (grepl("\\.gz$", download_url)) { - # Download the .gz file to a temporary location. + # Download the .gz file to a temporary lomessageion. gz_file_path <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) @@ -164,7 +164,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s close(gz_con) close(out_con) - # Remove the .gz file after unzipping to avoid storing duplicate data. + # Remove the .gz file after unzipping to avoid storing duplimessagee data. file.remove(gz_file_path) } else { # If the .txt file is available, download it directly and save it as a .mbox file. @@ -202,18 +202,18 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s max_downloaded_date <- max(downloaded_dates) if (verbose) { - cat("\nSummary of Downloads:\n") - cat("save_folder_path contains mail from date ", min_downloaded_date, " to ", max_downloaded_date, "\n") + message("\nSummary of Downloads:\n") + message("save_folder_path contains mail from date ", min_downloaded_date, " to ", max_downloaded_date, "\n") } } else { if (verbose) { - cat("No files found in save_folder_path\n") + message("No files found in save_folder_path\n") } } if (length(missing_months) == 0) { if (verbose) { - cat("No missing months\n") + message("No missing months\n") } } else { warning("Months missing in the date range: ", paste(missing_months, collapse = ", "), "\n") @@ -254,7 +254,7 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, # If the folder is empty, download all pipermail files starting from the start_year_month # The end date is set to the current month based on the system date end_year_month <- format(Sys.Date(), "%Y%m") - if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") + if (verbose) message("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") # Call the download_pipermail function to download files from start_year_month to end_year_month download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) @@ -272,7 +272,7 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) - if (verbose) cat("Deleted the most recent file:", recent_file, "\n") + if (verbose) message("Deleted the most recent file:", recent_file, "\n") } ########## Redownload from the Most Recent Month ########## @@ -280,14 +280,14 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, end_year_month <- format(Sys.Date(), "%Y%m") # Redownload files from the most recent month (that was just deleted) to the current month - if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") + if (verbose) message("Redownloading from", recent_month, "to", end_year_month, "\n") # Call the download_pipermail function to redownload the deleted month and all subsequent months up to the current month download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) } ########## Process .gz Files After Refresh ########## # Call process_gz_to_mbox_in_folder to ensure all .gz files are converted to .mbox after the refresh - if (verbose) cat("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") + if (verbose) message("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") process_gz_to_mbox_in_folder(save_folder_path = save_folder_path, verbose = verbose) } @@ -313,7 +313,7 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { # If there are no .gz files, print a message (if verbose is TRUE) and return NULL if (length(gz_files) == 0) { - if (verbose) cat("This folder does not contain any .gz files.\n") + if (verbose) message("This folder does not contain any .gz files.\n") return(invisible(NULL)) } @@ -326,7 +326,7 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { # Define the corresponding .mbox file path by replacing .gz with .mbox in the file name mbox_file <- gsub("\\.gz$", ".mbox", gz_file) - if (verbose) cat("Processing:", gz_file, " -> ", mbox_file, "\n") + if (verbose) message("Processing:", gz_file, " -> ", mbox_file, "\n") # Open the .gz file in binary mode for reading gz_con <- gzfile(gz_file, open = "rb") @@ -369,7 +369,7 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { #' The function loops through each month in the range specified by `start_year_month` and `end_year_month`, #' and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. #' This means the file could not be found and that month's data may not exist. -#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. +#' At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. #' #' @param mailing_list The URL of the Apache Pony Mail list from which mbox files are to be downloaded #' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). @@ -386,7 +386,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa # embedded within the URL (after the 'list.html?'). # We are using 'sub()' to perform a simple string replacement, extracting everything after 'list.html?'. mailing_list_name <- sub(".*list.html\\?(.+)", "\\1", mailing_list) - if (verbose) cat("Base list extracted:", mailing_list_name, "\n") + if (verbose) message("Base list extracted:", mailing_list_name, "\n") ########## Prepare Year and Month ########## # The start_year_month and end_year_month are in the format "YYYYMM". @@ -426,8 +426,8 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa file_path <- file.path(save_folder_path, file_name) if (verbose) { - cat("Constructed URL:", download_url, "\n") - cat("Saving to file:", file_path, "\n") + message("Constructed URL:", download_url, "\n") + message("Saving to file:", file_path, "\n") } ########## Download Mbox File ########## @@ -438,11 +438,11 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa # Check for successful download (status code 200). if (status_code == 200) { - if (verbose) cat("Successfully downloaded:", download_url, "\n") + if (verbose) message("Successfully downloaded:", download_url, "\n") } else { if (verbose) { - cat("Failed to download:", download_url, "\n") - cat("HTTP Status Code:", status_code, "\n") + message("Failed to download:", download_url, "\n") + message("HTTP Status Code:", status_code, "\n") } # Remove failed download file. unlink(file_path) @@ -477,18 +477,18 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa max_downloaded_date <- max(downloaded_dates) if (verbose) { - cat("\nSummary of Downloads:\n") - cat("save_folder_path contains mail from date", min_downloaded_date, "to", max_downloaded_date, "\n") + message("\nSummary of Downloads:\n") + message("save_folder_path contains mail from date", min_downloaded_date, "to", max_downloaded_date, "\n") } } else { if (verbose) { - cat("No files found in save_folder_path\n") + message("No files found in save_folder_path\n") } } if (length(missing_months) == 0) { if (verbose) { - cat("No missing months\n") + message("No missing months\n") } } else { warning("Months missing in the date range:", paste(missing_months, collapse = ", "), "\n") @@ -530,7 +530,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v # If the folder is empty, download all mod_mbox files starting from start_year_month # The end date is set to the current month based on the system date end_year_month <- format(Sys.Date(), "%Y%m") - if (verbose) cat("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") + if (verbose) message("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") # Call the download_mod_mbox function to download files from start_year_month to end_year_month download_mod_mbox(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = verbose) @@ -548,7 +548,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) - if (verbose) cat("Deleted the most recent file:", recent_file, "\n") + if (verbose) message("Deleted the most recent file:", recent_file, "\n") } ########## Redownload from the Most Recent Month ########## @@ -556,7 +556,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v end_year_month <- format(Sys.Date(), "%Y%m") # Redownload files from the most recent month (that was just deleted) to the current month - if (verbose) cat("Redownloading from", recent_month, "to", end_year_month, "\n") + if (verbose) message("Redownloading from", recent_month, "to", end_year_month, "\n") # Call the download_mod_mbox function to redownload the deleted month and all subsequent months up to the current month download_mod_mbox(mailing_list, recent_month, end_year_month, save_folder_path, verbose = verbose) @@ -707,14 +707,14 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r #' fake .mbox file #' #' @param replies An array of replies that have been created with \code{\link{make_mbox_reply}} -#' @param mbox Folder path for the .mbox file being created. Defaulted at /tmp +#' @param folder_path Folder path for the .mbox file being created. Defaulted at /tmp #' @param file_name Name of the file that will store the .mbox file #' @return the path of the .mbox file that was created #' @export -make_mbox_mailing_list <- function(replies, mbox = "/tmp", file_name) { +make_mbox_mailing_list <- function(replies, folder_path = "/tmp", file_name) { # Create a unique filename for the mbox file - mbox_filepath <- file.path(mbox, stringi::stri_c(file_name, ".mbox")) + mbox_filepath <- file.path(folder_path, stringi::stri_c(file_name, ".mbox")) # make the file mbox_body <- stringi::stri_c(replies,collapse = "\n\n") @@ -723,8 +723,3 @@ make_mbox_mailing_list <- function(replies, mbox = "/tmp", file_name) { # Return the path of the created mbox file return(mbox_filepath) } - - - - - diff --git a/conf/helix.yml b/conf/helix.yml index a56b3a4e..69b337b5 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -53,26 +53,26 @@ mailing_list: # There can be multiple projects in both the pipermail and mod mbox sections. project_key_1: mailing_list: https://lists.apache.org/list.html?announce@apache.org - save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/ + save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail # mbox_file_path is for use only with parse_mbox() function. It is the file to parse mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu.mbox project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org - save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ + save_folder_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2 # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox + mbox_file_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox # If project uses Pipermail pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ + save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox - #project_key_2: - # mailing_list: https://mta.openssl.org/pipermail/openssl-project/ - # save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ + mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail/kaiaulu.mbox + project_key_2: + mailing_list: https://mta.openssl.org/pipermail/openssl-project/ + save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - # mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox + mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: diff --git a/conf/openssl.yml b/conf/openssl.yml index 825134f2..6d5deaea 100644 --- a/conf/openssl.yml +++ b/conf/openssl.yml @@ -50,7 +50,6 @@ mailing_list: archive_url: https://mta.openssl.org/pipermail/openssl-dev pipermail: ../../rawdata/openssl/pipermail/openssl-dev/ mailing_list: openssl-dev - archive_type: mta # issue_tracker: # jira: diff --git a/exec/mailinglist.R b/exec/mailinglist.R index ffcb0b87..65d9de42 100755 --- a/exec/mailinglist.R +++ b/exec/mailinglist.R @@ -15,11 +15,11 @@ require(data.table, quietly = TRUE) doc <- " USAGE: mailinglist.R tabulate help - mailinglist.R tabulate + mailinglist.R tabulate mailinglist.R download modmbox help - mailinglist.R download modmbox + mailinglist.R download modmbox mailinglist.R download pipermail help - mailinglist.R download pipermail + mailinglist.R download pipermail mailinglist.R (-h | --help) mailinglist.R --version @@ -33,7 +33,7 @@ OPTIONS: --version Show version. " -arguments <- docopt::docopt(doc, version = 'Kaiaulu 0.0.0.9600') +arguments <- docopt::docopt(doc, version = 'Kaiaulu 0.0.0.9700') if (arguments[["tabulate"]] & arguments[["help"]]) { cli::cli_alert_info("Tabulates a mailing list using parse_mbox().") @@ -41,35 +41,39 @@ if (arguments[["tabulate"]] & arguments[["help"]]) { tools_path <- arguments[[""]] conf_path <- arguments[[""]] + project_key <- arguments[[""]] save_path <- arguments[[""]] - tool <- yaml::read_yaml(tools_path) + tools <- yaml::read_yaml(tools_path) conf <- yaml::read_yaml(conf_path) - perceval_path <- path.expand(tool[["perceval"]]) - mbox_file_path <- path.expand(conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mbox_file_path"]]) + perceval_path <- get_tool("perceval", tools) + mbox_file_path <- get_mbox_input_file(conf, project_key) - project_mbox <- parse_mbox(perceval_path, mbox_file_path) + parsed_mbox <- parse_mbox( + perceval_path = perceval_path, + mbox_file_path = mbox_file_path + ) - data.table::fwrite(project_mbox, save_path) + data.table::fwrite(parsed_mbox, save_path) cli::cli_alert_success(paste0("Tabulated mailing list was saved at: ", save_path)) } else if (arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]) { cli::cli_alert_info("Downloads mailing list archives from mod_mbox using download_mod_mbox().") + } else if (arguments[["download"]] & arguments[["modmbox"]]) { conf_path <- arguments[[""]] + project_key <- arguments[[""]] start_year_month <- arguments[[""]] - end_year_month <- arguments[[""]] - save_folder_path <- arguments[[""]] conf <- yaml::read_yaml(conf_path) - mailing_list <- conf[["mailing_list"]][["mod_mbox"]][["project_key_1"]][["mailing_list"]] + mailing_list <- get_mbox_domain(conf, project_key) + save_folder_path <- get_mbox_path(conf, project_key) - download_mod_mbox( + refresh_mod_mbox( mailing_list = mailing_list, start_year_month = start_year_month, - end_year_month = end_year_month, save_folder_path = save_folder_path, verbose = TRUE ) @@ -81,17 +85,16 @@ if (arguments[["tabulate"]] & arguments[["help"]]) { } else if (arguments[["download"]] & arguments[["pipermail"]]) { conf_path <- arguments[[""]] + project_key <- arguments[[""]] start_year_month <- arguments[[""]] - end_year_month <- arguments[[""]] - save_folder_path <- arguments[[""]] conf <- yaml::read_yaml(conf_path) - mailing_list <- conf[["mailing_list"]][["pipermail"]][["project_key_1"]][["mailing_list"]] + mailing_list <- get_pipermail_domain(conf, project_key) + save_folder_path <- get_pipermail_path(conf, project_key) - download_pipermail( + refresh_pipermail( mailing_list = mailing_list, start_year_month = start_year_month, - end_year_month = end_year_month, save_folder_path = save_folder_path, verbose = TRUE ) @@ -101,8 +104,7 @@ if (arguments[["tabulate"]] & arguments[["help"]]) { } else if (arguments[["-h"]] || arguments[["--help"]]) { cli::cli_alert_info(doc) } else if (arguments[["--version"]]) { - cli::cli_alert_info('Kaiaulu 0.0.0.9600') + cli::cli_alert_info('Kaiaulu 0.0.0.9700') } else { cli::cli_alert_danger("Invalid command or arguments. Use --help for usage information.") } - diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index e1835761..33715d61 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -36,5 +36,5 @@ of kaiaulu_YYYYMM.mbox. The function loops through each month in the range specified by `start_year_month` and `end_year_month`, and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. This means the file could not be found and that month's data may not exist. -At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. +At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. } diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 0244abbd..e8c19b83 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -36,6 +36,6 @@ overwriting any existing file with the same name. The original .gz file is delet The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. The function only downloads files that fall between the specified start_year_month and end_year_month. -When both formats fail to download, the function issues a warning indicating the missing month. -At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. +When both formats fail to download, the function issues a warning indimessageing the missing month. +At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. } diff --git a/man/make_mbox_mailing_list.Rd b/man/make_mbox_mailing_list.Rd index 2ab66721..c81dbfd3 100644 --- a/man/make_mbox_mailing_list.Rd +++ b/man/make_mbox_mailing_list.Rd @@ -4,12 +4,12 @@ \alias{make_mbox_mailing_list} \title{Takes in mbox replies and creates a .mbox file} \usage{ -make_mbox_mailing_list(replies, mbox = "/tmp", file_name) +make_mbox_mailing_list(replies, folder_path = "/tmp", file_name) } \arguments{ \item{replies}{An array of replies that have been created with \code{\link{make_mbox_reply}}} -\item{mbox}{Folder path for the .mbox file being created. Defaulted at /tmp} +\item{folder_path}{Folder path for the .mbox file being created. Defaulted at /tmp} \item{file_name}{Name of the file that will store the .mbox file} } diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 780e984f..9b128dd8 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -23,15 +23,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index 486f35fd..eedf9633 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -21,15 +21,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 3ccea773..bfee954a 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -96,7 +96,7 @@ For Pipermail, we need to specify the project key, which is used to retrieve the Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. -```{r} +```{r eval=FALSE} conf <- parse_config("../conf/helix.yml") mailing_list <- get_pipermail_domain(conf, "project_key_1") start_year_month <- 202310 From e27a6042d9cce0c9abd71ec7c358b65df38fbbf0 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 12:33:03 -1000 Subject: [PATCH 45/80] i #295 Change argument for exec from 'tabulate' to 'parse' - Change exec from 'tabulate' to 'parse' - Will update issue 310 to use this exec instead of its own Signed-off-by: Dao McGill --- exec/mailinglist.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/exec/mailinglist.R b/exec/mailinglist.R index 65d9de42..bbcc357d 100755 --- a/exec/mailinglist.R +++ b/exec/mailinglist.R @@ -14,8 +14,8 @@ require(data.table, quietly = TRUE) doc <- " USAGE: - mailinglist.R tabulate help - mailinglist.R tabulate + mailinglist.R parse help + mailinglist.R parse mailinglist.R download modmbox help mailinglist.R download modmbox mailinglist.R download pipermail help @@ -35,9 +35,9 @@ OPTIONS: arguments <- docopt::docopt(doc, version = 'Kaiaulu 0.0.0.9700') -if (arguments[["tabulate"]] & arguments[["help"]]) { - cli::cli_alert_info("Tabulates a mailing list using parse_mbox().") -} else if (arguments[["tabulate"]]) { +if (arguments[["parse"]] & arguments[["help"]]) { + cli::cli_alert_info("Parses an mbox file using parse_mbox().") +} else if (arguments[["parse"]]) { tools_path <- arguments[[""]] conf_path <- arguments[[""]] @@ -56,7 +56,7 @@ if (arguments[["tabulate"]] & arguments[["help"]]) { ) data.table::fwrite(parsed_mbox, save_path) - cli::cli_alert_success(paste0("Tabulated mailing list was saved at: ", save_path)) + cli::cli_alert_success(paste0("Parsed mbox file was saved at: ", save_path)) } else if (arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]) { cli::cli_alert_info("Downloads mailing list archives from mod_mbox using download_mod_mbox().") From 6a5fed6c5ca7583474e1d35dfec85d1dc21c4983 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 14:35:11 -1000 Subject: [PATCH 46/80] i #284 Testing Fix for Actions - Fixed uri parameter in parse_mbox for perceval - Small change in example Signed-off-by: Dao McGill --- R/example.R | 4 ++-- R/mail.R | 38 ++++++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/R/example.R b/R/example.R index 389bcf28..8746ceae 100644 --- a/R/example.R +++ b/R/example.R @@ -550,7 +550,7 @@ example_jira_issue_comments <- function(folder_path = "/tmp", folder_name) { example_mailing_list_two_threads <- function(folder_path = "/tmp", folder_name, file_name) { # Create folder & repo - folder_path <- io_make_folder(folder_path=folder_path, folder_name = folder_name) + folder_path <- io_make_folder(folder_path = folder_path, folder_name = folder_name) # Step 1: Create fake mbox replies and assign them to variables for easy editing thread_1_reply_1 <- make_mbox_reply(mailing_list="test-list", @@ -584,7 +584,7 @@ example_mailing_list_two_threads <- function(folder_path = "/tmp", folder_name, replies <- c(thread_1_reply_1, thread_1_reply_2, thread_2_reply_1) # Create mbox file from the list of replies - mbox_path <- make_mbox_mailing_list(replies = replies, file_name = file_name) + mbox_path <- make_mbox_mailing_list(replies = replies, folder_path = folder_path, file_name = file_name) return(mbox_path) } diff --git a/R/mail.R b/R/mail.R index 9c169953..9496002e 100644 --- a/R/mail.R +++ b/R/mail.R @@ -578,29 +578,36 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v #' @param mbox_file_path path to mbox archive file (ends in .mbox) #' @export #' @family parsers -parse_mbox <- function(perceval_path, mbox_file_path){ - # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") +#' @param perceval_path path to perceval binary +#' @param mbox_file_path path to mbox archive file (ends in .mbox) +#' @export +#' @family parsers +parse_mbox <- function(perceval_path, mbox_file_path) { + # Expand paths perceval_path <- path.expand(perceval_path) mbox_file_path <- path.expand(mbox_file_path) - # Remove ".mbox" - mbox_uri <- stringi::stri_replace_last_regex(mbox_file_path, pattern = "\\.mbox$", replacement = "") + mbox_dir <- dirname(mbox_file_path) # Extract directory path + mbox_uri <- mbox_file_path # URI points to the mbox file - # Use percerval to parse mbox. --json line is required to be parsed by jsonlite::fromJSON. + # Use Perceval to parse the mbox file perceval_output <- system2(perceval_path, - args = c('mbox',mbox_uri,mbox_file_path,'--json-line'), + args = c('mbox', mbox_uri, mbox_dir, '--json-line'), stdout = TRUE, - stderr = FALSE) + stderr = TRUE) + + # Filter JSON lines from Perceval output + json_lines <- perceval_output[grepl("^\\{", perceval_output)] # Escape the `{` character - # Parsed JSON output as a data.table. - perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose=FALSE)) + # Parse JSON output as a data.table + perceval_parsed <- data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) - columns_of_interest <- c("data.Message.ID","data.In.Reply.To","data.Date","data.From","data.To","data.Cc","data.Subject","data.body.plain","data.body") - columns_rename <- c("reply_id","in_reply_to_id","reply_datetimetz","reply_from","reply_to","reply_cc","reply_subject","reply_body","reply_body") + columns_of_interest <- c("data.Message.ID", "data.In.Reply.To", "data.Date", "data.From", "data.To", "data.Cc", "data.Subject", "data.body.plain", "data.body") + columns_rename <- c("reply_id", "in_reply_to_id", "reply_datetimetz", "reply_from", "reply_to", "reply_cc", "reply_subject", "reply_body", "reply_body") is_available_column <- columns_of_interest %in% colnames(perceval_parsed) columns_of_interest <- columns_of_interest[is_available_column] - perceval_parsed <- perceval_parsed[,..columns_of_interest] + perceval_parsed <- perceval_parsed[, ..columns_of_interest] data.table::setnames(x = perceval_parsed, old = colnames(perceval_parsed), @@ -716,10 +723,9 @@ make_mbox_mailing_list <- function(replies, folder_path = "/tmp", file_name) { # Create a unique filename for the mbox file mbox_filepath <- file.path(folder_path, stringi::stri_c(file_name, ".mbox")) - # make the file - mbox_body <- stringi::stri_c(replies,collapse = "\n\n") - io_make_file(mbox_filepath,mbox_body) + # Write the mbox content + mbox_body <- stringi::stri_c(replies, collapse = "\n\n") + io_make_file(mbox_filepath, mbox_body) - # Return the path of the created mbox file return(mbox_filepath) } From ffb5c9ce7366b7145c4944d997e0e2c31bd7fe17 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:02:34 -1000 Subject: [PATCH 47/80] i #284 Try Adding Debugging Signed-off-by: Dao McGill --- R/git.R | 21 +++++++++++++++ R/mail.R | 52 ++++++++++++++++++++++++++++++++++---- man/parse_gitlog.Rd | 6 ++--- man/parse_mbox.Rd | 16 ++++++++++++ tests/testthat/test-git.R | 16 ++++++++++++ tests/testthat/test-mail.R | 24 ++++++++++++++---- 6 files changed, 122 insertions(+), 13 deletions(-) diff --git a/R/git.R b/R/git.R index e9e8029a..9d0474f1 100644 --- a/R/git.R +++ b/R/git.R @@ -23,10 +23,19 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) git_uri <- git_repo_path save_path <- ifelse(!is.na(save_path),path.expand(save_path),NA) + # DEBUG + print(paste("Perceval path:", perceval_path)) + print(paste("Git repo path:", git_repo_path)) + print(paste("Save path:", save_path)) + print(paste("Perl regex:", perl_regex)) + # Use percerval to parse .git --json line is required to be parsed by jsonlite::fromJSON. # The log will be saved to the /tmp/ folder gitlog_path <- "/tmp/gitlog.log" + # DEBUG + print(paste("Gitlog path:", gitlog_path)) + # Perceval suggested flags perceval_flags <- c( @@ -62,18 +71,30 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) } } + # DEBUG + print("Git log call message:") + print(gitlog_call_message) + # Parsed JSON output. perceval_output <- system2(perceval_path, args = c('git', '--git-log',gitlog_path,git_uri,'--json-line'), stdout = TRUE, stderr = FALSE) + # DEBUG + print("Perceval Output:") + cat(perceval_output, sep = "\n") + perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose = FALSE)) if(nrow(perceval_parsed) == 0){ stop("The repository specified has no commits.") } + # DEBUG + print("Parsed data structure:") + print(str(perceval_parsed)) + # APR very first commit is a weird single case of commit without files. We filter them here. is_commit_with_files <- !!sapply(perceval_parsed$data.files,length) perceval_parsed <- perceval_parsed[is_commit_with_files] diff --git a/R/mail.R b/R/mail.R index 9496002e..8e8b7620 100644 --- a/R/mail.R +++ b/R/mail.R @@ -589,17 +589,54 @@ parse_mbox <- function(perceval_path, mbox_file_path) { mbox_dir <- dirname(mbox_file_path) # Extract directory path mbox_uri <- mbox_file_path # URI points to the mbox file + + + # Debugging + print(paste("Perceval path:", perceval_path)) + print(paste("Mbox file path:", mbox_file_path)) + print(paste("Mbox directory path:", mbox_dir)) + # Use Perceval to parse the mbox file - perceval_output <- system2(perceval_path, - args = c('mbox', mbox_uri, mbox_dir, '--json-line'), - stdout = TRUE, - stderr = TRUE) + perceval_output <- tryCatch({ + system2(perceval_path, + args = c('mbox', mbox_uri, mbox_dir, '--json-line'), + stdout = TRUE, + stderr = TRUE) + }, error = function(e) { + print("Error running Perceval:") + print(e$message) + stop("Perceval execution failed.") + }) + + # Debugging Perceval output + print("Perceval Output:") + cat(perceval_output, sep = "\n") + + + # Filter JSON lines from Perceval output json_lines <- perceval_output[grepl("^\\{", perceval_output)] # Escape the `{` character + + if (length(json_lines) == 0) { + stop("No valid JSON lines found in Perceval output. Check the mbox file or Perceval configuration.") + } + + # Parse JSON output as a data.table - perceval_parsed <- data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) + perceval_parsed <- tryCatch({ + data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) + }, error = function(e) { + print("Error parsing JSON lines:") + print(e$message) + stop("JSON parsing failed.") + }) + + # Debugging parsed data + print("Parsed data structure:") + print(str(perceval_parsed)) + columns_of_interest <- c("data.Message.ID", "data.In.Reply.To", "data.Date", "data.From", "data.To", "data.Cc", "data.Subject", "data.body.plain", "data.body") columns_rename <- c("reply_id", "in_reply_to_id", "reply_datetimetz", "reply_from", "reply_to", "reply_cc", "reply_subject", "reply_body", "reply_body") @@ -613,6 +650,11 @@ parse_mbox <- function(perceval_path, mbox_file_path) { old = colnames(perceval_parsed), new = columns_rename[is_available_column]) + # Debugging final parsed data + print("Final parsed data:") + print(perceval_parsed) + + return(perceval_parsed) } diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index d4370808..7d65786f 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 9b128dd8..349d009c 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -19,6 +19,22 @@ data used. This function only ensures if columns of interest are available, then consistently renamed for clarity. } \seealso{ +Other parsers: +\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, +\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, +\code{\link{parse_bugzilla_rest_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_commit_message_id}()}, +\code{\link{parse_dependencies}()}, +\code{\link{parse_dv8_clusters}()}, +\code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, +\code{\link{parse_jira_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_nvdfeed}()} + Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, diff --git a/tests/testthat/test-git.R b/tests/testthat/test-git.R index 3659d6dd..740bb6ad 100644 --- a/tests/testthat/test-git.R +++ b/tests/testthat/test-git.R @@ -30,9 +30,25 @@ test_that("Calling parse_gitlog with correct perceval and correct git log path r tools_path <- file.path(tools_path) tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] + + # Debugging output + print("Debugging parse_gitlog:") + print(paste("Tools path:", tools_path)) + print(paste("Perceval path:", perceval_path)) + git_repo_path <- suppressWarnings(git_create_sample_log()) + + # Debugging output + print(paste("Generated Git repo path:", git_repo_path)) + result <- parse_gitlog(perceval_path, git_repo_path) + + # Debugging output + print("Result of parse_gitlog:") + print(head(result)) + expect_is(result, "data.table") + suppressWarnings(git_delete_sample_log(git_repo_path)) }) diff --git a/tests/testthat/test-mail.R b/tests/testthat/test-mail.R index b7426917..bf190130 100644 --- a/tests/testthat/test-mail.R +++ b/tests/testthat/test-mail.R @@ -20,16 +20,30 @@ test_that("Calling parse_mbox with correct perceval and mbox path returns a data tools_path <- file.path(tools_path) tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] - mbox_path <- example_mailing_list_two_threads(folder_path = "/tmp", - folder_name="example_two_threads_mailing_list", - file_name = "two_thread_mailing_list") + + # Debugging output + print("Debugging parse_mbox:") + print(paste("Tools path:", tools_path)) + print(paste("Perceval path:", perceval_path)) + + mbox_path <- example_mailing_list_two_threads( + folder_path = "/tmp", + folder_name = "example_two_threads_mailing_list", + file_name = "two_thread_mailing_list" + ) + + # Debugging output + print(paste("Generated Mbox path:", mbox_path)) + result <- parse_mbox(perceval_path, mbox_path) - io_delete_folder(folder_path="/tmp", folder_name="example_two_threads_mailing_list") + # Debugging output + print("Result of parse_mbox:") + print(head(result)) + io_delete_folder(folder_path = "/tmp", folder_name = "example_two_threads_mailing_list") expect_equal(result[reply_from == "John Doe "]$reply_subject, "Subject 1") expect_equal(result[reply_subject == "Re: Subject 1"]$reply_from, "Smithsonian Doe ") - }) From e55b6e2b493d4c427a1108365eefc7b9ae975c4e Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:36:41 -1000 Subject: [PATCH 48/80] Revert "i #284 Try Adding Debugging" This reverts commit ffb5c9ce7366b7145c4944d997e0e2c31bd7fe17. --- R/git.R | 21 --------------- R/mail.R | 52 ++++---------------------------------- man/parse_gitlog.Rd | 6 ++--- man/parse_mbox.Rd | 16 ------------ tests/testthat/test-git.R | 16 ------------ tests/testthat/test-mail.R | 24 ++++-------------- 6 files changed, 13 insertions(+), 122 deletions(-) diff --git a/R/git.R b/R/git.R index 9d0474f1..e9e8029a 100644 --- a/R/git.R +++ b/R/git.R @@ -23,19 +23,10 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) git_uri <- git_repo_path save_path <- ifelse(!is.na(save_path),path.expand(save_path),NA) - # DEBUG - print(paste("Perceval path:", perceval_path)) - print(paste("Git repo path:", git_repo_path)) - print(paste("Save path:", save_path)) - print(paste("Perl regex:", perl_regex)) - # Use percerval to parse .git --json line is required to be parsed by jsonlite::fromJSON. # The log will be saved to the /tmp/ folder gitlog_path <- "/tmp/gitlog.log" - # DEBUG - print(paste("Gitlog path:", gitlog_path)) - # Perceval suggested flags perceval_flags <- c( @@ -71,30 +62,18 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) } } - # DEBUG - print("Git log call message:") - print(gitlog_call_message) - # Parsed JSON output. perceval_output <- system2(perceval_path, args = c('git', '--git-log',gitlog_path,git_uri,'--json-line'), stdout = TRUE, stderr = FALSE) - # DEBUG - print("Perceval Output:") - cat(perceval_output, sep = "\n") - perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose = FALSE)) if(nrow(perceval_parsed) == 0){ stop("The repository specified has no commits.") } - # DEBUG - print("Parsed data structure:") - print(str(perceval_parsed)) - # APR very first commit is a weird single case of commit without files. We filter them here. is_commit_with_files <- !!sapply(perceval_parsed$data.files,length) perceval_parsed <- perceval_parsed[is_commit_with_files] diff --git a/R/mail.R b/R/mail.R index 8e8b7620..9496002e 100644 --- a/R/mail.R +++ b/R/mail.R @@ -589,54 +589,17 @@ parse_mbox <- function(perceval_path, mbox_file_path) { mbox_dir <- dirname(mbox_file_path) # Extract directory path mbox_uri <- mbox_file_path # URI points to the mbox file - - - # Debugging - print(paste("Perceval path:", perceval_path)) - print(paste("Mbox file path:", mbox_file_path)) - print(paste("Mbox directory path:", mbox_dir)) - # Use Perceval to parse the mbox file - perceval_output <- tryCatch({ - system2(perceval_path, - args = c('mbox', mbox_uri, mbox_dir, '--json-line'), - stdout = TRUE, - stderr = TRUE) - }, error = function(e) { - print("Error running Perceval:") - print(e$message) - stop("Perceval execution failed.") - }) - - # Debugging Perceval output - print("Perceval Output:") - cat(perceval_output, sep = "\n") - - - + perceval_output <- system2(perceval_path, + args = c('mbox', mbox_uri, mbox_dir, '--json-line'), + stdout = TRUE, + stderr = TRUE) # Filter JSON lines from Perceval output json_lines <- perceval_output[grepl("^\\{", perceval_output)] # Escape the `{` character - - if (length(json_lines) == 0) { - stop("No valid JSON lines found in Perceval output. Check the mbox file or Perceval configuration.") - } - - # Parse JSON output as a data.table - perceval_parsed <- tryCatch({ - data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) - }, error = function(e) { - print("Error parsing JSON lines:") - print(e$message) - stop("JSON parsing failed.") - }) - - # Debugging parsed data - print("Parsed data structure:") - print(str(perceval_parsed)) - + perceval_parsed <- data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) columns_of_interest <- c("data.Message.ID", "data.In.Reply.To", "data.Date", "data.From", "data.To", "data.Cc", "data.Subject", "data.body.plain", "data.body") columns_rename <- c("reply_id", "in_reply_to_id", "reply_datetimetz", "reply_from", "reply_to", "reply_cc", "reply_subject", "reply_body", "reply_body") @@ -650,11 +613,6 @@ parse_mbox <- function(perceval_path, mbox_file_path) { old = colnames(perceval_parsed), new = columns_rename[is_available_column]) - # Debugging final parsed data - print("Final parsed data:") - print(perceval_parsed) - - return(perceval_parsed) } diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index 7d65786f..d4370808 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 349d009c..9b128dd8 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -19,22 +19,6 @@ data used. This function only ensures if columns of interest are available, then consistently renamed for clarity. } \seealso{ -Other parsers: -\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, -\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, -\code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, -\code{\link{parse_commit_message_id}()}, -\code{\link{parse_dependencies}()}, -\code{\link{parse_dv8_clusters}()}, -\code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_jira_latest_date}()}, -\code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox_latest_date}()}, -\code{\link{parse_nvdfeed}()} - Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, diff --git a/tests/testthat/test-git.R b/tests/testthat/test-git.R index 740bb6ad..3659d6dd 100644 --- a/tests/testthat/test-git.R +++ b/tests/testthat/test-git.R @@ -30,25 +30,9 @@ test_that("Calling parse_gitlog with correct perceval and correct git log path r tools_path <- file.path(tools_path) tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] - - # Debugging output - print("Debugging parse_gitlog:") - print(paste("Tools path:", tools_path)) - print(paste("Perceval path:", perceval_path)) - git_repo_path <- suppressWarnings(git_create_sample_log()) - - # Debugging output - print(paste("Generated Git repo path:", git_repo_path)) - result <- parse_gitlog(perceval_path, git_repo_path) - - # Debugging output - print("Result of parse_gitlog:") - print(head(result)) - expect_is(result, "data.table") - suppressWarnings(git_delete_sample_log(git_repo_path)) }) diff --git a/tests/testthat/test-mail.R b/tests/testthat/test-mail.R index bf190130..b7426917 100644 --- a/tests/testthat/test-mail.R +++ b/tests/testthat/test-mail.R @@ -20,30 +20,16 @@ test_that("Calling parse_mbox with correct perceval and mbox path returns a data tools_path <- file.path(tools_path) tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] - - # Debugging output - print("Debugging parse_mbox:") - print(paste("Tools path:", tools_path)) - print(paste("Perceval path:", perceval_path)) - - mbox_path <- example_mailing_list_two_threads( - folder_path = "/tmp", - folder_name = "example_two_threads_mailing_list", - file_name = "two_thread_mailing_list" - ) - - # Debugging output - print(paste("Generated Mbox path:", mbox_path)) - + mbox_path <- example_mailing_list_two_threads(folder_path = "/tmp", + folder_name="example_two_threads_mailing_list", + file_name = "two_thread_mailing_list") result <- parse_mbox(perceval_path, mbox_path) - # Debugging output - print("Result of parse_mbox:") - print(head(result)) + io_delete_folder(folder_path="/tmp", folder_name="example_two_threads_mailing_list") - io_delete_folder(folder_path = "/tmp", folder_name = "example_two_threads_mailing_list") expect_equal(result[reply_from == "John Doe "]$reply_subject, "Subject 1") expect_equal(result[reply_subject == "Re: Subject 1"]$reply_from, "Smithsonian Doe ") + }) From c797219505427d378ee2c1c74f2cdc9e68b64648 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:14:23 -1000 Subject: [PATCH 49/80] i #284 Revert ctags version --- man/parse_gitlog.Rd | 6 +++--- man/parse_mbox.Rd | 16 ++++++++++++++++ tests/testthat/testdata/tools.yml | 2 +- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index d4370808..7d65786f 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 9b128dd8..349d009c 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -19,6 +19,22 @@ data used. This function only ensures if columns of interest are available, then consistently renamed for clarity. } \seealso{ +Other parsers: +\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, +\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, +\code{\link{parse_bugzilla_rest_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_commit_message_id}()}, +\code{\link{parse_dependencies}()}, +\code{\link{parse_dv8_clusters}()}, +\code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, +\code{\link{parse_jira_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_nvdfeed}()} + Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, diff --git a/tests/testthat/testdata/tools.yml b/tests/testthat/testdata/tools.yml index f023ce37..09074775 100644 --- a/tests/testthat/testdata/tools.yml +++ b/tests/testthat/testdata/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /usr/local/Cellar/universal-ctags/HEAD-62f0144/bin/ctags +utags: /opt/homebrew/opt/universal-ctags/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ From 092e2abb2d2478e81690f3d248610816afac3db2 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:15:11 -1000 Subject: [PATCH 50/80] Update commit_message_id_coverage.Rd --- man/commit_message_id_coverage.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index 68fad761..e7f0c6ef 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, From 56dff9c27320a8740c2b2ccdd2fa9f28834de35d Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:21:58 -1000 Subject: [PATCH 51/80] i #284 Please work --- .github/workflows/R-CMD-check.yml | 2 +- .github/workflows/test-coverage.yml | 2 +- tests/testthat/testdata/tools.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 84ce2ac6..770e1334 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.4'] + r-version: ['4.1'] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 01ab743a..3d326351 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -13,7 +13,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.4'] + r-version: ['4.1'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: diff --git a/tests/testthat/testdata/tools.yml b/tests/testthat/testdata/tools.yml index 09074775..f023ce37 100644 --- a/tests/testthat/testdata/tools.yml +++ b/tests/testthat/testdata/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /opt/homebrew/opt/universal-ctags/bin/ctags +utags: /usr/local/Cellar/universal-ctags/HEAD-62f0144/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ From fd97af0675577803badf12e6a925487fec8aa0ec Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:43:30 -1000 Subject: [PATCH 52/80] i #295 Last try --- .github/workflows/R-CMD-check.yml | 10 ++++++---- .github/workflows/test-coverage.yml | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 770e1334..b9191097 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.1'] + r-version: ['4.4'] steps: - uses: actions/checkout@v3 @@ -67,11 +67,13 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml + brew_prefix=$(brew --prefix) + utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml shell: bash + - name: Check env: _R_CHECK_CRAN_INCOMING_REMOTE_: false diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 3d326351..0f196d31 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -13,7 +13,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.1'] + r-version: ['4.4'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: @@ -59,11 +59,13 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml + brew_prefix=$(brew --prefix) + utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml shell: bash + - name: Test coverage run: | covr::codecov( From 8709b95bd0d94503a24222855c13414942c9015c Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:50:27 -1000 Subject: [PATCH 53/80] Revert "i #295 Last try" This reverts commit fd97af0675577803badf12e6a925487fec8aa0ec. --- .github/workflows/R-CMD-check.yml | 10 ++++------ .github/workflows/test-coverage.yml | 10 ++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index b9191097..770e1334 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.4'] + r-version: ['4.1'] steps: - uses: actions/checkout@v3 @@ -67,13 +67,11 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - brew_prefix=$(brew --prefix) - utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml + utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml shell: bash - - name: Check env: _R_CHECK_CRAN_INCOMING_REMOTE_: false diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 0f196d31..3d326351 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -13,7 +13,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.4'] + r-version: ['4.1'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: @@ -59,13 +59,11 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - brew_prefix=$(brew --prefix) - utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml + utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml shell: bash - - name: Test coverage run: | covr::codecov( From 71054f998d031e3b68cf6c9075af4a0cb7c0f175 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:50:31 -1000 Subject: [PATCH 54/80] Revert "i #284 Please work" This reverts commit 56dff9c27320a8740c2b2ccdd2fa9f28834de35d. --- .github/workflows/R-CMD-check.yml | 2 +- .github/workflows/test-coverage.yml | 2 +- tests/testthat/testdata/tools.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 770e1334..84ce2ac6 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.1'] + r-version: ['4.4'] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 3d326351..01ab743a 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -13,7 +13,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.1'] + r-version: ['4.4'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: diff --git a/tests/testthat/testdata/tools.yml b/tests/testthat/testdata/tools.yml index f023ce37..09074775 100644 --- a/tests/testthat/testdata/tools.yml +++ b/tests/testthat/testdata/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /usr/local/Cellar/universal-ctags/HEAD-62f0144/bin/ctags +utags: /opt/homebrew/opt/universal-ctags/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ From 382383d05ed6a93f837b64fb6c56960d5f2936b6 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:50:39 -1000 Subject: [PATCH 55/80] Revert "Update commit_message_id_coverage.Rd" This reverts commit 092e2abb2d2478e81690f3d248610816afac3db2. --- man/commit_message_id_coverage.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index e7f0c6ef..68fad761 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, From f11e452979bab051e07d0310a97130a587568bbf Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:50:52 -1000 Subject: [PATCH 56/80] Revert "i #284 Revert ctags version" This reverts commit c797219505427d378ee2c1c74f2cdc9e68b64648. --- man/parse_gitlog.Rd | 6 +++--- man/parse_mbox.Rd | 16 ---------------- tests/testthat/testdata/tools.yml | 2 +- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index 7d65786f..d4370808 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 349d009c..9b128dd8 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -19,22 +19,6 @@ data used. This function only ensures if columns of interest are available, then consistently renamed for clarity. } \seealso{ -Other parsers: -\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, -\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, -\code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, -\code{\link{parse_commit_message_id}()}, -\code{\link{parse_dependencies}()}, -\code{\link{parse_dv8_clusters}()}, -\code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_jira_latest_date}()}, -\code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox_latest_date}()}, -\code{\link{parse_nvdfeed}()} - Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, diff --git a/tests/testthat/testdata/tools.yml b/tests/testthat/testdata/tools.yml index 09074775..f023ce37 100644 --- a/tests/testthat/testdata/tools.yml +++ b/tests/testthat/testdata/tools.yml @@ -7,7 +7,7 @@ refactoring_miner: ~/RefactoringMiner-1.0/bin/RefactoringMiner # https://github.com/boyter/scc scc: ~/scc/scc # universal-ctags -utags: /opt/homebrew/opt/universal-ctags/bin/ctags +utags: /usr/local/Cellar/universal-ctags/HEAD-62f0144/bin/ctags # https://archdia.com/ dv8: /Applications/DV84/bin/dv8-console # OSLOM: http://oslom.org/ From 09d00c3daa24f6d554bba8b37323cb0600952bae Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:50:59 -1000 Subject: [PATCH 57/80] Reapply "i #284 Try Adding Debugging" This reverts commit e55b6e2b493d4c427a1108365eefc7b9ae975c4e. --- R/git.R | 21 +++++++++++++++ R/mail.R | 52 ++++++++++++++++++++++++++++++++++---- man/parse_gitlog.Rd | 6 ++--- man/parse_mbox.Rd | 16 ++++++++++++ tests/testthat/test-git.R | 16 ++++++++++++ tests/testthat/test-mail.R | 24 ++++++++++++++---- 6 files changed, 122 insertions(+), 13 deletions(-) diff --git a/R/git.R b/R/git.R index e9e8029a..9d0474f1 100644 --- a/R/git.R +++ b/R/git.R @@ -23,10 +23,19 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) git_uri <- git_repo_path save_path <- ifelse(!is.na(save_path),path.expand(save_path),NA) + # DEBUG + print(paste("Perceval path:", perceval_path)) + print(paste("Git repo path:", git_repo_path)) + print(paste("Save path:", save_path)) + print(paste("Perl regex:", perl_regex)) + # Use percerval to parse .git --json line is required to be parsed by jsonlite::fromJSON. # The log will be saved to the /tmp/ folder gitlog_path <- "/tmp/gitlog.log" + # DEBUG + print(paste("Gitlog path:", gitlog_path)) + # Perceval suggested flags perceval_flags <- c( @@ -62,18 +71,30 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) } } + # DEBUG + print("Git log call message:") + print(gitlog_call_message) + # Parsed JSON output. perceval_output <- system2(perceval_path, args = c('git', '--git-log',gitlog_path,git_uri,'--json-line'), stdout = TRUE, stderr = FALSE) + # DEBUG + print("Perceval Output:") + cat(perceval_output, sep = "\n") + perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose = FALSE)) if(nrow(perceval_parsed) == 0){ stop("The repository specified has no commits.") } + # DEBUG + print("Parsed data structure:") + print(str(perceval_parsed)) + # APR very first commit is a weird single case of commit without files. We filter them here. is_commit_with_files <- !!sapply(perceval_parsed$data.files,length) perceval_parsed <- perceval_parsed[is_commit_with_files] diff --git a/R/mail.R b/R/mail.R index 9496002e..8e8b7620 100644 --- a/R/mail.R +++ b/R/mail.R @@ -589,17 +589,54 @@ parse_mbox <- function(perceval_path, mbox_file_path) { mbox_dir <- dirname(mbox_file_path) # Extract directory path mbox_uri <- mbox_file_path # URI points to the mbox file + + + # Debugging + print(paste("Perceval path:", perceval_path)) + print(paste("Mbox file path:", mbox_file_path)) + print(paste("Mbox directory path:", mbox_dir)) + # Use Perceval to parse the mbox file - perceval_output <- system2(perceval_path, - args = c('mbox', mbox_uri, mbox_dir, '--json-line'), - stdout = TRUE, - stderr = TRUE) + perceval_output <- tryCatch({ + system2(perceval_path, + args = c('mbox', mbox_uri, mbox_dir, '--json-line'), + stdout = TRUE, + stderr = TRUE) + }, error = function(e) { + print("Error running Perceval:") + print(e$message) + stop("Perceval execution failed.") + }) + + # Debugging Perceval output + print("Perceval Output:") + cat(perceval_output, sep = "\n") + + + # Filter JSON lines from Perceval output json_lines <- perceval_output[grepl("^\\{", perceval_output)] # Escape the `{` character + + if (length(json_lines) == 0) { + stop("No valid JSON lines found in Perceval output. Check the mbox file or Perceval configuration.") + } + + # Parse JSON output as a data.table - perceval_parsed <- data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) + perceval_parsed <- tryCatch({ + data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) + }, error = function(e) { + print("Error parsing JSON lines:") + print(e$message) + stop("JSON parsing failed.") + }) + + # Debugging parsed data + print("Parsed data structure:") + print(str(perceval_parsed)) + columns_of_interest <- c("data.Message.ID", "data.In.Reply.To", "data.Date", "data.From", "data.To", "data.Cc", "data.Subject", "data.body.plain", "data.body") columns_rename <- c("reply_id", "in_reply_to_id", "reply_datetimetz", "reply_from", "reply_to", "reply_cc", "reply_subject", "reply_body", "reply_body") @@ -613,6 +650,11 @@ parse_mbox <- function(perceval_path, mbox_file_path) { old = colnames(perceval_parsed), new = columns_rename[is_available_column]) + # Debugging final parsed data + print("Final parsed data:") + print(perceval_parsed) + + return(perceval_parsed) } diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index d4370808..7d65786f 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, -\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 9b128dd8..349d009c 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -19,6 +19,22 @@ data used. This function only ensures if columns of interest are available, then consistently renamed for clarity. } \seealso{ +Other parsers: +\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, +\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, +\code{\link{parse_bugzilla_rest_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_commit_message_id}()}, +\code{\link{parse_dependencies}()}, +\code{\link{parse_dv8_clusters}()}, +\code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, +\code{\link{parse_jira_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_nvdfeed}()} + Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, diff --git a/tests/testthat/test-git.R b/tests/testthat/test-git.R index 3659d6dd..740bb6ad 100644 --- a/tests/testthat/test-git.R +++ b/tests/testthat/test-git.R @@ -30,9 +30,25 @@ test_that("Calling parse_gitlog with correct perceval and correct git log path r tools_path <- file.path(tools_path) tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] + + # Debugging output + print("Debugging parse_gitlog:") + print(paste("Tools path:", tools_path)) + print(paste("Perceval path:", perceval_path)) + git_repo_path <- suppressWarnings(git_create_sample_log()) + + # Debugging output + print(paste("Generated Git repo path:", git_repo_path)) + result <- parse_gitlog(perceval_path, git_repo_path) + + # Debugging output + print("Result of parse_gitlog:") + print(head(result)) + expect_is(result, "data.table") + suppressWarnings(git_delete_sample_log(git_repo_path)) }) diff --git a/tests/testthat/test-mail.R b/tests/testthat/test-mail.R index b7426917..bf190130 100644 --- a/tests/testthat/test-mail.R +++ b/tests/testthat/test-mail.R @@ -20,16 +20,30 @@ test_that("Calling parse_mbox with correct perceval and mbox path returns a data tools_path <- file.path(tools_path) tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] - mbox_path <- example_mailing_list_two_threads(folder_path = "/tmp", - folder_name="example_two_threads_mailing_list", - file_name = "two_thread_mailing_list") + + # Debugging output + print("Debugging parse_mbox:") + print(paste("Tools path:", tools_path)) + print(paste("Perceval path:", perceval_path)) + + mbox_path <- example_mailing_list_two_threads( + folder_path = "/tmp", + folder_name = "example_two_threads_mailing_list", + file_name = "two_thread_mailing_list" + ) + + # Debugging output + print(paste("Generated Mbox path:", mbox_path)) + result <- parse_mbox(perceval_path, mbox_path) - io_delete_folder(folder_path="/tmp", folder_name="example_two_threads_mailing_list") + # Debugging output + print("Result of parse_mbox:") + print(head(result)) + io_delete_folder(folder_path = "/tmp", folder_name = "example_two_threads_mailing_list") expect_equal(result[reply_from == "John Doe "]$reply_subject, "Subject 1") expect_equal(result[reply_subject == "Re: Subject 1"]$reply_from, "Smithsonian Doe ") - }) From 216fe07a1ee1829c95030d26cfe5b4d24369084c Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 17:54:36 -1000 Subject: [PATCH 58/80] i #284 R version --- .github/workflows/R-CMD-check.yml | 2 +- .github/workflows/test-coverage.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 84ce2ac6..770e1334 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.4'] + r-version: ['4.1'] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 01ab743a..3d326351 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -13,7 +13,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.4'] + r-version: ['4.1'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: From 99823d76a1892a1c04c11f4bc924a7c316ad9ed0 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 18:12:17 -1000 Subject: [PATCH 59/80] i #284 another R version change attempt --- .github/workflows/R-CMD-check.yml | 2 +- .github/workflows/test-coverage.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 770e1334..2350bf81 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.1'] + r-version: ['4.2'] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 3d326351..e70821a2 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -13,7 +13,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.1'] + r-version: ['4.2'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: From 6cd5e117e5b8823d1ddab60c230b52aababba33f Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Tue, 12 Nov 2024 19:01:02 -1000 Subject: [PATCH 60/80] i #284 Version that was passing check --- .github/workflows/R-CMD-check.yml | 9 +++++---- .github/workflows/test-coverage.yml | 9 +++++---- man/commit_message_id_coverage.Rd | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index 2350bf81..a94258e3 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -15,7 +15,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.2'] + r-version: ['4.4'] steps: - uses: actions/checkout@v3 @@ -67,9 +67,10 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml + brew_prefix=$(brew --prefix) + utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml shell: bash - name: Check diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index e70821a2..24273170 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -13,7 +13,7 @@ jobs: runs-on: macOS-13 strategy: matrix: - r-version: ['4.2'] + r-version: ['4.4'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: @@ -59,9 +59,10 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml - sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml + brew_prefix=$(brew --prefix) + utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml + sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml shell: bash - name: Test coverage diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index 68fad761..e7f0c6ef 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: +\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, -\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, From 132355dc18db1e8c688a665d4bc388a6668acaa3 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Sat, 30 Nov 2024 10:10:17 -1000 Subject: [PATCH 61/80] i #295 Small changes from updated config Signed-off-by: Dao McGill --- R/mail.R | 6 +++--- conf/helix.yml | 8 ++++---- vignettes/download_mail.Rmd | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R/mail.R b/R/mail.R index 8e8b7620..afb0f656 100644 --- a/R/mail.R +++ b/R/mail.R @@ -17,8 +17,8 @@ #' #' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. #' The function only downloads files that fall between the specified start_year_month and end_year_month. -#' When both formats fail to download, the function issues a warning indimessageing the missing month. -#' At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. +#' When both formats fail to download, the function issues a warning indicating the missing month. +#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. #' #' @param mailing_list The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/" #' @param start_year_month The year and month of the first file to be downloaded format: 'YYYYMM' @@ -369,7 +369,7 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { #' The function loops through each month in the range specified by `start_year_month` and `end_year_month`, #' and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. #' This means the file could not be found and that month's data may not exist. -#' At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. +#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. #' #' @param mailing_list The URL of the Apache Pony Mail list from which mbox files are to be downloaded #' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). diff --git a/conf/helix.yml b/conf/helix.yml index 69b337b5..8f6b60e6 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -55,24 +55,24 @@ mailing_list: mailing_list: https://lists.apache.org/list.html?announce@apache.org save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu.mbox + mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu_202407.mbox project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org save_folder_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2 # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox + mbox_file_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu_202407.mbox # If project uses Pipermail pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail/kaiaulu.mbox + mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail/kaiaulu_202407.mbox project_key_2: mailing_list: https://mta.openssl.org/pipermail/openssl-project/ save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox + mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu_202407.mbox issue_tracker: jira: diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index bfee954a..367b7c5f 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -129,10 +129,10 @@ In addition to the mailing list configurations, you need to specify the path to Now, you can load the configurations in your R script or notebook using the following code: -```{r} +```{r eval=FALSE} # Load tools configuration tools <- parse_config("../tools.yml") -parse_perceval_path <- get_tool("perceval", tools) +parse_perceval_path <- get_tool_project("perceval", tools) # Load project configuration conf <- parse_config("../conf/helix.yml") From dceded0dec9004c148e30ca93b179a9cfa6be37a Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 1 Dec 2024 13:40:36 -1000 Subject: [PATCH 62/80] i #284 Updates to Mail Notebook Signed-off-by: Dao McGill --- conf/helix.yml | 6 +- vignettes/download_mail.Rmd | 192 +++++++++++++----------------------- 2 files changed, 71 insertions(+), 127 deletions(-) diff --git a/conf/helix.yml b/conf/helix.yml index 8f6b60e6..ce1e48fd 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -58,7 +58,7 @@ mailing_list: mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu_202407.mbox project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org - save_folder_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2 + save_folder_path: ../../helix/mod_mbox/save_mbox_mail_2 # mbox_file_path is for use only with parse_mbox() function. It is the file to parse mbox_file_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu_202407.mbox # If project uses Pipermail @@ -70,9 +70,9 @@ mailing_list: mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail/kaiaulu_202407.mbox project_key_2: mailing_list: https://mta.openssl.org/pipermail/openssl-project/ - save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ + save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail_2/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu_202407.mbox + mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail_2/kaiaulu_202407.mbox issue_tracker: jira: diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 367b7c5f..894657bc 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -58,71 +58,32 @@ Because project lifetime can go as far as a few decades, to have the full pictur The information you need to find out for each open source project is documented in Kaiaulu using a project configuration file format. For pipermail and mod_mbox this is as follows: ``` -# top-level key for mailing list config mailing_list: # for pipermail pipermail: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - start_year_month: 202310 - end_year_month: 202405 - save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ + save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox + mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail/kaiaulu_202407.mbox # for mod mbox mod_mbox: - apache_announce: - mailing_list: https://lists.apache.org/list.html?announce@apache.org - start_year_month: 202310 - end_year_month: 202405 - save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox + project_key_1: + mailing_list: https://lists.apache.org/list.html?announce@apache.org + save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu_202407.mbox ``` The most time intensive step you will be required is to locate the URL of the mailing list archive you wish for in the project website. This is specified under `mailing_list`. Note for pipermail this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). - Note: It is important that the paths specified in save_folder_path and mbox_file_path are accurate and do not conflict between projects. By organizing the configuration in this way, you can manage multiple projects and mailing lists easily. The notebook reads these parameters and uses them to download and process the archives. Regardless of which mail archive you choose, the downloaders will store the mail data in monthly files, in a `.mbox` format. This is a simple text file that contains some markings to identify the header of the e-mail containing title, authors, etc. You can open any of the .mbox downloaded files with any text editor. -## Pipermail Configuration - -For Pipermail, we need to specify the project key, which is used to retrieve the configuration parameters for the specific project. The project key is used to identify the project in the configuration file. - -Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. - -```{r eval=FALSE} -conf <- parse_config("../conf/helix.yml") -mailing_list <- get_pipermail_domain(conf, "project_key_1") -start_year_month <- 202310 -end_year_month <- 202405 -save_folder_path <- get_pipermail_path(conf, "project_key_1") -``` - -Note that the date range is not set with a getter. The range for downloads changes often, and should be set manually using the YYYYMM format. - -## Mbox Configuration - -Similarly to Pipermail, we need to specify the project key for Mod Mbox. The project key is used to retrieve the configuration parameters for the specific project. - -Use the getters to extract the parameters: - -```{r eval=FALSE} -conf <- parse_config("../conf/helix.yml") -mailing_list <- get_mbox_domain(conf, "project_key_1") -start_year_month <- 202310 -end_year_month <- 202405 -save_folder_path <- get_mbox_path(conf, "project_key_1") -``` - - -The `start_year_month` and `end_year_month` time range parameters should be set manually, as with pipermail. - ## Tools Configuration In addition to the mailing list configurations, you need to specify the path to the [Perceval](https://github.com/chaoss/grimoirelab-perceval) binary in tools.yml. See the wiki for further details on how to setup third party tools. @@ -147,15 +108,31 @@ With the configurations loaded, we can proceed to download the mailing list arch ### Pipermail Downloader -The download_pipermail() function downloads Pipermail archives from a specified mailing list within a given date range: +For Pipermail, we need to specify the project key, which is used to retrieve the configuration parameters for the specific project. The project key is used to identify the project in the configuration file. + +Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. + +```{r eval=FALSE} +conf <- parse_config("../conf/helix.yml") +pipermail_mailing_list <- get_pipermail_domain(conf, "project_key_1") +pipermail_save_folder_path <- get_pipermail_path(conf, "project_key_1") + +# Define the date range +pipermail_start_year_month <- 202310 +pipermail_end_year_month <- 202405 +``` + +Note that the date range is not set with a getter. The range for downloads changes often, and should be set manually using the YYYYMM format. + +With our configurations loaded, we can proceed to downloading the mailing list archives. ```{r eval=FALSE} # Download archives download_pipermail( - mailing_list = mailing_list, - start_year_month = start_year_month, - end_year_month = end_year_month, - save_folder_path = save_folder_path, + mailing_list = pipermail_mailing_list, + start_year_month = pipermail_start_year_month, + end_year_month = pipermail_end_year_month, + save_folder_path = pipermail_save_folder_path, verbose = TRUE ) @@ -167,14 +144,30 @@ After running this function, the .mbox files will be saved in the specified dire The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range. The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. -#### Example Usage +Similarly to Pipermail, we need to specify the project key for Mod Mbox. The project key is used to retrieve the configuration parameters for the specific project. + +Use the getters to extract the parameters: + +```{r eval=FALSE} +conf <- parse_config("../conf/helix.yml") +mbox_mailing_list <- get_mbox_domain(conf, "project_key_1") +mbox_save_folder_path <- get_mbox_path(conf, "project_key_1") + +# Define the date range +mbox_start_year_month <- 202310 +mbox_end_year_month <- 202405 +``` + + +The `start_year_month` and `end_year_month` time range parameters should be set manually, as with Pipermail. + ```{r eval=FALSE} download_mod_mbox( - mailing_list = mailing_list, - start_year_month = start_year_month, - end_year_month = end_year_month, - save_folder_path = save_folder_path, + mailing_list = mbox_mailing_list, + start_year_month = mbox_start_year_month, + end_year_month = mbox_end_year_month, + save_folder_path = mbox_save_folder_path, verbose = TRUE ) @@ -185,27 +178,25 @@ and saves the files in the specified folder. ## Refreshers -Over time, new messages are added to mailing lists. It's important to keep your local archives up-to-date to ensure that your analysis includes the latest communications. The refreshers are functions designed to update your existing archives efficiently. - -Mailing lists are dynamic, with new emails being added regularly. If you're conducting ongoing analysis or need the most recent data, it's important to refresh your downloaded archives. Manually redownloading all archives can be time-consuming and inefficient. The refresher functions automate this process by updating only the necessary parts of your archives, saving time and ensuring data completeness. +Mailing lists are dynamic, with new emails being added regularly. If you're conducting ongoing analysis or need the most recent data, it's important to refresh your downloaded archives. Manually re-downloading all archives can be time-consuming and inefficient. The refresher functions automate this process by updating only the necessary parts of your archives, saving time and ensuring data completeness. These functions will update your archives by downloading new messages without re-downloading all existing data. ### Pipermail Refresher -In some cases, you may want to refresh the archive to ensure the most recent months are up-to-date or to handle updates to the mailing list. The refresh_pipermail() function helps automate this process. +The refresh_pipermail function is designed to keep your local archives up-to-date with the latest messages from the mailing list. Here's how it works: + +First, it checks the save_folder_path to see if there are any existing files. If the folder is empty, it means you have not downloaded any archives yet. In this case, the function will download all available archives from your specified start_year_month up to the current month, so that you have a complete dataset to work with. -How refresh_pipermail Works -1. Checks if the folder is empty: If the folder is empty, it downloads archives starting from start_year_month to the current month using download_pipermail(). -2. Finds the most recent file: If the folder is not empty, the function checks for the most recent month’s file (based on the filename) and deletes it. -3. Redownloads from the most recent month: The function then redownloads the archive from the most recent month up to the current month. +If there are already files in the directory, the function takes a smart approach to updating them. It identifies the most recent archive file based on the filenames (which include the date, like kaiaulu_202311.mbox). It then deletes this most recent file because new messages might have been added to that month since your last download. After deleting it, the function re-downloads this file along with any newer archives that have been added to the mailing list. This way, you don't have to re-download all the archives. + +By operating on the directory specified in save_folder_path, the refresh_pipermail function efficiently updates all relevant files, keeping your local archives current without unnecessary downloads. -#### Example Usage ```{r eval=FALSE} # Refresh archives refresh_pipermail( - mailing_list = mailing_list, - start_year_month = start_year_month, - save_folder_path = save_folder_path, + mailing_list = pipermail_mailing_list, + start_year_month = pipermail_start_year_month, + save_folder_path = pipermail_save_folder_path, verbose = TRUE ) @@ -215,41 +206,31 @@ This function will ensure that the most recent archives are always up-to-date by ### Mod Mbox Refresher -To refresh these archives to ensure that you have the latest messages, you can use the refresh_mod_mbox function. This function works similarly to the Pipermail refresher. - -How refresh_mod_mbox Works -1. Checks if the folder is empty and, if so, downloads the archives starting from start_year_month to the current month by calling download_mod_mbox(). -2. If the folder contains files, it identifies the most recent one using the YYYYMM found in the filename. This file is deleted, and then redownloaded along with all future months. - -#### Example Usage +The behavior is similar to the Pipermail refresher, and makes sure that your Mod Mbox archives are up-to-date. ```{r eval=FALSE} refresh_mod_mbox( - mailing_list = mailing_list, - start_year_month = start_year_month, - save_folder_path= save_folder_path, + mailing_list = mbox_mailing_list, + start_year_month = mbox_start_year_month, + save_folder_path= mbox_save_folder_path, verbose = TRUE ) ``` -This ensures your archive is up-to-date, accounting for new data that may have been added to the mailing list since the last download. # Parsers After downloading the mailing list archives as .mbox files, the next step is to parse these files to extract meaningful information for analysis. The parse_mbox() function utilizes the Perceval library to parse .mbox files and convert them into structured data tables. This enables easier manipulation and analysis of mailing list data. -## Mbox Parser +## How parse_mbox() Works -After downloading the mailing list archives as .mbox files, the next step is to parse these files to extract meaningful information for analysis. The parse_mbox() function utilizes the Perceval library to parse .mbox files and convert them into structured data tables. This enables easier manipulation and analysis of mailing list data. +The parse_mbox function makes it easy to transform .mbox files into structured data that you can analyze. -### How parse_mbox() Works -- Perceval Integration: Interfaces with the Perceval library to parse the .mbox file. -- Flexible Parsing: Handles variations in .mbox file structures, which may have inconsistent fields due to different email headers. -- Data Extraction: Extracts key information such as email content, sender, recipients, dates, and threading information. -- Consistent Column Naming: Ensures that columns of interest are consistently renamed for clarity, even if the raw data varies. +The function uses the Perceval library to process .mbox files. Mailing list archives often have variations in their structure—different email headers, missing fields, or inconsistent formats. The parser is designed to handle these variations, so you do not have to worry about cleaning up the data. +As it processes the files, the parser extracts key details from each email, such as the content, sender, recipients and dates. These elements are crucial for understanding communication patterns and building insights. -### Example Usage +Finally, to keep things consistent, the function standardizes the column names in the output. Even if the raw data varies from one archive to another, the resulting table will always have predictable and labeled columns, making it easy to work with. ```{r eval=FALSE} parsed_mail <- parse_mbox( @@ -269,40 +250,3 @@ parsed_mail %>% ``` Note: Displaying the entire dataset may not be practical if it's large. Showing a sample provides a glimpse of the structure. - -## Retrieve the Latest Mbox File - -We can use the parse_mbox_latest_date() function to identify the most recent .mbox file in the specified folder. This can be useful when you want to automate the parsing of the latest data without manually specifying the file name. - -First, make sure that the save_folder_path is correctly set to the directory where your .mbox files are stored. - -This will output the name of the latest .mbox file based on the YYYYMM pattern in the filename. -We can use this to update mbox_file_path to point to the latest file, and call the parse_mbox() function to parse the latest data. - -### Example Usage - -```{r eval=FALSE} -# Update mbox_file_path to use the latest file -mbox_file_path <- file.path(save_folder_path, latest_mbox_file) -print(mbox_file_path) -``` - -To parse this file: - -```{r eval=FALSE} -# Parse the latest mbox file -parsed_mail <- parse_mbox( - perceval_path = parse_perceval_path, - mbox_file_path = mbox_file_path -) -``` - -Now, parsed_mail contains the parsed data from the latest .mbox file. - -```{r eval=FALSE} -# Display the first 10 rows of parsed_mail using gt -# Refer to the gt documentation for more options on displaying tables -parsed_mail %>% - head(10) %>% - gt() -``` From 5515d7c10c06b0243b472190ea6aefa001233058 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:56:25 -1000 Subject: [PATCH 63/80] i #284 Update Mailing List Exec to use "refresh" Signed-off-by: Dao McGill --- exec/mailinglist.R | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/exec/mailinglist.R b/exec/mailinglist.R index bbcc357d..0d95f1c8 100755 --- a/exec/mailinglist.R +++ b/exec/mailinglist.R @@ -16,10 +16,10 @@ doc <- " USAGE: mailinglist.R parse help mailinglist.R parse - mailinglist.R download modmbox help - mailinglist.R download modmbox - mailinglist.R download pipermail help - mailinglist.R download pipermail + mailinglist.R refresh modmbox help + mailinglist.R refresh modmbox + mailinglist.R refresh pipermail help + mailinglist.R refresh pipermail mailinglist.R (-h | --help) mailinglist.R --version @@ -58,10 +58,10 @@ if (arguments[["parse"]] & arguments[["help"]]) { data.table::fwrite(parsed_mbox, save_path) cli::cli_alert_success(paste0("Parsed mbox file was saved at: ", save_path)) -} else if (arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]) { - cli::cli_alert_info("Downloads mailing list archives from mod_mbox using download_mod_mbox().") +} else if (arguments[["refresh"]] & arguments[["modmbox"]] & arguments[["help"]]) { + cli::cli_alert_info("Refreshes mailing list archives from mod_mbox using refresh_mod_mbox().") -} else if (arguments[["download"]] & arguments[["modmbox"]]) { +} else if (arguments[["refresh"]] & arguments[["modmbox"]]) { conf_path <- arguments[[""]] project_key <- arguments[[""]] @@ -78,11 +78,11 @@ if (arguments[["parse"]] & arguments[["help"]]) { verbose = TRUE ) - cli::cli_alert_success(paste0("Downloaded mailing list archives were saved at: ", save_folder_path)) + cli::cli_alert_success(paste0("Refreshed mailing list archives were saved at: ", save_folder_path)) -} else if (arguments[["download"]] & arguments[["pipermail"]] & arguments[["help"]]) { - cli::cli_alert_info("Downloads mailing list archives from pipermail using download_pipermail().") -} else if (arguments[["download"]] & arguments[["pipermail"]]) { +} else if (arguments[["refresh"]] & arguments[["pipermail"]] & arguments[["help"]]) { + cli::cli_alert_info("Refreshes mailing list archives from pipermail using refresh_pipermail().") +} else if (arguments[["refresh"]] & arguments[["pipermail"]]) { conf_path <- arguments[[""]] project_key <- arguments[[""]] @@ -99,7 +99,7 @@ if (arguments[["parse"]] & arguments[["help"]]) { verbose = TRUE ) - cli::cli_alert_success(paste0("Downloaded mailing list archives were saved at: ", save_folder_path)) + cli::cli_alert_success(paste0("Refreshed mailing list archives were saved at: ", save_folder_path)) } else if (arguments[["-h"]] || arguments[["--help"]]) { cli::cli_alert_info(doc) From a89b983a66ed31dc07d88b4c0b6254f98f7c0afb Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 16:39:50 -0800 Subject: [PATCH 64/80] Reverse github actions to match master Signed-off-by: Carlos Paradis --- .github/workflows/R-CMD-check.yml | 7 +++---- .github/workflows/test-coverage.yml | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index a94258e3..84ce2ac6 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -67,10 +67,9 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - brew_prefix=$(brew --prefix) - utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml + utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml shell: bash - name: Check diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index 24273170..01ab743a 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -59,10 +59,9 @@ jobs: run: | brew tap homebrew/core brew install --HEAD universal-ctags - brew_prefix=$(brew --prefix) - utags_head=$(ls $brew_prefix/Cellar/universal-ctags | tail -n 1) - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tools.yml - sed -i -e "s|utags: .*|utags: $brew_prefix/Cellar/universal-ctags/${utags_head}/bin/ctags|g" tests/testthat/testdata/tools.yml + utags_head=$(ls /usr/local/Cellar/universal-ctags | tail -n 1) + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tools.yml + sed -i -e "s|utags: \/usr\/local\/Cellar\/universal-ctags\/HEAD-62f0144\/bin\/ctags|utags: \/usr\/local\/Cellar\/universal-ctags\/${utags_head}\/bin\/ctags|g" tests/testthat/testdata/tools.yml shell: bash - name: Test coverage From 4aa2af2bc3e682239fd60bb7563636ac034df676 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 16:41:17 -0800 Subject: [PATCH 65/80] Remove git.R print statements Signed-off-by: Carlos Paradis --- R/git.R | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/R/git.R b/R/git.R index 9d0474f1..e9e8029a 100644 --- a/R/git.R +++ b/R/git.R @@ -23,19 +23,10 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) git_uri <- git_repo_path save_path <- ifelse(!is.na(save_path),path.expand(save_path),NA) - # DEBUG - print(paste("Perceval path:", perceval_path)) - print(paste("Git repo path:", git_repo_path)) - print(paste("Save path:", save_path)) - print(paste("Perl regex:", perl_regex)) - # Use percerval to parse .git --json line is required to be parsed by jsonlite::fromJSON. # The log will be saved to the /tmp/ folder gitlog_path <- "/tmp/gitlog.log" - # DEBUG - print(paste("Gitlog path:", gitlog_path)) - # Perceval suggested flags perceval_flags <- c( @@ -71,30 +62,18 @@ parse_gitlog <- function(perceval_path,git_repo_path,save_path=NA,perl_regex=NA) } } - # DEBUG - print("Git log call message:") - print(gitlog_call_message) - # Parsed JSON output. perceval_output <- system2(perceval_path, args = c('git', '--git-log',gitlog_path,git_uri,'--json-line'), stdout = TRUE, stderr = FALSE) - # DEBUG - print("Perceval Output:") - cat(perceval_output, sep = "\n") - perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose = FALSE)) if(nrow(perceval_parsed) == 0){ stop("The repository specified has no commits.") } - # DEBUG - print("Parsed data structure:") - print(str(perceval_parsed)) - # APR very first commit is a weird single case of commit without files. We filter them here. is_commit_with_files <- !!sapply(perceval_parsed$data.files,length) perceval_parsed <- perceval_parsed[is_commit_with_files] From a3665730c2fbc89392d1d2c2fc6f9db259442bb3 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 16:47:30 -0800 Subject: [PATCH 66/80] Remove prints from mail tests Signed-off-by: Carlos Paradis --- tests/testthat/test-mail.R | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/testthat/test-mail.R b/tests/testthat/test-mail.R index bf190130..fc115566 100644 --- a/tests/testthat/test-mail.R +++ b/tests/testthat/test-mail.R @@ -21,26 +21,14 @@ test_that("Calling parse_mbox with correct perceval and mbox path returns a data tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] - # Debugging output - print("Debugging parse_mbox:") - print(paste("Tools path:", tools_path)) - print(paste("Perceval path:", perceval_path)) - mbox_path <- example_mailing_list_two_threads( folder_path = "/tmp", folder_name = "example_two_threads_mailing_list", file_name = "two_thread_mailing_list" ) - # Debugging output - print(paste("Generated Mbox path:", mbox_path)) - result <- parse_mbox(perceval_path, mbox_path) - # Debugging output - print("Result of parse_mbox:") - print(head(result)) - io_delete_folder(folder_path = "/tmp", folder_name = "example_two_threads_mailing_list") expect_equal(result[reply_from == "John Doe "]$reply_subject, "Subject 1") From 28872329b24bdb2023184e6b6ff3d4427bc0c094 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 16:49:01 -0800 Subject: [PATCH 67/80] Remove prefix underline Notebooks should be now functional in master, so no longer needed to keep them disabled from pkgdown. Signed-off-by: Carlos Paradis --- vignettes/{_motif_analysis.Rmd => motif_analysis.Rmd} | 0 ...ommunication_showcase.Rmd => reply_communication_showcase.Rmd} | 0 .../{_social_smell_showcase.Rmd => social_smell_showcase.Rmd} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename vignettes/{_motif_analysis.Rmd => motif_analysis.Rmd} (100%) rename vignettes/{_reply_communication_showcase.Rmd => reply_communication_showcase.Rmd} (100%) rename vignettes/{_social_smell_showcase.Rmd => social_smell_showcase.Rmd} (100%) diff --git a/vignettes/_motif_analysis.Rmd b/vignettes/motif_analysis.Rmd similarity index 100% rename from vignettes/_motif_analysis.Rmd rename to vignettes/motif_analysis.Rmd diff --git a/vignettes/_reply_communication_showcase.Rmd b/vignettes/reply_communication_showcase.Rmd similarity index 100% rename from vignettes/_reply_communication_showcase.Rmd rename to vignettes/reply_communication_showcase.Rmd diff --git a/vignettes/_social_smell_showcase.Rmd b/vignettes/social_smell_showcase.Rmd similarity index 100% rename from vignettes/_social_smell_showcase.Rmd rename to vignettes/social_smell_showcase.Rmd From 800fccc0a2837523e8f62e017d86efe1e7945429 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 16:55:54 -0800 Subject: [PATCH 68/80] Remove additional git prints Signed-off-by: Carlos Paradis --- tests/testthat/test-git.R | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/testthat/test-git.R b/tests/testthat/test-git.R index 740bb6ad..ad3890b2 100644 --- a/tests/testthat/test-git.R +++ b/tests/testthat/test-git.R @@ -31,22 +31,10 @@ test_that("Calling parse_gitlog with correct perceval and correct git log path r tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] - # Debugging output - print("Debugging parse_gitlog:") - print(paste("Tools path:", tools_path)) - print(paste("Perceval path:", perceval_path)) - git_repo_path <- suppressWarnings(git_create_sample_log()) - # Debugging output - print(paste("Generated Git repo path:", git_repo_path)) - result <- parse_gitlog(perceval_path, git_repo_path) - # Debugging output - print("Result of parse_gitlog:") - print(head(result)) - expect_is(result, "data.table") suppressWarnings(git_delete_sample_log(git_repo_path)) From 67de9f8cf89f58bb87e5e49e4aeb0d8db6b7f567 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 16:56:12 -0800 Subject: [PATCH 69/80] Internal api functions should not be display Move to internal section of pkgdown so it is not display on docs to the user. Signed-off-by: Carlos Paradis --- _pkgdown.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index aa4964d3..f7e4810f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -99,9 +99,7 @@ reference: - refresh_pipermail - download_mod_mbox - refresh_mod_mbox - - process_gz_to_mbox_in_folder - parse_mbox - - parse_mbox_latest_date - make_mbox_reply - make_mbox_mailing_list - title: __JIRA__ @@ -202,11 +200,6 @@ reference: - is_same_identity - assign_exact_identity - identity_match -- title: __Interval__ - desc: Provides different types of interval windows (e.g. release) for metric functions. -- contents: - - interval_commit_metric - - get_date_from_commit_hash - title: __Metrics__ desc: > Various metrics used to estimate code quality, @@ -354,3 +347,10 @@ reference: - get_window_end_commit - get_window_size - get_window_start_commit +- title: __Interval__ + desc: Provides different types of interval windows (e.g. release) for metric functions. +- contents: + - interval_commit_metric + - get_date_from_commit_hash + - process_gz_to_mbox_in_folder + - parse_mbox_latest_date From ae1ba66e1c3bbb0c481d2f4a5d94a14b2ad87960 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 17:39:48 -0800 Subject: [PATCH 70/80] Unit tests now pass locally The unit tests rely on a separate copy of thirft.yml. That file was not updated, so the unit tests were throwing errors for not finding the field. Moreover, the unit tests did not have the new config path, so the parse_mbox() were failing. Signed-off-by: Carlos Paradis --- R/mail.R | 22 +---- man/commit_message_id_coverage.Rd | 2 +- man/download_mod_mbox.Rd | 2 +- man/download_pipermail.Rd | 4 +- man/parse_gitlog.Rd | 6 +- man/parse_mbox.Rd | 8 +- man/parse_mbox_latest_date.Rd | 4 +- tests/testthat/test-mail.R | 23 +++-- tests/testthat/testdata/thrift.yml | 132 ++++++++++++++++++++++++----- 9 files changed, 142 insertions(+), 61 deletions(-) diff --git a/R/mail.R b/R/mail.R index afb0f656..6e26f6e5 100644 --- a/R/mail.R +++ b/R/mail.R @@ -589,13 +589,6 @@ parse_mbox <- function(perceval_path, mbox_file_path) { mbox_dir <- dirname(mbox_file_path) # Extract directory path mbox_uri <- mbox_file_path # URI points to the mbox file - - - # Debugging - print(paste("Perceval path:", perceval_path)) - print(paste("Mbox file path:", mbox_file_path)) - print(paste("Mbox directory path:", mbox_dir)) - # Use Perceval to parse the mbox file perceval_output <- tryCatch({ system2(perceval_path, @@ -603,13 +596,11 @@ parse_mbox <- function(perceval_path, mbox_file_path) { stdout = TRUE, stderr = TRUE) }, error = function(e) { - print("Error running Perceval:") - print(e$message) + #print("Error running Perceval:") + #print(e$message) stop("Perceval execution failed.") }) - # Debugging Perceval output - print("Perceval Output:") cat(perceval_output, sep = "\n") @@ -633,10 +624,6 @@ parse_mbox <- function(perceval_path, mbox_file_path) { stop("JSON parsing failed.") }) - # Debugging parsed data - print("Parsed data structure:") - print(str(perceval_parsed)) - columns_of_interest <- c("data.Message.ID", "data.In.Reply.To", "data.Date", "data.From", "data.To", "data.Cc", "data.Subject", "data.body.plain", "data.body") columns_rename <- c("reply_id", "in_reply_to_id", "reply_datetimetz", "reply_from", "reply_to", "reply_cc", "reply_subject", "reply_body", "reply_body") @@ -650,11 +637,6 @@ parse_mbox <- function(perceval_path, mbox_file_path) { old = colnames(perceval_parsed), new = columns_rename[is_available_column]) - # Debugging final parsed data - print("Final parsed data:") - print(perceval_parsed) - - return(perceval_parsed) } diff --git a/man/commit_message_id_coverage.Rd b/man/commit_message_id_coverage.Rd index e7f0c6ef..68fad761 100644 --- a/man/commit_message_id_coverage.Rd +++ b/man/commit_message_id_coverage.Rd @@ -22,9 +22,9 @@ Calculates the number of commits from the git log which contains the message id. \code{\link{parse_gitlog}} to obtain additions and deletions from gitlog Other {metrics}: -\code{\link{metric_churn}()}, \code{\link{metric_churn_per_commit_interval}()}, \code{\link{metric_churn_per_commit_per_file}()}, +\code{\link{metric_churn}()}, \code{\link{metric_file_bug_churn}()}, \code{\link{metric_file_bug_frequency}()}, \code{\link{metric_file_churn}()}, diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index 33715d61..e1835761 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -36,5 +36,5 @@ of kaiaulu_YYYYMM.mbox. The function loops through each month in the range specified by `start_year_month` and `end_year_month`, and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. This means the file could not be found and that month's data may not exist. -At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. +At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. } diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index e8c19b83..0244abbd 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -36,6 +36,6 @@ overwriting any existing file with the same name. The original .gz file is delet The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. The function only downloads files that fall between the specified start_year_month and end_year_month. -When both formats fail to download, the function issues a warning indimessageing the missing month. -At the end, the function summarizes the downloads, indimessageing the range of dates present and any missing months. +When both formats fail to download, the function issues a warning indicating the missing month. +At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. } diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index 7d65786f..d4370808 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -23,16 +23,16 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_mbox}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } \concept{parsers} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 349d009c..768ff760 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -23,15 +23,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} @@ -39,15 +39,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index eedf9633..486f35fd 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -21,15 +21,15 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, -\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()} } diff --git a/tests/testthat/test-mail.R b/tests/testthat/test-mail.R index fc115566..700bcf30 100644 --- a/tests/testthat/test-mail.R +++ b/tests/testthat/test-mail.R @@ -2,24 +2,29 @@ tools_path <- test_path("testdata", "tools.yml") conf_path <- test_path("testdata", "thrift.yml") test_that("Incorrect perceval path fails parse_mbox", { - conf <- yaml::read_yaml(conf_path) - mbox_path <- conf[["mailing_list"]][["mbox"]] + + conf <- parse_config(conf_path) + key_1_name <- names(get_mbox_key_indexes(conf))[1] + mbox_path <- get_mbox_path(conf,key_1_name) + incorrect_perceval_path <- "/incorrect/path/to/perceval" - expect_error(parse_mbox(incorrect_perceval_path, mbox_path), "error in running command") + expect_error(parse_mbox(incorrect_perceval_path, mbox_path), "Perceval execution failed.") }) test_that("Incorrect mbox path to parse_mbox returns empty table", { - tool <- yaml::read_yaml(tools_path) - perceval_path <- tool[["perceval"]] + + tool <- parse_config(tools_path) + perceval_path <- get_tool_project("perceval",tool) perceval_path <- path.expand(perceval_path) incorrect_mbox_path <- "/incorrect/path/to/mbox" - output <- parse_mbox(perceval_path, incorrect_mbox_path) - expect_equal(nrow(output), 0) + expect_error(parse_mbox(perceval_path, incorrect_mbox_path), "No valid JSON lines found in Perceval output. Check the mbox file or Perceval configuration.") }) test_that("Calling parse_mbox with correct perceval and mbox path returns a data table with correct raw data", { tools_path <- file.path(tools_path) - tool <- yaml::read_yaml(tools_path) - perceval_path <- tool[["perceval"]] + + + tool <- parse_config(tools_path) + perceval_path <- get_tool_project("perceval",tool) mbox_path <- example_mailing_list_two_threads( folder_path = "/tmp", diff --git a/tests/testthat/testdata/thrift.yml b/tests/testthat/testdata/thrift.yml index f47062b2..87ea8278 100644 --- a/tests/testthat/testdata/thrift.yml +++ b/tests/testthat/testdata/thrift.yml @@ -34,7 +34,7 @@ project: version_control: # Where is the git log located locally? - log: ../../rawdata/git_repo/thrift/.git # cloned Apache Thrift repo and put path to its .git file + log: ../../rawdata/thrift/git_repo/.git # cloned Apache Thrift repo and put path to its .git file # From where the git log was downloaded? log_url: https://github.com/apache/thrift # List of branches used for analysis @@ -42,28 +42,66 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - mbox: ../../rawdata/mbox/thrift-dev.mbox # Download here: https://cdn.lfdr.de/stmc/ieee_tse_data/mail/thrift-dev.mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - thrift-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/thrift-dev + save_folder_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail/thrift.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/thrift-user + save_folder_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: THRIFT - # Download using `download_jira_data.Rmd` - issues: ../../rawdata/issue_tracker/thrift_issues.json - issue_comments: ../../rawdata/issue_tracker/thrift_issue_comments.json + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: THRIFT + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/thrift/jira/issues/thrift/ + issue_comments: ../../rawdata/thrift/jira/issue_comments/thrift/ github: - # Obtained from the project's GitHub URL - owner: apache - repo: thrift - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/thrift/ + project_key_1: + # Obtained from the project's GitHub URL + owner: apache + repo: thrift + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/thrift/github/issue_or_pr_comment/apache_thrift/ + issue: ../../rawdata/thrift/github/issue/apache_thrift/ + issue_search: ../../rawdata/thrift/github/issue_search/apache_thrift/ + issue_event: ../../rawdata/thrift/github/issue_event/apache_thrift/ + pull_request: ../../rawdata/thrift/github/pull_request/apache_thrift/ + commit: ../../rawdata/thrift/github/commit/apache_thrift/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -113,6 +151,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -132,6 +196,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: From 3697fe3700eb73b72748baf1229ef2fc4bfb152c Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 18:42:35 -0800 Subject: [PATCH 71/80] Remove more prints.. Signed-off-by: Carlos Paradis --- R/mail.R | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/R/mail.R b/R/mail.R index 6e26f6e5..08b58b91 100644 --- a/R/mail.R +++ b/R/mail.R @@ -601,11 +601,6 @@ parse_mbox <- function(perceval_path, mbox_file_path) { stop("Perceval execution failed.") }) - cat(perceval_output, sep = "\n") - - - - # Filter JSON lines from Perceval output json_lines <- perceval_output[grepl("^\\{", perceval_output)] # Escape the `{` character @@ -619,8 +614,7 @@ parse_mbox <- function(perceval_path, mbox_file_path) { perceval_parsed <- tryCatch({ data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) }, error = function(e) { - print("Error parsing JSON lines:") - print(e$message) + #print(e$message) stop("JSON parsing failed.") }) From 7fc9e4199452729b3e3682f8b1f83e24aba62092 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 18:42:55 -0800 Subject: [PATCH 72/80] Remove strange mbox file path Signed-off-by: Carlos Paradis --- conf/helix.yml | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/conf/helix.yml b/conf/helix.yml index ce1e48fd..1a8aa367 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -55,24 +55,20 @@ mailing_list: mailing_list: https://lists.apache.org/list.html?announce@apache.org save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu_202407.mbox project_key_2: mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org save_folder_path: ../../helix/mod_mbox/save_mbox_mail_2 # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../helix/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu_202407.mbox # If project uses Pipermail - pipermail: - project_key_1: - mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/openssl-users/ +# save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail/kaiaulu_202407.mbox - project_key_2: - mailing_list: https://mta.openssl.org/pipermail/openssl-project/ - save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail_2/ +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/openssl-project/ +# save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail_2/ # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail_2/kaiaulu_202407.mbox issue_tracker: jira: From 207d0c43275bed158115486d70537e6411c65e7f Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 19:14:03 -0800 Subject: [PATCH 73/80] Fix parse_mbox removing stderr = TRUE the flag caused errors of perceval being unable to parse json files. Signed-off-by: Carlos Paradis --- R/mail.R | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/R/mail.R b/R/mail.R index 08b58b91..d2de794c 100644 --- a/R/mail.R +++ b/R/mail.R @@ -583,18 +583,19 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v #' @export #' @family parsers parse_mbox <- function(perceval_path, mbox_file_path) { - # Expand paths + # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") perceval_path <- path.expand(perceval_path) mbox_file_path <- path.expand(mbox_file_path) - mbox_dir <- dirname(mbox_file_path) # Extract directory path - mbox_uri <- mbox_file_path # URI points to the mbox file - # Use Perceval to parse the mbox file + # Remove ".mbox" + mbox_uri <- stri_replace_last(mbox_file_path,replacement="",regex=".mbox") + + # Use percerval to parse mbox_path. --json line is required to be parsed by jsonlite::fromJSON. perceval_output <- tryCatch({ system2(perceval_path, - args = c('mbox', mbox_uri, mbox_dir, '--json-line'), + args = c('mbox', mbox_uri, mbox_file_path, '--json-line'), stdout = TRUE, - stderr = TRUE) + stderr = FALSE) }, error = function(e) { #print("Error running Perceval:") #print(e$message) @@ -609,10 +610,10 @@ parse_mbox <- function(perceval_path, mbox_file_path) { stop("No valid JSON lines found in Perceval output. Check the mbox file or Perceval configuration.") } - # Parse JSON output as a data.table perceval_parsed <- tryCatch({ - data.table(jsonlite::stream_in(textConnection(json_lines), verbose = FALSE)) + # Parsed JSON output as a data.table. + data.table(jsonlite::stream_in(textConnection(perceval_output),verbose=FALSE)) }, error = function(e) { #print(e$message) stop("JSON parsing failed.") From d3dd2329f3ee138b65eea6a32b39cfb3bc32782f Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sat, 7 Dec 2024 20:19:28 -0800 Subject: [PATCH 74/80] Add loop to parse_mbox on notebook Signed-off-by: Carlos Paradis --- vignettes/social_smell_showcase.Rmd | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vignettes/social_smell_showcase.Rmd b/vignettes/social_smell_showcase.Rmd index 7b454094..2f34629b 100644 --- a/vignettes/social_smell_showcase.Rmd +++ b/vignettes/social_smell_showcase.Rmd @@ -132,10 +132,14 @@ project_mbox <- NULL project_jira <- NULL project_github_replies <- NULL - - +project_mbox <- data.table() if(!is.null(mbox_path)){ - project_mbox <- parse_mbox(perceval_path,mbox_path) + for(mbox_file_path in list.files(mbox_path,full.names=TRUE)){ + #print(mbox_file_path) + project_mbox <- rbind(project_mbox,parse_mbox(perceval_path,mbox_file_path),fill=TRUE) + } + + #project_mbox <- parse_mbox(perceval_path,mbox_path) project_mbox$reply_tz <- sapply(stringi::stri_split(project_git$reply_datetimetz, regex=" "),"[[",6) From f74aff315ee502059f38d354508473275c8e2e11 Mon Sep 17 00:00:00 2001 From: Carlos Paradis Date: Sun, 8 Dec 2024 02:25:57 -0800 Subject: [PATCH 75/80] Documentation pass Simplified some of the notebook language, reduced title of functions, removed some of the sub-headers pound symbols as it was creating too many sections on the code blocks. Added parser tables after downloaders and remove their eval so example tables of what can be downloaded are shown on the generated notebook. Commit passes check, tests, and downloaders, refresh and parsers work. Signed-off-by: Carlos Paradis --- R/mail.R | 126 +++++++++++++--------------- conf/openssl.yml | 7 +- man/download_mod_mbox.Rd | 7 +- man/download_pipermail.Rd | 7 +- man/parse_mbox.Rd | 2 +- man/parse_mbox_latest_date.Rd | 4 +- man/process_gz_to_mbox_in_folder.Rd | 2 +- man/refresh_mod_mbox.Rd | 4 +- man/refresh_pipermail.Rd | 4 +- vignettes/download_mail.Rmd | 119 +++++++++----------------- 10 files changed, 120 insertions(+), 162 deletions(-) diff --git a/R/mail.R b/R/mail.R index d2de794c..f2de4df4 100644 --- a/R/mail.R +++ b/R/mail.R @@ -4,18 +4,18 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -############## Pipermail Downloader ############## +############## Downloader Pipermail ############## -#' Download all pipermail files in an archive as mbox files +#' Pipermail Downloader #' -#' @description This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. +#' This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. #' It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. #' The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. #' #' When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, #' overwriting any existing file with the same name. The original .gz file is deleted after extraction. #' -#' The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. +#' The downloaded .mbox files are saved in the specified folder following the naming convention YYYYMM.mbox. #' The function only downloads files that fall between the specified start_year_month and end_year_month. #' When both formats fail to download, the function issues a warning indicating the missing month. #' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. @@ -29,8 +29,7 @@ #' @export download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { - ########## Download and Parse Mailing List HTML for Links ########## - + ## Download and Parse Mailing List HTML for Links # Ensure mailing_list url ends with a slash, which is important when constructing links for downloading files, # since the extracted links are relative to the base URL. # e.g.base url: https://mta.openssl.org/pipermail/openssl-announce/ and extracted link: 2024-June.txt.gz @@ -60,7 +59,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s # Create an empty vector for storing the links that will be extracted. links <- c() - ########## Extract Date and Links ########## + ## Extract Date and Links # Loop through the data rows and extract the date and link from each row. # The date is in the first column, and the link is in the third column. for (row in data_rows) { @@ -91,10 +90,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s } } - ########## Initialize Vector for Failed Months ########## + ## Initialize Vector for Failed Months failed_months <- character() - ########## Use Links to Download Individual Files ########## + ## Use Links to Download Individual Files # Initialize a vector for storing the paths of the downloaded files. downloaded_files <- c() for (i in seq_along(links)) { @@ -136,9 +135,9 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s } # Define the destination file name and path where the downloaded content will be saved as a .mbox file. - dest <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox')) + dest <- file.path(save_folder_path, stringi::stri_c(year_month_clean, '.mbox')) - ########## Write Downloaded File to Disk ########## + ## Write Downloaded File to Disk # Print diagnostic info if verbose is TRUE if (verbose) { message("Downloading: ", download_url, "\n") @@ -148,7 +147,7 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s # Write the downloaded file to disk. If the file is a .gz file, it needs to be unzipped and converted to .mbox format. if (grepl("\\.gz$", download_url)) { # Download the .gz file to a temporary lomessageion. - gz_file_path <- file.path(save_folder_path, stringi::stri_c('kaiaulu_', year_month_clean, '.mbox.gz')) + gz_file_path <- file.path(save_folder_path, stringi::stri_c(year_month_clean, '.mbox.gz')) httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) # If a .gz file is downloaded, the function unzips it and converts it into an .mbox file. @@ -175,17 +174,17 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s downloaded_files <- c(downloaded_files, dest) } - ########## Summary of Downloads ########## + ## Summary of Downloads if (length(failed_months) > 0) { warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", ")) } # List the files in the save_folder_path. - downloaded_files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$", full.names = FALSE) + downloaded_files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$", full.names = FALSE) # The downloaded .mbox files are saved in the specified folder with the - # naming convention kaiaulu_YYYYMM.mbox, where YYYYMM represents the year and month. + # naming convention YYYYMM.mbox, where YYYYMM represents the year and month. # Extract the YYYYMM from the file names. - downloaded_dates <- as.numeric(sub("kaiaulu_(\\d{6})\\.mbox", "\\1", downloaded_files_in_folder)) + downloaded_dates <- as.numeric(sub("(\\d{6})\\.mbox", "\\1", downloaded_files_in_folder)) # Create the expected list of YYYYMM between start_year_month and end_year_month. start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d") @@ -219,21 +218,19 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s warning("Months missing in the date range: ", paste(missing_months, collapse = ", "), "\n") } - ########## Return List of Downloaded Files ########## + ## Return List of Downloaded Files # Return the list of downloaded .mbox files return(downloaded_files) } -############## Pipermail Refresher ############## - -#' Refresh mbox files downloaded via pipermail +#' Refresh Pipermail #' #' This function refreshes the mailing list files by checking the contents of a specified folder. #' If the folder is empty, it calls \code{\link{download_pipermail}} to download all pipermail files from start_year_month to the current month. #' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it #' along with all future months up to the current real-life month. #' -#' The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +#' The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. #' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. #' Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. #' @@ -245,10 +242,10 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s #' @export refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { - ########## Check if Folder is Empty ########## + ## Check if Folder is Empty # Check the contents of the folder to see if any .mbox files are already present - # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' - files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + # The function looks for files that match the naming pattern 'YYYYMM.mbox' + files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") if (length(files_in_folder) == 0) { # If the folder is empty, download all pipermail files starting from the start_year_month @@ -259,23 +256,23 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, # Call the download_pipermail function to download files from start_year_month to end_year_month download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) } - ########## Identify the Most Recent Month ########## + ## Identify the Most Recent Month ## else { # If the folder is not empty, identify the most recent month based on the filenames - # The filenames follow the pattern 'kaiaulu_YYYYMM.mbox', so we extract the YYYYMM part of the filenames - year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) + # The filenames follow the pattern 'YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("(\\d{6})\\.mbox$", "\\1", files_in_folder) # Find the most recent month by taking the maximum of the extracted YYYYMM values recent_month <- max(year_months) # Delete the most recent file before redownloading it - recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + recent_file <- file.path(save_folder_path, stringi::stri_c(recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) if (verbose) message("Deleted the most recent file:", recent_file, "\n") } - ########## Redownload from the Most Recent Month ########## + ## Redownload from the Most Recent Month ## # Set the end_year_month to the current month (based on the system date) end_year_month <- format(Sys.Date(), "%Y%m") @@ -285,14 +282,14 @@ refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, # Call the download_pipermail function to redownload the deleted month and all subsequent months up to the current month download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) } - ########## Process .gz Files After Refresh ########## + ## Process .gz Files After Refresh ## # Call process_gz_to_mbox_in_folder to ensure all .gz files are converted to .mbox after the refresh if (verbose) message("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") process_gz_to_mbox_in_folder(save_folder_path = save_folder_path, verbose = verbose) } -#' Process .gz files in a folder and convert them to .mbox +#' Gz to Mbox Converter #' #' This function scans a specified folder for any .gz files, unzips them, #' and renames them to the .mbox format. After unzipping, the original .gz files are deleted. @@ -320,7 +317,7 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { # Create a vector to store the names of the converted .mbox files converted_mbox_files <- c() - ########## Process Each .gz File ########## + ## Process Each .gz File ## # Iterate over each .gz file, unzip it, and convert it to .mbox for (gz_file in gz_files) { # Define the corresponding .mbox file path by replacing .gz with .mbox in the file name @@ -357,14 +354,14 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { } -############## Mod Mbox Downloader ############## +############## Downloader Mod Mbox ############## -#' Download all mod_mbox files in a mailing list as mbox files +#' Download Mod_Mbox #' -#' @description This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. +#' This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. #' It constructs the download URLs for each month based on the start and end date range and downloads the mbox files #' in the format "YYYY-MM". The downloaded .mbox files are saved in the specified folder, with a naming convention -#' of kaiaulu_YYYYMM.mbox. +#' of YYYYMM.mbox. #' #' The function loops through each month in the range specified by `start_year_month` and `end_year_month`, #' and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. @@ -381,14 +378,14 @@ process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { #' @export download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { - ########## Extract Mailing List Name ########## + ## Extract Mailing List Name ## # Extract the mailing list name from the given URL. This is because the actual list name is # embedded within the URL (after the 'list.html?'). # We are using 'sub()' to perform a simple string replacement, extracting everything after 'list.html?'. mailing_list_name <- sub(".*list.html\\?(.+)", "\\1", mailing_list) if (verbose) message("Base list extracted:", mailing_list_name, "\n") - ########## Prepare Year and Month ########## + ## Prepare Year and Month ## # The start_year_month and end_year_month are in the format "YYYYMM". # Split them into year and month for easier looping. # Extract first 4 digits as start year, and last 2 digits as start month. @@ -398,11 +395,11 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa end_year <- as.numeric(substr(end_year_month, 1, 4)) end_month <- as.numeric(substr(end_year_month, 5, 6)) - ########## Initialize Vectors for Failed Months ########## + ## Initialize Vectors for Failed Months ## # Vectors to track failed downloads. failed_months <- character() - ########## Download Loop ########## + ## Download Loop ## # Iterate over the years and months from start_year/month to end_year/month. # This is done by looping over the years, and for each year, looping over the 12 months. for (year in start_year:end_year) { @@ -411,7 +408,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa if (year == start_year && month < start_month) next if (year == end_year && month > end_month) break - ######### Construct URL and Save Path ########## + ######### Construct URL and Save Path ## # Construct the month string (e.g., '2023-04') and the full download URL. # Make sure the month has two digits. month_str <- sprintf("%02d", month) @@ -421,8 +418,8 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa # The format for the URL is fixed by Apache's Pony Mail service. download_url <- stringi::stri_c("https://lists.apache.org/api/mbox.lua?list=", mailing_list_name, "&date=", year_month_str) - # Create the file name where the mbox will be saved locally, in the format ''kaiaulu_'YYYYMM.mbox'. - file_name <- stringi::stri_c("kaiaulu_", year, month_str, ".mbox") + # Create the file name where the mbox will be saved locally, in the format 'YYYYMM.mbox'. + file_name <- stringi::stri_c(year, month_str, ".mbox") file_path <- file.path(save_folder_path, file_name) if (verbose) { @@ -430,7 +427,7 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa message("Saving to file:", file_path, "\n") } - ########## Download Mbox File ########## + ## Download Mbox File ## # Download the file using httr::GET, saving it directly to the destination file path. response <- httr::GET(download_url, httr::write_disk(file_path, overwrite = TRUE)) # Get the status code to see if the download succeeded. @@ -451,16 +448,16 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa } } - ########## Summary of Failed Downloads ########## + ## Summary of Failed Downloads ## if (length(failed_months) > 0) { warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", ")) } # List the files in the save_folder_path - downloaded_files <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$", full.names = FALSE) + downloaded_files <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$", full.names = FALSE) # Extract the YYYYMM from the file names - downloaded_dates <- as.numeric(sub("kaiaulu_(\\d{6})\\.mbox", "\\1", downloaded_files)) + downloaded_dates <- as.numeric(sub("(\\d{6})\\.mbox", "\\1", downloaded_files)) # Find the expected list of YYYYMM between start_year_month and end_year_month start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d") @@ -494,22 +491,19 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa warning("Months missing in the date range:", paste(missing_months, collapse = ", "), "\n") } - ########## Return Save Path ########## + ## Return Save Path ## # Return the folder path where all mbox files were saved. return(save_folder_path) } - -############## Mod Mbox Refresher ############## - -#' Refresh mbox files downloaded via mod_mbox +#' Refresh Mod_Mbox #' #' This function refreshes the mailing list files by checking the contents of a specified folder. #' If the folder is empty, it calls \code{\link{download_mod_mbox}} to download all mod_mbox files from start_year_month to the current month. #' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it #' along with all future months up to the current real-life month. #' -#' The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +#' The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. #' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. #' Redownloading the most recent file ensures any files added in that month after the latest refresh are included. #' @@ -521,10 +515,10 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa #' @export refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { - ########## Check if Folder is Empty ########## + ## Check if Folder is Empty ## # Check the contents of the folder to see if any .mbox files are already present. - # The function looks for files that match the naming pattern 'kaiaulu_YYYYMM.mbox' - files_in_folder <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + # The function looks for files that match the naming pattern 'YYYYMM.mbox' + files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") if (length(files_in_folder) == 0) { # If the folder is empty, download all mod_mbox files starting from start_year_month @@ -535,23 +529,23 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v # Call the download_mod_mbox function to download files from start_year_month to end_year_month download_mod_mbox(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = verbose) } - ########## Identify the Most Recent Month ########## + ## Identify the Most Recent Month ## else { # If the folder is not empty, identify the most recent month based on the filenames - # The filenames follow the pattern 'kaiaulu_YYYYMM.mbox', so we extract the YYYYMM part of the filenames - year_months <- gsub("kaiaulu_(\\d{6})\\.mbox$", "\\1", files_in_folder) + # The filenames follow the pattern 'YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("(\\d{6})\\.mbox$", "\\1", files_in_folder) # Find the most recent month by taking the maximum of the extracted YYYYMM values recent_month <- max(year_months) # Delete the most recent file before redownloading it - recent_file <- file.path(save_folder_path, stringi::stri_c("kaiaulu_", recent_month, ".mbox")) + recent_file <- file.path(save_folder_path, stringi::stri_c(recent_month, ".mbox")) if (file.exists(recent_file)) { file.remove(recent_file) if (verbose) message("Deleted the most recent file:", recent_file, "\n") } - ########## Redownload from the Most Recent Month ########## + ## Redownload from the Most Recent Month ## # Set the end_year_month to the current month (based on the system date) end_year_month <- format(Sys.Date(), "%Y%m") @@ -566,7 +560,7 @@ refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, v ############## Parsers ############## -#' Parse mbox from Perceval +#' Parse Mbox #' #' Parses an mbox file, which consists of emails in a mailbox, using the Perceval library. #' Note .mbox files do not have a consistent number of fields (e.g. Reply Cc.). Due to that, @@ -635,10 +629,10 @@ parse_mbox <- function(perceval_path, mbox_file_path) { return(perceval_parsed) } -#' Parse mbox latest date +#' Parse Latest Mbox #' #' This function returns the name of the latest mod_mbox file downloaded in the specified folder -#' based on the naming convention `kaiaulu_YYYYMM.mbox`. For example: `kaiaulu_202401.mbox`. +#' based on the naming convention `YYYYMM.mbox`. For example: `202401.mbox`. #' #' @param save_folder_path path to the folder containing the mbox files #' @return `latest_mbox_file` the name of the latest mod_mbox file @@ -646,7 +640,7 @@ parse_mbox <- function(perceval_path, mbox_file_path) { #' @family parsers parse_mbox_latest_date <- function(save_folder_path) { # List all .mbox files in the folder with the expected naming pattern - file_list <- list.files(save_folder_path, pattern = "kaiaulu_\\d{6}\\.mbox$") + file_list <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") if (length(file_list) == 0) { warning("No .mbox files found in the folder.") @@ -654,7 +648,7 @@ parse_mbox_latest_date <- function(save_folder_path) { } # Extract the dates from the filenames - date_list <- sub("kaiaulu_(\\d{6})\\.mbox$", "\\1", file_list) + date_list <- sub("(\\d{6})\\.mbox$", "\\1", file_list) # Convert dates to numeric for comparison date_numeric <- as.numeric(date_list) diff --git a/conf/openssl.yml b/conf/openssl.yml index 6d5deaea..91871521 100644 --- a/conf/openssl.yml +++ b/conf/openssl.yml @@ -46,10 +46,9 @@ version_control: mailing_list: pipermail: - mail_key_1: - archive_url: https://mta.openssl.org/pipermail/openssl-dev - pipermail: ../../rawdata/openssl/pipermail/openssl-dev/ - mailing_list: openssl-dev + project_key_1: + mailing_list: https://mta.openssl.org/pipermail/openssl-users/ + save_folder_path: ../../rawdata/openssl/pipermail/save_mbox_users # issue_tracker: # jira: diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index e1835761..9347aa22 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mail.R \name{download_mod_mbox} \alias{download_mod_mbox} -\title{Download all mod_mbox files in a mailing list as mbox files} +\title{Download Mod_Mbox} \usage{ download_mod_mbox( mailing_list, @@ -31,8 +31,9 @@ Returns `save_folder_path`, the folder path where the mbox files are stored. This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. It constructs the download URLs for each month based on the start and end date range and downloads the mbox files in the format "YYYY-MM". The downloaded .mbox files are saved in the specified folder, with a naming convention -of kaiaulu_YYYYMM.mbox. - +of YYYYMM.mbox. +} +\details{ The function loops through each month in the range specified by `start_year_month` and `end_year_month`, and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. This means the file could not be found and that month's data may not exist. diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 0244abbd..24f75c83 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mail.R \name{download_pipermail} \alias{download_pipermail} -\title{Download all pipermail files in an archive as mbox files} +\title{Pipermail Downloader} \usage{ download_pipermail( mailing_list, @@ -30,11 +30,12 @@ Returns `downloaded_files`, a vector of the downloaded files in the current work This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. - +} +\details{ When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, overwriting any existing file with the same name. The original .gz file is deleted after extraction. -The downloaded .mbox files are saved in the specified folder following the naming convention kaiaulu_YYYYMM.mbox. +The downloaded .mbox files are saved in the specified folder following the naming convention YYYYMM.mbox. The function only downloads files that fall between the specified start_year_month and end_year_month. When both formats fail to download, the function issues a warning indicating the missing month. At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 768ff760..775cbca6 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mail.R \name{parse_mbox} \alias{parse_mbox} -\title{Parse mbox from Perceval} +\title{Parse Mbox} \usage{ parse_mbox(perceval_path, mbox_file_path) } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index 486f35fd..dcaf8305 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mail.R \name{parse_mbox_latest_date} \alias{parse_mbox_latest_date} -\title{Parse mbox latest date} +\title{Parse Latest Mbox} \usage{ parse_mbox_latest_date(save_folder_path) } @@ -14,7 +14,7 @@ parse_mbox_latest_date(save_folder_path) } \description{ This function returns the name of the latest mod_mbox file downloaded in the specified folder -based on the naming convention `kaiaulu_YYYYMM.mbox`. For example: `kaiaulu_202401.mbox`. +based on the naming convention `YYYYMM.mbox`. For example: `202401.mbox`. } \seealso{ Other parsers: diff --git a/man/process_gz_to_mbox_in_folder.Rd b/man/process_gz_to_mbox_in_folder.Rd index 681022f2..a9a96c41 100644 --- a/man/process_gz_to_mbox_in_folder.Rd +++ b/man/process_gz_to_mbox_in_folder.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mail.R \name{process_gz_to_mbox_in_folder} \alias{process_gz_to_mbox_in_folder} -\title{Process .gz files in a folder and convert them to .mbox} +\title{Gz to Mbox Converter} \usage{ process_gz_to_mbox_in_folder(save_folder_path, verbose = TRUE) } diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd index f8da91dd..59c7d444 100644 --- a/man/refresh_mod_mbox.Rd +++ b/man/refresh_mod_mbox.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mail.R \name{refresh_mod_mbox} \alias{refresh_mod_mbox} -\title{Refresh mbox files downloaded via mod_mbox} +\title{Refresh Mod_Mbox} \usage{ refresh_mod_mbox( mailing_list, @@ -30,7 +30,7 @@ If the folder contains already-downloaded mbox files, it identifies the most rec along with all future months up to the current real-life month. } \details{ -The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. Redownloading the most recent file ensures any files added in that month after the latest refresh are included. } diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index 0e88851e..ae9d7f1f 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/mail.R \name{refresh_pipermail} \alias{refresh_pipermail} -\title{Refresh mbox files downloaded via pipermail} +\title{Refresh Pipermail} \usage{ refresh_pipermail( mailing_list, @@ -30,7 +30,7 @@ If the folder contains already-downloaded mbox files, it identifies the most rec along with all future months up to the current real-life month. } \details{ -The naming convention of files is `kaiaulu_YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. } diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 894657bc..f24af48a 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -11,19 +11,19 @@ vignette: > --- -```{r eval=FALSE} +```{r warning = FALSE, message = FALSE} rm(list = ls()) seed <- 1 set.seed(seed) # Load libraries - require(kaiaulu) - require(data.table) - require(yaml) - require(stringi) - require(XML) - require(httr) - require(gt) +require(kaiaulu) +require(data.table) +require(yaml) +require(stringi) +require(XML) +require(httr) +require(gt) ``` @@ -31,7 +31,7 @@ set.seed(seed) Open source projects require a means for developers to communicate. These may include mailing lists, issue trackers, discord, etc. This notebooks showcases how to download data from mailing list archives. Two often used archive types are [mod_mbox](https://httpd.apache.org/mod_mbox/) and [pipermail](https://en.wikipedia.org/wiki/GNU_Mailman#cite_note-9), which Kaiaulu offer functions to download data from. The former is commonly used by the Apache Software Foundation projects. The latter, is more commonly use in GNU related projects, but this can vary. -Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully selected, since this effects the quality and completeness of analysis. +Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully inspected, since this effects the quality and completeness of analysis. # Mailing List Organization @@ -64,40 +64,30 @@ mailing_list: project_key_1: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/helix/pipermail/save_mbox_mail/kaiaulu_202407.mbox # for mod mbox mod_mbox: project_key_1: mailing_list: https://lists.apache.org/list.html?announce@apache.org save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/helix/mod_mbox/save_mbox_mail/kaiaulu_202407.mbox ``` -The most time intensive step you will be required is to locate the URL of the mailing list archive you wish for in the project website. This is specified under `mailing_list`. Note for pipermail this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). - +The most manual time intensive step you will be required is to locate the URL of the mailing list archive you wish for in the project website. This is specified under `mailing_list`. Note for pipermail this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). -Note: It is important that the paths specified in save_folder_path and mbox_file_path are accurate and do not conflict between projects. -By organizing the configuration in this way, you can manage multiple projects and mailing lists easily. The notebook reads these parameters and uses them to download and process the archives. - -Regardless of which mail archive you choose, the downloaders will store the mail data in monthly files, in a `.mbox` format. This is a simple text file that contains some markings to identify the header of the e-mail containing title, authors, etc. You can open any of the .mbox downloaded files with any text editor. +Regardless of which mail archive you choose, the downloaders will store the mail data in monthly files, in a `.mbox` format. This is a simple text file that contains some markings to identify the header of the e-mail containing title, authors, etc. While you can open any of the .mbox downloaded files with any text editor, Kaiaulu parsers will format them into tables, as demonstrated below. ## Tools Configuration In addition to the mailing list configurations, you need to specify the path to the [Perceval](https://github.com/chaoss/grimoirelab-perceval) binary in tools.yml. See the wiki for further details on how to setup third party tools. -Now, you can load the configurations in your R script or notebook using the following code: - -```{r eval=FALSE} +```{r} # Load tools configuration tools <- parse_config("../tools.yml") parse_perceval_path <- get_tool_project("perceval", tools) # Load project configuration conf <- parse_config("../conf/helix.yml") -mbox_file_path <- get_mbox_input_file(conf, "project_key_1") +mbox_file_path <- get_mbox_path(conf, "project_key_1") ``` # Downloaders and Refreshers @@ -113,7 +103,7 @@ For Pipermail, we need to specify the project key, which is used to retrieve the Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. ```{r eval=FALSE} -conf <- parse_config("../conf/helix.yml") +conf <- parse_config("../conf/openssl.yml") pipermail_mailing_list <- get_pipermail_domain(conf, "project_key_1") pipermail_save_folder_path <- get_pipermail_path(conf, "project_key_1") @@ -122,8 +112,6 @@ pipermail_start_year_month <- 202310 pipermail_end_year_month <- 202405 ``` -Note that the date range is not set with a getter. The range for downloads changes often, and should be set manually using the YYYYMM format. - With our configurations loaded, we can proceed to downloading the mailing list archives. ```{r eval=FALSE} @@ -138,15 +126,23 @@ download_pipermail( ``` -After running this function, the .mbox files will be saved in the specified directory with filenames like kaiaulu_202310.mbox, kaiaulu_202311.mbox, etc. +After running this function, the .mbox files will be saved in the specified directory with filenames like 202310.mbox, 202311.mbox, etc, which can be parsed in a table: -### Mod Mbox Downloader +```{r} +parsed_mail <- parse_mbox( + perceval_path = parse_perceval_path, + mbox_file_path = mbox_file_path +) + +parsed_mail %>% + head(10) %>% + gt() +``` -The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range. The download_mod_mbox() function downloads Mod Mbox archives by constructing URLs based on the mailing list and date range, saving them as .mbox files named kaiaulu_YYYYMM.mbox. -Similarly to Pipermail, we need to specify the project key for Mod Mbox. The project key is used to retrieve the configuration parameters for the specific project. +### Mod Mbox Downloader -Use the getters to extract the parameters: +The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range. We obtain the required parameters from the project configuration file, as done before: ```{r eval=FALSE} conf <- parse_config("../conf/helix.yml") @@ -176,20 +172,23 @@ download_mod_mbox( After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 and saves the files in the specified folder. -## Refreshers - -Mailing lists are dynamic, with new emails being added regularly. If you're conducting ongoing analysis or need the most recent data, it's important to refresh your downloaded archives. Manually re-downloading all archives can be time-consuming and inefficient. The refresher functions automate this process by updating only the necessary parts of your archives, saving time and ensuring data completeness. These functions will update your archives by downloading new messages without re-downloading all existing data. - -### Pipermail Refresher +```{r} +parsed_mail <- parse_mbox( + perceval_path = parse_perceval_path, + mbox_file_path = mbox_file_path +) -The refresh_pipermail function is designed to keep your local archives up-to-date with the latest messages from the mailing list. Here's how it works: +parsed_mail %>% + head(10) %>% + gt() +``` -First, it checks the save_folder_path to see if there are any existing files. If the folder is empty, it means you have not downloaded any archives yet. In this case, the function will download all available archives from your specified start_year_month up to the current month, so that you have a complete dataset to work with. -If there are already files in the directory, the function takes a smart approach to updating them. It identifies the most recent archive file based on the filenames (which include the date, like kaiaulu_202311.mbox). It then deletes this most recent file because new messages might have been added to that month since your last download. After deleting it, the function re-downloads this file along with any newer archives that have been added to the mailing list. This way, you don't have to re-download all the archives. +## Refreshers -By operating on the directory specified in save_folder_path, the refresh_pipermail function efficiently updates all relevant files, keeping your local archives current without unnecessary downloads. +Kaiaulu offers convenient function to add new e-mails since the last execution of the downloaders. These are defined as "refresh_*" functions. The most recent file timestamp, which captures the latest month, is used as a starting date to download new files. The most recent file is deleted and re-downloaded to ensure all e-mails of the last month were downloaded, and subsequent files are then downloaded. +### Pipermail Refresher ```{r eval=FALSE} # Refresh archives @@ -199,54 +198,18 @@ refresh_pipermail( save_folder_path = pipermail_save_folder_path, verbose = TRUE ) - ``` -This function will ensure that the most recent archives are always up-to-date by redownloading the current month's archive and adding any new months that have been added to the mailing list. - ### Mod Mbox Refresher -The behavior is similar to the Pipermail refresher, and makes sure that your Mod Mbox archives are up-to-date. +A similar function is also available for mod_mbox: ```{r eval=FALSE} refresh_mod_mbox( mailing_list = mbox_mailing_list, - start_year_month = mbox_start_year_month, + #start_year_month = mbox_start_year_month, save_folder_path= mbox_save_folder_path, verbose = TRUE ) ``` - -# Parsers - -After downloading the mailing list archives as .mbox files, the next step is to parse these files to extract meaningful information for analysis. The parse_mbox() function utilizes the Perceval library to parse .mbox files and convert them into structured data tables. This enables easier manipulation and analysis of mailing list data. - -## How parse_mbox() Works - -The parse_mbox function makes it easy to transform .mbox files into structured data that you can analyze. - -The function uses the Perceval library to process .mbox files. Mailing list archives often have variations in their structure—different email headers, missing fields, or inconsistent formats. The parser is designed to handle these variations, so you do not have to worry about cleaning up the data. - -As it processes the files, the parser extracts key details from each email, such as the content, sender, recipients and dates. These elements are crucial for understanding communication patterns and building insights. - -Finally, to keep things consistent, the function standardizes the column names in the output. Even if the raw data varies from one archive to another, the resulting table will always have predictable and labeled columns, making it easy to work with. - -```{r eval=FALSE} -parsed_mail <- parse_mbox( - perceval_path = parse_perceval_path, - mbox_file_path = mbox_file_path -) -``` - -This will store the parsed data into the parsed_mail variable. You can use the gt package to display the parsed data in a readable format: - -```{r eval=FALSE} -# Display the first 10 rows of the parsed data using gt -# Refer to the gt documentation for more options on displaying tables -parsed_mail %>% - head(10) %>% - gt() -``` - -Note: Displaying the entire dataset may not be practical if it's large. Showing a sample provides a glimpse of the structure. From 557ad1025d7a35c7990d462608857cd182882e84 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 8 Dec 2024 13:28:33 -1000 Subject: [PATCH 76/80] i #284 Update refresh functions - Make start_year_month optional - Determine start_year_month from existing files if they exist - Return error if no existing files, and no date specified Signed-off-by: Dao McGill --- exec/mailinglist.R | 2 +- vignettes/download_mail.Rmd | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/exec/mailinglist.R b/exec/mailinglist.R index 0d95f1c8..3f0c3343 100755 --- a/exec/mailinglist.R +++ b/exec/mailinglist.R @@ -47,7 +47,7 @@ if (arguments[["parse"]] & arguments[["help"]]) { tools <- yaml::read_yaml(tools_path) conf <- yaml::read_yaml(conf_path) - perceval_path <- get_tool("perceval", tools) + perceval_path <- get_tool_project("perceval", tools) mbox_file_path <- get_mbox_input_file(conf, project_key) parsed_mbox <- parse_mbox( diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index f24af48a..05351fe2 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -207,8 +207,8 @@ A similar function is also available for mod_mbox: ```{r eval=FALSE} refresh_mod_mbox( mailing_list = mbox_mailing_list, - #start_year_month = mbox_start_year_month, - save_folder_path= mbox_save_folder_path, + # start_year_month = mbox_start_year_month, + save_folder_path = mbox_save_folder_path, verbose = TRUE ) ``` From 1cf86e5e1c4faad3db48052ec36952fb6765c645 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 8 Dec 2024 13:29:45 -1000 Subject: [PATCH 77/80] i #284 Missing file in previous commit Signed-off-by: Dao McGill --- R/mail.R | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/R/mail.R b/R/mail.R index f2de4df4..2e697b7c 100644 --- a/R/mail.R +++ b/R/mail.R @@ -240,15 +240,20 @@ download_pipermail <- function(mailing_list, start_year_month, end_year_month, s #' @param verbose if TRUE, prints diagnostic messages. #' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_pipermail <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { +refresh_pipermail <- function(mailing_list, start_year_month = NULL, save_folder_path, verbose = TRUE) { ## Check if Folder is Empty # Check the contents of the folder to see if any .mbox files are already present # The function looks for files that match the naming pattern 'YYYYMM.mbox' files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") + # If the folder is empty if (length(files_in_folder) == 0) { - # If the folder is empty, download all pipermail files starting from the start_year_month + # If start_year_month is not specified, issue an error + if (is.null(start_year_month)) { + stop("No existing data found. Please specify a start_year_month.") + } + # Otherwise, download all pipermail files starting from the start_year_month # The end date is set to the current month based on the system date end_year_month <- format(Sys.Date(), "%Y%m") if (verbose) message("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") @@ -513,15 +518,20 @@ download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, sa #' @param verbose if TRUE, prints diagnostic messages. #' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -refresh_mod_mbox <- function(mailing_list, start_year_month, save_folder_path, verbose = TRUE) { +refresh_mod_mbox <- function(mailing_list, start_year_month = NULL, save_folder_path, verbose = TRUE) { ## Check if Folder is Empty ## # Check the contents of the folder to see if any .mbox files are already present. # The function looks for files that match the naming pattern 'YYYYMM.mbox' files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") + # If the folder is empty if (length(files_in_folder) == 0) { - # If the folder is empty, download all mod_mbox files starting from start_year_month + # If start_year_month is not specified, issue an error + if (is.null(start_year_month)) { + stop("No existing data found. Please specify a start_year_month.") + } + # Otherwise, download all mod_mbox files starting from start_year_month # The end date is set to the current month based on the system date end_year_month <- format(Sys.Date(), "%Y%m") if (verbose) message("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") From f3048a9e8baf8b73855102128c4e7fabb9192d30 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 8 Dec 2024 14:28:13 -1000 Subject: [PATCH 78/80] i #284 Edit exec/mailinglist.R parse to take file as arg - Takes file path for mbox file to parse - No longer need to pass project_conf Signed-off-by: Dao McGill --- exec/mailinglist.R | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/exec/mailinglist.R b/exec/mailinglist.R index 3f0c3343..4a621722 100755 --- a/exec/mailinglist.R +++ b/exec/mailinglist.R @@ -15,7 +15,7 @@ require(data.table, quietly = TRUE) doc <- " USAGE: mailinglist.R parse help - mailinglist.R parse + mailinglist.R parse mailinglist.R refresh modmbox help mailinglist.R refresh modmbox mailinglist.R refresh pipermail help @@ -40,16 +40,13 @@ if (arguments[["parse"]] & arguments[["help"]]) { } else if (arguments[["parse"]]) { tools_path <- arguments[[""]] - conf_path <- arguments[[""]] - project_key <- arguments[[""]] + mbox_file_path <- arguments[[""]] save_path <- arguments[[""]] tools <- yaml::read_yaml(tools_path) - conf <- yaml::read_yaml(conf_path) - perceval_path <- get_tool_project("perceval", tools) - mbox_file_path <- get_mbox_input_file(conf, project_key) + cli::cli_alert_info(paste0("Parsing mbox file: ", mbox_file_path)) parsed_mbox <- parse_mbox( perceval_path = perceval_path, mbox_file_path = mbox_file_path From 5ce58302ec66f3c29d0e42632ee9215242f3fff4 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Sun, 8 Dec 2024 15:29:19 -1000 Subject: [PATCH 79/80] i #284 Use pipermail path for parsing pipermail folder Signed-off-by: Dao McGill --- vignettes/download_mail.Rmd | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index 05351fe2..b0b4c6f1 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -131,7 +131,7 @@ After running this function, the .mbox files will be saved in the specified dire ```{r} parsed_mail <- parse_mbox( perceval_path = parse_perceval_path, - mbox_file_path = mbox_file_path + mbox_file_path = pipermail_save_folder_path ) parsed_mail %>% @@ -139,7 +139,6 @@ parsed_mail %>% gt() ``` - ### Mod Mbox Downloader The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range. We obtain the required parameters from the project configuration file, as done before: @@ -183,7 +182,6 @@ parsed_mail %>% gt() ``` - ## Refreshers Kaiaulu offers convenient function to add new e-mails since the last execution of the downloaders. These are defined as "refresh_*" functions. The most recent file timestamp, which captures the latest month, is used as a starting date to download new files. The most recent file is deleted and re-downloaded to ensure all e-mails of the last month were downloaded, and subsequent files are then downloaded. From 4df52d92fdbaae556e39cc56dd0ef6faf752b788 Mon Sep 17 00:00:00 2001 From: Dao McGill <77309217+daomcgill@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:58:44 -1000 Subject: [PATCH 80/80] i #284 Minor fixes Signed-off-by: Dao McGill --- DESCRIPTION | 4 ++-- NAMESPACE | 2 +- man/parse_mbox.Rd | 13 ++++++++----- man/parse_mbox_latest_date.Rd | 9 ++++++--- man/refresh_mod_mbox.Rd | 2 +- man/refresh_pipermail.Rd | 2 +- vignettes/download_mail.Rmd | 2 +- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 06af4e22..e4632a59 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,8 +23,8 @@ Authors@R: c( person('Ian Jaymes', 'Iwata', role = c('ctb')), person('Dao', 'McGill', role = c('ctb')), person('Nicholas', 'Beydler', role = c('ctb')), - person('Mark', 'Burgess', role = c('ctb')) - person('Raven', 'Quiddaoen', role= c('ctb')), + person('Mark', 'Burgess', role = c('ctb')), + person('Raven', 'Quiddaoen', role= c('ctb')) ) Maintainer: Carlos Paradis License: MPL-2.0 | file LICENSE diff --git a/NAMESPACE b/NAMESPACE index 2c8b1fd4..92925ecf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -190,8 +190,8 @@ export(parse_r_dependencies) export(parse_r_function_definition) export(parse_r_function_dependencies) export(parse_rfile_ast) -export(process_gz_to_mbox_in_folder) export(parse_understand_dependencies) +export(process_gz_to_mbox_in_folder) export(query_src_text) export(query_src_text_class_names) export(query_src_text_namespace) diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index b4226a45..cfc8752b 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -25,31 +25,34 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, -\code{\link{parse_nvdfeed}()} +\code{\link{parse_nvdfeed}()}, +\code{\link{parse_understand_dependencies}()} Other parsers: +\code{\link{build_understand_project}()}, +\code{\link{export_understand_dependencies}()}, \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd index dcaf8305..cf718e6f 100644 --- a/man/parse_mbox_latest_date.Rd +++ b/man/parse_mbox_latest_date.Rd @@ -18,19 +18,22 @@ based on the naming convention `YYYYMM.mbox`. For example: `202401.mbox`. } \seealso{ Other parsers: +\code{\link{build_understand_project}()}, +\code{\link{export_understand_dependencies}()}, \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, -\code{\link{parse_jira}()}, \code{\link{parse_mbox}()}, -\code{\link{parse_nvdfeed}()} +\code{\link{parse_nvdfeed}()}, +\code{\link{parse_understand_dependencies}()} } \concept{parsers} diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd index 59c7d444..8140b782 100644 --- a/man/refresh_mod_mbox.Rd +++ b/man/refresh_mod_mbox.Rd @@ -6,7 +6,7 @@ \usage{ refresh_mod_mbox( mailing_list, - start_year_month, + start_year_month = NULL, save_folder_path, verbose = TRUE ) diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd index ae9d7f1f..60e84ab2 100644 --- a/man/refresh_pipermail.Rd +++ b/man/refresh_pipermail.Rd @@ -6,7 +6,7 @@ \usage{ refresh_pipermail( mailing_list, - start_year_month, + start_year_month = NULL, save_folder_path, verbose = TRUE ) diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd index b0b4c6f1..12fc0417 100644 --- a/vignettes/download_mail.Rmd +++ b/vignettes/download_mail.Rmd @@ -102,7 +102,7 @@ For Pipermail, we need to specify the project key, which is used to retrieve the Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. -```{r eval=FALSE} +```{r} conf <- parse_config("../conf/openssl.yml") pipermail_mailing_list <- get_pipermail_domain(conf, "project_key_1") pipermail_save_folder_path <- get_pipermail_path(conf, "project_key_1")