diff --git a/DESCRIPTION b/DESCRIPTION index 6c7a1bec..e4632a59 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,10 +20,11 @@ Authors@R: c( person('Nicole', 'Hoess', role = c('ctb')), person('Anthony', 'Lau', role = c('ctb')), person('Sean', 'Sunoo', role = c('ctb')), - person('Ian Jaymes', 'Iwata', role= c('ctb')), - person('Raven', 'Quiddaoen', role= c('ctb')), + person('Ian Jaymes', 'Iwata', role = c('ctb')), + person('Dao', 'McGill', role = c('ctb')), person('Nicholas', 'Beydler', role = c('ctb')), - person('Mark', 'Burgess', role = c('ctb')) + person('Mark', 'Burgess', role = c('ctb')), + person('Raven', 'Quiddaoen', role= c('ctb')) ) Maintainer: Carlos Paradis License: MPL-2.0 | file LICENSE diff --git a/NAMESPACE b/NAMESPACE index 7321af5b..92925ecf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,7 +6,6 @@ export(bipartite_graph_projection) export(build_understand_project) export(commit_message_id_coverage) export(community_oslom) -export(convert_pipermail_to_mbox) export(dependencies_to_sdsmj) export(download_bugzilla_perceval_rest_issue_comments) export(download_bugzilla_perceval_traditional_issue_comments) @@ -17,7 +16,6 @@ export(download_jira_issues) export(download_jira_issues_by_date) export(download_jira_issues_by_issue_key) export(download_mod_mbox) -export(download_mod_mbox_per_month) export(download_pipermail) export(dv8_clsxb_to_clsxj) export(dv8_clsxj_to_clsxb) @@ -186,18 +184,22 @@ export(parse_jira_rss_xml) export(parse_line_metrics) export(parse_line_type_file) export(parse_mbox) +export(parse_mbox_latest_date) export(parse_nvdfeed) export(parse_r_dependencies) export(parse_r_function_definition) export(parse_r_function_dependencies) export(parse_rfile_ast) export(parse_understand_dependencies) +export(process_gz_to_mbox_in_folder) export(query_src_text) export(query_src_text_class_names) export(query_src_text_namespace) export(read_temporary_file) export(recolor_network_by_community) export(refresh_jira_issues) +export(refresh_mod_mbox) +export(refresh_pipermail) export(smell_missing_links) export(smell_organizational_silo) export(smell_radio_silence) diff --git a/NEWS.md b/NEWS.md index c9c66cfd..a3f6368a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,11 @@ __kaiaulu 0.0.0.9700 (in development)__ ### NEW FEATURES + * Refactor of all R/mail.R mailing list functions for downloading and refreshing both pipermail and mod mbox archives. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `refresh_mod_mbox()` and `refresh_pipermail()` has been added. They are both functions that downloads mbox issues that are not already downloaded up until the current year and month. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `parse_mbox_latest_date()` has been added. This function returns the file name of the downloaded mbox file containing the latest date for use by `download_mbox_per_month()` and `download_pipermail` to implement a refresh capability. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `refresh_jira_issues()` had been added. It is a wrapper function for the previous downloader and downloads only issues greater than the greatest key already downloaded. + * `download_jira_issues()`, `download_jira_issues_by_issue_key()`, and `download_jira_issues_by_date()` has been added. This allows for downloading of Jira issues without the use of JirAgileR [#275](https://github.com/sailuh/kaiaulu/issues/275) and specification of issue Id and created ranges. It also interacts with `parse_jira_latest_date` to implement a refresh capability. * `build`, `export` `parse` and `transform` functions for Scitools Understand have been added. [#308](https://github.com/sailuh/kaiaulu/issues/308) * The GitHUB API has been expanded to use refresh, along with other functions. `github_api_project_issue_search` has been added that makes the search/issues endpoint API calls. `github_api_project_issue_or_pr_comments_by_date` and `github_api_project_issue_by_date` have been added to download issue data and comments by date ranges. `github_parse_search_issues_refresh` has been added that parses the issue data downloaded from the search endpoint in the refresh_issues folder. `github_api_project_issue_refresh` and `github_api_project_issue_or_pr_comment_refresh` were added to download issue data or comments respectively that have not already been downloaded. `format_created_at_from_file` was added to retrieve the greatest date from a JSON file. See the Reference Docs on GitHub section for more details. [#282](https://github.com/sailuh/kaiaulu/issues/282) * `config.R` now contains a set of getter functions used to centralize the gathering of configuration data and these getter functions are used to refactor configuration file information gathering. For example, loading configuration file information with variable assignment is as follows `git_repo_path <- config_file[["version_control"]][["log"]]` but refactoring with a config.R getter function becomes `git_repo_path <- get_git_repo_path(config_file)`. [#230](https://github.com/sailuh/kaiaulu/issues/230) @@ -31,7 +36,9 @@ __kaiaulu 0.0.0.9700 (in development)__ ### MINOR IMPROVEMENTS - * Issue #275, when introducing the concept of refresh on JIRA, affected some notebooks that still relied on data in that format. This issue change either notebook or config file to conform to the new JIRA downloader [#312](https://github.com/sailuh/kaiaulu/issues/312) + * `parse_mbox_latest_date()` now uses the new mbox naming convention for the latest date. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * All mailing list documentation can now be found in `download_mail.Rmd`. [#284](https://github.com/sailuh/kaiaulu/issues/284) + * `download_pipermail()` now downloads all the txt and txt.gz files in the accessed pipermail archive as mbox files. [#284](https://github.com/sailuh/kaiaulu/issues/284) * The line metrics notebook now provides further guidance on adjusting the snapshot and filtering. * The R File and R Function parser can now properly parse R folders which contain folders within (not following R package structure). Both `.r` and `.R` files are also now captured (previously only one of the two were specified, but R accepts both). [#235](https://github.com/sailuh/kaiaulu/issues/235) * Refactor GoF Notebook in Graph GoF and Text GoF Notebooks [#224](https://github.com/sailuh/kaiaulu/issues/224) diff --git a/R/example.R b/R/example.R index f70bd5c6..8746ceae 100644 --- a/R/example.R +++ b/R/example.R @@ -550,7 +550,7 @@ example_jira_issue_comments <- function(folder_path = "/tmp", folder_name) { example_mailing_list_two_threads <- function(folder_path = "/tmp", folder_name, file_name) { # Create folder & repo - folder_path <- io_make_folder(folder_path=folder_path, folder_name = folder_name) + folder_path <- io_make_folder(folder_path = folder_path, folder_name = folder_name) # Step 1: Create fake mbox replies and assign them to variables for easy editing thread_1_reply_1 <- make_mbox_reply(mailing_list="test-list", diff --git a/R/mail.R b/R/mail.R index 4a1257e5..2e697b7c 100644 --- a/R/mail.R +++ b/R/mail.R @@ -4,241 +4,573 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -############## Downloader ############## +############## Downloader Pipermail ############## -#' Download all pipermail files in an archive -#' @param url An url pointing to a pipermail archive -#' @return Returns `destination`, a vector of the downloaded files in the current working directory +#' Pipermail Downloader +#' +#' This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. +#' It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. +#' The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. +#' +#' When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, +#' overwriting any existing file with the same name. The original .gz file is deleted after extraction. +#' +#' The downloaded .mbox files are saved in the specified folder following the naming convention YYYYMM.mbox. +#' The function only downloads files that fall between the specified start_year_month and end_year_month. +#' When both formats fail to download, the function issues a warning indicating the missing month. +#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. +#' +#' @param mailing_list The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/" +#' @param start_year_month The year and month of the first file to be downloaded format: 'YYYYMM' +#' @param end_year_month The year and month of the last file to be downloaded format: 'YYYYMM', or use Sys.Date +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored +#' @param verbose if TRUE, prints diagnostic messages during the download process +#' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory #' @export -download_pipermail <- function(url) { +download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { + + ## Download and Parse Mailing List HTML for Links + # Ensure mailing_list url ends with a slash, which is important when constructing links for downloading files, + # since the extracted links are relative to the base URL. + # e.g.base url: https://mta.openssl.org/pipermail/openssl-announce/ and extracted link: 2024-June.txt.gz + if (!stringi::stri_endswith_fixed(mailing_list, "/")) { + mailing_list <- stringi::stri_c(mailing_list, "/") + } + + # Archive Index Retrieval + # Begins by downloading an HTML page that lists the URLs + # for the monthly archives, which are typically available in .txt or .gz formats. - #Get page - pagedata <- httr::GET(url) + # Sends a GET request to the mailing list’s URL to retrieve contents. This is the main page of the mailing list archive, + # which contains links to individual month files (in .txt or .gz format). + response <- httr::GET(mailing_list, httr::timeout(60)) + if (httr::status_code(response) != 200) { + stop("Failed to access the mailing list page.") + } - #Parse html file into object - tbls_xml <- XML::htmlParse(pagedata) + # The content is parsed as text to extract the rows of data from the table that contains the file links. + parsed_response <- httr::content(response, "text") + doc_obj <- XML::htmlParse(parsed_response, asText = TRUE) + + # Get all table rows in the archive page. These rows contain the links to the individual month files. + rows <- XML::getNodeSet(doc_obj, "//tr") + # Skip the header row, to get to data rows + data_rows <- rows[-1] + # Create an empty vector for storing the links that will be extracted. + links <- c() + + ## Extract Date and Links + # Loop through the data rows and extract the date and link from each row. + # The date is in the first column, and the link is in the third column. + for (row in data_rows) { + # Extract and clean the date, which is in the format "Month Year" (e.g., "June 2024"). + date_extracted <- XML::xpathSApply(row, ".//td[1]", XML::xmlValue) + date_cleaned <- stringi::stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "") + date_cleaned <- stringi::stri_trim_both(date_cleaned) + # Parse the cleaned date into a valid date object. This allows us to convert it into the "YYYYMM" format. + date_parsed <- as.Date(stringi::stri_c("01 ", date_cleaned), format = "%d %B %Y") + if (is.na(date_parsed)) { + warning("Date could not be parsed: ", date_cleaned) + next + } + year_month <- format(date_parsed, "%Y%m") + + # Check if the extracted year_month falls within the specified range of start_year_month to end_year_month. + # If it does, proceed to extract the file link from the third column of the row. + if (year_month >= start_year_month & year_month <= end_year_month) { + # Get the link (href) from the third column. This is the link to the .txt or .gz file for that month. + link_nodes <- XML::xpathSApply(row, ".//td[3]/a", XML::xmlGetAttr, 'href') + if (length(link_nodes) == 0) { + warning("No link found in row for date: ", date_cleaned) + next + } + # Store the link in the links vector, for later download. + link <- link_nodes[1] + links <- c(links, link) + } + } - #Extract href tablenodes from html table - tableNodes <- XML::getNodeSet(tbls_xml, "//td/a[@href]") + ## Initialize Vector for Failed Months + failed_months <- character() - #Extract filenames from tablenode content with xmlGetAtrr - hrefs <- sapply(tableNodes, XML::xmlGetAttr, 'href') + ## Use Links to Download Individual Files + # Initialize a vector for storing the paths of the downloaded files. + downloaded_files <- c() + for (i in seq_along(links)) { + link <- links[i] - #Create Vector - files <- vector() + # Extract the base name of the file (without the .txt.gz extension), so we can construct the correct download paths. + base_name <- gsub("\\.txt\\.gz$", "", link) - #Compose download urls for both gunzipped and plain text files - for (i in hrefs ){ - if (endsWith(i, ".txt.gz")){ - i <- paste0(url, i) - files <- c(files, i) - } else if (endsWith(i, ".txt")) { - i <- paste0(url, i) - files <- c(files, i) + # Parse the date from the base name and convert it into "YYYYMM" format for consistency with our file naming. + date_parsed <- as.Date(stringi::stri_c("01-", base_name), format = "%d-%Y-%B") + if (is.na(date_parsed)) { + warning("Could not parse date from link: ", link) + next + } + year_month_clean <- format(date_parsed, "%Y%m") + + # Construct the download URLs for both the .txt and .gz versions of the file. + # The function will first attempt to download the .txt version. + txt_url <- stringi::stri_c(mailing_list, gsub("\\.gz$", "", link)) + gz_url <- stringi::stri_c(mailing_list, link) + + # The function attempts to download the .txt file for each month. + # If the .txt file is unavailable, it falls back to downloading the + # .gz (gzipped) file. + # Attempt to download the .txt file first + download_url <- txt_url + response <- httr::GET(download_url, httr::timeout(60)) + + # If the response status code is not 200, the file is not available. + if (httr::status_code(response) != 200) { + # Fallback to .gz file if .txt is unavailable + download_url <- gz_url + response <- httr::GET(download_url, httr::timeout(60)) + if (httr::status_code(response) != 200) { + warning("Both .txt and .gz downloads failed for link: ", link, "\n") + failed_months <- c(failed_months, year_month_clean) + next + } } - } - destination <- vector() - # File downloading loop - for (i in files){ + # Define the destination file name and path where the downloaded content will be saved as a .mbox file. + dest <- file.path(save_folder_path, stringi::stri_c(year_month_clean, '.mbox')) - #split filename from url and create download destination out of it - splits <- stringi::stri_split_fixed(i, "/") - destination[[i]] <- paste0(splits[[1]][[length(splits[[1]])]]) + ## Write Downloaded File to Disk + # Print diagnostic info if verbose is TRUE + if (verbose) { + message("Downloading: ", download_url, "\n") + message("Saving to: ", dest, "\n") + } + + # Write the downloaded file to disk. If the file is a .gz file, it needs to be unzipped and converted to .mbox format. + if (grepl("\\.gz$", download_url)) { + # Download the .gz file to a temporary lomessageion. + gz_file_path <- file.path(save_folder_path, stringi::stri_c(year_month_clean, '.mbox.gz')) + httr::GET(download_url, httr::write_disk(gz_file_path, overwrite = TRUE), httr::timeout(60)) + + # If a .gz file is downloaded, the function unzips it and converts it into an .mbox file. + # The original .gz file is deleted after extraction to save space. + # Unzip the .gz file and save the contents as a .mbox file. + gz_con <- gzfile(gz_file_path, open = "rb") + out_con <- file(dest, open = "wb") + while (TRUE) { + bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) + if (length(bytes) == 0) break + writeBin(bytes, out_con) + } + close(gz_con) + close(out_con) + + # Remove the .gz file after unzipping to avoid storing duplimessagee data. + file.remove(gz_file_path) + } else { + # If the .txt file is available, download it directly and save it as a .mbox file. + httr::GET(download_url, httr::write_disk(dest, overwrite = TRUE), httr::timeout(60)) + } - #download file and place it at the destination - httr::GET(i, httr::write_disk(destination[[i]], overwrite=TRUE)) + # Add the downloaded file path to the list of downloaded files. + downloaded_files <- c(downloaded_files, dest) } - #Return filenames - return(destination) + ## Summary of Downloads + if (length(failed_months) > 0) { + warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", ")) + } + # List the files in the save_folder_path. + downloaded_files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$", full.names = FALSE) + + # The downloaded .mbox files are saved in the specified folder with the + # naming convention YYYYMM.mbox, where YYYYMM represents the year and month. + # Extract the YYYYMM from the file names. + downloaded_dates <- as.numeric(sub("(\\d{6})\\.mbox", "\\1", downloaded_files_in_folder)) + + # Create the expected list of YYYYMM between start_year_month and end_year_month. + start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d") + end_date <- as.Date(paste0(end_year_month, "01"), format = "%Y%m%d") + all_dates <- seq(start_date, end_date, by = "month") + expected_dates <- as.numeric(format(all_dates, "%Y%m")) + + # Identify missing months. + missing_months <- setdiff(expected_dates, downloaded_dates) + + # Determine the earliest and latest dates downloaded. + if (length(downloaded_dates) > 0) { + min_downloaded_date <- min(downloaded_dates) + max_downloaded_date <- max(downloaded_dates) + + if (verbose) { + message("\nSummary of Downloads:\n") + message("save_folder_path contains mail from date ", min_downloaded_date, " to ", max_downloaded_date, "\n") + } + } else { + if (verbose) { + message("No files found in save_folder_path\n") + } + } -} + if (length(missing_months) == 0) { + if (verbose) { + message("No missing months\n") + } + } else { + warning("Months missing in the date range: ", paste(missing_months, collapse = ", "), "\n") + } + ## Return List of Downloaded Files + # Return the list of downloaded .mbox files + return(downloaded_files) +} -#' Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}} -#' @param filelist A vector of pipermail archive files from \code{\link{download_pipermail}} -#' @return Returns `output`, the name of the resulting .mbox file in the current working directory +#' Refresh Pipermail +#' +#' This function refreshes the mailing list files by checking the contents of a specified folder. +#' If the folder is empty, it calls \code{\link{download_pipermail}} to download all pipermail files from start_year_month to the current month. +#' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +#' along with all future months up to the current real-life month. +#' +#' The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +#' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +#' Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. +#' +#' @param mailing_list The URL of the mailing list being downloaded (e.g., \url{https://mta.openssl.org/pipermail/openssl-announce/}) +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored. +#' @param verbose if TRUE, prints diagnostic messages. +#' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -convert_pipermail_to_mbox <- function(filelist) { +refresh_pipermail <- function(mailing_list, start_year_month = NULL, save_folder_path, verbose = TRUE) { + + ## Check if Folder is Empty + # Check the contents of the folder to see if any .mbox files are already present + # The function looks for files that match the naming pattern 'YYYYMM.mbox' + files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") + + # If the folder is empty + if (length(files_in_folder) == 0) { + # If start_year_month is not specified, issue an error + if (is.null(start_year_month)) { + stop("No existing data found. Please specify a start_year_month.") + } + # Otherwise, download all pipermail files starting from the start_year_month + # The end date is set to the current month based on the system date + end_year_month <- format(Sys.Date(), "%Y%m") + if (verbose) message("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") - #at to @ replace function - pipermail_atreplacer <- function(string) { + # Call the download_pipermail function to download files from start_year_month to end_year_month + download_pipermail(mailing_list, start_year_month, end_year_month, save_folder_path) + } + ## Identify the Most Recent Month ## + else { + # If the folder is not empty, identify the most recent month based on the filenames + # The filenames follow the pattern 'YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("(\\d{6})\\.mbox$", "\\1", files_in_folder) + + # Find the most recent month by taking the maximum of the extracted YYYYMM values + recent_month <- max(year_months) + + # Delete the most recent file before redownloading it + recent_file <- file.path(save_folder_path, stringi::stri_c(recent_month, ".mbox")) + if (file.exists(recent_file)) { + file.remove(recent_file) + if (verbose) message("Deleted the most recent file:", recent_file, "\n") + } - rstring <- sub(" at ", "@", string) + ## Redownload from the Most Recent Month ## + # Set the end_year_month to the current month (based on the system date) + end_year_month <- format(Sys.Date(), "%Y%m") - return(rstring) + # Redownload files from the most recent month (that was just deleted) to the current month + if (verbose) message("Redownloading from", recent_month, "to", end_year_month, "\n") + # Call the download_pipermail function to redownload the deleted month and all subsequent months up to the current month + download_pipermail(mailing_list, recent_month, end_year_month, save_folder_path) } + ## Process .gz Files After Refresh ## + # Call process_gz_to_mbox_in_folder to ensure all .gz files are converted to .mbox after the refresh + if (verbose) message("Processing .gz files in the folder (if any) to convert them to .mbox format...\n") + process_gz_to_mbox_in_folder(save_folder_path = save_folder_path, verbose = verbose) +} - output <- "output.mbox" - #Create mbox file and file connection - file.create(output) - fileConn <- file(output, "w+") +#' Gz to Mbox Converter +#' +#' This function scans a specified folder for any .gz files, unzips them, +#' and renames them to the .mbox format. After unzipping, the original .gz files are deleted. +#' If a .mbox file with the same name already exists, it will be overwritten. +#' This makes sure that all the files in the folder are in .mbox format, ready for parsing. +#' +#' @param save_folder_path The path to the folder containing both .gz and .mbox files. +#' @param verbose if TRUE, prints diagnostic messages during processing. +#' @return A list of the .mbox files that were created or updated. +#' @export +process_gz_to_mbox_in_folder <- function(save_folder_path, verbose = TRUE) { + + # Get the list of all files in the folder, including full paths + files <- list.files(save_folder_path, full.names = TRUE) + # Identify .gz files from the list of files + gz_files <- files[grepl("\\.gz$", files)] - #Read lines from downloaded files and write them to mbox file - for (filename in filelist[]){ + # If there are no .gz files, print a message (if verbose is TRUE) and return NULL + if (length(gz_files) == 0) { + if (verbose) message("This folder does not contain any .gz files.\n") + return(invisible(NULL)) + } + + # Create a vector to store the names of the converted .mbox files + converted_mbox_files <- c() - #Open read connection - readCon <- file(filename, "r") + ## Process Each .gz File ## + # Iterate over each .gz file, unzip it, and convert it to .mbox + for (gz_file in gz_files) { + # Define the corresponding .mbox file path by replacing .gz with .mbox in the file name + mbox_file <- gsub("\\.gz$", ".mbox", gz_file) - data <- readLines(filename) + if (verbose) message("Processing:", gz_file, " -> ", mbox_file, "\n") - #Find email headers to send to 'at' to @ replacer - for (i in 1:length(data)) { + # Open the .gz file in binary mode for reading + gz_con <- gzfile(gz_file, open = "rb") - data[i] <- sub("From:? \\S+ at \\S+", pipermail_atreplacer(data[i]), data[i]) + # Create a new .mbox file and open it in binary mode for writing + out_con <- file(mbox_file, open = "wb") + # Read the contents of the .gz file and write the chunks to the .mbox file + while (TRUE) { + bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024) + if (length(bytes) == 0) break + writeBin(bytes, out_con) } - #Write files to output - writeLines(data, fileConn) + # Close both the input (gz) and output (mbox) file connections + close(gz_con) + close(out_con) - #Close read connection - close(readCon) + # After successfully converting the file, delete the original .gz file + file.remove(gz_file) - #Delete the file - unlink(filename, force = TRUE) + # Add the newly created .mbox file to the list of converted files + converted_mbox_files <- c(converted_mbox_files, mbox_file) } - #Close connection to mbox file - close(fileConn) - - #return output location - return(output) + # Return the vector of all the .mbox files that were created or updated + return(converted_mbox_files) } -#' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -#' @param base_url An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes -#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory -#' @param from_year First year in the range to be downloaded -#' @param to_year Last year in the range to be downloaded -#' @param save_file_path the full path, including file name and extension to save the file -#' @param is_per_month If TRUE, does not delete monthly files in tmp. (Default = TRUE) -#' @param verbose Prints progress during execution -#' @return Returns the path of the downloaded mbox file. -#' @export -download_mod_mbox <- function(base_url, mailing_list, from_year, to_year, save_file_path,is_per_month=TRUE,verbose=FALSE) { - - - #Initialize variables - counter <- 0 - destination <- list() - - #Open file handle to output file - output <- path.expand(save_file_path) - fileConn <- file(output, "w+") - #Loop through time and compose the mbox file - for (year in (from_year:to_year)) { +############## Downloader Mod Mbox ############## +#' Download Mod_Mbox +#' +#' This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. +#' It constructs the download URLs for each month based on the start and end date range and downloads the mbox files +#' in the format "YYYY-MM". The downloaded .mbox files are saved in the specified folder, with a naming convention +#' of YYYYMM.mbox. +#' +#' The function loops through each month in the range specified by `start_year_month` and `end_year_month`, +#' and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. +#' This means the file could not be found and that month's data may not exist. +#' At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. +#' +#' @param mailing_list The URL of the Apache Pony Mail list from which mbox files are to be downloaded +#' (e.g., "https://lists.apache.org/list.html?announce@apache.org"). +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param end_year_month The year and month of the last file to be downloaded (format: 'YYYYMM'). +#' @param save_folder_path The folder path where all the downloaded mbox files will be stored. +#' @param verbose if TRUE, prints detailed messages during the download process. +#' @return Returns `save_folder_path`, the folder path where the mbox files are stored. +#' @export +download_mod_mbox <- function(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = TRUE) { + + ## Extract Mailing List Name ## + # Extract the mailing list name from the given URL. This is because the actual list name is + # embedded within the URL (after the 'list.html?'). + # We are using 'sub()' to perform a simple string replacement, extracting everything after 'list.html?'. + mailing_list_name <- sub(".*list.html\\?(.+)", "\\1", mailing_list) + if (verbose) message("Base list extracted:", mailing_list_name, "\n") + + ## Prepare Year and Month ## + # The start_year_month and end_year_month are in the format "YYYYMM". + # Split them into year and month for easier looping. + # Extract first 4 digits as start year, and last 2 digits as start month. + start_year <- as.numeric(substr(start_year_month, 1, 4)) + start_month <- as.numeric(substr(start_year_month, 5, 6)) + # Extract first 4 digits as end year, and last 2 digits as end month. + end_year <- as.numeric(substr(end_year_month, 1, 4)) + end_month <- as.numeric(substr(end_year_month, 5, 6)) + + ## Initialize Vectors for Failed Months ## + # Vectors to track failed downloads. + failed_months <- character() + + ## Download Loop ## + # Iterate over the years and months from start_year/month to end_year/month. + # This is done by looping over the years, and for each year, looping over the 12 months. + for (year in start_year:end_year) { for (month in 1:12) { - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d%02d.mbox", year, month) - - if(verbose){ - print(stringi::stri_c("Downloading:",destination[[counter]],sep = " ")) + # Skip months before the start_month or after the end_month for the start and end year. + if (year == start_year && month < start_month) next + if (year == end_year && month > end_month) break + + ######### Construct URL and Save Path ## + # Construct the month string (e.g., '2023-04') and the full download URL. + # Make sure the month has two digits. + month_str <- sprintf("%02d", month) + # Create a string in the format "YYYY-MM" + year_month_str <- sprintf("%04d-%02d", year, month) + # This constructs the URL from which the mbox for the current year and month will be downloaded. + # The format for the URL is fixed by Apache's Pony Mail service. + download_url <- stringi::stri_c("https://lists.apache.org/api/mbox.lua?list=", mailing_list_name, "&date=", year_month_str) + + # Create the file name where the mbox will be saved locally, in the format 'YYYYMM.mbox'. + file_name <- stringi::stri_c(year, month_str, ".mbox") + file_path <- file.path(save_folder_path, file_name) + + if (verbose) { + message("Constructed URL:", download_url, "\n") + message("Saving to file:", file_path, "\n") } - #Try file download and save result - full_month_url <- stringi::stri_c(base_url, mailing_list, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path('/tmp',destination[[counter]]) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - #If download was successful, write to mbox file, if not, delete file - if (httr::http_error(x) == FALSE) { + ## Download Mbox File ## + # Download the file using httr::GET, saving it directly to the destination file path. + response <- httr::GET(download_url, httr::write_disk(file_path, overwrite = TRUE)) + # Get the status code to see if the download succeeded. + status_code <- httr::status_code(response) + + # Check for successful download (status code 200). + if (status_code == 200) { + if (verbose) message("Successfully downloaded:", download_url, "\n") + } else { + if (verbose) { + message("Failed to download:", download_url, "\n") + message("HTTP Status Code:", status_code, "\n") + } + # Remove failed download file. + unlink(file_path) + failed_months <- c(failed_months, year_month_str) + } + } + } - #Open read connection - readCon <- file(full_tmp_save_path, "r") + ## Summary of Failed Downloads ## + if (length(failed_months) > 0) { + warning("The following months could not be downloaded (no data available or other error):\n", paste(failed_months, collapse = ", ")) + } - data <- readLines(full_tmp_save_path) + # List the files in the save_folder_path + downloaded_files <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$", full.names = FALSE) - #Write data to output - writeLines(data, fileConn) + # Extract the YYYYMM from the file names + downloaded_dates <- as.numeric(sub("(\\d{6})\\.mbox", "\\1", downloaded_files)) - #Close read connection - close(readCon) - } + # Find the expected list of YYYYMM between start_year_month and end_year_month + start_date <- as.Date(paste0(start_year_month, "01"), format = "%Y%m%d") + end_date <- as.Date(paste0(end_year_month, "01"), format = "%Y%m%d") + all_dates <- seq(start_date, end_date, by = "month") + expected_dates <- as.numeric(format(all_dates, "%Y%m")) - #Delete the /tmp/ monthly files - if(!is_per_month){ - unlink(full_tmp_save_path, force = TRUE) - } + # Identify missing months + missing_months <- setdiff(expected_dates, downloaded_dates) + # Determine the earliest and latest dates downloaded + if (length(downloaded_dates) > 0) { + min_downloaded_date <- min(downloaded_dates) + max_downloaded_date <- max(downloaded_dates) + if (verbose) { + message("\nSummary of Downloads:\n") + message("save_folder_path contains mail from date", min_downloaded_date, "to", max_downloaded_date, "\n") + } + } else { + if (verbose) { + message("No files found in save_folder_path\n") } - } - #Close connection to target mbox file - close(fileConn) + if (length(missing_months) == 0) { + if (verbose) { + message("No missing months\n") + } + } else { + warning("Months missing in the date range:", paste(missing_months, collapse = ", "), "\n") + } - #return output location - return(output) + ## Return Save Path ## + # Return the folder path where all mbox files were saved. + return(save_folder_path) } -#' Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -#' @param base_url An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes -#' @param mailing_list Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory -#' @param from_year First year in the range to be downloaded -#' @param to_year Last year in the range to be downloaded -#' @param save_folder_path the full *folder* path where the monthly downloaded mbox will be stored. -#' @param verbose Prints progress during execution -#' @return Returns the path of the downloaded mbox file. +#' Refresh Mod_Mbox +#' +#' This function refreshes the mailing list files by checking the contents of a specified folder. +#' If the folder is empty, it calls \code{\link{download_mod_mbox}} to download all mod_mbox files from start_year_month to the current month. +#' If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +#' along with all future months up to the current real-life month. +#' +#' The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +#' After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +#' Redownloading the most recent file ensures any files added in that month after the latest refresh are included. +#' +#' @param mailing_list The URL of the mailing list being downloaded (e.g., \url{https://lists.apache.org/list.html?announce@apache.org}) +#' @param start_year_month The year and month of the first file to be downloaded (format: 'YYYYMM'). +#' @param save_folder_path The folder path in which all the downloaded mod_mbox files will be stored. +#' @param verbose if TRUE, prints diagnostic messages. +#' @return Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. #' @export -download_mod_mbox_per_month <- function(base_url, mailing_list, from_year, to_year, save_folder_path,verbose=FALSE) { - - - #Initialize variables - counter <- 0 - destination <- list() - - #Open file handle to output file - output <- path.expand(save_folder_path) - - #Loop through time and compose the mbox file - for (year in (from_year:to_year)) { - - for (month in 1:12) { - counter <- counter + 1 - - #Generate file destinations for the monthly files in /tmp/ - destination[[counter]] <- sprintf("%d%02d.mbox", year, month) - - if(verbose){ - print(stringi::stri_c("Downloading:",destination[[counter]],sep = " ")) - } +refresh_mod_mbox <- function(mailing_list, start_year_month = NULL, save_folder_path, verbose = TRUE) { + + ## Check if Folder is Empty ## + # Check the contents of the folder to see if any .mbox files are already present. + # The function looks for files that match the naming pattern 'YYYYMM.mbox' + files_in_folder <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") + + # If the folder is empty + if (length(files_in_folder) == 0) { + # If start_year_month is not specified, issue an error + if (is.null(start_year_month)) { + stop("No existing data found. Please specify a start_year_month.") + } + # Otherwise, download all mod_mbox files starting from start_year_month + # The end date is set to the current month based on the system date + end_year_month <- format(Sys.Date(), "%Y%m") + if (verbose) message("Folder is empty. Downloading from", start_year_month, "to", end_year_month, "\n") - #Try file download and save result - full_month_url <- stringi::stri_c(base_url, mailing_list, destination[[counter]], sep = "/") - full_tmp_save_path <- file.path(output,destination[[counter]]) - x <- httr::GET(full_month_url, - httr::write_disk(full_tmp_save_path,overwrite=TRUE)) - - # Remove file if error - # Can only be done post-write, see https://github.com/r-lib/httr/issues/553 - if (httr::http_error(x) && file.exists(full_tmp_save_path)) { - warning(paste0("Unable to download: ",destination[[counter]])) - file.remove(full_tmp_save_path) - } + # Call the download_mod_mbox function to download files from start_year_month to end_year_month + download_mod_mbox(mailing_list, start_year_month, end_year_month, save_folder_path, verbose = verbose) + } + ## Identify the Most Recent Month ## + else { + # If the folder is not empty, identify the most recent month based on the filenames + # The filenames follow the pattern 'YYYYMM.mbox', so we extract the YYYYMM part of the filenames + year_months <- gsub("(\\d{6})\\.mbox$", "\\1", files_in_folder) + + # Find the most recent month by taking the maximum of the extracted YYYYMM values + recent_month <- max(year_months) + + # Delete the most recent file before redownloading it + recent_file <- file.path(save_folder_path, stringi::stri_c(recent_month, ".mbox")) + if (file.exists(recent_file)) { + file.remove(recent_file) + if (verbose) message("Deleted the most recent file:", recent_file, "\n") + } + ## Redownload from the Most Recent Month ## + # Set the end_year_month to the current month (based on the system date) + end_year_month <- format(Sys.Date(), "%Y%m") - } + # Redownload files from the most recent month (that was just deleted) to the current month + if (verbose) message("Redownloading from", recent_month, "to", end_year_month, "\n") + # Call the download_mod_mbox function to redownload the deleted month and all subsequent months up to the current month + download_mod_mbox(mailing_list, recent_month, end_year_month, save_folder_path, verbose = verbose) } - - #return output location - return(output) } + ############## Parsers ############## -#' Parse mbox from Perceval +#' Parse Mbox #' #' Parses an mbox file, which consists of emails in a mailbox, using the Perceval library. #' Note .mbox files do not have a consistent number of fields (e.g. Reply Cc.). Due to that, @@ -247,30 +579,58 @@ download_mod_mbox_per_month <- function(base_url, mailing_list, from_year, to_ye #' consistently renamed for clarity. #' #' @param perceval_path path to perceval binary -#' @param mbox_path path to mbox archive file (ends in .mbox) +#' @param mbox_file_path path to mbox archive file (ends in .mbox) +#' @export +#' @family parsers +#' @param perceval_path path to perceval binary +#' @param mbox_file_path path to mbox archive file (ends in .mbox) #' @export #' @family parsers -parse_mbox <- function(perceval_path,mbox_path){ +parse_mbox <- function(perceval_path, mbox_file_path) { # Expand paths (e.g. "~/Desktop" => "/Users/someuser/Desktop") perceval_path <- path.expand(perceval_path) - mbox_path <- path.expand(mbox_path) + mbox_file_path <- path.expand(mbox_file_path) + # Remove ".mbox" - mbox_uri <- stri_replace_last(mbox_path,replacement="",regex=".mbox") + mbox_uri <- stri_replace_last(mbox_file_path,replacement="",regex=".mbox") + # Use percerval to parse mbox_path. --json line is required to be parsed by jsonlite::fromJSON. - perceval_output <- system2(perceval_path, - args = c('mbox',mbox_uri,mbox_path,'--json-line'), - stdout = TRUE, - stderr = FALSE) - # Parsed JSON output as a data.table. - perceval_parsed <- data.table(jsonlite::stream_in(textConnection(perceval_output),verbose=FALSE)) - - columns_of_interest <- c("data.Message.ID","data.In.Reply.To","data.Date","data.From","data.To","data.Cc","data.Subject","data.body.plain","data.body") - columns_rename <- c("reply_id","in_reply_to_id","reply_datetimetz","reply_from","reply_to","reply_cc","reply_subject","reply_body","reply_body") + perceval_output <- tryCatch({ + system2(perceval_path, + args = c('mbox', mbox_uri, mbox_file_path, '--json-line'), + stdout = TRUE, + stderr = FALSE) + }, error = function(e) { + #print("Error running Perceval:") + #print(e$message) + stop("Perceval execution failed.") + }) + + # Filter JSON lines from Perceval output + json_lines <- perceval_output[grepl("^\\{", perceval_output)] # Escape the `{` character + + + if (length(json_lines) == 0) { + stop("No valid JSON lines found in Perceval output. Check the mbox file or Perceval configuration.") + } + + # Parse JSON output as a data.table + perceval_parsed <- tryCatch({ + # Parsed JSON output as a data.table. + data.table(jsonlite::stream_in(textConnection(perceval_output),verbose=FALSE)) + }, error = function(e) { + #print(e$message) + stop("JSON parsing failed.") + }) + + + columns_of_interest <- c("data.Message.ID", "data.In.Reply.To", "data.Date", "data.From", "data.To", "data.Cc", "data.Subject", "data.body.plain", "data.body") + columns_rename <- c("reply_id", "in_reply_to_id", "reply_datetimetz", "reply_from", "reply_to", "reply_cc", "reply_subject", "reply_body", "reply_body") is_available_column <- columns_of_interest %in% colnames(perceval_parsed) columns_of_interest <- columns_of_interest[is_available_column] - perceval_parsed <- perceval_parsed[,..columns_of_interest] + perceval_parsed <- perceval_parsed[, ..columns_of_interest] data.table::setnames(x = perceval_parsed, old = colnames(perceval_parsed), @@ -279,6 +639,38 @@ parse_mbox <- function(perceval_path,mbox_path){ return(perceval_parsed) } +#' Parse Latest Mbox +#' +#' This function returns the name of the latest mod_mbox file downloaded in the specified folder +#' based on the naming convention `YYYYMM.mbox`. For example: `202401.mbox`. +#' +#' @param save_folder_path path to the folder containing the mbox files +#' @return `latest_mbox_file` the name of the latest mod_mbox file +#' @export +#' @family parsers +parse_mbox_latest_date <- function(save_folder_path) { + # List all .mbox files in the folder with the expected naming pattern + file_list <- list.files(save_folder_path, pattern = "\\d{6}\\.mbox$") + + if (length(file_list) == 0) { + warning("No .mbox files found in the folder.") + return(invisible(NULL)) + } + + # Extract the dates from the filenames + date_list <- sub("(\\d{6})\\.mbox$", "\\1", file_list) + + # Convert dates to numeric for comparison + date_numeric <- as.numeric(date_list) + + # Find the latest date + latest_date <- max(date_numeric, na.rm = TRUE) + + # Find the file corresponding to the latest date + latest_mbox_file <- file_list[date_numeric == latest_date] + + return(latest_mbox_file) +} ############## Fake Generator ############## @@ -307,12 +699,12 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r # format the date correctly cdate <- format(as.POSIXct(reply_datetime, format = "%Y-%m-%dT%H:%M:%S"), "%a, %e %b %Y %H:%M:%S ") - reply_from_full_info <- paste0(reply_from_author, " <", reply_from_email, ">") - reply_to_full_info <- paste0(reply_to_author, " <", reply_to_email, ">") - reply_cc_full_info <- paste0(reply_cc_author, " <", reply_cc_email, ">") + reply_from_full_info <- stringi::stri_c(reply_from_author, " <", reply_from_email, ">") + reply_to_full_info <- stringi::stri_c(reply_to_author, " <", reply_to_email, ">") + reply_cc_full_info <- stringi::stri_c(reply_cc_author, " <", reply_cc_email, ">") - mbox_content <- paste0( + mbox_content <- stringi::stri_c( "From MAILER-DAEMON Thu Jul 18 13:48:48 2013", "\nPath: example.com!not-for-mail", "\nFrom: ", reply_from_full_info, @@ -331,7 +723,7 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r "\nX-Accept-Language: en-us ", "\nOriginal-To: ", reply_to_full_info, " ", reply_cc_full_info, "\nPrecedence: bulk", - "\nX-Mailing-List: ", paste0(mailing_list, "@example.com"), + "\nX-Mailing-List: ", stringi::stri_c(mailing_list, "@example.com"), "\n\n", reply_body ) @@ -352,16 +744,11 @@ make_mbox_reply <- function(mailing_list, reply_from_author, reply_from_email, r make_mbox_mailing_list <- function(replies, folder_path = "/tmp", file_name) { # Create a unique filename for the mbox file - mbox_filepath <- file.path(folder_path, paste0(file_name, ".mbox")) + mbox_filepath <- file.path(folder_path, stringi::stri_c(file_name, ".mbox")) - # make the file - mbox_body <- stringi::stri_c(replies,collapse = "\n\n") - io_make_file(mbox_filepath,mbox_body) + # Write the mbox content + mbox_body <- stringi::stri_c(replies, collapse = "\n\n") + io_make_file(mbox_filepath, mbox_body) - # Return the path of the created mbox file return(mbox_filepath) } - - - - diff --git a/README.md b/README.md index 5e77998f..b08b2af6 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ I also recommend you download the repo to have some example project configuratio 1. Clone this repo 2. Open `kaiaulu.Rproj` using RStudio - 3. Run the unit tests `devtools::test()`. If any fail, and you are not clear why, feel free to [ask in Discussions](https://github.com/sailuh/kaiaulu/discussions) + 3. Run the unit tests `devtools::test()`. If any fail and you are not clear why, feel free to [ask in Discussions](https://github.com/sailuh/kaiaulu/discussions) 4. Build the documentation `devtools::document(roclets = c('rd', 'collate', 'namespace'))`. 5. Build Kaiaulu (Top right pane in RStudio -> Build tab -> Install and Restart) 6. Run `vignettes/kaiaulu_architecture.Rmd` diff --git a/_pkgdown.yml b/_pkgdown.yml index aede9d0c..e988c7c2 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -95,14 +95,14 @@ reference: - title: __Mail__ desc: > Download, parsing and data generation of mailing lists. - See the [Download Mbox](../articles/download_mod_mbox.html) + See the [Download Mail](../articles/download_mail.html) and [Reply](../articles/reply_communication_showcase.html) Notebooks for details. - contents: - download_pipermail - - convert_pipermail_to_mbox + - refresh_pipermail - download_mod_mbox - - download_mod_mbox_per_month + - refresh_mod_mbox - parse_mbox - make_mbox_reply - make_mbox_mailing_list @@ -204,11 +204,6 @@ reference: - is_same_identity - assign_exact_identity - identity_match -- title: __Interval__ - desc: Provides different types of interval windows (e.g. release) for metric functions. -- contents: - - interval_commit_metric - - get_date_from_commit_hash - title: __Metrics__ desc: > Various metrics used to estimate code quality, @@ -356,3 +351,10 @@ reference: - get_window_end_commit - get_window_size - get_window_start_commit +- title: __Interval__ + desc: Provides different types of interval windows (e.g. release) for metric functions. +- contents: + - interval_commit_metric + - get_date_from_commit_hash + - process_gz_to_mbox_in_folder + - parse_mbox_latest_date diff --git a/conf/helix.yml b/conf/helix.yml index aedf0a58..1c84052d 100644 --- a/conf/helix.yml +++ b/conf/helix.yml @@ -48,26 +48,27 @@ version_control: - revert-1685-master mailing_list: + # If projects uses Apache Mod Mbox mod_mbox: + # There can be multiple projects in both the pipermail and mod mbox sections. project_key_1: mailing_list: https://lists.apache.org/list.html?announce@apache.org - save_folder_path: ../../rawdata/helix/mod_mbox/ - + save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + project_key_2: + mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org + save_folder_path: ../../helix/mod_mbox/save_mbox_mail_2 + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse + # If project uses Pipermail +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/openssl-users/ +# save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse # project_key_2: -# mailing_list: https://lists.apache.org/list.html?dev@felix.apache.org -# save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail_2/ - -# pipermail: -# project_key_1: -# mailing_list: https://mta.openssl.org/pipermail/openssl-users/ -# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ -# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse -# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox -# project_key_2: -# mailing_list: https://mta.openssl.org/pipermail/openssl-project/ -# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ -# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse -# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox +# mailing_list: https://mta.openssl.org/pipermail/openssl-project/ +# save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail_2/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse issue_tracker: jira: diff --git a/conf/openssl.yml b/conf/openssl.yml index d41cf319..91871521 100644 --- a/conf/openssl.yml +++ b/conf/openssl.yml @@ -45,27 +45,10 @@ version_control: - master mailing_list: - mod_mbox: - project_key_1: - mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-dev - save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/ - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail/kaiaulu.mbox - project_key_2: - mailing_list: http://mail-archives.apache.org/mod_mbox/kaiaulu-user - save_folder_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/ - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse - mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox - pipermail: + pipermail: project_key_1: - mailing_list: https://mta.openssl.org/pipermail/openssl-dev/ - save_folder_path: ../../rawdata/openssl/pipermail/save_mbox_mail/ - # mbox_file_path is for use only with parse_mbox() function. It is the file to parse -# mbox_file_path: ../../rawdata/openssl/pipermail/save_mbox_mail/openssl.mbox - project_key_2: mailing_list: https://mta.openssl.org/pipermail/openssl-users/ - save_folder_path: ../../rawdata/openssl/pipermail/save_mbox_mail_2/ -# mbox_file_path: ../../rawdata/openssl/pipermail/save_mbox_mail_2/openssl.mbox + save_folder_path: ../../rawdata/openssl/pipermail/save_mbox_users # issue_tracker: # jira: diff --git a/exec/mailinglist.R b/exec/mailinglist.R index 278edbdd..4a621722 100755 --- a/exec/mailinglist.R +++ b/exec/mailinglist.R @@ -6,101 +6,102 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at https://mozilla.org/MPL/2.0/. -require(yaml,quietly=TRUE) -require(cli,quietly=TRUE) -require(docopt,quietly=TRUE) -require(kaiaulu,quietly=TRUE) -require(data.table,quietly=TRUE) - - +require(yaml, quietly = TRUE) +require(cli, quietly = TRUE) +require(docopt, quietly = TRUE) +require(kaiaulu, quietly = TRUE) +require(data.table, quietly = TRUE) doc <- " USAGE: - mailinglist.R tabulate help - mailinglist.R tabulate - mailinglist.R download modmbox help - mailinglist.R download modmbox - mailinglist.R download modmboxmonth help - mailinglist.R download modmboxmonth + mailinglist.R parse help + mailinglist.R parse + mailinglist.R refresh modmbox help + mailinglist.R refresh modmbox + mailinglist.R refresh pipermail help + mailinglist.R refresh pipermail mailinglist.R (-h | --help) mailinglist.R --version DESCRIPTION: Provides a suite of functions to interact with Mailing Lists. Please see - Kaiaulu's README.md for instructions on how to create + Kaiaulu's README.md for instructions on how to create and . - OPTIONS: -h --help Show this screen. --version Show version. " +arguments <- docopt::docopt(doc, version = 'Kaiaulu 0.0.0.9700') - -arguments <- docopt::docopt(doc, version = 'Kaiaulu 0.0.0.9600') -if(arguments[["tabulate"]] & arguments[["help"]]){ - cli_alert_info("Tabulates a mailing list using parse_mbox().") -}else if(arguments[["tabulate"]]){ +if (arguments[["parse"]] & arguments[["help"]]) { + cli::cli_alert_info("Parses an mbox file using parse_mbox().") +} else if (arguments[["parse"]]) { tools_path <- arguments[[""]] - conf_path <- arguments[[""]] + mbox_file_path <- arguments[[""]] save_path <- arguments[[""]] - tool <- yaml::read_yaml(tools_path) - conf <- yaml::read_yaml(conf_path) - - perceval_path <- path.expand(tool[["perceval"]]) - mbox_path <- path.expand(conf[["mailing_list"]][["mbox"]]) + tools <- yaml::read_yaml(tools_path) + perceval_path <- get_tool_project("perceval", tools) - project_mbox <- parse_mbox(perceval_path,mbox_path) + cli::cli_alert_info(paste0("Parsing mbox file: ", mbox_file_path)) + parsed_mbox <- parse_mbox( + perceval_path = perceval_path, + mbox_file_path = mbox_file_path + ) - cli_alert_success(paste0("Tabulated mailing list was saved at: ",save_path)) + data.table::fwrite(parsed_mbox, save_path) + cli::cli_alert_success(paste0("Parsed mbox file was saved at: ", save_path)) - data.table::fwrite(project_mbox,save_path) -}else if(arguments[["download"]] & arguments[["modmbox"]] & arguments[["help"]]){ - cli_alert_info("Saves a mailing list archive from mod_mbox as a .mbox file - using download_mod_mbox().") -}else if(arguments[["download"]] & arguments[["modmbox"]]){ +} else if (arguments[["refresh"]] & arguments[["modmbox"]] & arguments[["help"]]) { + cli::cli_alert_info("Refreshes mailing list archives from mod_mbox using refresh_mod_mbox().") +} else if (arguments[["refresh"]] & arguments[["modmbox"]]) { conf_path <- arguments[[""]] - save_path <- arguments[[""]] - conf <- yaml::read_yaml(conf_path) - - mod_mbox_url <- conf[["mailing_list"]][["domain"]] - mailing_list <- conf[["mailing_list"]][["list_key"]][1] + project_key <- arguments[[""]] + start_year_month <- arguments[[""]] - start_year <- arguments[[""]] - end_year <- arguments[[""]] + conf <- yaml::read_yaml(conf_path) + mailing_list <- get_mbox_domain(conf, project_key) + save_folder_path <- get_mbox_path(conf, project_key) - mbox <- download_mod_mbox(base_url = mod_mbox_url, - mailing_list = mailing_list, - from_year=start_year, - to_year=end_year, - save_file_path = save_path, - verbose = TRUE) + refresh_mod_mbox( + mailing_list = mailing_list, + start_year_month = start_year_month, + save_folder_path = save_folder_path, + verbose = TRUE + ) - cli_alert_success(paste0("Downloaded mailing list was saved at: ",save_path)) -}else if(arguments[["download"]] & arguments[["modmboxmonth"]]){ + cli::cli_alert_success(paste0("Refreshed mailing list archives were saved at: ", save_folder_path)) +} else if (arguments[["refresh"]] & arguments[["pipermail"]] & arguments[["help"]]) { + cli::cli_alert_info("Refreshes mailing list archives from pipermail using refresh_pipermail().") +} else if (arguments[["refresh"]] & arguments[["pipermail"]]) { conf_path <- arguments[[""]] - save_path <- arguments[[""]] - conf <- yaml::read_yaml(conf_path) + project_key <- arguments[[""]] + start_year_month <- arguments[[""]] - mod_mbox_url <- conf[["mailing_list"]][["domain"]] - mailing_list <- conf[["mailing_list"]][["list_key"]][1] - - start_year <- arguments[[""]] - end_year <- arguments[[""]] - - mbox <- download_mod_mbox_per_month(base_url = mod_mbox_url, - mailing_list = mailing_list, - from_year=start_year, - to_year=end_year, - save_folder_path = save_path, - verbose = TRUE) - - cli_alert_success(paste0("Downloaded mailing list was saved at: ",save_path)) + conf <- yaml::read_yaml(conf_path) + mailing_list <- get_pipermail_domain(conf, project_key) + save_folder_path <- get_pipermail_path(conf, project_key) + + refresh_pipermail( + mailing_list = mailing_list, + start_year_month = start_year_month, + save_folder_path = save_folder_path, + verbose = TRUE + ) + + cli::cli_alert_success(paste0("Refreshed mailing list archives were saved at: ", save_folder_path)) + +} else if (arguments[["-h"]] || arguments[["--help"]]) { + cli::cli_alert_info(doc) +} else if (arguments[["--version"]]) { + cli::cli_alert_info('Kaiaulu 0.0.0.9700') +} else { + cli::cli_alert_danger("Invalid command or arguments. Use --help for usage information.") } diff --git a/man/convert_pipermail_to_mbox.Rd b/man/convert_pipermail_to_mbox.Rd deleted file mode 100644 index 441b1230..00000000 --- a/man/convert_pipermail_to_mbox.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mail.R -\name{convert_pipermail_to_mbox} -\alias{convert_pipermail_to_mbox} -\title{Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}}} -\usage{ -convert_pipermail_to_mbox(filelist) -} -\arguments{ -\item{filelist}{A vector of pipermail archive files from \code{\link{download_pipermail}}} -} -\value{ -Returns `output`, the name of the resulting .mbox file in the current working directory -} -\description{ -Convert pipermail archive files (.txt and .txt.gz) into an mbox format for use with \code{\link{parse_mbox}} -} diff --git a/man/download_mod_mbox.Rd b/man/download_mod_mbox.Rd index 3f4ec8e5..9347aa22 100644 --- a/man/download_mod_mbox.Rd +++ b/man/download_mod_mbox.Rd @@ -2,36 +2,40 @@ % Please edit documentation in R/mail.R \name{download_mod_mbox} \alias{download_mod_mbox} -\title{Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}}} +\title{Download Mod_Mbox} \usage{ download_mod_mbox( - base_url, mailing_list, - from_year, - to_year, - save_file_path, - is_per_month = TRUE, - verbose = FALSE + start_year_month, + end_year_month, + save_folder_path, + verbose = TRUE ) } \arguments{ -\item{base_url}{An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes} +\item{mailing_list}{The URL of the Apache Pony Mail list from which mbox files are to be downloaded +(e.g., "https://lists.apache.org/list.html?announce@apache.org").} -\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} -\item{from_year}{First year in the range to be downloaded} +\item{end_year_month}{The year and month of the last file to be downloaded (format: 'YYYYMM').} -\item{to_year}{Last year in the range to be downloaded} +\item{save_folder_path}{The folder path where all the downloaded mbox files will be stored.} -\item{save_file_path}{the full path, including file name and extension to save the file} - -\item{is_per_month}{If TRUE, does not delete monthly files in tmp. (Default = TRUE)} - -\item{verbose}{Prints progress during execution} +\item{verbose}{if TRUE, prints detailed messages during the download process.} } \value{ -Returns the path of the downloaded mbox file. +Returns `save_folder_path`, the folder path where the mbox files are stored. } \description{ -Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} +This function downloads mod_mbox archives from a specified Apache Pony Mail mailing list as .mbox files. +It constructs the download URLs for each month based on the start and end date range and downloads the mbox files +in the format "YYYY-MM". The downloaded .mbox files are saved in the specified folder, with a naming convention +of YYYYMM.mbox. +} +\details{ +The function loops through each month in the range specified by `start_year_month` and `end_year_month`, +and constructs the appropriate URL to download each month's data. If any download fails, an warning is issued for the failed months. +This means the file could not be found and that month's data may not exist. +At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. } diff --git a/man/download_mod_mbox_per_month.Rd b/man/download_mod_mbox_per_month.Rd deleted file mode 100644 index 5be3c0a3..00000000 --- a/man/download_mod_mbox_per_month.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mail.R -\name{download_mod_mbox_per_month} -\alias{download_mod_mbox_per_month} -\title{Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}}} -\usage{ -download_mod_mbox_per_month( - base_url, - mailing_list, - from_year, - to_year, - save_folder_path, - verbose = FALSE -) -} -\arguments{ -\item{base_url}{An url pointing to the mod_mbox directory (e.g. "http://mail-archives.apache.org/mod_mbox") without trailing slashes} - -\item{mailing_list}{Name of the project mailing list (e.g. apr-dev) in the mod_mbox directory} - -\item{from_year}{First year in the range to be downloaded} - -\item{to_year}{Last year in the range to be downloaded} - -\item{save_folder_path}{the full *folder* path where the monthly downloaded mbox will be stored.} - -\item{verbose}{Prints progress during execution} -} -\value{ -Returns the path of the downloaded mbox file. -} -\description{ -Compose mod_mbox archives (.mbox) into a single mbox file for use with \code{\link{parse_mbox}} -} diff --git a/man/download_pipermail.Rd b/man/download_pipermail.Rd index 9f4db683..24f75c83 100644 --- a/man/download_pipermail.Rd +++ b/man/download_pipermail.Rd @@ -2,16 +2,41 @@ % Please edit documentation in R/mail.R \name{download_pipermail} \alias{download_pipermail} -\title{Download all pipermail files in an archive} +\title{Pipermail Downloader} \usage{ -download_pipermail(url) +download_pipermail( + mailing_list, + start_year_month, + end_year_month, + save_folder_path, + verbose = TRUE +) } \arguments{ -\item{url}{An url pointing to a pipermail archive} +\item{mailing_list}{The name of the mailing list being downloaded e.g. "https://mta.openssl.org/pipermail/openssl-announce/"} + +\item{start_year_month}{The year and month of the first file to be downloaded format: 'YYYYMM'} + +\item{end_year_month}{The year and month of the last file to be downloaded format: 'YYYYMM', or use Sys.Date} + +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored} + +\item{verbose}{if TRUE, prints diagnostic messages during the download process} } \value{ -Returns `destination`, a vector of the downloaded files in the current working directory +Returns `downloaded_files`, a vector of the downloaded files in the current working directory } \description{ -Download all pipermail files in an archive +This function downloads pipermail archives from a specified pipermail mailing list as .mbox files. +It begins by downloading an .html file that contains the URLs for monthly archives in .txt or .gz formats. +The function first attempts to download the .txt file, and if unavailable, it falls back to downloading the .gz file. +} +\details{ +When a .gz file is downloaded, the function automatically unzips and converts it into an .mbox file, +overwriting any existing file with the same name. The original .gz file is deleted after extraction. + +The downloaded .mbox files are saved in the specified folder following the naming convention YYYYMM.mbox. +The function only downloads files that fall between the specified start_year_month and end_year_month. +When both formats fail to download, the function issues a warning indicating the missing month. +At the end, the function summarizes the downloads, indicating the range of dates present and any missing months. } diff --git a/man/parse_bugzilla_perceval_rest_issue_comments.Rd b/man/parse_bugzilla_perceval_rest_issue_comments.Rd index 2616e16b..7a498a41 100644 --- a/man/parse_bugzilla_perceval_rest_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_rest_issue_comments.Rd @@ -37,6 +37,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd index 8d22b68c..1110ceb6 100644 --- a/man/parse_bugzilla_perceval_traditional_issue_comments.Rd +++ b/man/parse_bugzilla_perceval_traditional_issue_comments.Rd @@ -37,6 +37,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_bugzilla_rest_comments.Rd b/man/parse_bugzilla_rest_comments.Rd index 4c23b9c8..f1640990 100644 --- a/man/parse_bugzilla_rest_comments.Rd +++ b/man/parse_bugzilla_rest_comments.Rd @@ -30,6 +30,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_bugzilla_rest_issues.Rd b/man/parse_bugzilla_rest_issues.Rd index 6d1a7086..c1cb667f 100644 --- a/man/parse_bugzilla_rest_issues.Rd +++ b/man/parse_bugzilla_rest_issues.Rd @@ -32,6 +32,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_bugzilla_rest_issues_comments.Rd b/man/parse_bugzilla_rest_issues_comments.Rd index cda90b6c..f4a6f052 100644 --- a/man/parse_bugzilla_rest_issues_comments.Rd +++ b/man/parse_bugzilla_rest_issues_comments.Rd @@ -34,6 +34,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_commit_message_id.Rd b/man/parse_commit_message_id.Rd index caef84ed..afc526d5 100644 --- a/man/parse_commit_message_id.Rd +++ b/man/parse_commit_message_id.Rd @@ -29,6 +29,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_dependencies.Rd b/man/parse_dependencies.Rd index 062bbf5b..ac8e0b7a 100644 --- a/man/parse_dependencies.Rd +++ b/man/parse_dependencies.Rd @@ -38,6 +38,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_dv8_clusters.Rd b/man/parse_dv8_clusters.Rd index 524e4d40..9b2500a8 100644 --- a/man/parse_dv8_clusters.Rd +++ b/man/parse_dv8_clusters.Rd @@ -27,6 +27,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_gitlog.Rd b/man/parse_gitlog.Rd index 5a0c8cad..a870cc42 100644 --- a/man/parse_gitlog.Rd +++ b/man/parse_gitlog.Rd @@ -33,6 +33,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_jira.Rd b/man/parse_jira.Rd index b0363181..3520b5ba 100644 --- a/man/parse_jira.Rd +++ b/man/parse_jira.Rd @@ -43,6 +43,7 @@ Other parsers: \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_jira_latest_date.Rd b/man/parse_jira_latest_date.Rd index e92216bd..c6b37e5d 100644 --- a/man/parse_jira_latest_date.Rd +++ b/man/parse_jira_latest_date.Rd @@ -35,6 +35,7 @@ Other parsers: \code{\link{parse_gitlog}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_jira_rss_xml.Rd b/man/parse_jira_rss_xml.Rd index 1043f976..933cea1e 100644 --- a/man/parse_jira_rss_xml.Rd +++ b/man/parse_jira_rss_xml.Rd @@ -38,6 +38,7 @@ Other parsers: \code{\link{parse_gitlog}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} diff --git a/man/parse_mbox.Rd b/man/parse_mbox.Rd index 8b8ad909..cfc8752b 100644 --- a/man/parse_mbox.Rd +++ b/man/parse_mbox.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/mail.R \name{parse_mbox} \alias{parse_mbox} -\title{Parse mbox from Perceval} +\title{Parse Mbox} \usage{ -parse_mbox(perceval_path, mbox_path) +parse_mbox(perceval_path, mbox_file_path) } \arguments{ \item{perceval_path}{path to perceval binary} -\item{mbox_path}{path to mbox archive file (ends in .mbox)} +\item{mbox_file_path}{path to mbox archive file (ends in .mbox)} } \description{ Parses an mbox file, which consists of emails in a mailbox, using the Perceval library. @@ -25,15 +25,35 @@ Other parsers: \code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, \code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, \code{\link{parse_bugzilla_rest_comments}()}, -\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, \code{\link{parse_commit_message_id}()}, \code{\link{parse_dependencies}()}, \code{\link{parse_dv8_clusters}()}, \code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox_latest_date}()}, +\code{\link{parse_nvdfeed}()}, +\code{\link{parse_understand_dependencies}()} + +Other parsers: +\code{\link{build_understand_project}()}, +\code{\link{export_understand_dependencies}()}, +\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, +\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, +\code{\link{parse_bugzilla_rest_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_commit_message_id}()}, +\code{\link{parse_dependencies}()}, +\code{\link{parse_dv8_clusters}()}, +\code{\link{parse_gitlog}()}, \code{\link{parse_jira}()}, +\code{\link{parse_jira_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_nvdfeed}()}, \code{\link{parse_understand_dependencies}()} } diff --git a/man/parse_mbox_latest_date.Rd b/man/parse_mbox_latest_date.Rd new file mode 100644 index 00000000..cf718e6f --- /dev/null +++ b/man/parse_mbox_latest_date.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{parse_mbox_latest_date} +\alias{parse_mbox_latest_date} +\title{Parse Latest Mbox} +\usage{ +parse_mbox_latest_date(save_folder_path) +} +\arguments{ +\item{save_folder_path}{path to the folder containing the mbox files} +} +\value{ +`latest_mbox_file` the name of the latest mod_mbox file +} +\description{ +This function returns the name of the latest mod_mbox file downloaded in the specified folder +based on the naming convention `YYYYMM.mbox`. For example: `202401.mbox`. +} +\seealso{ +Other parsers: +\code{\link{build_understand_project}()}, +\code{\link{export_understand_dependencies}()}, +\code{\link{parse_bugzilla_perceval_rest_issue_comments}()}, +\code{\link{parse_bugzilla_perceval_traditional_issue_comments}()}, +\code{\link{parse_bugzilla_rest_comments}()}, +\code{\link{parse_bugzilla_rest_issues}()}, +\code{\link{parse_bugzilla_rest_issues_comments}()}, +\code{\link{parse_commit_message_id}()}, +\code{\link{parse_dependencies}()}, +\code{\link{parse_dv8_clusters}()}, +\code{\link{parse_gitlog}()}, +\code{\link{parse_jira}()}, +\code{\link{parse_jira_latest_date}()}, +\code{\link{parse_jira_rss_xml}()}, +\code{\link{parse_mbox}()}, +\code{\link{parse_nvdfeed}()}, +\code{\link{parse_understand_dependencies}()} +} +\concept{parsers} diff --git a/man/parse_nvdfeed.Rd b/man/parse_nvdfeed.Rd index 7b49c51c..5607266a 100644 --- a/man/parse_nvdfeed.Rd +++ b/man/parse_nvdfeed.Rd @@ -29,6 +29,7 @@ Other parsers: \code{\link{parse_jira_latest_date}()}, \code{\link{parse_jira_rss_xml}()}, \code{\link{parse_jira}()}, +\code{\link{parse_mbox_latest_date}()}, \code{\link{parse_mbox}()}, \code{\link{parse_understand_dependencies}()} } diff --git a/man/process_gz_to_mbox_in_folder.Rd b/man/process_gz_to_mbox_in_folder.Rd new file mode 100644 index 00000000..a9a96c41 --- /dev/null +++ b/man/process_gz_to_mbox_in_folder.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{process_gz_to_mbox_in_folder} +\alias{process_gz_to_mbox_in_folder} +\title{Gz to Mbox Converter} +\usage{ +process_gz_to_mbox_in_folder(save_folder_path, verbose = TRUE) +} +\arguments{ +\item{save_folder_path}{The path to the folder containing both .gz and .mbox files.} + +\item{verbose}{if TRUE, prints diagnostic messages during processing.} +} +\value{ +A list of the .mbox files that were created or updated. +} +\description{ +This function scans a specified folder for any .gz files, unzips them, +and renames them to the .mbox format. After unzipping, the original .gz files are deleted. +If a .mbox file with the same name already exists, it will be overwritten. +This makes sure that all the files in the folder are in .mbox format, ready for parsing. +} diff --git a/man/refresh_mod_mbox.Rd b/man/refresh_mod_mbox.Rd new file mode 100644 index 00000000..8140b782 --- /dev/null +++ b/man/refresh_mod_mbox.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{refresh_mod_mbox} +\alias{refresh_mod_mbox} +\title{Refresh Mod_Mbox} +\usage{ +refresh_mod_mbox( + mailing_list, + start_year_month = NULL, + save_folder_path, + verbose = TRUE +) +} +\arguments{ +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., \url{https://lists.apache.org/list.html?announce@apache.org})} + +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} + +\item{save_folder_path}{The folder path in which all the downloaded mod_mbox files will be stored.} + +\item{verbose}{if TRUE, prints diagnostic messages.} +} +\value{ +Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. +} +\description{ +This function refreshes the mailing list files by checking the contents of a specified folder. +If the folder is empty, it calls \code{\link{download_mod_mbox}} to download all mod_mbox files from start_year_month to the current month. +If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +along with all future months up to the current real-life month. +} +\details{ +The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +Redownloading the most recent file ensures any files added in that month after the latest refresh are included. +} diff --git a/man/refresh_pipermail.Rd b/man/refresh_pipermail.Rd new file mode 100644 index 00000000..60e84ab2 --- /dev/null +++ b/man/refresh_pipermail.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mail.R +\name{refresh_pipermail} +\alias{refresh_pipermail} +\title{Refresh Pipermail} +\usage{ +refresh_pipermail( + mailing_list, + start_year_month = NULL, + save_folder_path, + verbose = TRUE +) +} +\arguments{ +\item{mailing_list}{The URL of the mailing list being downloaded (e.g., \url{https://mta.openssl.org/pipermail/openssl-announce/})} + +\item{start_year_month}{The year and month of the first file to be downloaded (format: 'YYYYMM').} + +\item{save_folder_path}{The folder path in which all the downloaded pipermail files will be stored.} + +\item{verbose}{if TRUE, prints diagnostic messages.} +} +\value{ +Returns `downloaded_files`, a vector of the newly downloaded files in the current working directory. +} +\description{ +This function refreshes the mailing list files by checking the contents of a specified folder. +If the folder is empty, it calls \code{\link{download_pipermail}} to download all pipermail files from start_year_month to the current month. +If the folder contains already-downloaded mbox files, it identifies the most recent month, deletes that file, and redownloads it +along with all future months up to the current real-life month. +} +\details{ +The naming convention of files is `YYYYMM.mbox`, and the function uses this pattern to identify the most recent month. +After deleting the most recent file, the function ensures that the month is redownloaded, along with all subsequent months up to the current month. +Redownloading the most recent file makes sure that any files added in that month after the latest refresh are included. +} diff --git a/tests/testthat/test-git.R b/tests/testthat/test-git.R index 3659d6dd..ad3890b2 100644 --- a/tests/testthat/test-git.R +++ b/tests/testthat/test-git.R @@ -30,9 +30,13 @@ test_that("Calling parse_gitlog with correct perceval and correct git log path r tools_path <- file.path(tools_path) tool <- yaml::read_yaml(tools_path) perceval_path <- tool[["perceval"]] + git_repo_path <- suppressWarnings(git_create_sample_log()) + result <- parse_gitlog(perceval_path, git_repo_path) + expect_is(result, "data.table") + suppressWarnings(git_delete_sample_log(git_repo_path)) }) diff --git a/tests/testthat/test-mail.R b/tests/testthat/test-mail.R index b7426917..700bcf30 100644 --- a/tests/testthat/test-mail.R +++ b/tests/testthat/test-mail.R @@ -2,34 +2,41 @@ tools_path <- test_path("testdata", "tools.yml") conf_path <- test_path("testdata", "thrift.yml") test_that("Incorrect perceval path fails parse_mbox", { - conf <- yaml::read_yaml(conf_path) - mbox_path <- conf[["mailing_list"]][["mbox"]] + + conf <- parse_config(conf_path) + key_1_name <- names(get_mbox_key_indexes(conf))[1] + mbox_path <- get_mbox_path(conf,key_1_name) + incorrect_perceval_path <- "/incorrect/path/to/perceval" - expect_error(parse_mbox(incorrect_perceval_path, mbox_path), "error in running command") + expect_error(parse_mbox(incorrect_perceval_path, mbox_path), "Perceval execution failed.") }) test_that("Incorrect mbox path to parse_mbox returns empty table", { - tool <- yaml::read_yaml(tools_path) - perceval_path <- tool[["perceval"]] + + tool <- parse_config(tools_path) + perceval_path <- get_tool_project("perceval",tool) perceval_path <- path.expand(perceval_path) incorrect_mbox_path <- "/incorrect/path/to/mbox" - output <- parse_mbox(perceval_path, incorrect_mbox_path) - expect_equal(nrow(output), 0) + expect_error(parse_mbox(perceval_path, incorrect_mbox_path), "No valid JSON lines found in Perceval output. Check the mbox file or Perceval configuration.") }) test_that("Calling parse_mbox with correct perceval and mbox path returns a data table with correct raw data", { tools_path <- file.path(tools_path) - tool <- yaml::read_yaml(tools_path) - perceval_path <- tool[["perceval"]] - mbox_path <- example_mailing_list_two_threads(folder_path = "/tmp", - folder_name="example_two_threads_mailing_list", - file_name = "two_thread_mailing_list") - result <- parse_mbox(perceval_path, mbox_path) - io_delete_folder(folder_path="/tmp", folder_name="example_two_threads_mailing_list") + tool <- parse_config(tools_path) + perceval_path <- get_tool_project("perceval",tool) + + mbox_path <- example_mailing_list_two_threads( + folder_path = "/tmp", + folder_name = "example_two_threads_mailing_list", + file_name = "two_thread_mailing_list" + ) + + result <- parse_mbox(perceval_path, mbox_path) + + io_delete_folder(folder_path = "/tmp", folder_name = "example_two_threads_mailing_list") expect_equal(result[reply_from == "John Doe "]$reply_subject, "Subject 1") expect_equal(result[reply_subject == "Re: Subject 1"]$reply_from, "Smithsonian Doe ") - }) diff --git a/tests/testthat/testdata/thrift.yml b/tests/testthat/testdata/thrift.yml index f47062b2..87ea8278 100644 --- a/tests/testthat/testdata/thrift.yml +++ b/tests/testthat/testdata/thrift.yml @@ -34,7 +34,7 @@ project: version_control: # Where is the git log located locally? - log: ../../rawdata/git_repo/thrift/.git # cloned Apache Thrift repo and put path to its .git file + log: ../../rawdata/thrift/git_repo/.git # cloned Apache Thrift repo and put path to its .git file # From where the git log was downloaded? log_url: https://github.com/apache/thrift # List of branches used for analysis @@ -42,28 +42,66 @@ version_control: - master mailing_list: - # Where is the mbox located locally? - mbox: ../../rawdata/mbox/thrift-dev.mbox # Download here: https://cdn.lfdr.de/stmc/ieee_tse_data/mail/thrift-dev.mbox - # What is the domain of the chosen mailing list archive? - domain: http://mail-archives.apache.org/mod_mbox - # Which lists of the domain will be used? - list_key: - - thrift-dev + mod_mbox: + project_key_1: + mailing_list: http://mail-archives.apache.org/mod_mbox/thrift-dev + save_folder_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail/ + # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail/thrift.mbox + project_key_2: + mailing_list: http://mail-archives.apache.org/mod_mbox/thrift-user + save_folder_path: ../../rawdata/thrift/mod_mbox/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/mod_mbox/save_mbox_mail_2/kaiaulu.mbox +# pipermail: +# project_key_1: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-dev/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail/kaiaulu.mbox +# project_key_2: +# mailing_list: https://mta.openssl.org/pipermail/kaiaulu-users/ +# save_folder_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/ +# # mbox_file_path is for use only with parse_mbox() function. It is the file to parse +# mbox_file_path: ../../rawdata/kaiaulu/pipermail/save_mbox_mail_2/kaiaulu.mbox issue_tracker: jira: - # Obtained from the project's JIRA URL - domain: https://issues.apache.org/jira - project_key: THRIFT - # Download using `download_jira_data.Rmd` - issues: ../../rawdata/issue_tracker/thrift_issues.json - issue_comments: ../../rawdata/issue_tracker/thrift_issue_comments.json + project_key_1: + # Obtained from the project's JIRA URL + domain: https://issues.apache.org/jira + project_key: THRIFT + # Download using `download_jira_data.Rmd` + issues: ../../rawdata/thrift/jira/issues/thrift/ + issue_comments: ../../rawdata/thrift/jira/issue_comments/thrift/ github: - # Obtained from the project's GitHub URL - owner: apache - repo: thrift - # Download using `download_github_comments.Rmd` - replies: ../../rawdata/github/thrift/ + project_key_1: + # Obtained from the project's GitHub URL + owner: apache + repo: thrift + # Download using `download_github_comments.Rmd` + issue_or_pr_comment: ../../rawdata/thrift/github/issue_or_pr_comment/apache_thrift/ + issue: ../../rawdata/thrift/github/issue/apache_thrift/ + issue_search: ../../rawdata/thrift/github/issue_search/apache_thrift/ + issue_event: ../../rawdata/thrift/github/issue_event/apache_thrift/ + pull_request: ../../rawdata/thrift/github/pull_request/apache_thrift/ + commit: ../../rawdata/thrift/github/commit/apache_thrift/ +# project_key_2: +# # Obtained from the project's GitHub URL +# owner: ssunoo2 +# repo: kaiaulu +# # Download using `download_github_comments.Rmd` +# issue_or_pr_comment: ../../rawdata/kaiaulu/github/issue_or_pr_comment/ssunoo2_kaiaulu/ +# issue: ../../rawdata/kaiaulu/github/issue/ssunoo2_kaiaulu/ +# issue_search: ../../rawdata/kaiaulu/github/issue_search/ssunoo2_kaiaulu/ +# issue_event: ../../rawdata/kaiaulu/github/issue_event/ssunoo2_kaiaulu/ +# pull_request: ../../rawdata/kaiaulu/github/pull_request/ssunoo2_kaiaulu/ +# commit: ../../rawdata/kaiaulu/github/commit/ssunoo2_kaiaulu/ +# bugzilla: +# project_key_1: +# project_key: kaiaulu +# issues: ../../rawdata/kaiaulu/bugzilla/issues/kaiaulu/ +# issue_comments: ../../rawdata/kaiaulu/bugzilla/issue_comments/kaiaulu/ #vulnerabilities: # Folder path with nvd cve feeds (e.g. nvdcve-1.1-2018.json) @@ -113,6 +151,32 @@ tool: - Throw - Parameter - Contain + # dv8: + # # The project folder path to store various intermediate + # # files for DV8 Analysis + # # The folder name will be used in the file names. + # folder_path: ../../analysis/junit/dv8/ + # # the architectural flaws thresholds that should be used + # architectural_flaws: + # cliqueDepends: + # - call + # - use + # crossingCochange: 2 + # crossingFanIn: 4 + # crossingFanOut: 4 + # mvCochange: 2 + # uiCochange: 2 + # uihDepends: + # - call + # - use + # uihInheritance: + # - extend + # - implement + # - public + # - private + # - virtual + # uiHistoryImpact: 10 + # uiStructImpact: 0.01 # Uctags allows finer file-file dependency parsing (e.g. functions, classes, structs) uctags: # See https://github.com/sailuh/kaiaulu/wiki/Universal-Ctags for details @@ -132,6 +196,36 @@ tool: - f # functions r: - f # functions + # # srcML allow to parse src code as text (e.g. identifiers) + # srcml: + # # The file path to where you wish to store the srcml output of the project + # srcml_path: ../../analysis/junit5/srcml/srcml_junit.xml + # pattern4: + # # The file path to where you wish to store the classes of the pattern4 analysis + # class_folder_path: ../../rawdata/junit5/pattern4/classes/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # # The file path to where you wish to store the output of the pattern4 analysis + # output_filepath: ../../analysis/junit5/pattern4/ + # compile_note: > + # 1. Switch Java version to Java 17: + # https://stackoverflow.com/questions/69875335/macos-how-to-install-java-17 + # 2. Disable VPN to pull modules from Gradle Plugin Portal. + # 3. Use sudo ./gradlew build + # 4. After building, locate the engine class files and specify as the class_folder_path: + # in this case they are in: /path/to/junit5/analysis/junit-platform-engine/build/classes/java/main/org/junit/platform/engine/ + # understand: + # # Accepts one language at a time: ada, assembly, c/c++, c#, fortran, java, jovial, delphi/pascal, python, vhdl, basic, javascript + # code_language: java + # # Specify which types of Dependencies to keep + # keep_dependencies_type: + # - Import + # - Call + # - Create + # - Use + # - Type GenericArgument + # # Where the files to analyze should be stored + # project_path: ../../rawdata/kaiaulu/git_repo/understand/ + # # Where the output for the understands analysis is stored + # output_path: ../../analysis/kaiaulu/understand/ # Analysis Configuration # analysis: diff --git a/vignettes/download_mail.Rmd b/vignettes/download_mail.Rmd new file mode 100644 index 00000000..12fc0417 --- /dev/null +++ b/vignettes/download_mail.Rmd @@ -0,0 +1,213 @@ +--- +title: "Download Mod Mbox and Pipermail Mailing List Archives" +output: + html_document: + toc: true + number_sections: true +vignette: > + %\VignetteEngine{knitr::rmarkdown} + %\VignetteIndexEntry{Download Mod Mbox Mailing List Archives} + %\VignetteEncoding{UTF-8} +--- + + +```{r warning = FALSE, message = FALSE} +rm(list = ls()) +seed <- 1 +set.seed(seed) + +# Load libraries +require(kaiaulu) +require(data.table) +require(yaml) +require(stringi) +require(XML) +require(httr) +require(gt) +``` + + +# Introduction + +Open source projects require a means for developers to communicate. These may include mailing lists, issue trackers, discord, etc. This notebooks showcases how to download data from mailing list archives. Two often used archive types are [mod_mbox](https://httpd.apache.org/mod_mbox/) and [pipermail](https://en.wikipedia.org/wiki/GNU_Mailman#cite_note-9), which Kaiaulu offer functions to download data from. The former is commonly used by the Apache Software Foundation projects. The latter, is more commonly use in GNU related projects, but this can vary. + +Each mailing list maintains archives of past messages, often organized by month and year. These archives can be accessed and downloaded for analysis. However, it is important to note that mailing list archives may be split into multiple formats or locations, and not all archives contain the same information. Different archives can differ in completeness, date ranges, and the data they contain. Some archives might lack important fields like "In-Reply-To," which is important for reconstructing message threads. It is, therefore, important the archive being used is carefully inspected, since this effects the quality and completeness of analysis. + +# Mailing List Organization + +Mailing list data is stored in a variety of archives. See: +- Mod Mbox: [Apache Geronimo](https://geronimo.apache.org/mailing-lists.html)). +- Pipermail: [OpenSSL](https://mta.openssl.org/mailman/listinfo/). +(More information on this in the sections below.) This notebook demonstrates how to download and refresh mailing list archives from Mod Mbox and Pipermail. + +Mailing lists are typically organized by topic or purpose. For example, the [OpenSSL project](https://www.openssl.org/community/mailinglists.html) maintains several mailing lists, each serving a different group: + +- **project-announce**: For important announcements. +- **project-commits**: For commit messages. +- **project-project**: For project discussions. +- **project-users**: For general user questions and discussions. + +Mod Mbox archives also organize mailing lists by topic. The apache mailing list archives can be found at https://lists.apache.org/. + +# Project Configuration File + +Mailing List archives are hosted by their respective open source projects. Therefore, in order to use Kaiaulu downloaders to obtain mail data, you will need to access the respective open source project, and find out the URL tied to the archive you are interested in. Generally, that is the developer mailing list, if your interest is to understand communication patterns among developers. Alternatively, if the focus of the research is Q&A from the user base, then a user mailing list may make more sense. + +Because project lifetime can go as far as a few decades, to have the full picture of what communication took place in the project you may need to download multiple archives and combine them, after turning them into tables using the Kaiaulu parser. + +The information you need to find out for each open source project is documented in Kaiaulu using a project configuration file format. For pipermail and mod_mbox this is as follows: + +``` +mailing_list: + # for pipermail + pipermail: + project_key_1: + mailing_list: https://mta.openssl.org/pipermail/openssl-users/ + save_folder_path: ../../rawdata/helix/pipermail/save_mbox_mail + # for mod mbox + mod_mbox: + project_key_1: + mailing_list: https://lists.apache.org/list.html?announce@apache.org + save_folder_path: ../../rawdata/helix/mod_mbox/save_mbox_mail +``` + +The most manual time intensive step you will be required is to locate the URL of the mailing list archive you wish for in the project website. This is specified under `mailing_list`. Note for pipermail this URL should point to the page containing links to the monthly archives (e.g. https://mta.openssl.org/pipermail/openssl-users/), not the top-level mailing list page that contains all the different types of archives (e.g. https://mta.openssl.org/mailman/listinfo/). + + +Regardless of which mail archive you choose, the downloaders will store the mail data in monthly files, in a `.mbox` format. This is a simple text file that contains some markings to identify the header of the e-mail containing title, authors, etc. While you can open any of the .mbox downloaded files with any text editor, Kaiaulu parsers will format them into tables, as demonstrated below. + +## Tools Configuration + +In addition to the mailing list configurations, you need to specify the path to the [Perceval](https://github.com/chaoss/grimoirelab-perceval) binary in tools.yml. See the wiki for further details on how to setup third party tools. + +```{r} +# Load tools configuration +tools <- parse_config("../tools.yml") +parse_perceval_path <- get_tool_project("perceval", tools) + +# Load project configuration +conf <- parse_config("../conf/helix.yml") +mbox_file_path <- get_mbox_path(conf, "project_key_1") +``` + +# Downloaders and Refreshers + +## Downloaders + +With the configurations loaded, we can proceed to download the mailing list archives. The downloaders are responsible for fetching the archives from the specified mailing lists and saving them locally in .mbox format. + +### Pipermail Downloader + +For Pipermail, we need to specify the project key, which is used to retrieve the configuration parameters for the specific project. The project key is used to identify the project in the configuration file. + +Now, we can use the getter functions to retrieve the configuration parameters for the specified project key. + +```{r} +conf <- parse_config("../conf/openssl.yml") +pipermail_mailing_list <- get_pipermail_domain(conf, "project_key_1") +pipermail_save_folder_path <- get_pipermail_path(conf, "project_key_1") + +# Define the date range +pipermail_start_year_month <- 202310 +pipermail_end_year_month <- 202405 +``` + +With our configurations loaded, we can proceed to downloading the mailing list archives. + +```{r eval=FALSE} +# Download archives +download_pipermail( + mailing_list = pipermail_mailing_list, + start_year_month = pipermail_start_year_month, + end_year_month = pipermail_end_year_month, + save_folder_path = pipermail_save_folder_path, + verbose = TRUE +) + +``` + +After running this function, the .mbox files will be saved in the specified directory with filenames like 202310.mbox, 202311.mbox, etc, which can be parsed in a table: + +```{r} +parsed_mail <- parse_mbox( + perceval_path = parse_perceval_path, + mbox_file_path = pipermail_save_folder_path +) + +parsed_mail %>% + head(10) %>% + gt() +``` + +### Mod Mbox Downloader + +The download_mod_mbox() function downloads Mod Mbox archives from a specified Apache Pony Mail mailing list over a given date range. We obtain the required parameters from the project configuration file, as done before: + +```{r eval=FALSE} +conf <- parse_config("../conf/helix.yml") +mbox_mailing_list <- get_mbox_domain(conf, "project_key_1") +mbox_save_folder_path <- get_mbox_path(conf, "project_key_1") + +# Define the date range +mbox_start_year_month <- 202310 +mbox_end_year_month <- 202405 +``` + + +The `start_year_month` and `end_year_month` time range parameters should be set manually, as with Pipermail. + + +```{r eval=FALSE} +download_mod_mbox( + mailing_list = mbox_mailing_list, + start_year_month = mbox_start_year_month, + end_year_month = mbox_end_year_month, + save_folder_path = mbox_save_folder_path, + verbose = TRUE + ) + +``` + +After running the function, it constructs URLs like: https://lists.apache.org/api/mbox.lua?list=announce@apache.org&date=2024-01 +and saves the files in the specified folder. + +```{r} +parsed_mail <- parse_mbox( + perceval_path = parse_perceval_path, + mbox_file_path = mbox_file_path +) + +parsed_mail %>% + head(10) %>% + gt() +``` + +## Refreshers + +Kaiaulu offers convenient function to add new e-mails since the last execution of the downloaders. These are defined as "refresh_*" functions. The most recent file timestamp, which captures the latest month, is used as a starting date to download new files. The most recent file is deleted and re-downloaded to ensure all e-mails of the last month were downloaded, and subsequent files are then downloaded. + +### Pipermail Refresher + +```{r eval=FALSE} +# Refresh archives +refresh_pipermail( + mailing_list = pipermail_mailing_list, + start_year_month = pipermail_start_year_month, + save_folder_path = pipermail_save_folder_path, + verbose = TRUE +) +``` + +### Mod Mbox Refresher + +A similar function is also available for mod_mbox: + +```{r eval=FALSE} +refresh_mod_mbox( + mailing_list = mbox_mailing_list, + # start_year_month = mbox_start_year_month, + save_folder_path = mbox_save_folder_path, + verbose = TRUE +) +``` + diff --git a/vignettes/social_smell_showcase.Rmd b/vignettes/social_smell_showcase.Rmd index 7b454094..2f34629b 100644 --- a/vignettes/social_smell_showcase.Rmd +++ b/vignettes/social_smell_showcase.Rmd @@ -132,10 +132,14 @@ project_mbox <- NULL project_jira <- NULL project_github_replies <- NULL - - +project_mbox <- data.table() if(!is.null(mbox_path)){ - project_mbox <- parse_mbox(perceval_path,mbox_path) + for(mbox_file_path in list.files(mbox_path,full.names=TRUE)){ + #print(mbox_file_path) + project_mbox <- rbind(project_mbox,parse_mbox(perceval_path,mbox_file_path),fill=TRUE) + } + + #project_mbox <- parse_mbox(perceval_path,mbox_path) project_mbox$reply_tz <- sapply(stringi::stri_split(project_git$reply_datetimetz, regex=" "),"[[",6)