diff --git a/Cargo.lock b/Cargo.lock index 60f74e9..ad6b1ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3576,7 +3576,7 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "simpleaf" -version = "0.19.5" +version = "0.20.0" dependencies = [ "af-anndata", "anyhow", diff --git a/bump_and_publish.sh b/bump_and_publish.sh new file mode 100755 index 0000000..f752f0b --- /dev/null +++ b/bump_and_publish.sh @@ -0,0 +1,392 @@ +#!/usr/bin/env bash +set -euo pipefail + +die() { + echo "error: $*" >&2 + exit 1 +} + +usage() { + cat <<'EOF' +Usage: + ./bump_and_publish.sh [--publish] [--dry-run] + ./bump_and_publish.sh [--publish] [--dry-run] + +Options: + --publish Publish to crates.io after bumping, committing, tagging, and pushing + --dry-run Show what would be done without modifying tracked files, creating commits or tags, pushing, or publishing + -h, --help Show this help message +EOF +} + +print_cmd() { + printf '+' + printf ' %q' "$@" + printf '\n' +} + +run() { + print_cmd "$@" + if [[ "$DRY_RUN" == true ]]; then + return 0 + fi + "$@" +} + +is_valid_semver() { + local version="$1" + local semver_regex='^(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9A-Za-z-]*[A-Za-z-][0-9A-Za-z-]*)(\.(0|[1-9][0-9]*|[0-9A-Za-z-]*[A-Za-z-][0-9A-Za-z-]*))*))?(\+([0-9A-Za-z-]+(\.[0-9A-Za-z-]+)*))?$' + printf '%s\n' "$version" | grep -Eq "$semver_regex" +} + +split_semver() { + local version="$1" + local core="$version" + local pre="" + local major minor patch + + if [[ "$core" == *+* ]]; then + core="${core%%+*}" + fi + + if [[ "$core" == *-* ]]; then + pre="${core#*-}" + core="${core%%-*}" + fi + + IFS='.' read -r major minor patch <<<"$core" + printf '%s\t%s\t%s\t%s\n' "$major" "$minor" "$patch" "$pre" +} + +compare_prerelease() { + local left="$1" + local right="$2" + + if [[ -z "$left" && -z "$right" ]]; then + echo 0 + return + fi + if [[ -z "$left" ]]; then + echo 1 + return + fi + if [[ -z "$right" ]]; then + echo -1 + return + fi + + local -a lparts=() + local -a rparts=() + IFS='.' read -r -a lparts <<<"$left" + IFS='.' read -r -a rparts <<<"$right" + + local i + local lnum rnum + for ((i = 0; i < ${#lparts[@]} || i < ${#rparts[@]}; i++)); do + if ((i >= ${#lparts[@]})); then + echo -1 + return + fi + if ((i >= ${#rparts[@]})); then + echo 1 + return + fi + + if [[ "${lparts[$i]}" == "${rparts[$i]}" ]]; then + continue + fi + + if [[ "${lparts[$i]}" =~ ^(0|[1-9][0-9]*)$ ]]; then + lnum=1 + else + lnum=0 + fi + if [[ "${rparts[$i]}" =~ ^(0|[1-9][0-9]*)$ ]]; then + rnum=1 + else + rnum=0 + fi + + if ((lnum == 1 && rnum == 1)); then + if ((10#${lparts[$i]} > 10#${rparts[$i]})); then + echo 1 + else + echo -1 + fi + return + fi + if ((lnum == 1 && rnum == 0)); then + echo -1 + return + fi + if ((lnum == 0 && rnum == 1)); then + echo 1 + return + fi + + if [[ "${lparts[$i]}" > "${rparts[$i]}" ]]; then + echo 1 + else + echo -1 + fi + return + done + + echo 0 +} + +compare_semver() { + local left="$1" + local right="$2" + + local lmaj lmin lpat lpre + local rmaj rmin rpat rpre + IFS=$'\t' read -r lmaj lmin lpat lpre <<<"$(split_semver "$left")" + IFS=$'\t' read -r rmaj rmin rpat rpre <<<"$(split_semver "$right")" + + if ((10#$lmaj > 10#$rmaj)); then + echo 1 + return + fi + if ((10#$lmaj < 10#$rmaj)); then + echo -1 + return + fi + + if ((10#$lmin > 10#$rmin)); then + echo 1 + return + fi + if ((10#$lmin < 10#$rmin)); then + echo -1 + return + fi + + if ((10#$lpat > 10#$rpat)); then + echo 1 + return + fi + if ((10#$lpat < 10#$rpat)); then + echo -1 + return + fi + + compare_prerelease "$lpre" "$rpre" +} + +VERSION="" +PUBLISH=false +DRY_RUN=false + +while [[ $# -gt 0 ]]; do + case "$1" in + --publish) + PUBLISH=true + ;; + --dry-run) + DRY_RUN=true + ;; + -h|--help) + usage + exit 0 + ;; + -*) + die "unknown option: $1" + ;; + *) + if [[ -n "$VERSION" ]]; then + die "version specified more than once" + fi + VERSION="$1" + ;; + esac + shift +done + +[[ -n "$VERSION" ]] || { + usage + exit 1 +} + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +ROOT_CARGO="Cargo.toml" +LOCKFILE="Cargo.lock" +CRATE_NAME="simpleaf" +TAG="v${VERSION}" +TMP_TARGET_DIR="" +MANIFEST_BACKUP="" +LOCKFILE_BACKUP="" +MANIFEST_UPDATED=false +COMMIT_CREATED=false + +cleanup() { + local status=$? + + if [[ -n "$TMP_TARGET_DIR" && -d "$TMP_TARGET_DIR" ]]; then + rm -rf "$TMP_TARGET_DIR" + fi + + if [[ "$status" -ne 0 && "$DRY_RUN" == false && "$MANIFEST_UPDATED" == true && "$COMMIT_CREATED" == false ]]; then + if [[ -n "$MANIFEST_BACKUP" && -f "$MANIFEST_BACKUP" ]]; then + cp "$MANIFEST_BACKUP" "$ROOT_CARGO" + fi + if [[ -n "$LOCKFILE_BACKUP" && -f "$LOCKFILE_BACKUP" ]]; then + cp "$LOCKFILE_BACKUP" "$LOCKFILE" + fi + echo "restored $ROOT_CARGO and $LOCKFILE after failure" >&2 + fi + + if [[ -n "$MANIFEST_BACKUP" && -f "$MANIFEST_BACKUP" ]]; then + rm -f "$MANIFEST_BACKUP" + fi + if [[ -n "$LOCKFILE_BACKUP" && -f "$LOCKFILE_BACKUP" ]]; then + rm -f "$LOCKFILE_BACKUP" + fi + + return "$status" +} + +trap cleanup EXIT + +[[ -f "$ROOT_CARGO" ]] || die "not found: $ROOT_CARGO" +[[ -f "$LOCKFILE" ]] || die "not found: $LOCKFILE" +git rev-parse --is-inside-work-tree >/dev/null 2>&1 || die "current directory is not a git repository" + +is_valid_semver "$VERSION" || die "version '$VERSION' is not valid SemVer 2.0.0" + +CURRENT_VERSION="$( + awk ' + /^\[package\]/ { in_package=1; next } + in_package && /^\[/ { in_package=0 } + in_package && /^[[:space:]]*version[[:space:]]*=/ { + match($0, /"[^"]+"/) + if (RSTART > 0) { + print substr($0, RSTART + 1, RLENGTH - 2) + exit + } + } + ' "$ROOT_CARGO" +)" +[[ -n "$CURRENT_VERSION" ]] || die "could not determine current crate version from $ROOT_CARGO" +is_valid_semver "$CURRENT_VERSION" || die "current crate version '$CURRENT_VERSION' is not valid SemVer" + +CMP="$(compare_semver "$VERSION" "$CURRENT_VERSION")" +if [[ "$CMP" -le 0 ]]; then + die "new version '$VERSION' must be greater than current version '$CURRENT_VERSION'" +fi + +if git rev-parse "$TAG" >/dev/null 2>&1; then + die "tag $TAG already exists" +fi + +if [[ -n "$(git status --porcelain)" ]]; then + die "working tree is not clean; commit or stash existing changes first" +fi + +if ! git remote get-url origin >/dev/null 2>&1; then + die "git remote 'origin' is not configured" +fi + +echo "Current crate version : $CURRENT_VERSION" +echo "New crate version : $VERSION" +echo "Tag : $TAG" +if [[ "$PUBLISH" == true ]]; then + echo "Publish : yes" +else + echo "Publish : no" +fi +if [[ "$DRY_RUN" == true ]]; then + echo "Dry-run : yes" +else + echo "Dry-run : no" +fi +echo + +echo "Preflight checks before changing version" +cargo check -q +TMP_TARGET_DIR="$(mktemp -d "${TMPDIR:-/tmp}/simpleaf-release-check.XXXXXX")" +CARGO_TARGET_DIR="$TMP_TARGET_DIR" cargo package --offline --allow-dirty --no-verify >/dev/null +rm -rf "$TMP_TARGET_DIR" +TMP_TARGET_DIR="" + +echo "Updating $ROOT_CARGO" +echo " version: $CURRENT_VERSION -> $VERSION" +echo "Updating $LOCKFILE" +echo " package entry version: $CURRENT_VERSION -> $VERSION" + +if [[ "$DRY_RUN" == false ]]; then + MANIFEST_BACKUP="$(mktemp "${TMPDIR:-/tmp}/simpleaf-Cargo.toml.XXXXXX")" + LOCKFILE_BACKUP="$(mktemp "${TMPDIR:-/tmp}/simpleaf-Cargo.lock.XXXXXX")" + cp "$ROOT_CARGO" "$MANIFEST_BACKUP" + cp "$LOCKFILE" "$LOCKFILE_BACKUP" + + sed -i.bak "1,/^version = /s/^version = \".*\"/version = \"${VERSION}\"/" "$ROOT_CARGO" + rm -f "${ROOT_CARGO}.bak" + + sed -i.bak "/^name = \"${CRATE_NAME}\"$/,/^dependencies = \\[$/s/^version = \".*\"/version = \"${VERSION}\"/" "$LOCKFILE" + rm -f "${LOCKFILE}.bak" + + MANIFEST_UPDATED=true +else + echo "Dry-run: would rewrite $ROOT_CARGO and $LOCKFILE" +fi + +UPDATED_VERSION="$CURRENT_VERSION" +UPDATED_LOCK_VERSION="$CURRENT_VERSION" +if [[ "$DRY_RUN" == false ]]; then + UPDATED_VERSION="$( + awk ' + /^\[package\]/ { in_package=1; next } + in_package && /^\[/ { in_package=0 } + in_package && /^[[:space:]]*version[[:space:]]*=/ { + match($0, /"[^"]+"/) + if (RSTART > 0) { + print substr($0, RSTART + 1, RLENGTH - 2) + exit + } + } + ' "$ROOT_CARGO" + )" + UPDATED_LOCK_VERSION="$(sed -n "/^name = \"${CRATE_NAME}\"$/,/^dependencies = \\[$/s/^version = \"\(.*\)\"/\1/p" "$LOCKFILE" | head -1)" + + [[ "$UPDATED_VERSION" == "$VERSION" ]] || die "crate version update failed" + [[ "$UPDATED_LOCK_VERSION" == "$VERSION" ]] || die "lockfile version update failed" +fi + +echo +echo "Post-bump validation" +if [[ "$DRY_RUN" == true ]]; then + echo "Dry-run: would run cargo check and cargo package against the bumped version" +else + cargo check -q + TMP_TARGET_DIR="$(mktemp -d "${TMPDIR:-/tmp}/simpleaf-release-check.XXXXXX")" + CARGO_TARGET_DIR="$TMP_TARGET_DIR" cargo package --offline --allow-dirty --no-verify >/dev/null + rm -rf "$TMP_TARGET_DIR" + TMP_TARGET_DIR="" +fi + +run git add "$ROOT_CARGO" "$LOCKFILE" +run git commit -m "chore(release): bump ${CRATE_NAME} to v${VERSION}" + +if [[ "$DRY_RUN" == false ]]; then + COMMIT_CREATED=true +fi + +run git tag -a "$TAG" -m "Release ${VERSION}" +run git push origin HEAD +run git push origin "$TAG" + +if [[ "$PUBLISH" == true ]]; then + run cargo publish +else + echo "Skipping crates.io publish; re-run with --publish to publish ${CRATE_NAME} v${VERSION}" +fi + +echo +if [[ "$DRY_RUN" == true ]]; then + echo "Dry-run complete" +else + echo "Release bump and publish complete for v${VERSION}" +fi diff --git a/resources/chemistries.json b/resources/chemistries.json index 29b1208..3cb735e 100644 --- a/resources/chemistries.json +++ b/resources/chemistries.json @@ -139,7 +139,61 @@ "version": "0.1.0", "meta": { "cr_filename": "737K-arc-v1.txt", - "barcode_ori": "forward" + "barcode_ori": "forward" + } + }, + "10x-flexv1-gex-3p": { + "geometry": "1{b[16]u[12]x:}2{r[50]x[18]s[8]x:}", + "expected_ori": "both", + "version": "0.1.0", + "plist_name": "9fe0cbace5ad3cf83679a7b082b99df879d3d4c069d68da313b6650dd18fc803", + "remote_url": "https://umd.box.com/shared/static/ocr4rah6xdzan10rfrrggqyat06zea5m", + "meta": { + "cr_filename": "737K-fixed-rna-profiling.txt", + "protocol_type": "flex_gex" + }, + "sample_bc_list": { + "plist_name": "5dc9d109e541b02974bbae15682e7198ebc72d3da4f4a219d73620e2852c89e0", + "remote_url": "https://umd.box.com/shared/static/o1db4o6021a4nblpci8aume0z0lz5hfd" + }, + "probe_sets": { + "human": { + "name": "Chromium_Human_Transcriptome_Probe_Set_v1.1.0_GRCh38-2024-A", + "plist_name": "9ccbd08d70bcab619be45bd9a1b9c76efdd91030517564e5819e5f74b1d36d93", + "remote_url": "https://umd.box.com/shared/static/iygrl8vn798j4dgto9bqop9914wg1n51" + }, + "mouse": { + "name": "Chromium_Mouse_Transcriptome_Probe_Set_v1.1.1_GRCm39-2024-A", + "plist_name": "b4b811412ef75c33e09b5ec1a8f4028a08f3d0fe470ddf524380526f3cbc2e12", + "remote_url": "https://umd.box.com/shared/static/oz6kubjc3rmjeb0e9fi417iui5x9h72b" + } + } + }, + "10x-flexv2-gex-3p": { + "geometry": "1{b[16]u[12]x[10]s[10]}2{r:}", + "expected_ori": "both", + "version": "0.1.0", + "plist_name": "dcb018db2e9d7023ac077eb2477b877163fa5dcef7b8e39e0826bc2560c29cf9", + "remote_url": "https://umd.box.com/shared/static/fei8u0tcve2wu73q4qxrskwl3gs7svj1", + "meta": { + "cr_filename": "737K-flex-v2.txt", + "protocol_type": "flex_gex" + }, + "sample_bc_list": { + "plist_name": "5e7ef950ad43c35525e8d923423a7bb1abdce78ca2a8fe4ef9a2d69c5f22830b", + "remote_url": "https://umd.box.com/shared/static/4ogdlgooo1onjkrny9pfbyz4gdt5vktl" + }, + "probe_sets": { + "human": { + "name": "Chromium_Human_Transcriptome_Probe_Set_v2.0.0_GRCh38-2024-A", + "plist_name": "bfa53e2acde140d6bd90c3b21d65c48c5b04e3064adee7402d3016116db04958", + "remote_url": "https://umd.box.com/shared/static/2fd0tda0638wy0fclgh3ywdgul1g53pi" + }, + "mouse": { + "name": "Chromium_Mouse_Transcriptome_Probe_Set_v2.0.0_GRCm39-2024-A", + "plist_name": "8e0ea460bc5a93df85cf5c4f321c62a3e50ad01b5589a8c0c9559bf554f6f276", + "remote_url": "https://umd.box.com/shared/static/z21ofrtnj3t33fb0mti66dc319cgvnuq" + } } } } diff --git a/src/main.rs b/src/main.rs index e5bcbc8..911e11a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -101,6 +101,11 @@ fn main() -> anyhow::Result<()> { // if we are running mapping and quantification Commands::Quant(map_quant_opts) => map_and_quant(af_home_path.as_path(), map_quant_opts), + // if we are running Flex GEX quantification + Commands::FlexQuant(flex_opts) => { + flex_quant::flex_map_and_quant(af_home_path.as_path(), flex_opts) + } + // indexing for ATAC-seq data Commands::Atac(AtacCommand::Index(index_opts)) => { atac::index::piscem_index(af_home_path.as_path(), &index_opts) diff --git a/src/simpleaf_commands.rs b/src/simpleaf_commands.rs index b226f6d..8c43187 100644 --- a/src/simpleaf_commands.rs +++ b/src/simpleaf_commands.rs @@ -15,6 +15,8 @@ pub use self::indexing::build_ref_and_index; pub mod quant; pub use self::quant::map_and_quant; +pub mod flex_quant; + pub mod workflow; pub use self::workflow::{ get_workflow, list_workflows, patch_manifest_or_template, refresh_protocol_estuary, @@ -578,6 +580,85 @@ pub struct SetPathOpts { macs: Option, } +/// Options for the `flex-quant` subcommand — 10x Flex GEX quantification. +/// +/// This command handles the complete Flex pipeline: probe index building, +/// mapping, barcode correction (cell + sample), collation, and quantification. +#[derive(Args, Clone, Debug)] +#[command(arg_required_else_help = true)] +pub struct FlexQuantOpts { + /// Chemistry name: 10x-flexv1-gex-3p or 10x-flexv2-gex-3p + #[arg(short, long)] + pub chemistry: String, + + /// Target organism for automatic probe set selection + #[arg(long, value_enum)] + pub organism: crate::utils::chem_utils::Organism, + + /// Path to output directory + #[arg(short, long)] + pub output: PathBuf, + + /// Number of threads to use + #[arg(short, long, default_value_t = 16)] + pub threads: u32, + + /// Path to pre-built probe index (overrides auto-build) + #[arg(short = 'i', long, help_heading = "Mapping Options")] + pub index: Option, + + /// Path to probe set CSV or FASTA (overrides auto-download). + /// If a CSV is provided, it is converted to FASTA and a t2g map + /// is generated automatically. + #[arg(long, help_heading = "Probe Set Options")] + pub probe_set: Option, + + /// Path to sample/probe barcode file with rotation mapping + /// (overrides auto-download). 3-column TSV: observed, canonical, sample_name. + #[arg(long, help_heading = "Probe Set Options")] + pub sample_bc_list: Option, + + /// Comma-separated list of R1 FASTQ files + #[arg(short = '1', long, value_delimiter = ',', help_heading = "Mapping Options")] + pub reads1: Vec, + + /// Comma-separated list of R2 FASTQ files + #[arg(short = '2', long, value_delimiter = ',', help_heading = "Mapping Options")] + pub reads2: Vec, + + /// UMI resolution mode + #[arg(short, long, default_value = "cr-like", + help_heading = "Quantification Options", + value_parser = clap::builder::PossibleValuesParser::new([ + "cr-like", "cr-like-em", "parsimony", "parsimony-em", + "parsimony-gene", "parsimony-gene-em" + ]))] + pub resolution: String, + + /// k-mer length for probe index building + #[arg(long, default_value_t = 23, help_heading = "Probe Set Options")] + pub kmer_length: usize, + + /// The skipping strategy to use for k-mer collection + #[arg(long, + default_value = "permissive", + value_parser = clap::builder::PossibleValuesParser::new(["permissive", "strict"]), + help_heading = "Piscem Mapping Options")] + pub skipping_strategy: String, + + /// If piscem >= 0.7.0, enable structural constraints + #[arg(long, help_heading = "Piscem Mapping Options")] + pub struct_constraints: bool, + + /// Maximum cardinality equivalence class to examine + #[arg(long, default_value_t = DefaultParams::MAX_EC_CARD, help_heading = "Piscem Mapping Options")] + pub max_ec_card: u32, + + /// Minimum read count threshold for unfiltered permit list + #[arg(long, default_value_t = 10, help_heading = "Permit List Options")] + pub min_reads: usize, +} + #[derive(Debug, Subcommand)] pub enum Commands { /// build the (expanded) reference index @@ -589,6 +670,8 @@ pub enum Commands { Inspect {}, /// quantify a sample Quant(MapQuantOpts), + /// quantify a 10x Flex GEX sample (probe-based, multiplexed) + FlexQuant(FlexQuantOpts), /// set paths to the programs that simpleaf will use SetPaths(SetPathOpts), /// refreshes version information associated with programs used by simpleaf diff --git a/src/simpleaf_commands/chemistry.rs b/src/simpleaf_commands/chemistry.rs index 91b7924..7fd4027 100644 --- a/src/simpleaf_commands/chemistry.rs +++ b/src/simpleaf_commands/chemistry.rs @@ -406,6 +406,8 @@ pub fn add_chemistry( remote_pl_url: add_opts.remote_url, version, meta, + sample_bc_list: None, + probe_sets: None, }; let mut chem_hm = get_custom_chem_hm(&chem_p)?; diff --git a/src/simpleaf_commands/flex_quant.rs b/src/simpleaf_commands/flex_quant.rs new file mode 100644 index 0000000..ea39299 --- /dev/null +++ b/src/simpleaf_commands/flex_quant.rs @@ -0,0 +1,530 @@ +//! Pipeline orchestration for 10x Flex GEX quantification. +//! +//! Handles the complete Flex pipeline: +//! 1. Resource resolution (probe index, cell BC whitelist, probe barcode file) +//! 2. Mapping with piscem +//! 3. Generate-permit-list (multi-barcode aware) +//! 4. Collate (hierarchical, multi-barcode) +//! 5. Quant + +use crate::core::{context, exec}; +use crate::simpleaf_commands::FlexQuantOpts; +use crate::utils::chem_utils::{CustomChemistry, CustomChemistryMap}; +use crate::utils::constants::CHEMISTRIES_PATH; +use crate::utils::prog_utils; + +use anyhow::{Context, bail}; +use serde_json::json; +use std::io::{BufRead, BufReader, Write}; +use std::path::{Path, PathBuf}; +use std::time::Instant; +use tracing::{info, warn}; + +/// Convert a 10x probe set CSV file to a FASTA file suitable for indexing. +/// +/// Also generates a transcript-to-gene (t2g) map and extracts probe set metadata. +/// Returns (fasta_path, t2g_path, metadata_json). +pub fn convert_probe_csv_to_fasta( + csv_path: &Path, + output_dir: &Path, +) -> anyhow::Result<(PathBuf, PathBuf, serde_json::Value)> { + std::fs::create_dir_all(output_dir)?; + + let file = std::fs::File::open(csv_path) + .with_context(|| format!("couldn't open probe CSV: {}", csv_path.display()))?; + let reader = BufReader::new(file); + + let fasta_path = output_dir.join("probes.fa"); + let t2g_path = output_dir.join("probe_t2g.tsv"); + let meta_path = output_dir.join("probe_set_info.json"); + + let mut fasta_writer = std::io::BufWriter::new(std::fs::File::create(&fasta_path)?); + let mut t2g_writer = std::io::BufWriter::new(std::fs::File::create(&t2g_path)?); + + let mut metadata = serde_json::Map::new(); + let mut num_probes = 0u64; + let mut num_included = 0u64; + let mut num_excluded = 0u64; + let mut genes = std::collections::HashSet::new(); + let mut header_parsed = false; + + for line in reader.lines() { + let line = line?; + let trimmed = line.trim(); + + if let Some(stripped) = trimmed.strip_prefix('#') { + if let Some((key, val)) = stripped.split_once('=') { + metadata.insert(key.to_string(), serde_json::Value::String(val.to_string())); + } + continue; + } + + if !header_parsed { + header_parsed = true; + continue; + } + + let cols: Vec<&str> = trimmed.split(',').collect(); + if cols.len() < 4 { + continue; + } + let gene_id = cols[0]; + let probe_seq = cols[1]; + let probe_id = cols[2]; + let included = cols[3].eq_ignore_ascii_case("true"); + + num_probes += 1; + if included { + num_included += 1; + } else { + num_excluded += 1; + } + // All probes (included and excluded) go into the FASTA and t2g map. + // The index contains all probes, so quant needs a t2g entry for every + // reference. Excluded probes still map to their gene — they simply + // won't contribute meaningful counts in practice. + writeln!(fasta_writer, ">{}", probe_id)?; + writeln!(fasta_writer, "{}", probe_seq)?; + writeln!(t2g_writer, "{}\t{}", probe_id, gene_id)?; + genes.insert(gene_id.to_string()); + } + + fasta_writer.flush()?; + t2g_writer.flush()?; + + metadata.insert("num_probes".to_string(), json!(num_probes)); + metadata.insert("num_included".to_string(), json!(num_included)); + metadata.insert("num_excluded".to_string(), json!(num_excluded)); + metadata.insert("num_genes".to_string(), json!(genes.len())); + metadata.insert( + "source_file".to_string(), + json!(csv_path.file_name().unwrap_or_default().to_string_lossy()), + ); + + let meta_value = serde_json::Value::Object(metadata); + let meta_file = std::fs::File::create(&meta_path)?; + serde_json::to_writer_pretty(meta_file, &meta_value)?; + + info!( + "Converted probe CSV: {} included probes, {} genes, {} excluded", + num_included, + genes.len(), + num_excluded, + ); + + Ok((fasta_path, t2g_path, meta_value)) +} + +/// Main entry point for the flex-quant pipeline. +pub fn flex_map_and_quant(af_home: &Path, opts: FlexQuantOpts) -> anyhow::Result<()> { + let start = Instant::now(); + info!("Starting Flex GEX pipeline"); + + // Load runtime context (program paths) + let rt = context::load_runtime_context(af_home)?; + let piscem_info = rt + .progs + .piscem + .as_ref() + .context("piscem is required for Flex mapping; please run `simpleaf set-paths`")?; + let alevin_fry_info = rt + .progs + .alevin_fry + .as_ref() + .context("alevin-fry is required; please run `simpleaf set-paths`")?; + + // Load chemistry from registry + let chem_path = af_home.join(CHEMISTRIES_PATH); + if !chem_path.exists() { + bail!( + "Chemistry registry not found at {}. Run `simpleaf chemistry refresh` first.", + chem_path.display(), + ); + } + let chem_file = std::fs::File::open(&chem_path)?; + let chem_map: CustomChemistryMap = serde_json::from_reader(chem_file) + .with_context(|| format!("couldn't parse {}", chem_path.display()))?; + + let chem = chem_map + .get(&opts.chemistry) + .ok_or_else(|| { + anyhow::anyhow!( + "Chemistry '{}' not found in registry. Run `simpleaf chemistry refresh`.", + opts.chemistry, + ) + })?; + + if !chem.is_flex_gex() { + bail!( + "Chemistry '{}' is not a Flex GEX protocol. Use `simpleaf quant` for standard chemistries.", + opts.chemistry, + ); + } + + info!( + "Chemistry: {} (geometry: {}, organism: {})", + opts.chemistry, + chem.geometry(), + opts.organism, + ); + + // Create output directory structure + let output_dir = &opts.output; + std::fs::create_dir_all(output_dir)?; + let map_output = output_dir.join("af_map"); + let quant_output = output_dir.join("af_quant"); + std::fs::create_dir_all(&map_output)?; + std::fs::create_dir_all(&quant_output)?; + + // === Step 1: Resolve probe index === + let (index_path, t2g_path) = resolve_probe_index( + af_home, chem, &opts, &piscem_info.exe_path, + )?; + + // === Step 2: Resolve cell barcode whitelist === + // For Flex, the cell BC whitelist is fetched the same way as standard chemistries. + let cell_bc_path = resolve_cell_bc_whitelist(af_home, chem)?; + + // === Step 3: Resolve probe barcode (sample BC) file === + let sample_bc_path = resolve_sample_bc_list(af_home, chem, &opts)?; + + // === Step 4: Map reads with piscem === + info!("Mapping reads with piscem..."); + let mut piscem_cmd = std::process::Command::new(&piscem_info.exe_path); + piscem_cmd + .arg("map-scrna") + .arg("-i").arg(&index_path) + .arg("-g").arg(chem.geometry()) + .arg("-o").arg(&map_output) + .arg("-t").arg(format!("{}", opts.threads)); + + if opts.struct_constraints { + piscem_cmd.arg("--struct-constraints"); + } + piscem_cmd + .arg("--skipping-strategy").arg(&opts.skipping_strategy) + .arg("--max-ec-card").arg(format!("{}", opts.max_ec_card)); + + let r1_str: Vec = opts.reads1.iter().map(|p| p.display().to_string()).collect(); + let r2_str: Vec = opts.reads2.iter().map(|p| p.display().to_string()).collect(); + piscem_cmd.arg("-1").arg(r1_str.join(",")); + piscem_cmd.arg("-2").arg(r2_str.join(",")); + + let map_cmd_str = prog_utils::get_cmd_line_string(&piscem_cmd); + info!("piscem map-scrna cmd: {}", map_cmd_str); + let map_start = Instant::now(); + exec::run_checked(&mut piscem_cmd, "[piscem map-sc]")?; + let map_duration = map_start.elapsed(); + info!("Mapping complete in {:.1}s", map_duration.as_secs_f64()); + + // === Step 5: Generate permit list (multi-barcode) === + info!("Generating permit list..."); + let mut gpl_cmd = std::process::Command::new(&alevin_fry_info.exe_path); + gpl_cmd + .arg("generate-permit-list") + .arg("-i").arg(&map_output) + .arg("-d").arg(chem.expected_ori().as_str()) + .arg("-o").arg(&quant_output) + .arg("-t").arg(format!("{}", opts.threads.min(8))) + .arg("--unfiltered-pl").arg(&cell_bc_path) + .arg("--sample-bc-list").arg(&sample_bc_path) + .arg("--sample-correction-mode").arg("exact") + .arg("--min-reads").arg(format!("{}", opts.min_reads)); + + let gpl_cmd_str = prog_utils::get_cmd_line_string(&gpl_cmd); + info!("generate-permit-list cmd: {}", gpl_cmd_str); + let gpl_start = Instant::now(); + exec::run_checked(&mut gpl_cmd, "[generate permit list]")?; + let gpl_duration = gpl_start.elapsed(); + + // === Step 6: Collate === + info!("Collating..."); + let mut collate_cmd = std::process::Command::new(&alevin_fry_info.exe_path); + collate_cmd + .arg("collate") + .arg("-i").arg(&quant_output) + .arg("-r").arg(&map_output) + .arg("-t").arg(format!("{}", opts.threads)); + + let collate_cmd_str = prog_utils::get_cmd_line_string(&collate_cmd); + info!("collate cmd: {}", collate_cmd_str); + let collate_start = Instant::now(); + exec::run_checked(&mut collate_cmd, "[collate]")?; + let collate_duration = collate_start.elapsed(); + + // === Step 7: Quantify === + info!("Quantifying..."); + let mut quant_cmd = std::process::Command::new(&alevin_fry_info.exe_path); + quant_cmd + .arg("quant") + .arg("-i").arg(&quant_output) + .arg("-o").arg(&quant_output) + .arg("-m").arg(&t2g_path) + .arg("-t").arg(format!("{}", opts.threads)) + .arg("-r").arg(&opts.resolution) + .arg("--use-mtx"); + + let quant_cmd_str = prog_utils::get_cmd_line_string(&quant_cmd); + info!("quant cmd: {}", quant_cmd_str); + let quant_start = Instant::now(); + exec::run_checked(&mut quant_cmd, "[quant]")?; + let quant_duration = quant_start.elapsed(); + + // === Write pipeline metadata === + let meta = json!({ + "chemistry": opts.chemistry, + "organism": opts.organism.to_string(), + "geometry": chem.geometry(), + "resolution": opts.resolution, + "threads": opts.threads, + "kmer_length": opts.kmer_length, + "index_path": index_path.display().to_string(), + "cell_bc_path": cell_bc_path.display().to_string(), + "sample_bc_path": sample_bc_path.display().to_string(), + "t2g_path": t2g_path.display().to_string(), + "map_cmd": map_cmd_str, + "gpl_cmd": gpl_cmd_str, + "collate_cmd": collate_cmd_str, + "quant_cmd": quant_cmd_str, + "map_duration_secs": map_duration.as_secs_f64(), + "gpl_duration_secs": gpl_duration.as_secs_f64(), + "collate_duration_secs": collate_duration.as_secs_f64(), + "quant_duration_secs": quant_duration.as_secs_f64(), + "total_duration_secs": start.elapsed().as_secs_f64(), + "simpleaf_version": env!("CARGO_PKG_VERSION"), + }); + let meta_path = output_dir.join("simpleaf_flex_quant_info.json"); + let meta_file = std::fs::File::create(&meta_path)?; + serde_json::to_writer_pretty(meta_file, &meta)?; + + info!( + "Flex GEX pipeline complete in {:.1}s. Output: {}", + start.elapsed().as_secs_f64(), + output_dir.display(), + ); + + Ok(()) +} + +/// Resolve the probe index: use provided, build from probe set, or auto-download. +fn resolve_probe_index( + af_home: &Path, + chem: &CustomChemistry, + opts: &FlexQuantOpts, + piscem_path: &Path, +) -> anyhow::Result<(PathBuf, PathBuf)> { + // If user provided a pre-built index, use it directly. + if let Some(ref index) = opts.index { + info!("Using user-provided probe index: {}", index.display()); + let t2g_candidate = index.join("probe_t2g.tsv"); + let t2g = if t2g_candidate.exists() { + t2g_candidate + } else if let Some(ref ps) = opts.probe_set { + let conv_dir = opts.output.join("probe_conversion"); + let (_, t2g, _) = convert_probe_csv_to_fasta(ps, &conv_dir)?; + t2g + } else { + bail!( + "When providing --index, a t2g map is also needed. \ + Place probe_t2g.tsv in the index directory, or provide --probe-set." + ); + }; + return Ok((index.clone(), t2g)); + } + + // If user provided a probe set file, build index from it. + if let Some(ref probe_set) = opts.probe_set { + warn!( + "Using user-provided probe set: {}. This overrides the default for '{}'.", + probe_set.display(), + opts.chemistry, + ); + return build_index_from_probe_set(probe_set, opts, piscem_path); + } + + // Auto mode: look up probe set in chemistry registry + let organism_key = opts.organism.to_string(); + let probe_sets = chem.probe_sets.as_ref().ok_or_else(|| { + anyhow::anyhow!( + "Chemistry '{}' has no registered probe sets. Provide --probe-set.", + opts.chemistry, + ) + })?; + let probe_info = probe_sets.get(&organism_key).ok_or_else(|| { + anyhow::anyhow!( + "No probe set for organism '{}' in chemistry '{}'. Provide --probe-set.", + organism_key, + opts.chemistry, + ) + })?; + + // Check cache + let cache_dir = af_home.join("probe_indices"); + std::fs::create_dir_all(&cache_dir)?; + let cache_key = probe_info.plist_name.as_deref().unwrap_or("unknown"); + let cached_index = cache_dir.join(format!("{}_{}", cache_key, opts.kmer_length)); + + if cached_index.join("index.ssi").exists() { + let t2g = cached_index.join("probe_t2g.tsv"); + if t2g.exists() { + info!("Using cached probe index: {}", cached_index.display()); + return Ok((cached_index.join("index"), t2g)); + } + } + + // Download and build + if let Some(ref url) = probe_info.remote_url { + info!("Downloading probe set '{}'...", probe_info.name); + let download_dir = cache_dir.join("downloads"); + std::fs::create_dir_all(&download_dir)?; + let csv_path = download_dir.join(format!("{}.csv", probe_info.name)); + if !csv_path.exists() { + prog_utils::download_to_file(url, &csv_path)?; + info!("Downloaded probe set to {}", csv_path.display()); + } + + // Build index in the cache directory + let mut build_opts = opts.clone(); + build_opts.output = cached_index.clone(); + let result = build_index_from_probe_set(&csv_path, &build_opts, piscem_path)?; + Ok((result.0, result.1)) + } else { + bail!( + "No remote URL for probe set '{}'. Provide --probe-set.", + probe_info.name, + ); + } +} + +/// Build a probe index from a CSV or FASTA file. +fn build_index_from_probe_set( + probe_set: &Path, + opts: &FlexQuantOpts, + piscem_path: &Path, +) -> anyhow::Result<(PathBuf, PathBuf)> { + let index_dir = opts.output.join("probe_index"); + std::fs::create_dir_all(&index_dir)?; + + let ext = probe_set + .extension() + .and_then(|e| e.to_str()) + .unwrap_or(""); + + let (fasta_path, t2g_path) = if ext.eq_ignore_ascii_case("csv") { + info!("Converting probe CSV to FASTA: {}", probe_set.display()); + let (fa, t2g, _meta) = convert_probe_csv_to_fasta(probe_set, &index_dir)?; + (fa, t2g) + } else { + // Assume FASTA — generate identity t2g + let t2g = index_dir.join("probe_t2g.tsv"); + if !t2g.exists() { + let fa_file = std::fs::File::open(probe_set)?; + let reader = BufReader::new(fa_file); + let mut t2g_writer = std::io::BufWriter::new(std::fs::File::create(&t2g)?); + for line in reader.lines() { + let line = line?; + if let Some(name) = line.strip_prefix('>') { + let name = name.split_whitespace().next().unwrap_or(name); + let gene = name.split('|').next().unwrap_or(name); + writeln!(t2g_writer, "{}\t{}", name, gene)?; + } + } + } + (probe_set.to_path_buf(), t2g) + }; + + // Build piscem index + info!("Building probe index with k={}...", opts.kmer_length); + let index_prefix = index_dir.join("index"); + let mut build_cmd = std::process::Command::new(piscem_path); + build_cmd + .arg("build") + .arg("-s").arg(&fasta_path) + .arg("-o").arg(&index_prefix) + .arg("-k").arg(format!("{}", opts.kmer_length)) + .arg("-t").arg(format!("{}", opts.threads)) + .arg("--overwrite"); + + info!("piscem build cmd: {}", prog_utils::get_cmd_line_string(&build_cmd)); + exec::run_checked(&mut build_cmd, "[piscem build]")?; + + // Copy t2g alongside index + let t2g_dest = index_dir.join("probe_t2g.tsv"); + if t2g_path != t2g_dest && t2g_path.exists() { + std::fs::copy(&t2g_path, &t2g_dest)?; + } + + Ok((index_prefix, t2g_dest)) +} + +/// Resolve cell barcode whitelist from the chemistry's plist_name/remote_url. +fn resolve_cell_bc_whitelist(af_home: &Path, chem: &CustomChemistry) -> anyhow::Result { + let plist_dir = af_home.join("plist"); + std::fs::create_dir_all(&plist_dir)?; + + if let Some(ref hash) = chem.plist_name { + let cached = plist_dir.join(hash); + if cached.exists() { + info!("Cell barcode whitelist cached: {}", cached.display()); + return Ok(cached); + } + } + + if let Some(ref url) = chem.remote_pl_url { + let dest = if let Some(ref hash) = chem.plist_name { + plist_dir.join(hash) + } else { + plist_dir.join("cell_bc_whitelist.txt") + }; + info!("Downloading cell barcode whitelist..."); + prog_utils::download_to_file(url, &dest)?; + info!("Downloaded to {}", dest.display()); + Ok(dest) + } else { + bail!("Chemistry '{}' has no cell barcode whitelist URL.", chem.name()); + } +} + +/// Resolve probe barcode (sample BC) file: use provided or auto-fetch. +fn resolve_sample_bc_list( + af_home: &Path, + chem: &CustomChemistry, + opts: &FlexQuantOpts, +) -> anyhow::Result { + if let Some(ref path) = opts.sample_bc_list { + info!("Using user-provided sample barcode list: {}", path.display()); + return Ok(path.clone()); + } + + let sbc_info = chem.sample_bc_list.as_ref().ok_or_else(|| { + anyhow::anyhow!( + "Chemistry '{}' has no sample barcode list. Provide --sample-bc-list.", + opts.chemistry, + ) + })?; + + let plist_dir = af_home.join("plist"); + std::fs::create_dir_all(&plist_dir)?; + + if let Some(ref hash) = sbc_info.plist_name { + let cached = plist_dir.join(hash); + if cached.exists() { + info!("Sample barcode list cached: {}", cached.display()); + return Ok(cached); + } + } + + if let Some(ref url) = sbc_info.remote_url { + let dest = if let Some(ref hash) = sbc_info.plist_name { + plist_dir.join(hash) + } else { + plist_dir.join("sample_bc_list.txt") + }; + info!("Downloading sample barcode list..."); + prog_utils::download_to_file(url, &dest)?; + info!("Downloaded to {}", dest.display()); + Ok(dest) + } else { + bail!("Chemistry '{}' has no sample barcode list URL. Provide --sample-bc-list.", opts.chemistry); + } +} diff --git a/src/utils/af_utils.rs b/src/utils/af_utils.rs index e4fae0a..33826a1 100644 --- a/src/utils/af_utils.rs +++ b/src/utils/af_utils.rs @@ -183,7 +183,7 @@ impl CellFilterMethod { pub enum Chemistry { Rna(RnaChemistry), Atac(AtacChemistry), - Custom(CustomChemistry), + Custom(Box), } impl QueryInRegistry for Chemistry { @@ -312,7 +312,7 @@ impl Chemistry { s, chem.geometry() ); - Chemistry::Custom(chem) + Chemistry::Custom(Box::new(chem)) } else { // we want to bail with an error if the chemistry is *known* but // not supported using this mapper (i.e. if it is a piscem-specific chem @@ -325,12 +325,14 @@ impl Chemistry { index_type.counterpart().as_str() ); } - Chemistry::Custom(CustomChemistry::simple_custom(s).with_context(|| { - format!( - "Could not parse the provided chemistry {}. Please ensure it is a valid chemistry string wrapped by quotes or that it is defined in the custom_chemistries.json file.", - s - ) - })?) + Chemistry::Custom(Box::new( + CustomChemistry::simple_custom(s).with_context(|| { + format!( + "Could not parse the provided chemistry {}. Please ensure it is a valid chemistry string wrapped by quotes or that it is defined in the custom_chemistries.json file.", + s + ) + })?, + )) } } }; @@ -417,6 +419,27 @@ fn is_builtin(s: &str) -> Option<&str> { } } +/// Returns true if the geometry string contains piscem-extended tags +/// that seq_geom_parser doesn't understand: `s[N]` (sample barcode) +/// or numbered barcodes like `b0[N]`, `b1[N]`. +fn has_piscem_extended_tags(geo: &str) -> bool { + // Match s[] — sample barcode tag + // Match b[] — numbered barcode tag (b0[16], b1[8], etc.) + let bytes = geo.as_bytes(); + for i in 0..bytes.len() { + if bytes[i] == b's' && bytes.get(i + 1) == Some(&b'[') { + return true; + } + if bytes[i] == b'b' + && bytes.get(i + 1).is_some_and(|c| c.is_ascii_digit()) + && bytes.get(i + 2) == Some(&b'[') + { + return true; + } + } + false +} + /// determines if a given geometry string is valid; if so /// it returns `Ok(())`, otherwise it returns an `anyhow::Error` describing /// why parsing failed. @@ -429,6 +452,15 @@ pub fn validate_geometry(geo: &str) -> Result<()> { builtin_kwd ); Ok(()) //bail!("The provided geometry is a builtin keyword [{}] (preceeded by \"__\") and so no attempt was made to parse it", builtin_kwd); + } else if has_piscem_extended_tags(geo) { + // Geometries with piscem-only extensions (e.g. s[N] for sample + // barcodes, bN[L] for numbered barcodes) cannot be parsed by + // seq_geom_parser. These are validated by piscem at mapping time. + debug!( + "geometry string contains piscem-extended tags (s[]/b[]), skipping seq_geom_parser validation: {}", + geo + ); + Ok(()) } else { let fg = FragmentGeomDesc::try_from(geo); match fg { diff --git a/src/utils/chem_utils.rs b/src/utils/chem_utils.rs index ef6790b..82c1505 100644 --- a/src/utils/chem_utils.rs +++ b/src/utils/chem_utils.rs @@ -75,6 +75,86 @@ impl ExpectedOri { } } +/// Target organism for probe set selection in Flex protocols. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum Organism { + #[serde(rename = "human")] + Human, + #[serde(rename = "mouse")] + Mouse, + #[serde(untagged)] + Other(String), +} + +impl std::str::FromStr for Organism { + type Err = std::convert::Infallible; + fn from_str(s: &str) -> Result { + Ok(match s.to_lowercase().as_str() { + "human" => Organism::Human, + "mouse" => Organism::Mouse, + other => Organism::Other(other.to_string()), + }) + } +} + +impl fmt::Display for Organism { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Organism::Human => write!(f, "human"), + Organism::Mouse => write!(f, "mouse"), + Organism::Other(s) => write!(f, "{}", s), + } + } +} + +impl clap::ValueEnum for Organism { + fn value_variants<'a>() -> &'a [Self] { + &[Organism::Human, Organism::Mouse] + } + + fn to_possible_value(&self) -> Option { + match self { + Organism::Human => Some(clap::builder::PossibleValue::new("human")), + Organism::Mouse => Some(clap::builder::PossibleValue::new("mouse")), + Organism::Other(_) => None, + } + } +} + +/// Protocol type — distinguishes standard scRNA from Flex, ATAC, etc. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +pub enum ProtocolType { + /// Standard single-cell RNA-seq (10x Chromium, etc.) + #[serde(rename = "standard_rna")] + #[default] + StandardRna, + /// 10x Flex gene expression (probe-based, multi-barcode) + #[serde(rename = "flex_gex")] + FlexGex, + /// scATAC-seq + #[serde(rename = "atac")] + Atac, +} + +/// Info for fetching the probe/sample barcode list (for Flex protocols). +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct SampleBcListInfo { + #[serde(skip_serializing_if = "Option::is_none")] + pub plist_name: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub remote_url: Option, +} + +/// Info for a specific probe set (organism-specific, for Flex protocols). +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +pub struct ProbeSetInfo { + pub name: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub plist_name: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub remote_url: Option, +} + /// A CustomChemistry is a description of a chemistry that is not /// covered under the different built-in chemistries. It defines the /// relevant information about how a chemistry should be defined including @@ -90,10 +170,16 @@ pub struct CustomChemistry { pub version: String, #[serde(skip_serializing_if = "Option::is_none")] pub plist_name: Option, - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(skip_serializing_if = "Option::is_none", rename = "remote_url")] pub remote_pl_url: Option, #[serde(skip_serializing_if = "Option::is_none")] pub meta: Option, + /// Probe/sample barcode list info (for Flex protocols) + #[serde(skip_serializing_if = "Option::is_none")] + pub sample_bc_list: Option, + /// Probe sets keyed by organism (for Flex protocols) + #[serde(skip_serializing_if = "Option::is_none")] + pub probe_sets: Option>, } /// The key to use to query a custom chemistry @@ -115,6 +201,8 @@ impl CustomChemistry { plist_name: None, remote_pl_url: None, meta: None, + sample_bc_list: None, + probe_sets: None, }) } pub fn geometry(&self) -> &str { @@ -148,6 +236,20 @@ impl CustomChemistry { pub fn meta(&self) -> &Option { &self.meta } + + /// Get the protocol type from meta.protocol_type, defaulting to StandardRna. + pub fn protocol_type(&self) -> ProtocolType { + self.meta + .as_ref() + .and_then(|m| m.get("protocol_type")) + .and_then(|v| serde_json::from_value::(v.clone()).ok()) + .unwrap_or_default() + } + + /// Whether this chemistry is a Flex GEX protocol. + pub fn is_flex_gex(&self) -> bool { + self.protocol_type() == ProtocolType::FlexGex + } } impl fmt::Display for CustomChemistry { diff --git a/tests/snapshots/cli-help/simpleaf___help.txt b/tests/snapshots/cli-help/simpleaf___help.txt index d722166..dd32aa8 100644 --- a/tests/snapshots/cli-help/simpleaf___help.txt +++ b/tests/snapshots/cli-help/simpleaf___help.txt @@ -7,6 +7,7 @@ Commands: chemistry operate on or inspect the chemistry registry inspect inspect the current configuration quant quantify a sample + flex-quant quantify a 10x Flex GEX sample (probe-based, multiplexed) set-paths set paths to the programs that simpleaf will use refresh-prog-info refreshes version information associated with programs used by simpleaf atac run a sub-command dealing with atac-seq data