diff --git a/jcvi/assembly/geneticmap.py b/jcvi/assembly/geneticmap.py index 35fe7ec5..4658e386 100644 --- a/jcvi/assembly/geneticmap.py +++ b/jcvi/assembly/geneticmap.py @@ -10,6 +10,7 @@ from itertools import combinations, groupby from random import sample +from typing import Tuple import numpy as np import seaborn as sns @@ -22,6 +23,7 @@ from ..graphics.base import ( Rectangle, draw_cmap, + normalize_axes, plt, plot_heatmap, savefig, @@ -336,30 +338,12 @@ def dotplot(args): fig.clear() -def heatmap(args): +def read_subsampled_matrix(mstmap: str, subsample: int) -> Tuple[np.ndarray, str, int]: """ - %prog heatmap map - - Calculate pairwise linkage disequilibrium given MSTmap. + Read the subsampled matrix from file if it exists, otherwise calculate it. """ - p = OptionParser(heatmap.__doc__) - p.add_option( - "--subsample", - default=1000, - type="int", - help="Subsample markers to speed up", - ) - opts, args, iopts = p.set_image_options(args, figsize="8x8") - - if len(args) != 1: - sys.exit(not p.print_help()) - - (mstmap,) = args - subsample = opts.subsample data = MSTMap(mstmap) - markerbedfile = mstmap + ".subsample.bed" - ldmatrix = mstmap + ".subsample.matrix" # Take random subsample while keeping marker order if subsample < data.nmarkers: data = [data[x] for x in sorted(sample(range(len(data)), subsample))] @@ -367,6 +351,8 @@ def heatmap(args): logger.debug("Use all markers, --subsample ignored") nmarkers = len(data) + markerbedfile = mstmap + ".subsample.bed" + ldmatrix = mstmap + ".subsample.matrix" if need_update(mstmap, (ldmatrix, markerbedfile)): with open(markerbedfile, "w", encoding="utf-8") as fw: print("\n".join(x.bedline for x in data), file=fw) @@ -389,21 +375,24 @@ def heatmap(args): M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers) logger.debug("LD matrix `%s` exists (%dx%d).", ldmatrix, nmarkers, nmarkers) - plt.rcParams["axes.linewidth"] = 0 + return M, markerbedfile, nmarkers - fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes((0, 0, 1, 1)) - ax = fig.add_axes((0.1, 0.1, 0.8, 0.8)) # the heatmap + +def draw_geneticmap_heatmap(root, ax, mstmap: str, subsample: int): + """ + Draw the heatmap of the genetic map. + """ + M, markerbedfile, nmarkers = read_subsampled_matrix(mstmap, subsample) # Plot chromosomes breaks - bed = Bed(markerbedfile) - xsize = len(bed) + b = Bed(markerbedfile) + xsize = len(b) extent = (0, nmarkers) chr_labels = [] ignore_size = 20 breaks = [] - for seqid, beg, end in bed.get_breaks(): + for seqid, beg, end in b.get_breaks(): ignore = abs(end - beg) < ignore_size pos = (beg + end) / 2 chr_labels.append((seqid, pos, ignore)) @@ -434,11 +423,39 @@ def heatmap(args): m = mstmap.split(".")[0] root.text(0.5, 0.06, f"Linkage Disequilibrium between {m} markers", ha="center") - root.set_xlim(0, 1) - root.set_ylim(0, 1) - root.set_axis_off() + normalize_axes(root) + + +def heatmap(args): + """ + %prog heatmap map + + Calculate pairwise linkage disequilibrium given MSTmap. + """ + p = OptionParser(heatmap.__doc__) + p.add_option( + "--subsample", + default=1000, + type="int", + help="Subsample markers to speed up", + ) + opts, args, iopts = p.set_image_options(args, figsize="8x8") + + if len(args) != 1: + sys.exit(not p.print_help()) + + (mstmap,) = args + + plt.rcParams["axes.linewidth"] = 0 + + fig = plt.figure(1, (iopts.w, iopts.h)) + root = fig.add_axes((0, 0, 1, 1)) + ax = fig.add_axes((0.1, 0.1, 0.8, 0.8)) # the heatmap + + draw_geneticmap_heatmap(root, ax, mstmap, opts.subsample) - image_name = m + ".subsample" + "." + iopts.format + pf = mstmap.split(".")[0] + image_name = pf + ".subsample" + "." + iopts.format savefig(image_name, dpi=iopts.dpi, iopts=iopts) diff --git a/jcvi/assembly/hic.py b/jcvi/assembly/hic.py index ca8cf367..4ad12b37 100644 --- a/jcvi/assembly/hic.py +++ b/jcvi/assembly/hic.py @@ -10,9 +10,11 @@ import os import os.path as op import sys + from collections import defaultdict from functools import partial from multiprocessing import Pool +from typing import List, Optional, Tuple import numpy as np @@ -683,56 +685,21 @@ def generate_groups(groupsfile): yield seqids, color -def heatmap(args): +def read_matrix( + npyfile: str, + header: dict, + contig: Optional[str], + groups: List[Tuple[str, str]], + vmin: int, + vmax: int, + plot_breaks: bool, +): """ - %prog heatmap input.npy genome.json - - Plot heatmap based on .npy data file. The .npy stores a square matrix with - bins of genome, and cells inside the matrix represent number of links - between bin i and bin j. The `genome.json` contains the offsets of each - contig/chr so that we know where to draw boundary lines, or extract per - contig/chromosome heatmap. - - If a 'groups' file is given (with --groups), we will draw squares on the - heatmap. The 'groups' file has the following format, for example: - - seq1,seq2 b - seq1 g - seq2 g - - This will first draw a square around seq1+seq2 with blue color, then seq1 - and seq2 individually with green color. + Read the matrix from the npy file and apply log transformation and thresholding. """ - p = OptionParser(heatmap.__doc__) - p.add_option("--title", help="Title of the heatmap") - p.add_option("--groups", help="Groups file, see doc") - p.add_option("--vmin", default=1, type="int", help="Minimum value in the heatmap") - p.add_option("--vmax", default=6, type="int", help="Maximum value in the heatmap") - p.add_option("--chr", help="Plot this contig/chr only") - p.add_option( - "--nobreaks", - default=False, - action="store_true", - help="Do not plot breaks (esp. if contigs are small)", - ) - opts, args, iopts = p.set_image_options( - args, figsize="11x11", style="white", cmap="coolwarm", format="png", dpi=120 - ) - - if len(args) != 2: - sys.exit(not p.print_help()) - - npyfile, jsonfile = args - contig = opts.chr - groups = list(generate_groups(opts.groups)) if opts.groups else [] - - # Load contig/chromosome starts and sizes - header = json.loads(open(jsonfile).read()) - resolution = header.get("resolution") - assert resolution is not None, "`resolution` not found in `{}`".format(jsonfile) - logger.debug("Resolution set to %d", resolution) # Load the matrix A = np.load(npyfile) + total_bins = header["total_bins"] # Select specific submatrix if contig: @@ -740,6 +707,8 @@ def heatmap(args): contig_size = header["sizes"][contig] contig_end = contig_start + contig_size A = A[contig_start:contig_end, contig_start:contig_end] + else: + A = A[:total_bins, :total_bins] # Convert seqids to positions for each group new_groups = [] @@ -766,40 +735,126 @@ def heatmap(args): B = A.astype("float64") B += 1.0 B = np.log(B) - vmin, vmax = opts.vmin, opts.vmax B[B < vmin] = vmin B[B > vmax] = vmax print(B) logger.debug("Matrix log-transformation and thresholding (%d-%d) done", vmin, vmax) - # Canvas - fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes((0, 0, 1, 1)) # whole canvas - ax = fig.add_axes((0.05, 0.05, 0.9, 0.9)) # just the heatmap - breaks = list(header["starts"].values()) - breaks += [header["total_bins"]] # This is actually discarded + breaks += [total_bins] # This is actually discarded breaks = sorted(breaks)[1:] - if contig or opts.nobreaks: + if contig or not plot_breaks: breaks = [] + + return B, new_groups, breaks + + +def draw_hic_heatmap( + root, + ax, + npyfile: str, + jsonfile: str, + contig: Optional[str], + groups_file: str, + title: str, + vmin: int, + vmax: int, + plot_breaks: bool, +): + """ + Draw heatmap based on .npy file. The .npy file stores a square matrix with + bins of genome, and cells inside the matrix represent number of links + between bin i and bin j. The `genome.json` contains the offsets of each + contig/chr so that we know where to draw boundary lines, or extract per + contig/chromosome heatmap. + """ + groups = list(generate_groups(groups_file)) if groups_file else [] + + # Load contig/chromosome starts and sizes + header = json.loads(open(jsonfile, encoding="utf-8").read()) + resolution = header.get("resolution") + assert resolution is not None, "`resolution` not found in `{}`".format(jsonfile) + logger.debug("Resolution set to %d", resolution) + + B, new_groups, breaks = read_matrix( + npyfile, header, contig, groups, vmin, vmax, plot_breaks + ) plot_heatmap(ax, B, breaks, groups=new_groups, binsize=resolution) # Title - pf = npyfile.rsplit(".", 1)[0] - title = opts.title if contig: - title += "-{}".format(contig) + title += f"-{contig}" root.text( 0.5, - 0.98, + 0.96, markup(title), color="darkslategray", - size=18, ha="center", va="center", ) normalize_axes(root) + + +def heatmap(args): + """ + %prog heatmap input.npy genome.json + + Plot heatmap based on .npy data file. The .npy stores a square matrix with + bins of genome, and cells inside the matrix represent number of links + between bin i and bin j. The `genome.json` contains the offsets of each + contig/chr so that we know where to draw boundary lines, or extract per + contig/chromosome heatmap. + + If a 'groups' file is given (with --groups), we will draw squares on the + heatmap. The 'groups' file has the following format, for example: + + seq1,seq2 b + seq1 g + seq2 g + + This will first draw a square around seq1+seq2 with blue color, then seq1 + and seq2 individually with green color. + """ + p = OptionParser(heatmap.__doc__) + p.add_option("--title", help="Title of the heatmap") + p.add_option("--groups", help="Groups file, see doc") + p.add_option("--vmin", default=1, type="int", help="Minimum value in the heatmap") + p.add_option("--vmax", default=6, type="int", help="Maximum value in the heatmap") + p.add_option("--chr", help="Plot this contig/chr only") + p.add_option( + "--nobreaks", + default=False, + action="store_true", + help="Do not plot breaks (esp. if contigs are small)", + ) + opts, args, iopts = p.set_image_options( + args, figsize="11x11", style="white", cmap="coolwarm", dpi=120 + ) + + if len(args) != 2: + sys.exit(not p.print_help()) + + npyfile, jsonfile = args + # Canvas + fig = plt.figure(1, (iopts.w, iopts.h)) + root = fig.add_axes((0, 0, 1, 1)) # whole canvas + ax = fig.add_axes((0.05, 0.05, 0.9, 0.9)) # just the heatmap + + draw_hic_heatmap( + root, + ax, + npyfile, + jsonfile, + contig=opts.chr, + groups_file=opts.groups, + title=opts.title, + vmin=opts.vmin, + vmax=opts.vmax, + plot_breaks=not opts.nobreaks, + ) + + pf = npyfile.rsplit(".", 1)[0] image_name = pf + "." + iopts.format # macOS sometimes has way too verbose output savefig(image_name, dpi=iopts.dpi, iopts=iopts) diff --git a/jcvi/assembly/kmer.py b/jcvi/assembly/kmer.py index b1846eff..9c4b43a5 100644 --- a/jcvi/assembly/kmer.py +++ b/jcvi/assembly/kmer.py @@ -6,32 +6,40 @@ """ import os.path as op import sys -import logging import math -import numpy as np from collections import defaultdict -from more_itertools import chunked from typing import List -from jcvi.graphics.base import ( - plt, +import numpy as np +from more_itertools import chunked + +from ..apps.grid import MakeManager +from ..apps.base import ( + ActionDispatcher, + OptionParser, + PIPE, + Popen, + logger, + need_update, + sh, +) +from ..formats.fasta import Fasta +from ..formats.base import BaseFile, must_open, get_number +from ..graphics.base import ( adjust_spines, asciiplot, - set_human_axis, - savefig, markup, - panel_labels, normalize_axes, + panel_labels, + plt, + savefig, + set_human_axis, set_ticklabels_helvetica, write_messages, ) -from jcvi.formats.fasta import Fasta -from jcvi.formats.base import BaseFile, must_open, get_number -from jcvi.utils.cbook import thousands, percentage -from jcvi.assembly.automaton import iter_project -from jcvi.apps.grid import MakeManager -from jcvi.apps.base import OptionParser, ActionDispatcher, sh, need_update, Popen, PIPE +from ..utils.cbook import thousands, percentage +from .automaton import iter_project KMERYL, KSOAP, KALLPATHS = range(3) @@ -48,7 +56,7 @@ def load_data(self, histfile): self.hist = {} kformat = self.guess_format(histfile) kformats = ("Meryl", "Soap", "AllPaths") - logging.debug("Guessed format: {0}".format(kformats[kformat])) + logger.debug("Guessed format: %s", kformats[kformat]) fp = open(histfile) for rowno, row in enumerate(fp): @@ -270,9 +278,7 @@ def run_optimization(termination=0.999, maxiter=100): copy_num = start if start == end else "{}-{}".format(start, end) g_copies = int(round(g * mid * (end - start + 1))) copy_series.append((mid, copy_num, g_copies, g)) - copy_message = "CN {}: {:.1f} Mb ({:.1f} percent)".format( - copy_num, g_copies / 1e6, g_copies * 100 / genome_size - ) + copy_message = f"CN {copy_num}: {g_copies / 1e6:.1f} Mb ({ g_copies * 100 / genome_size:.1f} percent)" copy_messages.append(copy_message) m += copy_message + "\n" @@ -280,15 +286,13 @@ def run_optimization(termination=0.999, maxiter=100): g_copies = genome_size - inferred_genome_size copy_num = "{}+".format(end + 1) copy_series.append((end + 1, copy_num, g_copies, g_copies / (end + 1))) - m += "CN {}: {:.1f} Mb ({:.1f} percent)\n".format( - copy_num, g_copies / 1e6, g_copies * 100 / genome_size - ) + m += f"CN {copy_num}: {g_copies / 1e6:.1f} Mb ({ g_copies * 100 / genome_size:.1f} percent)\n" # Determine ploidy def determine_ploidy(copy_series, threshold=0.15): counts_so_far = 1 ploidy_so_far = 0 - for mid, copy_num, g_copies, g in copy_series: + for mid, _, g_copies, _ in copy_series: if g_copies / counts_so_far < threshold: break counts_so_far += g_copies @@ -304,7 +308,7 @@ def determine_ploidy(copy_series, threshold=0.15): # Repeat content def calc_repeats(copy_series, ploidy, genome_size): unique = 0 - for mid, copy_num, g_copies, g in copy_series: + for mid, _, g_copies, _ in copy_series: if mid <= ploidy: unique += g_copies else: @@ -355,9 +359,10 @@ def analyze_allpaths(self, ploidy=2, K=23, covmax=1000000): kf_ceil = max(K for (K, c) in data) if kf_ceil > covmax: exceeds = sum(1 for (K, c) in data if K > covmax) - logging.debug( - "A total of {0} distinct K-mers appear > " - "{1} times. Ignored ...".format(exceeds, covmax) + logger.debug( + "A total of %d distinct K-mers appear > %d times. Ignored ...", + exceeds, + covmax, ) kf_ceil = covmax @@ -513,7 +518,7 @@ def write( # Divide indices into batches batches = [] batchsize = (len(self.indices) + batch - 1) // batch - logging.debug("Use batchsize of %d", batchsize) + logger.debug("Use batchsize of %d", batchsize) for i, indices in enumerate(chunked(self.indices, batchsize)): filename_i = filename.format(i + 1) outfile_i = outfile + ".{}".format(i + 1) @@ -660,7 +665,7 @@ def bed(args): KMERS.add(kmer_rc) K = len(kmer) - logging.debug("Imported {} {}-mers".format(len(KMERS), K)) + logger.debug("Imported %d %d-mers", len(KMERS), K) for name, seq in parse_fasta(fastafile): name = name.split()[0] @@ -723,7 +728,7 @@ def kmcop(args): indices = [x for x in indices if x.rsplit(".", 2)[0] not in exclude_ids] after = set(indices) if before > after: - logging.debug( + logger.debug( "Excluded accessions %d → %d (%s)", len(before), len(after), @@ -968,7 +973,7 @@ def count(args): kmers = list(make_kmers(rec.seq, K)) print("\n".join(kmers), file=proc.stdin) proc.stdin.close() - logging.debug(cmd) + logger.debug(cmd) proc.wait() a = bitarray() @@ -979,18 +984,20 @@ def count(args): c = row.strip() a.append(int(c)) a.tofile(fw) - logging.debug("Serialize {0} bits to `{1}`.".format(len(a), binfile)) + logger.debug("Serialize %d bits to `%s`.", len(a), binfile) fw.close() sh("rm {0}".format(t.name)) - logging.debug( - "Shared K-mers (K={0}) between `{1}` and `{2}` written to `{3}`.".format( - K, fastafile, jfdb, binfile - ) + logger.debug( + "Shared K-mers (K=%d) between `%s` and `%s` written to `%s`.", + K, + fastafile, + jfdb, + binfile, ) cntfile = ".".join((fastafile, jfdb, "cnt")) bincount([fastafile, binfile, "-o", cntfile, "-K {0}".format(K)]) - logging.debug("Shared K-mer counts written to `{0}`.".format(cntfile)) + logger.debug("Shared K-mer counts written to `%s`.", cntfile) def bincount(args): @@ -1124,10 +1131,10 @@ def jellyfish(args): gzip = fq.endswith(".gz") hashsize = totalfilesize / coverage - logging.debug( - "Total file size: {0}, hashsize (-s): {1}".format( - human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize - ) + logger.debug( + "Total file size: %s, hashsize (-s): %d", + human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), + hashsize, ) jfpf = "{0}-K{1}".format(pf, K) @@ -1156,20 +1163,6 @@ def jellyfish(args): sh(cmd) -def merylhistogram(merylfile): - """ - Run meryl to dump histogram to be used in kmer.histogram(). The merylfile - are the files ending in .mcidx or .mcdat. - """ - pf, sf = op.splitext(merylfile) - outfile = pf + ".histogram" - if need_update(merylfile, outfile): - cmd = "meryl -Dh -s {0}".format(pf) - sh(cmd, outfile=outfile) - - return outfile - - def multihistogram(args): """ %prog multihistogram *.histogram species @@ -1190,9 +1183,9 @@ def multihistogram(args): histfiles = args[:-1] species = args[-1] fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) - A = fig.add_axes([0.08, 0.12, 0.38, 0.76]) - B = fig.add_axes([0.58, 0.12, 0.38, 0.76]) + root = fig.add_axes((0, 0, 1, 1)) + A = fig.add_axes((0.08, 0.12, 0.38, 0.76)) + B = fig.add_axes((0.58, 0.12, 0.38, 0.76)) lines = [] legends = [] @@ -1241,39 +1234,141 @@ def multihistogram(args): savefig(imagename, dpi=iopts.dpi, iopts=iopts) +def plot_nbinom_fit(ax, ks: KmerSpectrum, ymax: float, method_info: dict): + """ + Plot the negative binomial fit. + """ + generative_model = method_info["generative_model"] + GG = method_info["Gbins"] + ll = method_info["lambda"] + rr = method_info["rho"] + kf_range = method_info["kf_range"] + stacked = generative_model(GG, ll, rr) + ax.plot( + kf_range, + stacked, + ":", + color="#6a3d9a", + lw=2, + ) + # Plot multiple CN locations, CN1, CN2, ... up to ploidy + cn_color = "#a6cee3" + for i in range(1, ks.ploidy + 1): + x = i * ks.lambda_ + ax.plot((x, x), (0, ymax), "-.", color=cn_color) + ax.text( + x, + ymax * 0.95, + f"CN{i}", + ha="right", + va="center", + color=cn_color, + rotation=90, + ) + + +def draw_ks_histogram( + ax, + histfile: str, + method: str, + coverage: int, + vmin: int, + vmax: int, + species: str, + K: int, + maxiter: int, + peaks: bool, +) -> int: + """ + Draw the K-mer histogram. + """ + ks = KmerSpectrum(histfile) + method_info = ks.analyze(K=K, maxiter=maxiter, method=method) + + Total_Kmers = int(ks.totalKmers) + Kmer_coverage = ks.lambda_ if not coverage else coverage + Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) + + Total_Kmers_msg = f"Total {K}-mers: {thousands(Total_Kmers)}" + Kmer_coverage_msg = f"{K}-mer coverage: {Kmer_coverage:.1f}x" + Genome_size_msg = f"Estimated genome size: {Genome_size / 1e6:.1f} Mb" + Repetitive_msg = ks.repetitive + SNPrate_msg = ks.snprate + + messages = [ + Total_Kmers_msg, + Kmer_coverage_msg, + Genome_size_msg, + Repetitive_msg, + SNPrate_msg, + ] + for msg in messages: + print(msg, file=sys.stderr) + + x, y = ks.get_xy(vmin, vmax) + title = f"{species} {K}-mer histogram" + + ax.bar(x, y, fc="#b2df8a", lw=0) + + if peaks: # Only works for method 'allpaths' + t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) + tcounts = [(x, y) for x, y in ks.counts if x in t] + if tcounts: + x, y = zip(*tcounts) + tcounts = dict(tcounts) + ax.plot(x, y, "ko", lw=3, mec="k", mfc="w") + ax.text(ks.max1, tcounts[ks.max1], "SNP peak") + ax.text(ks.max2, tcounts[ks.max2], "Main peak") + + _, ymax = ax.get_ylim() + ymax *= 7 / 6 + # Plot the negative binomial fit + if method == "nbinom": + plot_nbinom_fit(ax, ks, ymax, method_info) + messages += [ks.ploidy_message] + ks.copy_messages + + write_messages(ax, messages) + + ax.set_title(markup(title)) + ax.set_xlim((0, vmax)) + ax.set_ylim((0, ymax)) + adjust_spines(ax, ["left", "bottom"], outward=True) + xlabel, ylabel = "Coverage (X)", "Counts" + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + set_human_axis(ax) + + return Genome_size + + def histogram(args): """ %prog histogram meryl.histogram species K - Plot the histogram based on meryl K-mer distribution, species and N are + Plot the histogram based on Jellyfish or meryl K-mer distribution, species and N are only used to annotate the graphic. """ p = OptionParser(histogram.__doc__) p.add_option( "--vmin", dest="vmin", - default=1, + default=2, type="int", help="minimum value, inclusive", ) p.add_option( "--vmax", dest="vmax", - default=100, + default=200, type="int", help="maximum value, inclusive", ) - p.add_option( - "--pdf", - default=False, - action="store_true", - help="Print PDF instead of ASCII plot", - ) p.add_option( "--method", choices=("nbinom", "allpaths"), default="nbinom", - help="'nbinom' - slow but more accurate for het or polyploid genome; 'allpaths' - fast and works for homozygous enomes", + help="'nbinom' - slow but more accurate for het or polyploid genome; " + + "'allpaths' - fast and works for homozygous enomes", ) p.add_option( "--maxiter", @@ -1298,105 +1393,15 @@ def histogram(args): histfile, species, N = args method = opts.method vmin, vmax = opts.vmin, opts.vmax - ascii = not opts.pdf peaks = not opts.nopeaks and method == "allpaths" N = int(N) - if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"): - logging.debug("CA kmer index found") - histfile = merylhistogram(histfile) - - ks = KmerSpectrum(histfile) - method_info = ks.analyze(K=N, maxiter=opts.maxiter, method=method) - - Total_Kmers = int(ks.totalKmers) - coverage = opts.coverage - Kmer_coverage = ks.lambda_ if not coverage else coverage - Genome_size = int(round(Total_Kmers * 1.0 / Kmer_coverage)) - - Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers)) - Kmer_coverage_msg = "{0}-mer coverage: {1:.1f}x".format(N, Kmer_coverage) - Genome_size_msg = "Estimated genome size: {0:.1f} Mb".format(Genome_size / 1e6) - Repetitive_msg = ks.repetitive - SNPrate_msg = ks.snprate - - for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg): - print(msg, file=sys.stderr) - - x, y = ks.get_xy(vmin, vmax) - title = "{0} {1}-mer histogram".format(species, N) - - if ascii: - asciiplot(x, y, title=title) - return Genome_size - - plt.figure(1, (iopts.w, iopts.h)) - plt.bar(x, y, fc="#b2df8a", lw=0) - # Plot the negative binomial fit - if method == "nbinom": - generative_model = method_info["generative_model"] - GG = method_info["Gbins"] - ll = method_info["lambda"] - rr = method_info["rho"] - kf_range = method_info["kf_range"] - stacked = generative_model(GG, ll, rr) - plt.plot( - kf_range, - stacked, - ":", - color="#6a3d9a", - lw=2, - ) - - ax = plt.gca() - - if peaks: # Only works for method 'allpaths' - t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3) - tcounts = [(x, y) for x, y in ks.counts if x in t] - if tcounts: - x, y = zip(*tcounts) - tcounts = dict(tcounts) - plt.plot(x, y, "ko", lw=3, mec="k", mfc="w") - ax.text(ks.max1, tcounts[ks.max1], "SNP peak") - ax.text(ks.max2, tcounts[ks.max2], "Main peak") - - ymin, ymax = ax.get_ylim() - ymax = ymax * 7 / 6 - if method == "nbinom": - # Plot multiple CN locations, CN1, CN2, ... up to ploidy - cn_color = "#a6cee3" - for i in range(1, ks.ploidy + 1): - x = i * ks.lambda_ - plt.plot((x, x), (0, ymax), "-.", color=cn_color) - plt.text( - x, - ymax * 0.95, - "CN{}".format(i), - ha="right", - va="center", - color=cn_color, - rotation=90, - ) - - messages = [ - Total_Kmers_msg, - Kmer_coverage_msg, - Genome_size_msg, - Repetitive_msg, - SNPrate_msg, - ] - if method == "nbinom": - messages += [ks.ploidy_message] + ks.copy_messages - write_messages(ax, messages) + fig = plt.figure(1, (iopts.w, iopts.h)) + ax = fig.add_axes((0.1, 0.1, 0.8, 0.8)) - ax.set_title(markup(title)) - ax.set_xlim((0, vmax)) - ax.set_ylim((0, ymax)) - adjust_spines(ax, ["left", "bottom"], outward=True) - xlabel, ylabel = "Coverage (X)", "Counts" - ax.set_xlabel(xlabel) - ax.set_ylabel(ylabel) - set_human_axis(ax) + Genome_size = draw_ks_histogram( + ax, histfile, method, opts.coverage, vmin, vmax, species, N, opts.maxiter, peaks + ) imagename = histfile.split(".")[0] + "." + iopts.format savefig(imagename, dpi=100) diff --git a/jcvi/apps/ks.py b/jcvi/compara/ks.py similarity index 100% rename from jcvi/apps/ks.py rename to jcvi/compara/ks.py diff --git a/jcvi/apps/pedigree.py b/jcvi/compara/pedigree.py similarity index 74% rename from jcvi/apps/pedigree.py rename to jcvi/compara/pedigree.py index 62c9a305..249e71b0 100644 --- a/jcvi/apps/pedigree.py +++ b/jcvi/compara/pedigree.py @@ -12,7 +12,7 @@ import networkx as nx import numpy as np -from ..apps.base import OptionParser, ActionDispatcher, logger, sh +from ..apps.base import OptionParser, ActionDispatcher, logger from ..formats.base import BaseFile from ..graphics.base import set3_n @@ -70,23 +70,44 @@ def __init__(self, pedfile: str): mom = mom if mom != "0" else None s = Sample(name, dad, mom) self[s.name] = s + self._check() - def to_graph(self, inbreeding: Dict[str, SampleInbreeding]) -> nx.DiGraph: + def _check(self): + """ + # Check if all nodes are assigned, including the roots + """ + terminal_nodes = set() + for s in self: + dad, mom = self[s].dad, self[s].mom + if dad and dad not in self: + terminal_nodes.add(dad) + if mom and mom not in self: + terminal_nodes.add(mom) + for s in terminal_nodes: + logger.info("Adding %s to pedigree", s) + self[s] = Sample(s, None, None) + self.terminal_nodes = terminal_nodes + + def to_graph( + self, inbreeding_dict: Dict[str, SampleInbreeding], title: str = "" + ) -> nx.DiGraph: """ Convert the pedigree to a graph. """ - G = nx.DiGraph() + graph_styles = {"labelloc": "b", "label": title, "splines": "curved"} + edge_styles = {"arrowhead": "none", "color": "lightslategray"} + G = nx.DiGraph(**graph_styles) for s in self: dad, mom = self[s].dad, self[s].mom if dad: - G.add_edge(dad, s, color="lightslategray", arrowhead="none") + G.add_edge(dad, s, **edge_styles) if mom: - G.add_edge(mom, s, color="lightslategray", arrowhead="none") + G.add_edge(mom, s, **edge_styles) # Map colors to terminal nodes terminal_nodes = [s for s in self if self[s].is_terminal] colors = dict(zip(terminal_nodes, set3_n(len(terminal_nodes)))) for s in self: - inb = inbreeding[s] + inb = inbreeding_dict[s] label = s if inb.mean_inbreeding > 0.01: label += f"\n(F={inb.mean_inbreeding:.2f})" @@ -98,16 +119,20 @@ def to_graph(self, inbreeding: Dict[str, SampleInbreeding]) -> nx.DiGraph: fillcolor += ":white" else: fillcolor = fillcolor.rsplit(";", 1)[0] - G._node[s]["label"] = label - G._node[s]["shape"] = "circle" - G._node[s]["fixedsize"] = "true" - G._node[s]["width"] = "0.6" - G._node[s]["height"] = "0.6" - G._node[s]["style"] = "wedged" - G._node[s]["fillcolor"] = fillcolor - G._node[s]["color"] = "none" - G._node[s]["fontsize"] = "10" - G._node[s]["fontname"] = "Helvetica" + node_styles = { + "color": "none", + "fillcolor": fillcolor, + "fixedsize": "true", + "fontname": "Helvetica", + "fontsize": "10", + "height": "0.6", + "label": label, + "shape": "circle", + "style": "wedged", + "width": "0.6", + } + for k, v in node_styles.items(): + G._node[s][k] = v return G @@ -206,16 +231,17 @@ def calculate_inbreeding( return results -def inbreeding(args): +def pedigree(args): """ - %prog inbreeding pedfile + %prog pedigree pedfile - Calculate inbreeding coefficients from a pedigree file. + Plot pedigree and calculate pedigree coefficients from a pedigree file. """ - p = OptionParser(inbreeding.__doc__) + p = OptionParser(pedigree.__doc__) p.add_option("--ploidy", default=2, type="int", help="Ploidy") p.add_option("--N", default=10000, type="int", help="Number of samples") - opts, args = p.parse_args(args) + p.add_option("--title", default="", help="Title of the graph") + opts, args, iopts = p.set_image_options(args) if len(args) != 1: sys.exit(not p.print_help()) @@ -227,17 +253,15 @@ def inbreeding(args): for _, v in inb.items(): print(v) - G = ped.to_graph(inb) - dotfile = f"{pedfile}.dot" - nx.nx_agraph.write_dot(G, dotfile) - pdf_file = dotfile + ".pdf" - file_format = pdf_file.split(".")[-1] - sh(f"dot -T{file_format} {dotfile} -o {pdf_file}") - logger.info("Pedigree graph written to `%s`", pdf_file) + G = ped.to_graph(inb, title=opts.title) + A = nx.nx_agraph.to_agraph(G) + image_file = f"{pedfile}.{iopts.format}" + A.draw(image_file, prog="dot") + logger.info("Pedigree graph written to `%s`", image_file) def main(): - actions = (("inbreeding", "calculate inbreeding coefficients"),) + actions = (("pedigree", "Plot pedigree and calculate inbreeding coefficients"),) p = ActionDispatcher(actions) p.dispatch(globals()) diff --git a/jcvi/graphics/base.py b/jcvi/graphics/base.py index ab9d1eed..11f82031 100644 --- a/jcvi/graphics/base.py +++ b/jcvi/graphics/base.py @@ -4,6 +4,7 @@ import copy import os.path as op from os import remove + import sys import logging @@ -12,6 +13,7 @@ logging.getLogger("PIL").setLevel(logging.INFO) from functools import partial +from typing import Optional, List, Tuple, Union import numpy as np import matplotlib as mpl @@ -35,7 +37,6 @@ FancyArrowPatch, FancyBboxPatch, ) -from typing import Optional, List, Tuple, Union from ..apps.base import datadir, glob, listify, logger, sample_N, which from ..formats.base import LineFile @@ -275,6 +276,9 @@ def prettyplot(): def normalize_axes(axes): + """ + Normalize the axes to have the same scale. + """ axes = listify(axes) for ax in axes: ax.set_xlim(0, 1) @@ -282,7 +286,10 @@ def normalize_axes(axes): ax.set_axis_off() -def panel_labels(ax, labels, size=16): +def panel_labels(ax, labels, size: int = 16): + """ + Add panel labels (A, B, ...) to a figure. + """ for xx, yy, panel_label in labels: if rcParams["text.usetex"]: panel_label = r"$\textbf{{{0}}}$".format(panel_label) @@ -420,14 +427,12 @@ def setup_theme( usetex: bool = True, ): try: - import seaborn as sns - extra_rc = { "lines.linewidth": 1, "lines.markeredgewidth": 1, "patch.edgecolor": "k", } - sns.set(context=context, style=style, palette=palette, rc=extra_rc) + sns.set_theme(context=context, style=style, palette=palette, rc=extra_rc) except (ImportError, SyntaxError): pass @@ -549,8 +554,6 @@ def simplify_seqid(seqid): ax.set_xlim(xlim) ax.set_ylim((xlim[1], xlim[0])) # Flip the y-axis so the origin is at the top - ax.set_xticks(ax.get_xticks()) - ax.set_yticks(ax.get_yticks()) ax.set_xticklabels(ax.get_xticks(), family="Helvetica", color="gray") ax.set_yticklabels(ax.get_yticks(), family="Helvetica", color="gray", rotation=90) ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True) diff --git a/jcvi/graphics/chromosome.py b/jcvi/graphics/chromosome.py index 5f590283..ef9ce000 100644 --- a/jcvi/graphics/chromosome.py +++ b/jcvi/graphics/chromosome.py @@ -6,9 +6,10 @@ from the script used in the Tang et al. PNAS 2010 paper, sigma figure. """ import sys + from itertools import groupby from math import ceil -from typing import Tuple +from typing import Optional, Tuple import numpy as np @@ -24,6 +25,7 @@ Rectangle, latex, markup, + normalize_axes, plt, savefig, set1_n, @@ -480,7 +482,7 @@ class will get assigned a unique color. `id_mappings` file is optional (if mappingfile = args[1] fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) + root = fig.add_axes((0, 0, 1, 1)) draw_chromosomes( root, @@ -497,9 +499,7 @@ class will get assigned a unique color. `id_mappings` file is optional (if title=opts.title, ) - root.set_xlim(0, 1) - root.set_ylim(0, 1) - root.set_axis_off() + normalize_axes(root) prefix = bedfile.rsplit(".", 1)[0] figname = prefix + "." + opts.format @@ -511,14 +511,14 @@ def draw_chromosomes( bedfile, sizes, iopts, - mergedist, - winsize, - imagemap, - mappingfile=None, - gauge=False, - legend=True, - empty=False, - title=None, + mergedist: int, + winsize: int, + imagemap: bool = False, + mappingfile: Optional[str] = None, + gauge: bool = False, + legend: bool = True, + empty: bool = False, + title: Optional[str] = None, ): bed = Bed(bedfile) prefix = bedfile.rsplit(".", 1)[0] diff --git a/jcvi/graphics/graph.py b/jcvi/graphics/graph.py deleted file mode 100755 index 205fdd07..00000000 --- a/jcvi/graphics/graph.py +++ /dev/null @@ -1,92 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - - -""" -Script to plot diagrams of assembly graph in polyploids. -""" - -from collections import defaultdict -from random import choice, sample - -from brewer2mpl import get_map -from graphviz import Graph -from matplotlib.colors import to_hex -from more_itertools import pairwise - - -def make_sequence(seq, name="S"): - """ - Make unique nodes for sequence graph. - """ - return ["{}_{}_{}".format(name, i, x) for i, x in enumerate(seq)] - - -def sequence_to_graph(G, seq, color="black"): - """ - Automatically construct graph given a sequence of characters. - """ - for x in seq: - if x.endswith("_1"): # Mutation - G.node(x, color=color, width="0.1", shape="circle", label="") - else: - G.node(x, color=color) - for a, b in pairwise(seq): - G.edge(a, b, color=color) - - -def zip_sequences(G, allseqs): - """ - Fuse certain nodes together, if they contain same data except for the - sequence name. - """ - for s in zip(*allseqs): - groups = defaultdict(list) - for x in s: - part = x.split("_", 1)[1] - groups[part].append(x) - for part, g in groups.items(): - with G.subgraph(name="cluster_" + part) as c: - for x in g: - c.node(x) - c.attr(style="invis") - - -def main(): - SIZE = 20 - PLOIDY = 2 - MUTATIONS = 2 - - indices = range(SIZE) - # Build fake data - seqA = list("0" * SIZE) - allseqs = [seqA[:] for x in range(PLOIDY)] # Hexaploid - for s in allseqs: - for i in [choice(indices) for x in range(MUTATIONS)]: - s[i] = "1" - - allseqs = [ - make_sequence(s, name=name) - for (s, name) in zip(allseqs, [str(x) for x in range(PLOIDY)]) - ] - - # Build graph structure - G = Graph("Assembly graph", filename="graph") - G.attr(rankdir="LR", fontname="Helvetica", splines="true") - G.attr(ranksep=".2", nodesep="0.02") - G.attr("node", shape="point") - G.attr("edge", dir="none", penwidth="4") - - colorset = get_map("Set2", "qualitative", 8).mpl_colors - colorset = [to_hex(x) for x in colorset] - colors = sample(colorset, PLOIDY) - for s, color in zip(allseqs, colors): - sequence_to_graph(G, s, color=color) - zip_sequences(G, allseqs) - - # Output graph - G.view() - - -if __name__ == "__main__": - main() diff --git a/jcvi/graphics/landscape.py b/jcvi/graphics/landscape.py index e28b96e6..b0322b6a 100644 --- a/jcvi/graphics/landscape.py +++ b/jcvi/graphics/landscape.py @@ -9,31 +9,32 @@ import os.path as op import sys -import logging + +from collections import Counter, OrderedDict, defaultdict +from typing import List, Optional import numpy as np -from collections import defaultdict, OrderedDict +from ..algorithms.matrix import moving_sum +from ..apps.base import OptionParser, ActionDispatcher, logger +from ..formats.base import BaseFile, DictFile, LineFile, must_open +from ..formats.bed import Bed, bins +from ..formats.sizes import Sizes +from ..utils.cbook import human_size, autoscale -from jcvi.formats.sizes import Sizes -from jcvi.formats.base import BaseFile, DictFile, LineFile, must_open -from jcvi.formats.bed import Bed, bins -from jcvi.algorithms.matrix import moving_sum -from jcvi.graphics.base import ( - plt, - Rectangle, +from .base import ( CirclePolygon, - savefig, - ticker, + Rectangle, + adjust_spines, human_readable_base, latex, markup, - set_human_axis, normalize_axes, - adjust_spines, + plt, + savefig, + set_human_axis, + ticker, ) -from jcvi.utils.cbook import human_size, autoscale -from jcvi.apps.base import OptionParser, ActionDispatcher # Colors picked from Schmutz soybean genome paper using ColorPic @@ -92,7 +93,7 @@ def __init__(self, row, delimiter=","): class ChrInfoFile(BaseFile, OrderedDict): def __init__(self, filename, delimiter=","): super(ChrInfoFile, self).__init__(filename) - with open(filename) as fp: + with open(filename, encoding="utf-8") as fp: for row in fp: if row[0] == "#": continue @@ -113,7 +114,7 @@ def __init__(self, row, delimiter=","): class TitleInfoFile(BaseFile, OrderedDict): def __init__(self, filename, delimiter=","): super(TitleInfoFile, self).__init__(filename) - with open(filename) as fp: + with open(filename, encoding="utf-8") as fp: for row in fp: if row[0] == "#": continue @@ -148,15 +149,13 @@ def parse_distfile(filename): Args: filename (str): Path to the file. """ - from collections import defaultdict, Counter - dists = defaultdict(Counter) with must_open(filename) as fp: for row in fp: chromosome, start, end, depth = row.split() depth = int(float(depth)) dists[chromosome][depth] += 1 - logging.debug("Loaded {} seqids".format(len(dists))) + logger.debug("Loaded %d seqids", len(dists)) return dists @@ -176,7 +175,7 @@ def parse_groupsfile(filename): for row in fp: chrs, colors = row.split() groups.append((chrs.split(","), colors.split(","))) - logging.debug("Loaded {} groups".format(len(groups))) + logger.debug("Loaded %d groups", len(groups)) return groups @@ -221,7 +220,7 @@ def mosdepth(args): # Construct a composite figure with N tracks indicated in the groups fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) + root = fig.add_axes((0, 0, 1, 1)) rows = len(groups) ypad = 0.05 @@ -230,10 +229,10 @@ def mosdepth(args): for group_idx, (chrs, colors) in enumerate(groups): yy -= yinterval - ax = fig.add_axes([0.15, yy, 0.7, yinterval * 0.85]) + ax = fig.add_axes((0.15, yy, 0.7, yinterval * 0.85)) for c, color in zip(chrs, colors): cdata = dists[c].items() - logging.debug("Importing {} records for {}".format(len(cdata), c)) + logger.debug("Importing %d records for %s", len(cdata), c) cx, cy = zip(*sorted(cdata)) ax.plot(cx, cy, "-", color=color) if logscale: @@ -271,14 +270,14 @@ def mosdepth(args): def draw_depth( root, ax, - bed, - chrinfo={}, - defaultcolor="k", - sepcolor="w", - ylim=100, - logscale=False, - title=None, - subtitle=None, + bed: Bed, + chrinfo: dict = {}, + defaultcolor: str = "k", + sepcolor: str = "w", + maxdepth: int = 100, + logscale: bool = False, + title: Optional[str] = None, + subtitle: Optional[str] = None, ): """Draw depth plot on the given axes, using data from bed @@ -289,7 +288,7 @@ def draw_depth( chrinfo (ChrInfoFile): seqid => color, new name defaultcolor (str): matplotlib-compatible color for data points sepcolor (str): matplotlib-compatible color for chromosome breaks - ylim (int): Upper limit of the y-axis (depth) + maxdepth (int): Upper limit of the y-axis (depth) title (str): Title of the figure, to the right of the axis subtitle (str): Subtitle of the figure, just below title """ @@ -334,7 +333,7 @@ def draw_depth( s=8, lw=0, ) - logging.debug("Obtained {} data points with depth data".format(len(data))) + logger.debug("Obtained %d data points with depth data", len(data)) # Per seqid median medians = {} @@ -353,11 +352,11 @@ def draw_depth( alpha=0.5, ) - # vertical lines for all the breaks + # Vertical lines for all the breaks for pos in starts.values(): - ax.plot((pos, pos), (0, ylim), "-", lw=1, color=sepcolor) + ax.plot((pos, pos), (0, maxdepth), "-", lw=1, color=sepcolor) - # beautify the numeric axis + # Beautify the numeric axis for tick in ax.get_xticklines() + ax.get_yticklines(): tick.set_visible(False) @@ -380,6 +379,15 @@ def draw_depth( va="center", ) + # Add an arrow to the right of the plot, indicating these are median depths + root.text( + 0.91, + 0.88, + r"$\leftarrow$median", + color="lightslategray", + va="center", + ) + if title: root.text( 0.95, @@ -405,7 +413,7 @@ def draw_depth( ax.set_xlim(0, xsize) if logscale: ax.set_yscale("log", basey=2) - ax.set_ylim(1 if logscale else 0, ylim) + ax.set_ylim(1 if logscale else 0, maxdepth) ax.set_ylabel("Depth") set_human_axis(ax) @@ -413,6 +421,52 @@ def draw_depth( normalize_axes(root) +def draw_multi_depth( + root, + panel_roots, + panel_axes, + bedfiles: List[str], + chrinfo_file: str, + titleinfo_file: str, + maxdepth: int, + logscale: bool, +): + """ + Draw multiple depth plots on the same canvas. + """ + chrinfo = ChrInfoFile(chrinfo_file) if chrinfo_file else {} + titleinfo = TitleInfoFile(titleinfo_file) if titleinfo_file else {} + npanels = len(bedfiles) + yinterval = 1.0 / npanels + ypos = 1 - yinterval + for bedfile, panel_root, panel_ax in zip(bedfiles, panel_roots, panel_axes): + pf = op.basename(bedfile).split(".", 1)[0] + bed = Bed(bedfile) + + if ypos > 0.001: + root.plot((0.02, 0.98), (ypos, ypos), "-", lw=2, color="lightslategray") + + title = titleinfo.get(bedfile, pf.split("_", 1)[0]) + subtitle = None + if isinstance(title, TitleInfoLine): + subtitle = title.subtitle + title = title.title + + draw_depth( + panel_root, + panel_ax, + bed, + chrinfo=chrinfo, + maxdepth=maxdepth, + logscale=logscale, + title=title, + subtitle=subtitle, + ) + ypos -= yinterval + + normalize_axes(root) + + def depth(args): """ %prog depth *.regions.bed.gz @@ -455,43 +509,31 @@ def depth(args): sys.exit(not p.print_help()) bedfiles = args - chrinfo = ChrInfoFile(opts.chrinfo) if opts.chrinfo else {} - titleinfo = TitleInfoFile(opts.titleinfo) if opts.titleinfo else {} fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) + root = fig.add_axes((0, 0, 1, 1)) npanels = len(bedfiles) yinterval = 1.0 / npanels ypos = 1 - yinterval - for bedfile in bedfiles: - pf = op.basename(bedfile).split(".", 1)[0] - bed = Bed(bedfile) - - panel_root = root if npanels == 1 else fig.add_axes([0, ypos, 1, yinterval]) - panel_ax = fig.add_axes([0.1, ypos + 0.2 * yinterval, 0.8, 0.65 * yinterval]) - if ypos > 0.001: - root.plot((0, 1), (ypos, ypos), "-", lw=2, color="lightslategray") - - title = titleinfo.get(bedfile, pf.split("_", 1)[0]) - subtitle = None - if isinstance(title, TitleInfoLine): - subtitle = title.subtitle - title = title.title - - draw_depth( - panel_root, - panel_ax, - bed, - chrinfo=chrinfo, - ylim=opts.maxdepth, - logscale=opts.logscale, - title=title, - subtitle=subtitle, - ) + panel_roots, panel_axes = [], [] + for _ in range(npanels): + panel_root = root if npanels == 1 else fig.add_axes((0, ypos, 1, yinterval)) + panel_ax = fig.add_axes((0.1, ypos + 0.2 * yinterval, 0.8, 0.65 * yinterval)) + panel_roots.append(panel_root) + panel_axes.append(panel_ax) ypos -= yinterval - normalize_axes(root) + draw_multi_depth( + root, + panel_roots, + panel_axes, + bedfiles, + opts.chrinfo, + opts.titleinfo, + opts.maxdepth, + opts.logscale, + ) if npanels > 1: pf = op.commonprefix(bedfiles) @@ -514,10 +556,11 @@ def check_window_options(opts): shift = opts.shift subtract = opts.subtract assert window % shift == 0, "--window must be divisible by --shift" - logging.debug( - "Line/stack-plot options: window={0} shift={1} subtract={2}".format( - window, shift, subtract - ) + logger.debug( + "Line/stack-plot options: window=%d shift=%d subtract=%d", + window, + shift, + subtract, ) merge = not opts.nomerge @@ -641,7 +684,7 @@ def composite(args): plt.rcParams["ytick.major.size"] = 0 fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) + root = fig.add_axes((0, 0, 1, 1)) root.text(0.5, 0.95, chr, ha="center", color="darkslategray") @@ -649,7 +692,7 @@ def composite(args): xlen = xend - xstart ratio = xlen / clen # Line plots - ax = fig.add_axes([xstart, 0.6, xlen, 0.3]) + ax = fig.add_axes((xstart, 0.6, xlen, 0.3)) lineplot(ax, linebins, nbins, chr, window, shift) # Bar plots @@ -822,7 +865,7 @@ def heatmap(args): clen = Sizes(fastafile).mapping[chr] fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) + root = fig.add_axes((0, 0, 1, 1)) # Gauge ratio = draw_gauge(root, margin, clen, rightmargin=4 * margin) @@ -837,7 +880,7 @@ def heatmap(args): cc = ca[0].upper() + cb root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) - ax = fig.add_axes([xx, yy, xlen, yinterval - inner]) + ax = fig.add_axes((xx, yy, xlen, yinterval - inner)) nbins = get_nbins(clen, shift) @@ -1028,7 +1071,7 @@ def stack(args): pf = fastafile.rsplit(".", 1)[0] fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) + root = fig.add_axes((0, 0, 1, 1)) # Gauge ratio = draw_gauge(root, margin, maxl) @@ -1049,7 +1092,7 @@ def stack(args): cc = "\n".join((cc, "({0})".format(switch[cc]))) root.add_patch(Rectangle((xx, yy), xlen, yinterval - inner, color=gray)) - ax = fig.add_axes([xx, yy, xlen, yinterval - inner]) + ax = fig.add_axes((xx, yy, xlen, yinterval - inner)) nbins = clen / shift if clen % shift: diff --git a/jcvi/projects/alfalfa.py b/jcvi/projects/alfalfa.py deleted file mode 100644 index 3a67b7c3..00000000 --- a/jcvi/projects/alfalfa.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -Random collection of scripts associated with alfalfa assembly. -""" - -import sys - -from jcvi.formats.bed import Bed, fastaFromBed -from jcvi.graphics.mummerplot import main as mummerplot_main -from jcvi.apps.base import OptionParser, ActionDispatcher, sh - - -def main(): - - actions = (("nucmer", "select specific chromosome region based on MTR mapping"),) - p = ActionDispatcher(actions) - p.dispatch(globals()) - - -def nucmer(args): - """ - %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3 - - Select specific chromosome region based on MTR mapping. The above command - will extract chr1:2,000,001-3,000,000. - """ - p = OptionParser(nucmer.__doc__) - opts, args = p.parse_args(args) - - if len(args) != 5: - sys.exit(not p.print_help()) - - mapbed, mtrfasta, asmfasta, chr, idx = args - idx = int(idx) - m1 = 1000000 - bedfile = "sample.bed" - bed = Bed() - bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1))) - bed.print_to_file(bedfile) - - cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format( - mapbed, bedfile - ) - idsfile = "query.ids" - sh(cmd, outfile=idsfile) - - sfasta = fastaFromBed(bedfile, mtrfasta) - qfasta = "query.fasta" - cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta) - sh(cmd) - - cmd = "nucmer {0} {1}".format(sfasta, qfasta) - sh(cmd) - - mummerplot_main(["out.delta", "--refcov=0"]) - sh("mv out.pdf {0}.{1}.pdf".format(chr, idx)) - - -if __name__ == "__main__": - main() diff --git a/jcvi/projects/jcvi.py b/jcvi/projects/jcvi.py new file mode 100644 index 00000000..dc38dd02 --- /dev/null +++ b/jcvi/projects/jcvi.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + +""" +Functions in this script produce figures in the JCVI manuscript. +""" + +import sys + +import networkx as nx + +from ..apps.base import ActionDispatcher, OptionParser, logger +from ..assembly.geneticmap import draw_geneticmap_heatmap +from ..assembly.hic import draw_hic_heatmap +from ..assembly.kmer import draw_ks_histogram +from ..compara.pedigree import Pedigree, calculate_inbreeding +from ..graphics.base import normalize_axes, panel_labels, plt, savefig +from ..graphics.chromosome import draw_chromosomes +from ..graphics.landscape import draw_multi_depth + + +def diversity(args): + """ + %prog diversity pedigree.ped VAR?_srtd.wgs.regions.bed.gz + + Plot diversity composite figure, including: + A. Pedigree + B. Depth distribution across genomes + """ + p = OptionParser(diversity.__doc__) + _, args, iopts = p.set_image_options(args, figsize="14x7") + + if len(args) < 2: + sys.exit(not p.print_help()) + + pedfile, bedfiles = args[0], args[1:] + + fig = plt.figure(1, (iopts.w, iopts.h)) + root = fig.add_axes((0, 0, 1, 1)) + + ax1_root = fig.add_axes((0, 0, 0.25, 1)) + ax2_root = fig.add_axes((0.25, 0, 0.75, 1)) + + # Panel A + logger.info("Plotting pedigree") + ped = Pedigree(pedfile) + pngfile = f"{pedfile}.png" + inb = calculate_inbreeding(ped, ploidy=4, N=10000) + + G = ped.to_graph(inb, title="Pedigree of Variety1") + A = nx.nx_agraph.to_agraph(G) + dpi = 300 + A.draw(pngfile, prog="dot", args=f"-Gdpi={dpi}") + logger.info("Pedigree graph written to `%s`", pngfile) + + # Show the image as is + ax1_root.imshow(plt.imread(pngfile)) + ax1_root.set_axis_off() + + # Panel B + logger.info("Plotting depth distribution across genomes") + npanels = len(bedfiles) + yinterval = 1.0 / npanels + ypos = 1 - yinterval + panel_roots, panel_axes = [], [] + for _ in range(npanels): + panel_root = fig.add_axes((0.25, ypos, 0.75, yinterval)) + panel_ax = fig.add_axes( + (0.25 + 0.1 * 0.75, ypos + 0.2 * yinterval, 0.8 * 0.75, 0.65 * yinterval) + ) + panel_roots.append(panel_root) + panel_axes.append(panel_ax) + ypos -= yinterval + + draw_multi_depth( + ax2_root, + panel_roots, + panel_axes, + bedfiles, + chrinfo_file="chrinfo.txt", + titleinfo_file="titleinfo.txt", + maxdepth=100, + logscale=False, + ) + + labels = ( + (0.02, 0.95, "A"), + (0.25 + 0.25 * 0.1, 0.95, "B"), + ) + panel_labels(root, labels) + normalize_axes([root, ax2_root]) + + image_name = "diversity.pdf" + savefig(image_name, dpi=iopts.dpi, iopts=iopts) + + +def landscape(args): + """ + %prog landscape features.bed athaliana.sizes Fig5.png + + Plot landscape composite figure, including: + A. Example genomic features painted on Arabidopsis genome + B. Landscape of genomic features across the genome + """ + p = OptionParser(landscape.__doc__) + _, args, iopts = p.set_image_options(args, figsize="10x7") + + if len(args) != 3: + sys.exit(not p.print_help()) + + bedfile, sizesfile, pngfile = args + + fig = plt.figure(1, (iopts.w, iopts.h)) + root = fig.add_axes((0, 0, 1, 1)) + aspect_ratio = iopts.w / iopts.h + ax1_root = fig.add_axes((0, 1 / 4, 0.4, 0.5 * aspect_ratio)) + ax2_root = fig.add_axes((0.4, 0.43, 0.54, 0.57)) + ax3_root = fig.add_axes((0.43, -0.13, 0.54, 0.57)) + + # Panel A + logger.info("Plotting example genomic features painted on Arabidopsis genome") + draw_chromosomes( + ax1_root, + bedfile, + sizesfile, + iopts=iopts, + mergedist=0, + winsize=50000, + gauge=True, + legend=True, + empty=False, + title="*Arabidopsis* genome features", + ) + + # Panel B + logger.info("Plotting landscape of genomic features across the genome") + M = plt.imread(pngfile) + width, height = M.shape[1], M.shape[0] + # Split the image into left and right parts + mid = width // 2 - 10 + left_cut = right_cut = 900 + logger.info("Image size: %dx%d", width, height) + logger.info("Splitting image at %d", mid) + + LM, RM = M[:left_cut, :mid], M[:right_cut, mid:] + logger.info("Left image size: %dx%d", LM.shape[1], LM.shape[0]) + logger.info("Right image size: %dx%d", RM.shape[1], RM.shape[0]) + + ax2_root.imshow(LM, extent=(0.4, 1, 0.5, 1), aspect="auto") + ax3_root.imshow(RM, extent=(0.4, 1, 0, 0.5), aspect="auto") + ax2_root.set_axis_off() + ax3_root.set_axis_off() + + labels = ( + (0.02, 0.95, "A"), + (0.42, 0.95, "B"), + ) + panel_labels(root, labels) + normalize_axes([root, ax1_root]) + + image_name = "landscape.pdf" + savefig(image_name, dpi=iopts.dpi, iopts=iopts) + + +def genomebuild(args): + """ + %prog genomebuild reads.histo geneticmap.matrix hic.resolution_500000.npy hic.resolution_500000.json + + Plot genome build composite figure, including: + A. Read kmer histogram + B. Genetic map concordance + C. Hi-C contact map concordance + """ + p = OptionParser(genomebuild.__doc__) + _, args, iopts = p.set_image_options(args, figsize="21x7") + + if len(args) != 4: + sys.exit(not p.print_help()) + + reads_histo, mstmap, hic_matrix, hic_json = args + + fig = plt.figure(1, (iopts.w, iopts.h)) + root = fig.add_axes((0, 0, 1, 1)) + ax1_root = fig.add_axes((0, 0, 1 / 3, 1)) + ax2_root = fig.add_axes((1 / 3, 0, 1 / 3, 1)) + ax3_root = fig.add_axes((2 / 3, 0, 1 / 3, 1)) + ax1 = fig.add_axes((1 / 3 * 0.1, 0.1, 1 / 3 * 0.8, 0.8)) + ax2 = fig.add_axes((1 / 3 * 1.1, 0.1, 1 / 3 * 0.8, 0.8)) + ax3 = fig.add_axes((1 / 3 * 2.1, 0.1, 1 / 3 * 0.8, 0.8)) + + # Panel A + logger.info("Plotting read kmer histogram") + _ = draw_ks_histogram( + ax1, + reads_histo, + method="nbinom", + coverage=0, + vmin=2, + vmax=200, + species="*S. species* ‘Variety 1’", + K=21, + maxiter=100, + peaks=False, + ) + + # Panel B + logger.info("Plotting genetic map concordance") + draw_geneticmap_heatmap(ax2_root, ax2, mstmap, 1000) + + # Panel C + logger.info("Plotting Hi-C contact map concordance") + draw_hic_heatmap( + ax3_root, + ax3, + hic_matrix, + hic_json, + contig=None, + groups_file="groups", + title="*S. species* Hi-C contact map", + vmin=1, + vmax=6, + plot_breaks=True, + ) + + labels = ( + (1 / 3 * 0.1, 0.95, "A"), + (1 / 3 * 1.1, 0.95, "B"), + (1 / 3 * 2.1, 0.95, "C"), + ) + panel_labels(root, labels) + normalize_axes([root, ax1_root, ax2_root, ax3_root]) + + image_name = "genomebuild.pdf" + savefig(image_name, dpi=iopts.dpi, iopts=iopts) + + +def main(): + + actions = ( + ("diversity", "Plot diversity composite figure"), + ("genomebuild", "Plot genome build composite figure"), + ("landscape", "Plot landscape composite figure"), + ) + p = ActionDispatcher(actions) + p.dispatch(globals()) + + +if __name__ == "__main__": + main() diff --git a/jcvi/projects/misc.py b/jcvi/projects/misc.py index 0a287434..7c074ac3 100644 --- a/jcvi/projects/misc.py +++ b/jcvi/projects/misc.py @@ -33,7 +33,6 @@ def main(): ("oropetium", "plot oropetium micro-synteny (requires data)"), # Pomegranate paper (Qin et al., 2017 Plant Journal) ("pomegranate", "plot pomegranate macro- and micro-synteny (requires data)"), - # Unpublished ("birch", "plot birch macro-synteny (requires data)"), ("litchi", "plot litchi micro-synteny (requires data)"), ("utricularia", "plot utricularia micro-synteny (requires data)"), @@ -76,7 +75,7 @@ def waterlilyGOM(args): pf = datafile.rsplit(".", 1)[0] fig = plt.figure(1, (iopts.w, iopts.h)) - root = fig.add_axes([0, 0, 1, 1]) + root = fig.add_axes((0, 0, 1, 1)) margin, rmargin = 0.15, 0.19 # Left and right margin leafinfo = LeafInfoFile("leafinfo.csv").cache diff --git a/jcvi/projects/pistachio.py b/jcvi/projects/pistachio.py deleted file mode 100644 index 83e1ca93..00000000 --- a/jcvi/projects/pistachio.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -Functions related to processing of the pistachio genome. -""" -import sys - -from jcvi.apps.base import OptionParser, ActionDispatcher - - -def main(): - - actions = (("agp", "convert from the table file to agp format"),) - p = ActionDispatcher(actions) - p.dispatch(globals()) - - -def agp(args): - """ - %prog agp Siirt_Female_pistachio_23May2017_table.txt - - The table file, as prepared by Dovetail Genomics, is not immediately useful - to convert gene model coordinates, as assumed by formats.chain.fromagp(). - This is a quick script to do such conversion. The file structure of this - table file is described in the .manifest file shipped in the same package:: - - pistachio_b_23May2017_MeyIy.table.txt - Tab-delimited table describing positions of input assembly scaffolds - in the Hirise scaffolds. The table has the following format: - - 1. HiRise scaffold name - 2. Input sequence name - 3. Starting base (zero-based) of the input sequence - 4. Ending base of the input sequence - 5. Strand (- or +) of the input sequence in the scaffold - 6. Starting base (zero-based) in the HiRise scaffold - 7. Ending base in the HiRise scaffold - - where '-' in the strand column indicates that the sequence is reverse - complemented relative to the input assembly. - - CAUTION: This is NOT a proper AGP format since it does not have gaps in - them. - """ - p = OptionParser(agp.__doc__) - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - (tablefile,) = args - fp = open(tablefile) - for row in fp: - atoms = row.split() - hr = atoms[0] - scaf = atoms[1] - scaf_start = int(atoms[2]) + 1 - scaf_end = int(atoms[3]) - strand = atoms[4] - hr_start = int(atoms[5]) + 1 - hr_end = int(atoms[6]) - - print( - "\t".join( - str(x) - for x in ( - hr, - hr_start, - hr_end, - 1, - "W", - scaf, - scaf_start, - scaf_end, - strand, - ) - ) - ) - - -if __name__ == "__main__": - main()