Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JCVI demo script #651

Merged
merged 24 commits into from
Apr 29, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactor into read_matrix()
tanghaibao committed Apr 28, 2024
commit ad86bf63fcd3e455bdd8893991285ea4a376599b
111 changes: 63 additions & 48 deletions jcvi/assembly/hic.py
Original file line number Diff line number Diff line change
@@ -10,9 +10,11 @@
import os
import os.path as op
import sys

from collections import defaultdict
from functools import partial
from multiprocessing import Pool
from typing import List, Tuple

import numpy as np

@@ -683,6 +685,65 @@ def generate_groups(groupsfile):
yield seqids, color


def read_matrix(
npyfile: str, header: dict, contig: str, groups: List[Tuple[str, str]], opts
):
"""
Read the matrix from the npy file and apply log transformation and thresholding.
"""
# Load the matrix
A = np.load(npyfile)
total_bins = header["total_bins"]

# Select specific submatrix
if contig:
contig_start = header["starts"][contig]
contig_size = header["sizes"][contig]
contig_end = contig_start + contig_size
A = A[contig_start:contig_end, contig_start:contig_end]
else:
A = A[:total_bins, :total_bins]

# Convert seqids to positions for each group
new_groups = []
for seqids, color in groups:
seqids = seqids.split(",")
assert all(
x in header["starts"] for x in seqids
), f"{seqids} contain ids not found in starts"
assert all(
x in header["sizes"] for x in seqids
), f"{seqids} contain ids not found in sizes"
start = min(header["starts"][x] for x in seqids)
end = max(header["starts"][x] + header["sizes"][x] for x in seqids)
position_seqids = []
for seqid in seqids:
seqid_start = header["starts"][seqid]
seqid_size = header["sizes"][seqid]
position_seqids.append((seqid_start + seqid_size / 2, seqid))
new_groups.append((start, end, position_seqids, color))

# Several concerns in practice:
# The diagonal counts may be too strong, this can either be resolved by
# masking them. Or perform a log transform on the entire heatmap.
B = A.astype("float64")
B += 1.0
B = np.log(B)
vmin, vmax = opts.vmin, opts.vmax
B[B < vmin] = vmin
B[B > vmax] = vmax
print(B)
logger.debug("Matrix log-transformation and thresholding (%d-%d) done", vmin, vmax)

breaks = list(header["starts"].values())
breaks += [total_bins] # This is actually discarded
breaks = sorted(breaks)[1:]
if contig or opts.nobreaks:
breaks = []

return B, new_groups, breaks


def heatmap(args):
"""
%prog heatmap input.npy genome.json
@@ -727,64 +788,18 @@ def heatmap(args):
groups = list(generate_groups(opts.groups)) if opts.groups else []

# Load contig/chromosome starts and sizes
header = json.loads(open(jsonfile).read())
header = json.loads(open(jsonfile, encoding="utf-8").read())
resolution = header.get("resolution")
assert resolution is not None, "`resolution` not found in `{}`".format(jsonfile)
logger.debug("Resolution set to %d", resolution)
# Load the matrix
A = np.load(npyfile)
total_bins = header["total_bins"]

# Select specific submatrix
if contig:
contig_start = header["starts"][contig]
contig_size = header["sizes"][contig]
contig_end = contig_start + contig_size
A = A[contig_start:contig_end, contig_start:contig_end]
else:
A = A[:total_bins, :total_bins]

# Convert seqids to positions for each group
new_groups = []
for seqids, color in groups:
seqids = seqids.split(",")
assert all(
x in header["starts"] for x in seqids
), f"{seqids} contain ids not found in starts"
assert all(
x in header["sizes"] for x in seqids
), f"{seqids} contain ids not found in sizes"
start = min(header["starts"][x] for x in seqids)
end = max(header["starts"][x] + header["sizes"][x] for x in seqids)
position_seqids = []
for seqid in seqids:
seqid_start = header["starts"][seqid]
seqid_size = header["sizes"][seqid]
position_seqids.append((seqid_start + seqid_size / 2, seqid))
new_groups.append((start, end, position_seqids, color))

# Several concerns in practice:
# The diagonal counts may be too strong, this can either be resolved by
# masking them. Or perform a log transform on the entire heatmap.
B = A.astype("float64")
B += 1.0
B = np.log(B)
vmin, vmax = opts.vmin, opts.vmax
B[B < vmin] = vmin
B[B > vmax] = vmax
print(B)
logger.debug("Matrix log-transformation and thresholding (%d-%d) done", vmin, vmax)
B, new_groups, breaks = read_matrix(npyfile, header, contig, groups, opts)

# Canvas
fig = plt.figure(1, (iopts.w, iopts.h))
root = fig.add_axes((0, 0, 1, 1)) # whole canvas
ax = fig.add_axes((0.05, 0.05, 0.9, 0.9)) # just the heatmap

breaks = list(header["starts"].values())
breaks += [total_bins] # This is actually discarded
breaks = sorted(breaks)[1:]
if contig or opts.nobreaks:
breaks = []
plot_heatmap(ax, B, breaks, groups=new_groups, binsize=resolution)

# Title