Skip to content

Commit

Permalink
cleanup allmaps.py
Browse files Browse the repository at this point in the history
  • Loading branch information
tanghaibao committed Apr 30, 2024
1 parent 64ab819 commit 512b26d
Showing 1 changed file with 57 additions and 68 deletions.
125 changes: 57 additions & 68 deletions jcvi/assembly/allmaps.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,44 +12,45 @@
import numpy as np
import networkx as nx

from cmmodule.utils import read_chain_file
from cmmodule.mapbed import crossmap_bed_file
from collections import Counter, defaultdict
from functools import partial
from itertools import combinations, product
from more_itertools import pairwise
from typing import Optional

from cmmodule.utils import read_chain_file
from cmmodule.mapbed import crossmap_bed_file
from more_itertools import pairwise

from jcvi import __version__ as version
from jcvi.algorithms.formula import reject_outliers, spearmanr
from jcvi.algorithms.lis import (
from ..algorithms.ec import GA_setup, GA_run
from ..algorithms.formula import reject_outliers, spearmanr
from ..algorithms.lis import (
longest_monotonic_subseq_length_loose as lms,
longest_monotonic_subsequence_loose as lmseq,
)
from jcvi.algorithms.tsp import hamiltonian
from jcvi.algorithms.matrix import determine_signs
from jcvi.algorithms.ec import GA_setup, GA_run
from jcvi.formats.agp import AGP, order_to_agp, build as agp_build, reindex
from jcvi.formats.base import DictFile, FileMerger, must_open, read_block
from jcvi.formats.bed import Bed, BedLine, natsorted, sort
from jcvi.formats.chain import fromagp
from jcvi.formats.sizes import Sizes
from jcvi.graphics.landscape import draw_gauge
from jcvi.utils.cbook import human_size, percentage
from jcvi.utils.grouper import Grouper
from jcvi.utils.table import tabulate
from jcvi.apps.base import (
from ..algorithms.matrix import determine_signs
from ..apps.base import (
ActionDispatcher,
OptionGroup,
OptionParser,
SUPPRESS_HELP,
cleanup,
flatten,
get_today,
logger,
mkdir,
need_update,
sh,
)
from ..formats.agp import AGP, order_to_agp, build as agp_build, reindex
from ..formats.base import DictFile, FileMerger, must_open, read_block
from ..formats.bed import Bed, BedLine, natsorted, sort
from ..formats.chain import fromagp
from ..formats.sizes import Sizes
from ..graphics.landscape import draw_gauge
from ..utils.cbook import human_size, percentage
from ..utils.grouper import Grouper
from ..utils.table import tabulate


START, END = "START", "END"
Expand Down Expand Up @@ -193,9 +194,9 @@ def callback(tour, gen, i=0):
return tour

i = 0
best_tour, best_fitness = None, None
_, best_fitness = None, None
while True: # Multiple EC rounds due to orientation fixes
logging.debug("Start EC round {0}".format(i))
logger.debug("Start EC round %d", i)
scaffolds_oo = dict(tour)
scfs, tour, ww = self.prepare_ec(scaffolds, tour, weights)
callbacki = partial(callback, i=i)
Expand All @@ -206,16 +207,14 @@ def callback(tour, gen, i=0):
)
tour = callbacki(tour, "FIN")
if best_fitness and fitness <= best_fitness:
logging.debug(
"No fitness improvement: {0}. Exit EC.".format(best_fitness)
)
logger.debug("No fitness improvement: %s. Exit EC.", best_fitness)
break
tour = self.fix_orientation(tour)
best_tour, best_fitness = tour, fitness
print_tour(
fwtour, self.object, tag, "GA{0}-FIXORI".format(i), tour, recode=True
)
logging.debug("Current best fitness: {0}".format(best_fitness))
logger.debug("Current best fitness: %s", best_fitness)
i += 1

tour = self.fix_tour(tour)
Expand Down Expand Up @@ -285,7 +284,7 @@ def distances_to_tour(self):
continue
G.add_edge(b, a, weight=d)

logging.debug("Graph size: |V|=%d, |E|=%d", len(G), G.size())
logger.debug("Graph size: |V|=%d, |E|=%d", len(G), G.size())

L = dict(nx.all_pairs_dijkstra_path_length(G))
for a, b in combinations(scaffolds, 2):
Expand Down Expand Up @@ -421,7 +420,7 @@ def fix_tour(self, tour):
if score_with > score_without:
keep.add(s)
dropped = len(tour) - len(keep)
logging.debug("Dropped {0} minor scaffolds".format(dropped))
logger.debug("Dropped %d minor scaffolds", dropped)
return [(s, o) for (s, o) in tour if s in keep]

def fix_orientation(self, tour):
Expand Down Expand Up @@ -458,7 +457,7 @@ def fix_orientation(self, tour):
fixed += 1

tour = [(x, orientations[x]) for x in scaffolds]
logging.debug("Fixed orientations for {0} scaffolds.".format(fixed))
logger.debug("Fixed orientations for %d scaffolds.", fixed)
return tour


Expand Down Expand Up @@ -489,7 +488,7 @@ def __init__(self, b):
try:
self.mapname, self.lg = b.accn.split("-", 1)
except ValueError:
logging.error("Malformed marker name: {}".format(b.accn))
logger.error("Malformed marker name: %s", b.accn)
sys.exit(1)
self.cm = float(cm)
self.accn = b.accn
Expand Down Expand Up @@ -535,10 +534,10 @@ def report(self):
self.seqids = sorted(set(x.seqid for x in self))
self.mapnames = sorted(set(x.mapname for x in self))
self.mlgs = sorted(set(x.mlg for x in self))
logging.debug(
"Map contains {0} markers in {1} linkage groups.".format(
self.nmarkers, len(self.mlgs)
)
logger.debug(
"Map contains %d markers in %d linkage groups.",
self.nmarkers,
len(self.mlgs),
)

def extract(self, seqid):
Expand Down Expand Up @@ -582,9 +581,7 @@ def get_bins(self, function, remove_outliers):
s[pair] = cm
original += len(markers)
clean += len(cm)
logging.debug(
"Retained {0} clean markers.".format(percentage(clean, original))
)
logger.debug("Retained %s clean markers.", percentage(clean, original))
return s

def remove_outliers(self, markers, function):
Expand Down Expand Up @@ -648,7 +645,7 @@ def __init__(self, filename, mapnames, cast=int):
self.pivot = pivot
self.ref = ref

logging.debug("Map weights: {0}".format(self.items()))
logger.debug("Map weights: %s", self.items())

def update_maps(self, mapnames, default=1):
keys = list(self.keys())
Expand All @@ -659,15 +656,13 @@ def update_maps(self, mapnames, default=1):
if m in self:
continue
self[m] = default
logging.debug("Weight for `{0}` set to {1}.".format(m, default))
logger.debug("Weight for `%s` set to %d.", m, default)

def get_pivot(self, mapnames):
# Break ties by occurence in file
common_mapnames = set(self.maps) & set(mapnames)
if not common_mapnames:
logging.error(
"No common names found between %s and %s", self.maps, mapnames
)
logger.error("No common names found between %s and %s", self.maps, mapnames)
sys.exit(1)
return max(
(w, -self.maps.index(m), m) for m, w in self.items() if m in common_mapnames
Expand Down Expand Up @@ -719,7 +714,7 @@ def calculate_coords(self, r=0.8, gapsize=0.1):
class GapEstimator(object):
def __init__(self, mapc, agp, seqid, mlg, function=lambda x: x.cm):
mm = mapc.extract_mlg(mlg)
logging.debug("Extracted {0} markers for {1}-{2}".format(len(mm), seqid, mlg))
logger.debug("Extracted %d markers for %s-%s", len(mm), seqid, mlg)
self.mlgsize = max(function(x) for x in mm)

self.agp = [x for x in agp if x.object == seqid]
Expand Down Expand Up @@ -1054,9 +1049,7 @@ def split(args):
start = end = (start + end) / 2
print("\t".join(str(x) for x in (mi.seqid, start - 1, end)))
nbreaks += 1
logging.debug(
"A total of {} breakpoints inferred (--chunk={})".format(nbreaks, nchunk)
)
logger.debug("A total of %d breakpoints inferred (--chunk=%d)", nbreaks, nchunk)


def movie(args):
Expand Down Expand Up @@ -1112,7 +1105,7 @@ def movie(args):
fwagp = must_open(agpfile, "w")
order_to_agp(seqid, tour, sizes, fwagp, gapsize=gapsize, evidence="map")
fwagp.close()
logging.debug("%s written to `%s`.", header, agpfile)
logger.debug("%s written to `%s`.", header, agpfile)
build([inputbed, scaffoldsfasta, "--cleanup"])
pdf_name = plot([inputbed, seqid, "--title={0}".format(label)])
sh("mv {0} {1}".format(pdf_name, image_name))
Expand Down Expand Up @@ -1262,15 +1255,15 @@ def merge(args):
try:
m = CSVMapLine(row, mapname=mapname)
if m.cm < 0:
logging.error("Ignore marker with negative genetic distance")
logger.error("Ignore marker with negative genetic distance")
print(row.strip(), file=sys.stderr)
else:
b.append(BedLine(m.bedline))
except (IndexError, ValueError): # header or mal-formed line
continue

b.print_to_file(filename=outfile, sorted=True)
logging.debug("A total of %d markers written to `%s`.", len(b), outfile)
logger.debug("A total of %d markers written to `%s`.", len(b), outfile)

assert len(maps) == len(mapnames), "You have a collision in map names"
write_weightsfile(mapnames, weightsfile=opts.weightsfile)
Expand Down Expand Up @@ -1309,24 +1302,22 @@ def mergebed(args):
continue

b.print_to_file(filename=outfile, sorted=True)
logging.debug("A total of %d markers written to `%s`.", len(b), outfile)
logger.debug("A total of %d markers written to `%s`.", len(b), outfile)

assert len(maps) == len(mapnames), "You have a collision in map names"
write_weightsfile(mapnames, weightsfile=opts.weightsfile)


def write_weightsfile(mapnames, weightsfile="weights.txt"):
if op.exists(weightsfile):
logging.debug(
"Weights file `{0}` found. Will not overwrite.".format(weightsfile)
)
logger.debug("Weights file `%s` found. Will not overwrite.", weightsfile)
return

fw = open(weightsfile, "w")
for mapname in sorted(mapnames):
weight = 1
print(mapname, weight, file=fw)
logging.debug("Weights file written to `%s`.", weightsfile)
logger.debug("Weights file written to `%s`.", weightsfile)


def best_no_ambiguous(d, label):
Expand Down Expand Up @@ -1463,10 +1454,8 @@ def path(args):
cpus = opts.cpus
seed = opts.seed
if sys.version_info[:2] < (2, 7):
logging.debug(
"Python version: {0}. CPUs set to 1.".format(
sys.version.splitlines()[0].strip()
)
logger.debug(
"Python version: %s. CPUs set to 1.", sys.version.splitlines()[0].strip()
)
cpus = 1

Expand All @@ -1484,7 +1473,7 @@ def path(args):
ref = weights.ref
linkage = opts.linkage
oseqid = opts.seqid
logging.debug("Linkage function: {0}-linkage".format(linkage))
logger.debug("Linkage function: %s-linkage", linkage)
linkage = {
"single": min,
"double": double_linkage,
Expand All @@ -1500,12 +1489,12 @@ def path(args):
C.join(mlg)

if partitionsfile:
logging.debug("Partition LGs based on `{}`".format(partitionsfile))
logger.debug("Partition LGs based on `%s`", partitionsfile)
fp = open(partitionsfile)
for row in fp:
C.join(*row.strip().split(","))
else:
logging.debug("Partition LGs based on {0}".format(ref))
logger.debug("Partition LGs based on %s", ref)
for mapname in mapnames:
if mapname == ref:
continue
Expand Down Expand Up @@ -1561,9 +1550,9 @@ def path(args):
tag = "|".join(lgs)
lgs_maps = set(x.split("-")[0] for x in lgs)
if pivot not in lgs_maps:
logging.debug("Skipping {0} ...".format(tag))
logger.debug("Skipping %s ...", tag)
continue
logging.debug("Working on {0} ...".format(tag))
logger.debug("Working on %s ...", tag)
s = ScaffoldOO(
lgs,
scaffolds,
Expand Down Expand Up @@ -1593,7 +1582,7 @@ def path(args):
)
for i, (c, size) in enumerate(sorted(chrsizes.items(), key=lambda x: -x[1])):
newc = "chr{0}".format(i + 1)
logging.debug("{0}: {1} => {2}".format(c, size, newc))
logger.debug("%s: %d => %d", c, size, newc)
conversion[c] = newc
for s in solutions:
s.object = conversion[s.object]
Expand All @@ -1609,8 +1598,8 @@ def path(args):
order_to_agp(s.object, s.tour, sizes, fwagp, gapsize=gapsize, evidence="map")
fwagp.close()

logging.debug("AGP file written to `%s`.", agpfile)
logging.debug("Tour file written to `%s`.", tourfile)
logger.debug("AGP file written to `%s`.", agpfile)
logger.debug("Tour file written to `%s`.", tourfile)

build([inputbed, fastafile])

Expand Down Expand Up @@ -1638,7 +1627,7 @@ def write_unplaced_agp(agpfile, scaffolds, unplaced_agp):
if s in scaffolds_seen:
continue
order_to_agp(s, [(s, "?")], sizes, fwagp)
logging.debug("Write unplaced AGP to `%s`", unplaced_agp)
logger.debug("Write unplaced AGP to `%s`", unplaced_agp)


def summary(args):
Expand Down Expand Up @@ -1764,7 +1753,7 @@ def build(args):

liftedbed = mapbed.rsplit(".", 1)[0] + ".lifted.bed"
if need_update((mapbed, chainfile), liftedbed):
logging.debug(
logger.debug(
"Lifting markers from positions in `%s` to new positions in `%s`",
mapbed,
liftedbed,
Expand Down Expand Up @@ -1848,8 +1837,8 @@ def plot(args):
s = Scaffold(seqid, cc)
mlgs = [k for k, v in s.mlg_counts.items() if v >= links]
while not mlgs:
links /= 2
logging.error("No markers to plot, --links reset to {0}".format(links))
links //= 2
logger.error("No markers to plot, --links reset to %d", links)
mlgs = [k for k, v in s.mlg_counts.items() if v >= links]

mlgsizes = {}
Expand Down

0 comments on commit 512b26d

Please sign in to comment.