diff --git a/jcvi/formats/fasta.py b/jcvi/formats/fasta.py index 6e1a2546..b794f045 100644 --- a/jcvi/formats/fasta.py +++ b/jcvi/formats/fasta.py @@ -4,13 +4,14 @@ import re import sys -import os import os.path as op import shutil import logging import string import hashlib +from random import choice + from itertools import groupby, zip_longest from more_itertools import grouper, pairwise @@ -19,12 +20,13 @@ from Bio.SeqRecord import SeqRecord from Bio.SeqUtils.CheckSum import seguid -from jcvi.formats.base import BaseFile, DictFile, must_open -from jcvi.formats.bed import Bed -from jcvi.utils.cbook import percentage -from jcvi.utils.console import printf -from jcvi.utils.table import write_csv -from jcvi.apps.base import OptionParser, ActionDispatcher, cleanup, need_update +from ..apps.base import OptionParser, ActionDispatcher, cleanup, need_update +from ..utils.cbook import percentage +from ..utils.console import printf +from ..utils.table import write_csv + +from .base import BaseFile, DictFile, must_open +from .bed import Bed class Fasta(BaseFile, dict): @@ -166,19 +168,17 @@ def sequence(self, f, asstring=True): return seq -""" -Class derived from https://gist.github.com/933737 -Original code written by David Winter (https://github.com/dwinter) - -Code writted to answer this challenge at Biostar: -http://biostar.stackexchange.com/questions/5902/ +class ORFFinder(object): + """ + Class derived from https://gist.github.com/933737 + Original code written by David Winter (https://github.com/dwinter) -(Code includes improvements from Brad Chapman) -""" + Code writted to answer this challenge at Biostar: + http://biostar.stackexchange.com/questions/5902/ + (Code includes improvements from Brad Chapman) -class ORFFinder(object): - """Find the longest ORF in a given sequence + Find the longest ORF in a given sequence "seq" is a string, if "start" is not provided any codon can be the start of and ORF. If muliple ORFs have the longest length the first one encountered is printed @@ -356,51 +356,48 @@ def rc(s): def main(): actions = ( + ("clean", "remove irregular chars in FASTA seqs"), + ("diff", "check if two fasta records contain same information"), ( "extract", - "given fasta file and seq id, retrieve the sequence " + "in fasta format", + "given fasta file and seq id, retrieve the sequence in fasta format", ), - ("longestorf", "find longest orf for CDS fasta"), - ("translate", "translate CDS to proteins"), - ("info", "run `sequence_info` on fasta files"), - ("summary", "report the real no of bases and N's in fasta files"), - ("uniq", "remove records that are the same"), - ("ids", "generate a list of headers"), + ("fastq", "combine fasta and qual to create fastq file"), ( "format", - "trim accession id to the first space or switch id " - + "based on 2-column mapping file", + "trim accession id to the first space or switch id based on 2-column mapping file", ), - ("pool", "pool a bunch of fastafiles together and add prefix"), - ("random", "randomly take some records"), - ("simulate", "simulate random fasta file for testing"), - ("diff", "check if two fasta records contain same information"), - ("identical", "given 2 fasta files, find all exactly identical records"), - ("trim", "given a cross_match screened fasta, trim the sequence"), - ("trimsplit", "split sequences at lower-cased letters"), - ("sort", "sort the records by IDs, sizes, etc."), ("filter", "filter the records by size"), + ("fromtab", "convert 2-column sequence file to FASTA format"), + ("gaps", "print out a list of gap sizes within sequences"), + ("gc", "plot G+C content distribution"), + ("identical", "given 2 fasta files, find all exactly identical records"), + ("ids", "generate a list of headers"), + ("info", "run `sequence_info` on fasta files"), + ("ispcr", "reformat paired primers into isPcr query format"), + ("join", "concatenate a list of seqs and add gaps in between"), + ("longestorf", "find longest orf for CDS fasta"), ("pair", "sort paired reads to .pairs, rest to .fragments"), ( "pairinplace", - "starting from fragment.fasta, find if " - + "adjacent records can form pairs", + "starting from fragment.fasta, find if adjacent records can form pairs", ), - ("fastq", "combine fasta and qual to create fastq file"), - ("tidy", "normalize gap sizes and remove small components in fasta"), + ("pool", "pool a bunch of fastafiles together and add prefix"), + ("qual", "generate dummy .qual file based on FASTA file"), + ("random", "randomly take some records"), ("sequin", "generate a gapped fasta file for sequin submission"), - ("gaps", "print out a list of gap sizes within sequences"), - ("join", "concatenate a list of seqs and add gaps in between"), + ("simulate", "simulate random fasta file for testing"), ( "some", - "include or exclude a list of records (also performs on " - + ".qual file if available)", + "include or exclude a list of records (also performs on .qual file if available)", ), - ("qual", "generate dummy .qual file based on FASTA file"), - ("clean", "remove irregular chars in FASTA seqs"), - ("ispcr", "reformat paired primers into isPcr query format"), - ("fromtab", "convert 2-column sequence file to FASTA format"), - ("gc", "plot G+C content distribution"), + ("sort", "sort the records by IDs, sizes, etc."), + ("summary", "report the real no of bases and N's in fasta files"), + ("tidy", "normalize gap sizes and remove small components in fasta"), + ("translate", "translate CDS to proteins"), + ("trim", "given a cross_match screened fasta, trim the sequence"), + ("trimsplit", "split sequences at lower-cased letters"), + ("uniq", "remove records that are the same"), ) p = ActionDispatcher(actions) p.dispatch(globals()) @@ -410,8 +407,6 @@ def simulate_one(fw, name, size): """ Simulate a random sequence with name and size """ - from random import choice - seq = Seq("".join(choice("ACGT") for _ in range(size))) s = SeqRecord(seq, id=name, description="Fake sequence") SeqIO.write([s], fw, "fasta") diff --git a/jcvi/formats/gff.py b/jcvi/formats/gff.py index 46c99b52..d4ee94bb 100644 --- a/jcvi/formats/gff.py +++ b/jcvi/formats/gff.py @@ -23,7 +23,7 @@ ) from ..annotation.reformat import atg_name from ..utils.cbook import AutoVivification -from ..utils.range import range_minmax +from ..utils.range import Range, range_minmax from ..utils.orderedcollections import DefaultOrderedDict, OrderedDict, parse_qs from .base import DictFile, LineFile, must_open, is_number @@ -484,8 +484,6 @@ def to_range(obj, score=None, id=None, strand=None): """ Given a gffutils object, convert it to a range object """ - from jcvi.utils.range import Range - if score or id: _score = score if score else obj.score _id = id if id else obj.id @@ -503,39 +501,39 @@ def main(): ("addparent", "merge sister features and infer their parent"), ("bed", "parse gff and produce bed file for particular feature type"), ("bed12", "produce bed12 file for coding features"), - ("fromgtf", "convert gtf to gff3 format"), - ("gtf", "convert gff3 to gtf format"), - ("gb", "convert gff3 to genbank format"), - ("sort", "sort the gff file"), + ("chain", "fill in parent features by chaining children"), + ("children", "find all children that belongs to the same parent"), + ("cluster", "cluster transcripts based on shared splicing structure"), + ("extract", "extract contig or features from gff file"), ("filter", "filter the gff file based on Identity and Coverage"), - ("sizes", "calculate sizes of features in gff file"), - ("format", "format the gff file, change seqid, etc."), ( "fixboundaries", "fix boundaries of parent features by range chaining child features", ), - ("chain", "fill in parent features by chaining children"), - ("rename", "change the IDs within the gff3"), - ("uniq", "remove the redundant gene models"), - ("liftover", "adjust gff coordinates based on tile number"), - ("note", "extract certain attribute field for each feature"), - ("load", "extract the feature (e.g. CDS) sequences and concatenate"), - ("extract", "extract contig or features from gff file"), - ("split", "split the gff into one contig per file"), - ("merge", "merge several gff files into one"), - ("parents", "find the parents given a list of IDs"), - ("children", "find all children that belongs to the same parent"), + ( + "fixpartials", + "fix 5/3 prime partial transcripts, locate nearest in-frame start/stop", + ), + ("format", "format the gff file, change seqid, etc."), ("frombed", "convert from bed format to gff3"), + ("fromgtf", "convert gtf to gff3 format"), ("fromsoap", "convert from soap format to gff3"), ("gapsplit", "split alignment GFF3 at gaps based on CIGAR string"), + ("gb", "convert gff3 to genbank format"), + ("gtf", "convert gff3 to gtf format"), + ("liftover", "adjust gff coordinates based on tile number"), + ("load", "extract the feature (e.g. CDS) sequences and concatenate"), + ("merge", "merge several gff files into one"), + ("note", "extract certain attribute field for each feature"), ("orient", "orient the coding features based on translation"), + ("parents", "find the parents given a list of IDs"), + ("rename", "change the IDs within the gff3"), + ("sizes", "calculate sizes of features in gff file"), + ("sort", "sort the gff file"), ("splicecov", "tag gff introns with coverage info from junctions.bed"), + ("split", "split the gff into one contig per file"), ("summary", "print summary stats for features of different types"), - ("cluster", "cluster transcripts based on shared splicing structure"), - ( - "fixpartials", - "fix 5/3 prime partial transcripts, locate nearest in-frame start/stop", - ), + ("uniq", "remove the redundant gene models"), ) p = ActionDispatcher(actions)