grptools

#!/usr/bin/env python

# Copyright (C) 2016  Shengwei Hou
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.


import os
import sys 
import argparse
import numpy as np
from Bio import SeqIO
from collections import Counter
from sys import stdout, stderr


def main_usage(parser):
    """ display usage for main parser
    """
    stderr.write(parser.format_help())

def subparser_usage(argv, parser):
    """ display usage for subparser
    """
    cmd = argv[1]
    found = 0
    for action in parser._actions:
        if isinstance(action, argparse._SubParsersAction):
            for choice, subparser in action.choices.items():
                if cmd == choice:
                    stderr.write(subparser.format_help())
                    found = 1
    if not found:
        stderr.write("\n\nERROR:%s is not a valid command!!!\n\n"%cmd)
        main_usage(parser)

def display_help(argv, parser):
    """ display help information
    """
    if len(argv) == 1:
        main_usage(parser)
        sys.exit(1)
    elif len(argv) == 2:
        subparser_usage(argv, parser)
        sys.exit(1)
    else:
        pass


def _get_genome_size(input_file):
    """ parse fasta records in input_file,
        return the fastaName:genomeSize as a dictionary
    """
    genomeSizeDict = {}
    for fasta in SeqIO.parse(input_file, "fasta"):
        name = fasta.name.strip()
        length = len(fasta.seq)
        genomeSizeDict[name] = length
    return genomeSizeDict


def _get_chromArrayDict(genomeSizeDict, wig):
    """
    :param genomeSizeDict: a dict of chromosome name and length, got from _get_genome_size(input_genome)
    :param wig: fwd or rev wiggle file
    :return: a dict of chromosome:Array
    """
    chromArrayDict = {} # {chrom:Array}
    for chrom in genomeSizeDict.keys():
        chromArrayDict[chrom] = np.zeros(genomeSizeDict[chrom])

    with open(wig, "r") as ih:
        for line in ih:
            if line.startswith("variableStep"):
                chrom = line.strip().split("chrom=")[-1].strip()
                if not chrom in genomeSizeDict:
                    stderr.write("ERROR: chromosome %s in your wig file was not found " \
                                         "in your genome reference! "%chrom)
                    sys.exit(1)
            else:
                line = line.strip().split("\t")
                index = int(line[0])-1
                value = float(line[1])
                if index >= len(chromArrayDict[chrom]):
                    stderr.write("ATTENTION: Mapped position at %d was out of genome length %d, " \
                                         "on chromosome %s"%(index, genomeSizeDict[chrom], chrom))
                    continue
                chromArrayDict[chrom][index] = value

    return chromArrayDict


def wig2grp(args):
    """ convert wig to grp file
    """
    # test args
    if not (args.fwd_wig or args.rev_wig):
        stderr.write("\nERROR: At least one wiggle file need to be given!\n")
        sys.exit(1)

    # wiggle files, prefix and other arguments
    fwd=args.fwd_wig
    rev=args.rev_wig
    prefix=args.prefix
    toInt=args.toInt

    # get genomeSize
    genomeSizeDict = _get_genome_size(args.genomeFasta)

    # only fwd or rev wig
    if (fwd and (not rev)) or (rev and (not fwd)):

        if fwd:
            header = "# BASE fwd_grp\n# colour 0:255:0\n"
            wig = fwd
        else:
            header = "# BASE rev_grp\n# colour 0:0:255\n"
            wig = rev

        chromArrayDict = _get_chromArrayDict(genomeSizeDict, wig)
        for chrom, value_array in chromArrayDict.iteritems():
            pos_array = np.arange(genomeSizeDict[chrom]) + 1
            if prefix:
                outfile = prefix +"_"+ chrom + ".grp"
            else:
                outfile = os.path.basename(wig) +"_"+ chrom +".grp"
            with open(os.path.join(os.getcwd(), outfile), "w") as oh:
                oh.write(header)
                if toInt:
                    for pos, value in zip(pos_array, value_array):
                        oh.write("%d %d\n"%(pos, int(value)))
                else:
                    for pos, value in zip(pos_array, value_array):
                        oh.write("%d %.2f\n"%(pos, value))

    # both fwd and rev wig, integrate into one grp
    else:
        header = "# BASE fwd_grp rev_grp\n# colour 0:255:0 0:0:255\n"
        fwd_chromArrayDict = _get_chromArrayDict(genomeSizeDict, fwd)
        rev_chromArrayDict = _get_chromArrayDict(genomeSizeDict, rev)
        for chrom, fwd_value_array in fwd_chromArrayDict.iteritems():
            pos_array = np.arange(genomeSizeDict[chrom]) + 1
            rev_value_array = rev_chromArrayDict[chrom]
            if prefix:
                outfile = prefix +"_"+ chrom + ".grp"
            else:
                if "Forward" in fwd:
                    outfile = os.path.basename(fwd).replace("Forward", "FwdRev") +"_"+ chrom +".grp"
                else:
                    outfile = os.path.basename(fwd).rsplit(".", 1)[0] +"_FwdRev_"+ chrom +".grp"
            with open(os.path.join(os.getcwd(), outfile), "w") as oh:
                oh.write(header)
                if toInt:
                    for pos, fwd_value, rev_value in zip(pos_array, fwd_value_array, rev_value_array):
                        oh.write("%d %d %d\n"%(pos, int(fwd_value), int(rev_value)))
                else:
                    for pos, fwd_value, rev_value in zip(pos_array, fwd_value_array, rev_value_array):
                        oh.write("%d %.2f %.2f\n"%(pos, fwd_value, rev_value))


def _get_headers(path2grp):
    headers = []
    with open(path2grp, "r") as ih:
        for line in ih:
            if line.startswith("#"):
                headers.append(line.strip())
            else:
                break
    return headers


def _get_ncol(headers):
    ncol = None
    for line in headers:
        line = line.lstrip("# ")
        if not ncol:
            ncol = len(line.strip().split())-1
        else:
            assert ncol == len(line.strip().split())-1, "lines in headers have different number of columns!"
    return ncol


def _read_grp(path2grp):
    """
    :param path2grp: input grp file
    :return:         the headers and a list of grp numpy arrays.

    This function used to read in grp file, the common 2 column input grp file format like this:
        # BASE fwd_grp rev_grp
        # colour 0:204:0 0:0:255
        1 0.00 -0.00
        2 0.00 -0.00
        3 0.00 -0.00

    or the 4 column grp file format like this:
        # BASE fwd_cov fwd_tss rev_cov rev_tss
        # colour 0:204:0 255:0:0 0:0:255 255:0:0
        1 0.00 0.00 -0.00 -0.00
        2 0.00 0.00 -0.00 -0.00
        3 0.00 0.00 -0.00 -0.00
    """
    headers = _get_headers(path2grp)
    ncol = _get_ncol(headers)

    res = []
    for i in range(ncol):
        res.append([])

    with open(path2grp, "r") as ih:
        for line in ih:
            if line.startswith("#"):
                continue
            line = line.strip().split()
            for i, val in enumerate(line[1:]):
                res[i].append(abs(float(val)))

    # convert to np.array
    res = [np.array(arr) for arr in res]

    return headers, res


def _write_grp(filename, headers, arrays, toInt):
    """
    :param filename: output filename
    :param headers:  headers as a list returned by _get_headers()
    :param arrays:   arrays as a list generated by _read_grp()
    :param toInt:    convert to Integer or not
    :return:
    """

    ncol = _get_ncol(headers)

    # make rev to minus
    if ncol == 2:
        arrays[1] *= -1
    else:
        arrays[2] *= -1
        arrays[3] *= -1

    # initialize base array as long as genome length
    pos_array = np.arange(len(arrays[0]))
    pos_array += 1

    # write out grp
    with open(filename, "w") as oh:
        for line in headers:
            oh.write(line+"\n")

        if ncol == 2:
            if toInt:
                for pos, fwd, rev in zip(pos_array, arrays[0], arrays[1]):
                    oh.write(str(pos)+" %d %d\n"%(int(fwd), int(rev)))
            else:
                for pos, fwd, rev in zip(pos_array, arrays[0], arrays[1]):
                    oh.write(str(pos)+" %.2f %.2f\n"%(fwd, rev))
        else:
            if toInt:
                for pos, fwd_cov, fwd_tss, rev_cov, rev_tss in zip(pos_array, arrays[0], arrays[1],
                                                                   arrays[2], arrays[3]):
                    oh.write(str(pos) + " %d %d %d %d\n" % (int(fwd_cov), int(fwd_tss), int(rev_cov), int(rev_tss)))
            else:
                for pos, fwd_cov, fwd_tss, rev_cov, rev_tss in zip(pos_array, arrays[0], arrays[1],
                                                                   arrays[2], arrays[3]):
                    oh.write(str(pos) + " %.2f %.2f %.2f %.2f\n" % (fwd_cov, fwd_tss, rev_cov, rev_tss))


def merge(args):
    """
    given a list of grp files, get the mean or sum of grp file
    """
    grpFile_list = args.input_grps
    prefix = args.prefix
    toInt = args.toInt
    mode = args.mode

    if prefix:
        outfile = prefix+".grp"
    else:
        basename = os.path.basename(grpFile_list[0])
        filestem = os.path.splitext(basename)[0]
        outfile = filestem+"_" + mode + ".grp"


    # number of grp files
    n = len(grpFile_list)

    # read in first one, sum the others, then divide by n (if mode == average)
    headers, grp_arrays = _read_grp(grpFile_list[0])
    ncol = _get_ncol(headers)

    for grp_file in grpFile_list[1:]:
        tmp_headers, tmp_grp_arrays = _read_grp(grp_file)
        assert len(tmp_grp_arrays) == len(grp_arrays), "The input grp files have different number of columns!"
        for i, tmp_arr in enumerate(tmp_grp_arrays):
            grp_arrays[i] += tmp_arr

    if mode == "average":
        for i, arr in enumerate(grp_arrays):
            grp_arrays[i] = arr/n

    _write_grp(outfile, headers, grp_arrays, toInt)


def compareTSS(args):
    """
    given a list of grp files, compare their TSS positions, integrate overlapping
    TSS positions in different grp files into non-redundant positions
    """
    grpFile_list = args.input_grps
    prefix = args.prefix
    toInt = args.toInt
    mode = args.mode           # choices=["average", "max", "sumup"],
    upstream = args.upstream
    downstream = args.downstream

    if prefix:
        outfile = prefix+".grp"
    else:
        basename = os.path.basename(grpFile_list[0])
        filestem = os.path.splitext(basename)[0]
        outfile = filestem+"_compareTSS_" + mode + ".grp"

    # number of grp files
    n = len(grpFile_list)

    # functions corresponding to mode
    func = {"average":lambda x, y: (x+y)/2,
            "max":lambda x, y: x if x > y else y,
            "sumup":lambda x, y: x+y }

    # read in first one
    headers, grp_arrays = _read_grp(grpFile_list[0])
    ncol = _get_ncol(headers)
    assert ncol == 4, "The grp file must have coverage columns in order to compare TSS positions"

    for grp_file in grpFile_list[1:]:
        tmp_headers, tmp_grp_arrays = _read_grp(grp_file)
        assert len(tmp_grp_arrays) == len(grp_arrays), "The input grp files have different number of columns!"

        for i, arr in enumerate(tmp_grp_arrays):
            if i % 2 == 0:  # coverage columns, nothing to do
                continue
            for tmp_pos, tmp_tss in enumerate(arr): # working on tss arrays
                if tmp_tss <= 1:       # only tmp_tss more than 1 will be considered
                    continue
                tmp_cov = tmp_grp_arrays[i-1][tmp_pos]
                if int(tmp_cov) == 0:  # besides, the TSS without any coverage will not be considered
                    continue
                # define coverage region of tmp_tss at tmp_pos
                tmp_up = tmp_pos-upstream if tmp_pos-upstream > 0 else 0
                tmp_down = tmp_pos+downstream+1 if tmp_pos+downstream+1 < len(arr) else len(arr)
                tss = max(grp_arrays[i][tmp_up:tmp_down])
                tss_pos = list(grp_arrays[i][tmp_up:tmp_down]).index(tss)+tmp_up
                tss_up = tss_pos-upstream if tss_pos-upstream > 0 else 0
                tss_down = tss_pos+downstream+1 if tss_pos+downstream+1 < len(arr) else len(arr)
                cov_region = grp_arrays[i-1][tss_up:tss_down]
                tmp_cov_region = tmp_grp_arrays[i-1][tmp_up:tmp_down]

                # average TSS, also coverage of the clustering region
                if mode == "average":
                    grp_arrays[i][tss_pos] = sum(tss, tmp_tss)/2
                    grp_arrays[i-1][tss_up:tss_down] = np.arrays([x+y for x, y in zip(cov_region, tmp_cov_region)])/2

                # take max of TSS, also take the coverage of the sample with higher TSS
                elif mode == "max":
                    if tss >= tmp_tss:                   # tss is larger, no change
                        continue
                    else:                                # tss is smaller, use tmp_tss
                        # set tss count and coverage of old tss region to 0
                        grp_arrays[i][tss_pos] = 0
                        #grp_arrays[i][tmp_up:tmp_down] = 0
                        #grp_arrays[i-1][tss_up:tss_down] = 0
                        # transfer count and coverage of tmp_tss region to tss region
                        grp_arrays[i][tmp_pos] = tmp_tss
                        grp_arrays[i-1][tmp_up:tmp_down] = tmp_cov_region
                # take sum of TSS, also take the sum of the samples for coverage
                else:
                    grp_arrays[i][tss_pos] = sum(tss, tmp_tss)
                    grp_arrays[i-1][tss_up:tss_down] = np.arrays([x+y for x, y in zip(cov_region, tmp_cov_region)])

    _write_grp(outfile, headers, grp_arrays, toInt)


def subtract(args):
    """
    given two grp files, subtract the first one by the second one
    """
    prefix = args.prefix
    first_grp = args.first_grp
    second_grp = args.second_grp
    func = args.transform_func
    base_line = args.base_line
    toInt = args.toInt

    if prefix:
        outfile = prefix+".grp"
    else:
        basename = os.path.basename(first_grp)
        filestem = os.path.splitext(basename)[0]
        outfile = filestem+"_subtracted.grp"

    # read in first one, sum the others, then divide by n (if mode == average)
    first_headers, first_grp_arrays = _read_grp(first_grp)
    ncol = _get_ncol(first_headers)
    second_headers, second_grp_arrays = _read_grp(second_grp)
    assert len(second_grp_arrays) == len(first_grp_arrays), "The input grp files have different number of columns!"

    # transform second grp arrays
    func_dict = {'sqrt':np.sqrt, 'log2':np.log2}
    if func is not None:
        for i, arr in enumerate(second_grp_arrays):
            second_grp_arrays[i] = func_dict[func](arr)

    for i, second_grp_arr in enumerate(second_grp_arrays):
            first_grp_arrays[i] -= second_grp_arr
            first_grp_arrays[i][first_grp_arrays[i] < 0] = base_line

    # if the tss > cov at one position, change it to the coverage
    if ncol == 4:
        cov_arrays = [first_grp_arrays[0], first_grp_arrays[2]]
        tss_arrays = [first_grp_arrays[1], first_grp_arrays[3]]
        for i, (cov, tss) in enumerate(zip(cov_arrays, tss_arrays)):
            for j, (_cov, _tss) in enumerate(zip(cov, tss)):
                if _tss > _cov:
                    first_grp_arrays[i+1][j] = _cov

    _write_grp(outfile, first_headers, first_grp_arrays, toInt)


def intersect(args):
    """
    given two grp files, only keep tss positions shared by them, set tss positions occured in one grp to 0
    """
    prefix = args.prefix
    first_grp = args.first_grp
    second_grp = args.second_grp
    toInt = args.toInt


    if prefix:
        outfile1 = prefix + "_first.grp"
        outfile2 = prefix + "_second.grp"
    else:
        basename1 = os.path.basename(first_grp)
        filestem1 = os.path.splitext(basename1)[0]
        outfile1 = filestem1 + "_intersected.grp"
        basename2 = os.path.basename(second_grp)
        filestem2 = os.path.splitext(basename2)[0]
        outfile2 = filestem2 + "_intersected.grp"

    first_headers, first_grp_arrays = _read_grp(first_grp)
    ncol = _get_ncol(first_headers)
    second_headers, second_grp_arrays = _read_grp(second_grp)
    assert _get_ncol(second_headers) == ncol, "The input grp files have different number of columns!"

    # get tss arrays
    if ncol == 2:
        first_tss_arrays = first_grp_arrays
        second_tss_arrays = second_grp_arrays
    else:
        first_tss_arrays = [first_grp_arrays[1], first_grp_arrays[3]]
        second_tss_arrays = [second_grp_arrays[1], second_grp_arrays[3]]

    # intersect two arrays
    for i, arr in enumerate(first_tss_arrays):
        first_arr = arr
        second_arr = second_tss_arrays[i]

        for j, (v1, v2) in enumerate(zip(first_arr, second_arr)):
            if v1 > 0 and v2 == 0:
                first_tss_arrays[i][j] = 0
            elif v1 == 0 and v2 > 0:
                second_tss_arrays[i][j] = 0

    # setup return arrays
    if ncol == 2:
        res_first_arrays = first_tss_arrays
        res_second_arrays = second_tss_arrays
    else:
        res_first_arrays = [first_grp_arrays[0], first_tss_arrays[0], first_grp_arrays[2], first_tss_arrays[1]]
        res_second_arrays = [second_grp_arrays[0], second_tss_arrays[0], second_grp_arrays[2], second_tss_arrays[1]]

    _write_grp(outfile1, first_headers, res_first_arrays, toInt)
    _write_grp(outfile2, second_headers, res_second_arrays, toInt)


def summarize(args):
    """
    summarize basic contents of given grp file
    """
    input_grp = args.input_grp
    headers, arrays = _read_grp(input_grp)
    ncol = _get_ncol(headers)
    length = len(arrays[0])
    colnames = headers[0].lstrip("# BASE").split()

    print("Input grp file contains %d columns (excluding pos_array), " \
          "each column has %d values"%(ncol, length))
    print("The colnames includes:")
    print("\t", "\t".join(colnames))

    # summarize each arr
    for i, name in enumerate(colnames):
        print("For column %s:"%name)
        arr = arrays[i]
        print("\tSum:\t%.2f"%sum(arr))
        print("\tMin:\t%.2f"%min(arr))
        print("\tMax:\t%.2f"%max(arr))
        print("\tMedian:\t%.2f"%np.median(arr))
        print("\tMean:\t%.2f"%np.mean(arr))


def slice(args):
    """
    slice 4 column grp files into 2 column coverage or tss grp file.
    """
    input_grp = args.input_grp
    type = args.type
    prefix = args.prefix
    toInt = args.toInt

    if prefix:
        outfile = prefix+".grp"
    else:
        basename = os.path.basename(input_grp)
        filestem = os.path.splitext(basename)[0]
        outfile = filestem+"_"+ type + ".grp"

    headers, arrays = _read_grp(input_grp)
    new_headers = []

    if type == "coverage":
        for line in headers:
            line = line.strip().split()
            new_headers.append(" ".join([line[0], line[1], line[2], line[4]]))
        new_arrays = [arrays[0], arrays[2]]
    else:
        for line in headers:
            line = line.strip().split()
            new_headers.append(" ".join([line[0], line[1], line[3], line[5]]))
        new_arrays = [arrays[1], arrays[3]]

    _write_grp(outfile, new_headers, new_arrays, toInt)


def normalize(args):
    """
    normalize input grp file, this function will first sum up all the fwd and rev values in tss columns, then
    divide each value against the summed tss. If a multiplier given, then each value will be multiplied by this
    multiplier,

    The grp file should have 4 columns as below:

    # BASE fwd_cov fwd_tss rev_cov rev_tss
    # colour 0:204:0 255:0:0 0:0:255 255:0:0
    1 0.00 0.00 -0.00 -0.00
    2 0.00 0.00 -0.00 -0.00

    or 2 columns like below:

    # BASE fwd_tss rev_tss
    # colour 0:204:0 0:0:255

    """
    input_grp = args.input_grp
    multiplier = args.multiplier
    prefix = args.prefix
    toInt = args.toInt

    if prefix:
        outfile = prefix + ".grp"
    else:
        basename = os.path.basename(input_grp)
        filestem = os.path.splitext(basename)[0]
        outfile = filestem + "_normalized.grp"

    # do normalization
    headers, arrays = _read_grp(input_grp)
    ncol = _get_ncol(headers)

    if ncol == 4:
        total_tss = sum(arrays[1]) + sum(arrays[3])
    else:
        total_tss = sum(arrays[0]) + sum(arrays[1])

    new_arrays = []
    for arr in arrays:
        arr = multiplier*arr/total_tss
        new_arrays.append(arr)

    _write_grp(outfile, headers, new_arrays, toInt)


def combine(args):
    """
    combine 2 column coverage grp file and 2 column tss grp file into one
    """
    input_cov_grp = args.input_cov_grp
    input_tss_grp = args.input_tss_grp
    prefix = args.prefix
    toInt = args.toInt

    if prefix:
        outfile = prefix + ".grp"
    else:
        basename = os.path.basename(input_cov_grp)
        filestem = os.path.splitext(basename)[0]
        outfile = filestem + "_combined.grp"

    # do combine
    cov_headers, cov_arrays = _read_grp(input_cov_grp)
    tss_headers, tss_arrays = _read_grp(input_tss_grp)

    assert _get_ncol(cov_headers) == _get_ncol(tss_headers) == 2, \
        "combine only supports 2 column grp file, please slice your grp file first!"

    # combine headers
    cov_colnames = cov_headers[0].lstrip("# BASE").split()
    cov_colours = cov_headers[1].lstrip("# colour").split()
    tss_colnames = tss_headers[0].lstrip("# BASE").split()
    tss_colours = tss_headers[1].lstrip("# colour").split()
    combined_colnames = [cov_colnames[0], tss_colnames[0], cov_colnames[1], tss_colnames[1]]
    combined_colours = [cov_colours[0], tss_colours[0], cov_colours[1], tss_colours[1]]
    combined_headers = ["# BASE " + " ".join(combined_colnames), "# colour " + " ".join(combined_colours)]

    # combine arrays
    combined_arrays = [cov_arrays[0], tss_arrays[0], cov_arrays[1], tss_arrays[1]]

    _write_grp(outfile, combined_headers, combined_arrays, toInt)


def _group_tss(tss_arr, before=5, after=5, template_tss_arr=None, set_to_zero=False):
    """
    :param tss_arr: input tss array
    :param before:  upstream region
    :param after:   downstream region
    :param template_tss_arr: used as a template to group tss around positions in the template
    :param set_to_zero: set the other positions to zero, when not in the template
    :return:
    """
    total_length = len(tss_arr)

    # count how many tss we have before grouping
    oldNum = Counter(tss_arr > 0)[True]

    # index array for sorted tss_arr from small to large value
    if template_tss_arr is not None:
        sort_index = np.argsort(template_tss_arr)
    else:
        sort_index = np.argsort(tss_arr)
    # inverse to make high TSS number's index in beginning
    sort_index = sort_index[::-1]


    for index in sort_index:
        #print "current array index is : %d, current guide value is: %d"%(index, template_tss_arr[index])
        # maybe this pos is already merged to a neighboring highly expressed tss
        if tss_arr[index] == 0:
            continue

        else:
            # get leftmost and rightmost pos of this region, rightmost is not inclusive
            fore_index = index - before if index - before > 0 else 0
            back_index = index + after + 1 if index + after + 1 < total_length else total_length

            # get sum of this region, assign it to tss_arr[index]
            tss_arr[index] = tss_arr[index] + \
                             np.sum(tss_arr[fore_index:index]) + \
                             np.sum(tss_arr[index + 1:back_index])

            # set before and after region of tss_arr[index] to 0
            tss_arr[fore_index:index] = 0
            tss_arr[index + 1:back_index] = 0

    if set_to_zero:
        tss_arr[np.where(template_tss_arr == 0)] = 0

    # count how many tss we have after grouping
    newNum = Counter(tss_arr > 0)[True]
    print("Grouped %d raw tss into %d raw tss in region %d upstream and %d downstream" % (oldNum, newNum, before, after))

    return tss_arr


def aggregate(args):
    """
    aggregate tss from surrounding positons, to increase the signal of most likely tss, and remove blurry signals
    """
    input_grp = args.input_grp
    template_grp = args.guide_grp
    set_to_zero = args.set_to_zero
    region = args.region
    prefix = args.prefix
    toInt = args.toInt

    if prefix:
        outfile = prefix + ".grp"
    else:
        basename = os.path.basename(input_grp)
        filestem = os.path.splitext(basename)[0]
        outfile = filestem + "_aggregated.grp"

    # prepare tss_arrays
    headers, arrays = _read_grp(input_grp)
    ncol = _get_ncol(headers)
    if ncol == 4:
        tss_arrays = [arrays[1], arrays[3]]
    else:
        tss_arrays = arrays

    # prepare guide_arrays
    if template_grp:
        guide_headers, guide_arrays = _read_grp(template_grp)
        guide_ncol = _get_ncol(guide_headers)
        if guide_ncol == 4:
            guide_tss = [guide_arrays[1], guide_arrays[3]]
        else:
            guide_tss = guide_arrays

    # group tss
    new_arrays = []
    if template_grp:
        for arr, guide_arr in zip(tss_arrays, guide_tss):
            new_arr = _group_tss(arr, before=region, after=region, template_tss_arr=guide_arr, set_to_zero=set_to_zero)
            new_arrays.append(new_arr)
    else:
        for arr in tss_arrays:
            new_arr = _group_tss(arr, before=region, after=region)
            new_arrays.append(new_arr)

    # prepare result arrays
    if ncol == 4:
        res_arrays = [arrays[0], new_arrays[0], arrays[2], new_arrays[1]]
    else:
        res_arrays = new_arrays

    _write_grp(outfile, headers, res_arrays, toInt)


def main():

    # main parser
    parser = argparse.ArgumentParser(description="A set of subcommands for GRP manipulations")
    parser.add_argument("-v", "--version", action="version", version="%(prog)s 1.0")


    # parent parser, to specify shared arguments, inherited by subparsers
    parent_parser = argparse.ArgumentParser(add_help=False)
    parent_parser.add_argument("-p", "--prefix", required=False, help="output prefix for grp file")
    parent_parser.add_argument("-t", "--toInt", required=False, action="store_true",
                               help="write out integer values")


    # subparsers
    subparsers = parser.add_subparsers(help='available subcommands')

    # ------- #
    # wig2grp #
    # ------- #
    wig2grp_desc="Convert variableStep forward and reverse wig files into fixedStep grp file, to visualize in Artemis"
    parser_wig2grp = subparsers.add_parser('wig2grp', parents=[parent_parser],
                                           help='convert wiggle format to grp format', description=wig2grp_desc)
    parser_wig2grp.add_argument("genomeFasta", help="genome fasta file that used in the mapping")
    parser_wig2grp.add_argument('-f', "--fwd_wig", help="forward wig file generated by RSeQC::bam2wig.py")
    parser_wig2grp.add_argument('-r', "--rev_wig", help="reverse wig file generated by RSeQC::bam2wig.py")
    parser_wig2grp.set_defaults(func=wig2grp)

    # ----- #
    # merge #
    # ----- #
    parser_merge = subparsers.add_parser('merge', parents=[parent_parser],
                                         help='average/sumup individual grp files into one grp file')
    parser_merge.add_argument("input_grps", nargs="+", help="grp files to be averaged/summed up")
    mode_help = "average: sum the values, then take avarage of the sum. \nsumup: sum the values in each grp column"
    parser_merge.add_argument("-m", "--mode", required=False, choices=["average", "sumup"],
                              default="average", help=mode_help)
    parser_merge.set_defaults(func=merge)


    # -------- #
    # subtract #
    # -------- #
    parser_subtract = subparsers.add_parser('subtract', parents=[parent_parser], help='subtract two grp files')
    parser_subtract.add_argument("first_grp", help="first grp file to be subtracted")
    parser_subtract.add_argument('second_grp', help='second grp file used to subtract')
    parser_subtract.add_argument('-f', '--transform_func', choices=[None, 'sqrt', 'log2'], default=None,
                                 help='transform the second grp using this fuction, ')
    baseline_help = 'non-negative value to be used replace negetive values produced by subtraction, default = 0.'
    parser_subtract.add_argument('-b', '--base_line', type=int, default=0, help=baseline_help)
    parser_subtract.set_defaults(func=subtract)


    # --------- #
    # intersect #
    # --------- #
    parser_intersect = subparsers.add_parser('intersect', parents=[parent_parser], help='intersect two grp files, to '
                                                                                        'keep only shared tss positions')
    parser_intersect.add_argument('first_grp', help='first grp file to be intersected')
    parser_intersect.add_argument('second_grp', help='second grp file to be intersected')
    parser_intersect.set_defaults(func=intersect)

    # --------- #
    # summarize #
    # --------- #
    parser_summarize = subparsers.add_parser('summarize', parents=[parent_parser], help='summarize input grp file')
    parser_summarize.add_argument("input_grp", help="input grp file to be summarized")
    parser_summarize.set_defaults(func=summarize)


    # ---------- #
    # compareTSS #
    # ---------- #
    parser_compareTSS = subparsers.add_parser('compareTSS', parents=[parent_parser],
                                         help='compare TSS in each grp files, and write a final grp file')
    parser_compareTSS.add_argument("input_grps", nargs="+", help="grp files to be compared and merged")
    parser_compareTSS.add_argument("-u", "--upstream", type=int, default=5,
                                         help="upstream clustering region, default=5")
    parser_compareTSS.add_argument("-d", "--downstream", type=int, default=5,
                                         help="downstream clustering region, default=5")
    mode_help = "average: sum the counts for TSS positions in each pair of grp, then take average. \n \
                 sumup: sum the counts for TSS positions in each pair of grp. \n \
                 max: take the maximum counts of TSS positions in each pair of grp.\n \
                 default=max."
    parser_compareTSS.add_argument("-m", "--mode", required=False, choices=["average", "max", "sumup"],
                                  default="max", help=mode_help)
    parser_compareTSS.set_defaults(func=compareTSS)


    # ----- #
    # slice #
    # ----- #
    parser_slice = subparsers.add_parser('slice', parents=[parent_parser], help='slice input grp file')
    parser_slice.add_argument("input_grp", help="input grp file to be sliced")
    parser_slice.add_argument("type", choices=['coverage', 'tss'], default='coverage',
                              help='which type of columns to slice')
    parser_slice.set_defaults(func=slice)


    # --------- #
    # normalize #
    # --------- #
    parser_normalize = subparsers.add_parser('normalize', parents=[parent_parser], help='normalize input grp file')
    parser_normalize.add_argument('input_grp', help='input grp file to be normalized')
    parser_normalize.add_argument('multiplier', type=int,
                                  help='the number to be multiplied after library normalization, normally the '
                                       'read count of largest library.')
    parser_normalize.set_defaults(func=normalize)


    # -------- #
    #  combine #
    # -------- #
    parser_combine = subparsers.add_parser('combine', parents=[parent_parser],
                                           help='combine coverage and tss grp into one grp file')
    parser_combine.add_argument('input_cov_grp', help='input coverage grp file')
    parser_combine.add_argument('input_tss_grp', help='input tss grp file')
    parser_combine.set_defaults(func=combine)


    # --------- #
    # aggregate #
    # --------- #
    parser_aggregate = subparsers.add_parser('aggregate', parents=[parent_parser],
                                             help='aggregate tss from surrounding positions')
    parser_aggregate.add_argument('input_grp', help='input grp file')
    parser_aggregate.add_argument('-r', '--region', type=int, default=5, help='the length of upstream or '
                                                                              'downstream region')
    parser_aggregate.add_argument('-g', '--guide_grp', help='template grp file to guide tss aggregation')
    parser_aggregate.add_argument('-s', '--set_to_zero', action='store_true',
                                  help='set tss to zero where positions were zero in template')
    parser_aggregate.set_defaults(func=aggregate)


    # ----------------------- #
    # parse arguments and run #
    # ----------------------- #

    # display help
    display_help(sys.argv, parser)

    # parse args
    args = parser.parse_args()

    # run commands
    args.func(args)

if __name__ == "__main__":
    main()