White_sTF_PROseq/correct_proseq_analysis.sh at main · BU-BMSIP/White_sTF_PROseq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env bash

echo "=== CORRECTED PRO-seq Analysis: Minus Strand Only ==="

python3 - "pSC146:" "8235" "8949" "10" <<'PY'
import sys, pandas as pd, numpy as np, pathlib
import warnings
warnings.filterwarnings('ignore')

contig = sys.argv[1]
tss, tes, bin_sz = map(int, sys.argv[2:5])

print(f"Processing for contig: '{contig}'")
print(f"TSS: {tss}, TES: {tes}, Bin size: {bin_sz}")
print("*** USING CORRECTED PRO-seq METHODOLOGY ***")
print("*** R1 reads on MINUS strand only (3' end positions) ***")

bed_files = list(pathlib.Path('bed').glob('*.bed'))
if not bed_files:
    print("No BED files found!")
    sys.exit(1)

for bedfile in bed_files:
    print(f"\nProcessing {bedfile.name}...")

    # Read BED file
    try:
        df = pd.read_csv(bedfile, sep='\t', header=None,
                         names=['chr', 'start', 'end', 'name', 'score', 'strand'])
    except Exception as e:
        print(f"Error reading {bedfile}: {e}")
        continue

    print(f"  Total reads: {len(df)}")
    print(f"  Plus strand reads: {len(df[df['strand'] == '+'])}")
    print(f"  Minus strand reads: {len(df[df['strand'] == '-'])}")

    # Filter for target contig (exact match)
    df_contig = df[df['chr'] == contig].copy()
    print(f"  Reads on '{contig}': {len(df_contig)}")

    if len(df_contig) == 0:
        print(f"  No reads found for contig '{contig}'")
        # Show available contigs for debugging
        available_contigs = sorted(df['chr'].unique())[:10]
        print(f"  Available contigs (first 10): {available_contigs}")
        continue

    # *** CRITICAL: PRO-seq uses ONLY minus strand reads ***
    # These represent R1 reads where the original nascent RNA was on the plus strand
    df_minus = df_contig[df_contig['strand'] == '-'].copy()
    print(f"  Minus strand reads on '{contig}': {len(df_minus)}")

    if len(df_minus) == 0:
        print(f"  No minus strand reads found for contig '{contig}'")
        continue

    # For minus strand reads: the 3' end position is the START coordinate
    # This represents where Pol II was when it incorporated the biotin-labeled nucleotide
    df_minus['pol2_position'] = df_minus['start']

    # Count reads at each Pol II position
    counts = df_minus['pol2_position'].value_counts().sort_index()

    if len(counts) == 0:
        print(f"  No valid Pol II positions found")
        continue

    print(f"  Position range: {counts.index.min()} - {counts.index.max()}")
    print(f"  Unique positions: {len(counts)}")
    print(f"  Total signal: {counts.sum()}")

    # Create binned counts
    min_pos = counts.index.min()
    max_pos = counts.index.max()
    bin_edges = range(min_pos - (min_pos % bin_sz), max_pos + bin_sz, bin_sz)

    binned_counts = []
    for i in range(len(bin_edges) - 1):
        bin_start = bin_edges[i]
        bin_end = bin_edges[i + 1]
        bin_count = counts.loc[bin_start:bin_end-1].sum()
        if bin_count > 0:
            binned_counts.append((bin_start, bin_count))

    # Save corrected binned data
    out_base = pathlib.Path('traces') / f"{bedfile.stem}_corrected"

    if binned_counts:
        binned_df = pd.DataFrame(binned_counts, columns=['bin_start', 'count'])
        binned_df.to_csv(f'{out_base}.binned.tsv', sep='\t', header=False, index=False)
        print(f"  Saved corrected binned data: {len(binned_counts)} bins")

    # Calculate pausing index and metrics
    try:
        # Pausing index: ratio of reads near TSS vs downstream gene body
        tss_region = counts.loc[max(0, tss-50):tss+50].sum()
        body_region = counts.loc[tss+50:min(tss+250, max_pos)].sum()

        if body_region > 0:
            pausing_index = tss_region / body_region
        else:
            pausing_index = float('inf')

        # Regional analysis
        regions = [
            ('promoter', max(0, tss-50), tss+50),
            ('gene_body', tss+50, tes-150),
            ('pre_TES', tes-150, tes),
            ('post_TES', tes, tes+150),
            ('downstream', tes+150, min(tes+500, max_pos))
        ]

        metrics = [f'pausing_index\t{pausing_index:.4f}']
        metrics.append(f'total_minus_strand_reads\t{len(df_minus)}')
        metrics.append(f'total_all_reads\t{len(df_contig)}')
        metrics.append(f'minus_strand_fraction\t{len(df_minus)/len(df_contig):.4f}')

        for region_name, start, end in regions:
            if start < end and start <= max_pos:
                region_sum = counts.loc[start:min(end, max_pos)].sum()
                region_length = min(end, max_pos) - start + 1
                region_density = region_sum / region_length if region_length > 0 else 0
                metrics.append(f'{region_name}_density\t{region_density:.4f}')
                metrics.append(f'{region_name}_total\t{region_sum}')

        # Save metrics
        with open(f'{out_base}.metrics.txt', 'w') as f:
            f.write('\n'.join(metrics) + '\n')

        print(f"  Pausing index (corrected): {pausing_index:.4f}")
        print(f"  Minus strand fraction: {len(df_minus)/len(df_contig):.3f}")

    except Exception as e:
        print(f"  Error calculating metrics: {e}")

print("\n*** CORRECTED PRO-seq Analysis Complete! ***")
print("Results saved with '_corrected' suffix:")
print("  traces/*_corrected.binned.tsv   - Corrected binned read counts (minus strand only)")
print("  traces/*_corrected.metrics.txt  - Corrected pausing indices and metrics")
PY

echo ""
echo "=== Comparison with Original Analysis ==="
echo "Compare file sizes - corrected should be smaller (minus strand only):"
echo "Original files:"
ls -la traces/*.binned.tsv | grep -v corrected
echo ""
echo "Corrected files:"
ls -la traces/*_corrected.binned.tsv 2>/dev/null || echo "No corrected files yet"