Add files via upload

2022-12-24
gaoyubang · Dec 24, 2022 · bef62ce · bef62ce
1 parent 4a12e1e
commit bef62ce
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -16,18 +16,18 @@ To use the binary package, simply download the pre-compiled Linux binary from fo
  https://drive.google.com/drive/folders/1Dodt6uJC7lBihSNgT3Mexzpl_uqBagu0?usp=sharing
 
 
-Users can untar nanom6A_2021_10_22.tar.gz, and make sure the binaries in your PATH environment variable.   
+Users can untar nanom6A_2022_12_23.tar.gz, and make sure the binaries in your PATH environment variable.   
 Testing the pre-compiled binary installation:
 
 
 
 ```
-tar -xvzf nanom6A_2021_10_22.tar.gz
-cd nanom6A_2021_10_22
+tar -xvzf nanom6A_2022_12_23.tar.gz
+cd nanom6A_2022_12_23
 sh run_binary.sh
 ```
 
-User may still need sam2tsv in your $PATH (after 2021_10_22 version), you can install it through conda.
+User may still need sam2tsv in your $PATH (after 2022_12_23 version), you can install it through conda.
 
 
 
@@ -85,8 +85,8 @@ apt install -y libxext-dev
 ```
 
 ```
-tar -xvzf nanom6A_2021_10_22.tar.gz
-cd nanom6A_2021_10_22
+tar -xvzf nanom6A_2022_12_23.tar.gz
+cd nanom6A_2022_12_23
 conda env create -f conda.yml #install conda environment
 ```
 
@@ -116,8 +116,8 @@ scikit-learn              |0.22
 Testing the installation (Please make sure the dependence was installed).
 
 ```
-tar -xvzf nanom6A_2021_10_22.tar.gz
-cd nanom6A_2021_10_22
+tar -xvzf nanom6A_2022_12_23.tar.gz
+cd nanom6A_2022_12_23
 sh run_source_code.sh
 ```
 
@@ -135,8 +135,8 @@ Testing the Docker:
 
 
 ```
-tar -xvzf nanom6A_2021_10_22.tar.gz
-cd nanom6A_2021_10_22
+tar -xvzf nanom6A_2022_12_23.tar.gz
+cd nanom6A_2022_12_23
 sudo docker run -it -v `pwd`:/data gaoyubang/nanom6a:v1 /bin/bash
 cd /data/
 sh run_docker.sh
@@ -220,6 +220,8 @@ ggggccacgctgcgggcccgggccatggccgccgccgatgccgagAGACACCTATGGCTGCCGATGAAGGCTCAGCAGAG
 ```
 EHMT1 NM_001354612.2	NM_001354611.2	NM_001145527.2	NM_001354259.2	NM_001354263.2	NM_024757.5
 ```
+(4) the --support  parameter: The minimum number of DRS reads supporting a modified m6A site in genomic coordinates from one million DRS reads.  The default is 10.  Due to the low sequencing depth for DRS reads, quantification of m6A modification in low abundance gene is difficult.  With this option, the pipeline will attempt to normalize library using this formula: Total number of DRS reads/1,000, 000 to generate \'per million scaling factor'.   Then the  'per million scaling factor'  multiply reads from -r option to generate the cuttoff for the number of modified transcripts  for each modified m6A site.   For example, the option (-r = 10, total DRS reads=2, 000, 000) will generate (2000000/1000000)*10=20 as cuttoff. Than means that modified A base supported by at least 20 modified transcripts will be identified as modified m6A sites in genomic coordinates.
+
 
 The main output is the ratio.x.tsv in the output dir.
 The header of ratio.x.tsv.
@@ -303,6 +305,11 @@ Fixed bugs due to samtools depth default 8000 maximum coverage!
 
 Fixed bugs due to overlap genes!
 
+**2022.12.24 19:27 NanYang**
+
+add --support parameter to normlaize sequence depth。
+
+
 All suggestions are welcome to [email protected] or [email protected]
 
 
diff --git a/predict_sites.py b/predict_sites.py
@@ -385,7 +385,7 @@ def readprediction2(fl):
 def establish_ratio(i,read,readfeature):
 	genename,chro=i.split("|")
 	poss_summary=defaultdict(dict)
-	numlimit=int(FLAGS.support)
+	numlimit=int(FLAGS.factor)
 	poss=[]
 	######
 	# ~ Chr10	21971841	Potri.010G244500.1	GXB01149_20180715_FAH87828_GA10000_sequencing_run_20180715_NPL0183_I1_33361_read_252_ch_88_strand.fast5	162	CTACA
@@ -549,6 +549,15 @@ def dependence_check():
 			sys.exit("please check %s in your $PATH !\n"%(com))
 	#############
 	sys.stderr.write("check finsh!\n")
+	############################################
+	fl1=FLAGS.input+".feature.fa"
+	n=0
+	for i in open(fl1,"r"):
+		if i.startswith(">"):
+			n+=1
+	FLAGS.factor=int(n*int(FLAGS.support)/1000000)
+	# ~ print(FLAGS.factor)
+	############################################
 if __name__ == "__main__":
 	parser = argparse.ArgumentParser(description='Predict to genome sites.')
 	parser.add_argument('-i', '--input', required = True,help="features_extract")
@@ -557,7 +566,7 @@ def dependence_check():
 	parser.add_argument('-r', '--referance', required = True, help="referance transcripts sequence file")
 	parser.add_argument('-b', '--isoform', required = True, help="gene to referance transcripts information")
 	parser.add_argument('--cpu', default=8,help='cpu number usage,default=8')
-	parser.add_argument('--support', default=20,help='one m6A site supported read number,default=20')
+	parser.add_argument('--support', default=10,help='The minimum number of DRS reads supporting a modified m6A site in genomic coordinates from one million DRS reads.  The default is 10.  Due to the low sequencing depth for DRS reads, quantification of m6A modification in low abundance gene is difficult.  With this option, the pipeline will attempt to normalize library using this formula: Total number of DRS reads/1,000, 000 to generate \'per million scaling factor\'.   Then the  \'per million scaling factor\'  multiply reads from -r option to generate the cuttoff for the number of modified transcripts  for each modified m6A site.   For example, the option (-r = 10, total DRS reads=2, 000, 000) will generate (2000000/1000000)*10=20 as cuttoff. Than means that modified A base supported by at least 20 modified transcripts will be identified as modified m6A sites in genomic coordinates.')
 	parser.add_argument('--proba', default=0.5,help='m6A site predict probability,default=0.5')
 	parser.add_argument('--model',required = True, help='model dir')
 	args = parser.parse_args(sys.argv[1:])
@@ -570,3 +579,4 @@ def dependence_check():
 	dependence_check()
 	run_main()
 ###########################################################
+######