-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
mac
authored and
mac
committed
Dec 8, 2020
0 parents
commit 60b2df5
Showing
20 changed files
with
452 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
^.*\.Rproj$ | ||
^\.Rproj\.user$ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.Rproj.user | ||
.Rhistory | ||
.RData | ||
.Ruserdata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
Version: 1.0 | ||
|
||
RestoreWorkspace: Default | ||
SaveWorkspace: Default | ||
AlwaysSaveHistory: Default | ||
|
||
EnableCodeIndexing: Yes | ||
UseSpacesForTab: Yes | ||
NumSpacesForTab: 2 | ||
Encoding: UTF-8 | ||
|
||
RnwWeave: Sweave | ||
LaTeX: pdfLaTeX | ||
|
||
AutoAppendNewline: Yes | ||
StripTrailingWhitespace: Yes | ||
|
||
BuildType: Package | ||
PackageUseDevtools: Yes | ||
PackageInstallArgs: --no-multiarch --with-keep.source |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Package: Biopratice | ||
Type: Package | ||
Title: Biopratice | ||
Version: 0.1.0 | ||
Author: Zijing Xie | ||
Maintainer: Zijing Xie <[email protected]> | ||
Description: This package provides some functions of converting genbank file to | ||
fasta file, doing alignment between two sequences using dynamic programming | ||
algorithm and download genbank files from NCBI. | ||
License: GPL | ||
Encoding: UTF-8 | ||
LazyData: true | ||
RoxygenNote: 7.1.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
S3method(print,AlignmentResult) | ||
S3method(print,readfasta) | ||
export(download_genbank) | ||
export(global_aln) | ||
export(read.fasta) | ||
export(read.genbank) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#' @title download_genbank | ||
#' @description This function downloads genbank files from NCBI. | ||
#' @param acc The accession number | ||
#' @param db The database, it can be nuccore, nucest, nucgss or popset | ||
#' @param destfile The path where you want to save the genbank files | ||
#' @export | ||
download_genbank <- function(acc, db, destfile){ | ||
db <- db | ||
db <- paste('db=', db, '&id=', sep = '') | ||
accs <- acc | ||
base_url <- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' | ||
for (acc in accs){ | ||
mature_url <- paste(base_url, db, acc, '&rettype=gb', sep = '') | ||
gb.file_name <- paste(acc, '.gb', sep = '') | ||
destfile <- paste(destfile, gb.file_name, sep = '/') | ||
utils::download.file(mature_url,destfile = destfile) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#' @title global_aln | ||
#' @description This function reads the fasta files, extract the sequence from | ||
#' it and calculate the proportion of each kind of base in sequence. | ||
#' @param X A sequence to be aligned | ||
#' @param Y A sequence to be aligned | ||
#' @param Scoring_system A vector containing the score of match, mismatch and | ||
#' inserting a gap | ||
#' @export | ||
global_aln <- function(X, Y, Scoring_system){ | ||
Scoring_system <- Scoring_system | ||
names(Scoring_system) <- c('match', 'mismatch', 'gap') | ||
seq1 <- X | ||
seq2 <- Y | ||
seq1 <- unlist(strsplit(seq1, split = '')) | ||
seq2 <- unlist(strsplit(seq2, split = '')) | ||
seq1.len <- length(seq1) | ||
seq2.len <- length(seq2) | ||
score_matrix <- matrix(0, nrow = seq1.len + 1, ncol = seq2.len + 1) | ||
for (Row in 1:(seq1.len+1)){ | ||
for (Col in 1:(seq2.len+1)){ | ||
if (Row == 1){ | ||
if (Col != 1){score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']} | ||
} | ||
else{ | ||
if (Col == 1){score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap'];next;} | ||
else{ | ||
insert1 <- score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap'] | ||
insert2 <- score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap'] | ||
if (seq1[Row-1] == seq2[Col-1]){mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['match']} | ||
else{mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['mismatch']} | ||
score_matrix[Row, Col] <- max(insert1, insert2, mis_or_match) | ||
} | ||
} | ||
} | ||
} | ||
trace_back <- function(temp_seq1='', temp_seq2='', Row=seq1.len+1, Col=seq2.len+1, aln_result = list()){ | ||
if (Row == 1&Col == 1){ | ||
result <- list(c(temp_seq1, temp_seq2)) | ||
aln_result <- append(aln_result, result) | ||
aln_result | ||
} | ||
else{ | ||
if(Row>1&Col>1){ | ||
if ((seq1[Row-1] == seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['match']==score_matrix[Row, Col])){ | ||
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') | ||
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') | ||
aln_result1 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1) | ||
aln_result <- append(aln_result, aln_result1) | ||
} | ||
if ((seq1[Row-1] != seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['mismatch']==score_matrix[Row, Col])){ | ||
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') | ||
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') | ||
aln_result2 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1) | ||
aln_result <- append(aln_result, aln_result2) | ||
} | ||
if ((score_matrix[Row-1, Col] + Scoring_system['gap']==score_matrix[Row, Col])){ | ||
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') | ||
Seq2 <- paste('-', temp_seq2, sep = '') | ||
aln_result3 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col) | ||
aln_result <- append(aln_result, aln_result3) | ||
} | ||
if ((score_matrix[Row, Col-1] + Scoring_system['gap']==score_matrix[Row, Col])){ | ||
Seq1 <- paste('-', temp_seq1, sep = '') | ||
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') | ||
aln_result4<- trace_back(Seq1, Seq2, Row = Row, Col = Col-1) | ||
aln_result <- append(aln_result, aln_result4) | ||
} | ||
} | ||
else{ | ||
if(Row>1){ | ||
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') | ||
Seq2 <- paste('-', temp_seq2, sep = '') | ||
aln_result5 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col) | ||
aln_result <- append(aln_result, aln_result5) | ||
} | ||
if(Col>1){ | ||
Seq1 <- paste('-', temp_seq1, sep = '') | ||
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') | ||
aln_result6 <- trace_back(Seq1, Seq2, Row = Row, Col = Col-1) | ||
aln_result <- append(aln_result, aln_result6) | ||
} | ||
} | ||
} | ||
aln_result | ||
} | ||
aln_result <- trace_back() | ||
structure( | ||
list( | ||
Sequences = c(X, Y), | ||
Scoring_system = Scoring_system, | ||
Score_matrix = score_matrix, | ||
Aln_result = aln_result | ||
), | ||
class = 'AlignmentResult' | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#' @docType data | ||
#' @name Biopratice.test | ||
#' @title A dataset containing two sequences and Scoring_system. | ||
NULL | ||
Biopratice.test <- list("ATTCCAAG", "TTCGAGT", c(4, -3, -4)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#' @title print.AlignmentResult | ||
#' @description This function prints the object returned by function | ||
#' global_align. | ||
#' @param x The object returned by function global_align | ||
#' @param ... Other parameter | ||
#' @method print AlignmentResult | ||
#' @export | ||
print.AlignmentResult <- function(x, ...){ | ||
cat("Sequence X:\t") | ||
cat(x$Sequences[1]) | ||
cat('\n') | ||
cat("Sequence Y:\t") | ||
cat(x$Sequences[2]) | ||
cat('\n') | ||
cat('Scoring System:\t') | ||
cat(x$Scoring_system[1]) | ||
cat('\t') | ||
cat('for match;') | ||
cat('\t') | ||
cat(x$Scoring_system[2]) | ||
cat('\t') | ||
cat('for mismatch;') | ||
cat('\t') | ||
cat(x$Scoring_system[3]) | ||
cat('\t') | ||
cat('for gap;') | ||
cat('\t\n\n') | ||
cat('Dynamic programing matrix:\n') | ||
print(x$Score_matrix, ...) | ||
cat('\n') | ||
cat('Alignment:\n') | ||
for (i in 1:length(x$Aln_result)){ | ||
temp_result <- x$Aln_result[[i]] | ||
seq1 <- unlist(strsplit(temp_result[1], split = '')) | ||
seq2 <- unlist(strsplit(temp_result[2], split = '')) | ||
bars <- '' | ||
for ( i in 1:length(seq1)){ | ||
if ((seq1[i]==seq2[i])&(seq1[i]!='-')&(seq2[i]!='-')){ | ||
bars <- paste(bars, '|', sep = '') | ||
} | ||
else{ | ||
bars <- paste(bars, ' ', sep = '') | ||
} | ||
} | ||
bars <- paste(bars, '\n', sep = '') | ||
cat(temp_result[1]) | ||
cat('\n') | ||
cat(bars) | ||
cat(temp_result[2]) | ||
cat('\n\n') | ||
} | ||
cat('Optimum alignment mat:\t') | ||
cat(x$Score_matrix[dim(x$Score_matrix)[1], dim(x$Score_matrix)[2]]) | ||
cat('\n') | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#' @title print.readfasta | ||
#' @description This function prints the object returned by function read.fasta. | ||
#' @param x The object returned by function read.fasta | ||
#' @param ... Other parameter pass to print | ||
#' @method print readfasta | ||
#' @export | ||
print.readfasta <- function(x, ...){ | ||
cat(length(x$seq.list)) | ||
cat(' DNA sequence in binary format stored in a list.\n\n') | ||
if (length(x$seq.list) == 1){ | ||
cat('Sequence length:') | ||
cat(x$len) | ||
cat('\n\n') | ||
cat('Label:\n') | ||
cat(x$Labels) | ||
cat('\n\n') | ||
} | ||
else{ | ||
max_len <- max(x$len) | ||
min_len <- min(x$len) | ||
mean_len <- round(mean(x$len), 2) | ||
cat('Mean sequence length: ') | ||
cat(max_len) | ||
cat('\n\t') | ||
cat('Shortest sequence: ') | ||
cat(min_len) | ||
cat('\n\t') | ||
cat('Longest sequence: ') | ||
cat(max_len) | ||
cat('\n\n') | ||
cat('Label:\n') | ||
for (label in x$Labels){ | ||
Index <- which(x$Labels == label) | ||
if (Index >= 4){ | ||
break | ||
} | ||
else{ | ||
cat(label) | ||
cat('\n') | ||
} | ||
} | ||
} | ||
cat('\nBase composition:\n') | ||
if (dim(x$result)[1] >= 4){ | ||
print(x$result[1:4, ]) | ||
} | ||
else{ | ||
print(x$result) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#' @title read.fasta | ||
#' @description This function reads the fasta files, extract the sequence from | ||
#' it and calculate the proportion of each kind of base in sequence. | ||
#' @param fa_files A character vector in which all elements are the | ||
#' absolute path of fasta files. | ||
#' @export | ||
read.fasta <- function(fa_files){ | ||
fa_sum <- vector() | ||
for (f in fa_files){ | ||
f_temp <- readLines(f) | ||
fa_sum <- c(fa_sum, f_temp) | ||
} | ||
label.index <- grep('^>', fa_sum) | ||
seq.index <- label.index + 1 | ||
seq.list <- fa_sum[seq.index] | ||
names(seq.list) <- sub('^>(\\w+)\\|?', '\\1', fa_sum[label.index]) | ||
seq.len <- vector() | ||
result <- array(0, dim = c(length(label.index), 17)) | ||
colnames(result) <- c('A', 'T', 'C', 'G', 'U', "R", 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', '-') | ||
rownames(result) <- names(seq.list) | ||
for (i in 1:length(seq.list)){ | ||
s <- seq.list[i] | ||
s.list <- unlist(strsplit(toupper(s), split = '')) | ||
s.len <- length(s.list) | ||
names(s.len) <- names(s) | ||
seq.len <- c(seq.len, s.len) | ||
for (b in s.list){ | ||
result[names(s), b] <- result[names(s), b] + 1 | ||
} | ||
result[names(s), ] <- round(result[names(s), ]/s.len, 2) | ||
} | ||
structure( | ||
list(seq.list = seq.list, | ||
result = result, | ||
Labels = sub('>', '', fa_sum[label.index]), | ||
len = seq.len | ||
), | ||
class = "readfasta" | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#' @title read.genbank | ||
#' @description This function reads the genbank files, extracts the sequence | ||
#' from it, calculates the proportion of each kind of base in sequence | ||
#' and generates a fasta file which contains the sequence in it. | ||
#' @param gb_files A character vector in which all elements are the | ||
#' absolute path of genbank files | ||
#' @export | ||
read.genbank <- function(gb_files){ | ||
fa_files <- vector() | ||
for (gb_file in gb_files){ | ||
gb <- readLines(gb_file) | ||
acc <- gsub("ACCESSION\\s+", '', gb[grep("ACCESSION", gb)]) | ||
acc_file <- paste(acc, ".fasta", sep = '') | ||
accession <- paste(">", acc, sep = '') | ||
seq <- gb[(grep("ORIGIN", gb)+1):(grep('//', gb)-1)] | ||
seq <- gsub('\\d+', '', seq) | ||
seq <- gsub('\\s+', '', seq) | ||
seq <- paste(seq, collapse = '') | ||
seq.len <- length(unlist(strsplit(seq, split = ''))) | ||
fasta <- c(accession, seq) | ||
gb_path <- unlist(strsplit(gb_file, split = '/')) | ||
fa_path <- paste('/', paste(gb_path[2:(length(gb_path)-1)], collapse = '/'), sep = '') | ||
writeLines(fasta, paste(fa_path, acc_file, sep = '/')) | ||
fa_files <- c(fa_files, paste(fa_path, acc_file, sep = '/')) | ||
} | ||
x <- read.fasta(fa_files) | ||
print(x) | ||
} |
Binary file not shown.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.