Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mac authored and mac committed Dec 8, 2020
0 parents commit 60b2df5
Show file tree
Hide file tree
Showing 20 changed files with 452 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
20 changes: 20 additions & 0 deletions Biopratice.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
13 changes: 13 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Package: Biopratice
Type: Package
Title: Biopratice
Version: 0.1.0
Author: Zijing Xie
Maintainer: Zijing Xie <[email protected]>
Description: This package provides some functions of converting genbank file to
fasta file, doing alignment between two sequences using dynamic programming
algorithm and download genbank files from NCBI.
License: GPL
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.1
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Generated by roxygen2: do not edit by hand

S3method(print,AlignmentResult)
S3method(print,readfasta)
export(download_genbank)
export(global_aln)
export(read.fasta)
export(read.genbank)
18 changes: 18 additions & 0 deletions R/download_genbank.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' @title download_genbank
#' @description This function downloads genbank files from NCBI.
#' @param acc The accession number
#' @param db The database, it can be nuccore, nucest, nucgss or popset
#' @param destfile The path where you want to save the genbank files
#' @export
download_genbank <- function(acc, db, destfile){
db <- db
db <- paste('db=', db, '&id=', sep = '')
accs <- acc
base_url <- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
for (acc in accs){
mature_url <- paste(base_url, db, acc, '&rettype=gb', sep = '')
gb.file_name <- paste(acc, '.gb', sep = '')
destfile <- paste(destfile, gb.file_name, sep = '/')
utils::download.file(mature_url,destfile = destfile)
}
}
96 changes: 96 additions & 0 deletions R/global_align.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#' @title global_aln
#' @description This function reads the fasta files, extract the sequence from
#' it and calculate the proportion of each kind of base in sequence.
#' @param X A sequence to be aligned
#' @param Y A sequence to be aligned
#' @param Scoring_system A vector containing the score of match, mismatch and
#' inserting a gap
#' @export
global_aln <- function(X, Y, Scoring_system){
Scoring_system <- Scoring_system
names(Scoring_system) <- c('match', 'mismatch', 'gap')
seq1 <- X
seq2 <- Y
seq1 <- unlist(strsplit(seq1, split = ''))
seq2 <- unlist(strsplit(seq2, split = ''))
seq1.len <- length(seq1)
seq2.len <- length(seq2)
score_matrix <- matrix(0, nrow = seq1.len + 1, ncol = seq2.len + 1)
for (Row in 1:(seq1.len+1)){
for (Col in 1:(seq2.len+1)){
if (Row == 1){
if (Col != 1){score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']}
}
else{
if (Col == 1){score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap'];next;}
else{
insert1 <- score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap']
insert2 <- score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']
if (seq1[Row-1] == seq2[Col-1]){mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['match']}
else{mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['mismatch']}
score_matrix[Row, Col] <- max(insert1, insert2, mis_or_match)
}
}
}
}
trace_back <- function(temp_seq1='', temp_seq2='', Row=seq1.len+1, Col=seq2.len+1, aln_result = list()){
if (Row == 1&Col == 1){
result <- list(c(temp_seq1, temp_seq2))
aln_result <- append(aln_result, result)
aln_result
}
else{
if(Row>1&Col>1){
if ((seq1[Row-1] == seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['match']==score_matrix[Row, Col])){
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
aln_result1 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1)
aln_result <- append(aln_result, aln_result1)
}
if ((seq1[Row-1] != seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['mismatch']==score_matrix[Row, Col])){
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
aln_result2 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1)
aln_result <- append(aln_result, aln_result2)
}
if ((score_matrix[Row-1, Col] + Scoring_system['gap']==score_matrix[Row, Col])){
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
Seq2 <- paste('-', temp_seq2, sep = '')
aln_result3 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col)
aln_result <- append(aln_result, aln_result3)
}
if ((score_matrix[Row, Col-1] + Scoring_system['gap']==score_matrix[Row, Col])){
Seq1 <- paste('-', temp_seq1, sep = '')
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
aln_result4<- trace_back(Seq1, Seq2, Row = Row, Col = Col-1)
aln_result <- append(aln_result, aln_result4)
}
}
else{
if(Row>1){
Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
Seq2 <- paste('-', temp_seq2, sep = '')
aln_result5 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col)
aln_result <- append(aln_result, aln_result5)
}
if(Col>1){
Seq1 <- paste('-', temp_seq1, sep = '')
Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
aln_result6 <- trace_back(Seq1, Seq2, Row = Row, Col = Col-1)
aln_result <- append(aln_result, aln_result6)
}
}
}
aln_result
}
aln_result <- trace_back()
structure(
list(
Sequences = c(X, Y),
Scoring_system = Scoring_system,
Score_matrix = score_matrix,
Aln_result = aln_result
),
class = 'AlignmentResult'
)
}
5 changes: 5 additions & 0 deletions R/packagename-data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#' @docType data
#' @name Biopratice.test
#' @title A dataset containing two sequences and Scoring_system.
NULL
Biopratice.test <- list("ATTCCAAG", "TTCGAGT", c(4, -3, -4))
55 changes: 55 additions & 0 deletions R/print.AlignmentResult.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#' @title print.AlignmentResult
#' @description This function prints the object returned by function
#' global_align.
#' @param x The object returned by function global_align
#' @param ... Other parameter
#' @method print AlignmentResult
#' @export
print.AlignmentResult <- function(x, ...){
cat("Sequence X:\t")
cat(x$Sequences[1])
cat('\n')
cat("Sequence Y:\t")
cat(x$Sequences[2])
cat('\n')
cat('Scoring System:\t')
cat(x$Scoring_system[1])
cat('\t')
cat('for match;')
cat('\t')
cat(x$Scoring_system[2])
cat('\t')
cat('for mismatch;')
cat('\t')
cat(x$Scoring_system[3])
cat('\t')
cat('for gap;')
cat('\t\n\n')
cat('Dynamic programing matrix:\n')
print(x$Score_matrix, ...)
cat('\n')
cat('Alignment:\n')
for (i in 1:length(x$Aln_result)){
temp_result <- x$Aln_result[[i]]
seq1 <- unlist(strsplit(temp_result[1], split = ''))
seq2 <- unlist(strsplit(temp_result[2], split = ''))
bars <- ''
for ( i in 1:length(seq1)){
if ((seq1[i]==seq2[i])&(seq1[i]!='-')&(seq2[i]!='-')){
bars <- paste(bars, '|', sep = '')
}
else{
bars <- paste(bars, ' ', sep = '')
}
}
bars <- paste(bars, '\n', sep = '')
cat(temp_result[1])
cat('\n')
cat(bars)
cat(temp_result[2])
cat('\n\n')
}
cat('Optimum alignment mat:\t')
cat(x$Score_matrix[dim(x$Score_matrix)[1], dim(x$Score_matrix)[2]])
cat('\n')
}
50 changes: 50 additions & 0 deletions R/print.readfasta.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#' @title print.readfasta
#' @description This function prints the object returned by function read.fasta.
#' @param x The object returned by function read.fasta
#' @param ... Other parameter pass to print
#' @method print readfasta
#' @export
print.readfasta <- function(x, ...){
cat(length(x$seq.list))
cat(' DNA sequence in binary format stored in a list.\n\n')
if (length(x$seq.list) == 1){
cat('Sequence length:')
cat(x$len)
cat('\n\n')
cat('Label:\n')
cat(x$Labels)
cat('\n\n')
}
else{
max_len <- max(x$len)
min_len <- min(x$len)
mean_len <- round(mean(x$len), 2)
cat('Mean sequence length: ')
cat(max_len)
cat('\n\t')
cat('Shortest sequence: ')
cat(min_len)
cat('\n\t')
cat('Longest sequence: ')
cat(max_len)
cat('\n\n')
cat('Label:\n')
for (label in x$Labels){
Index <- which(x$Labels == label)
if (Index >= 4){
break
}
else{
cat(label)
cat('\n')
}
}
}
cat('\nBase composition:\n')
if (dim(x$result)[1] >= 4){
print(x$result[1:4, ])
}
else{
print(x$result)
}
}
40 changes: 40 additions & 0 deletions R/read.fasta.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#' @title read.fasta
#' @description This function reads the fasta files, extract the sequence from
#' it and calculate the proportion of each kind of base in sequence.
#' @param fa_files A character vector in which all elements are the
#' absolute path of fasta files.
#' @export
read.fasta <- function(fa_files){
fa_sum <- vector()
for (f in fa_files){
f_temp <- readLines(f)
fa_sum <- c(fa_sum, f_temp)
}
label.index <- grep('^>', fa_sum)
seq.index <- label.index + 1
seq.list <- fa_sum[seq.index]
names(seq.list) <- sub('^>(\\w+)\\|?', '\\1', fa_sum[label.index])
seq.len <- vector()
result <- array(0, dim = c(length(label.index), 17))
colnames(result) <- c('A', 'T', 'C', 'G', 'U', "R", 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', '-')
rownames(result) <- names(seq.list)
for (i in 1:length(seq.list)){
s <- seq.list[i]
s.list <- unlist(strsplit(toupper(s), split = ''))
s.len <- length(s.list)
names(s.len) <- names(s)
seq.len <- c(seq.len, s.len)
for (b in s.list){
result[names(s), b] <- result[names(s), b] + 1
}
result[names(s), ] <- round(result[names(s), ]/s.len, 2)
}
structure(
list(seq.list = seq.list,
result = result,
Labels = sub('>', '', fa_sum[label.index]),
len = seq.len
),
class = "readfasta"
)
}
28 changes: 28 additions & 0 deletions R/read.genbank.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#' @title read.genbank
#' @description This function reads the genbank files, extracts the sequence
#' from it, calculates the proportion of each kind of base in sequence
#' and generates a fasta file which contains the sequence in it.
#' @param gb_files A character vector in which all elements are the
#' absolute path of genbank files
#' @export
read.genbank <- function(gb_files){
fa_files <- vector()
for (gb_file in gb_files){
gb <- readLines(gb_file)
acc <- gsub("ACCESSION\\s+", '', gb[grep("ACCESSION", gb)])
acc_file <- paste(acc, ".fasta", sep = '')
accession <- paste(">", acc, sep = '')
seq <- gb[(grep("ORIGIN", gb)+1):(grep('//', gb)-1)]
seq <- gsub('\\d+', '', seq)
seq <- gsub('\\s+', '', seq)
seq <- paste(seq, collapse = '')
seq.len <- length(unlist(strsplit(seq, split = '')))
fasta <- c(accession, seq)
gb_path <- unlist(strsplit(gb_file, split = '/'))
fa_path <- paste('/', paste(gb_path[2:(length(gb_path)-1)], collapse = '/'), sep = '')
writeLines(fasta, paste(fa_path, acc_file, sep = '/'))
fa_files <- c(fa_files, paste(fa_path, acc_file, sep = '/'))
}
x <- read.fasta(fa_files)
print(x)
}
Binary file added data/Biopratice.test.RData
Binary file not shown.
9 changes: 9 additions & 0 deletions man/Biopratice.test.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions man/download_genbank.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 60b2df5

Please sign in to comment.