initial commit

xzj-14035 · Dec 8, 2020 · 60b2df5 · 60b2df5
commit 60b2df5
Show file tree

Hide file tree

Showing 20 changed files with 452 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/Biopratice.Rproj b/Biopratice.Rproj
@@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,13 @@
+Package: Biopratice
+Type: Package
+Title: Biopratice
+Version: 0.1.0
+Author: Zijing Xie
+Maintainer: Zijing Xie <[email protected]>
+Description: This package provides some functions of converting genbank file to
+  fasta file, doing alignment between two sequences using dynamic programming 
+  algorithm and download genbank files from NCBI.
+License: GPL
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,8 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(print,AlignmentResult)
+S3method(print,readfasta)
+export(download_genbank)
+export(global_aln)
+export(read.fasta)
+export(read.genbank)
diff --git a/R/download_genbank.R b/R/download_genbank.R
@@ -0,0 +1,18 @@
+#' @title  download_genbank
+#' @description This function downloads genbank files from NCBI.
+#' @param acc The accession number
+#' @param db The database, it can be nuccore, nucest, nucgss or popset
+#' @param destfile The path where you want to save the genbank files
+#' @export
+download_genbank <- function(acc, db, destfile){
+  db <- db
+  db <- paste('db=', db, '&id=', sep = '')
+  accs <- acc
+  base_url <- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
+  for (acc in accs){
+    mature_url <- paste(base_url, db, acc, '&rettype=gb', sep = '')
+    gb.file_name <- paste(acc, '.gb', sep = '')
+    destfile <- paste(destfile, gb.file_name, sep = '/')
+    utils::download.file(mature_url,destfile = destfile)
+  }
+}
diff --git a/R/global_align.R b/R/global_align.R
@@ -0,0 +1,96 @@
+#' @title  global_aln
+#' @description This function reads the fasta files, extract the sequence from
+#'     it and calculate the proportion of each kind of base in sequence.
+#' @param X A sequence to be aligned
+#' @param Y A sequence to be aligned
+#' @param Scoring_system A vector containing the score of match, mismatch and
+#' inserting a gap
+#' @export
+global_aln <- function(X, Y, Scoring_system){
+  Scoring_system <- Scoring_system
+  names(Scoring_system) <- c('match', 'mismatch', 'gap')
+  seq1 <- X
+  seq2 <- Y
+  seq1 <- unlist(strsplit(seq1, split = ''))
+  seq2 <- unlist(strsplit(seq2, split = ''))
+  seq1.len <- length(seq1)
+  seq2.len <- length(seq2)
+  score_matrix <- matrix(0, nrow = seq1.len + 1, ncol = seq2.len + 1)
+  for (Row in 1:(seq1.len+1)){
+    for (Col in 1:(seq2.len+1)){
+      if (Row == 1){
+        if (Col != 1){score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']}
+      }
+      else{
+        if (Col == 1){score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap'];next;}
+        else{
+          insert1 <- score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap']
+          insert2 <- score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']
+          if (seq1[Row-1] == seq2[Col-1]){mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['match']}
+          else{mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['mismatch']}
+          score_matrix[Row, Col] <- max(insert1, insert2, mis_or_match)
+        }
+      }
+    }
+  }
+  trace_back <- function(temp_seq1='', temp_seq2='', Row=seq1.len+1, Col=seq2.len+1, aln_result = list()){
+    if (Row == 1&Col == 1){
+      result <- list(c(temp_seq1, temp_seq2))
+      aln_result <- append(aln_result, result)
+      aln_result
+    }
+    else{
+      if(Row>1&Col>1){
+        if ((seq1[Row-1] == seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['match']==score_matrix[Row, Col])){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result1 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1)
+          aln_result <- append(aln_result, aln_result1)
+        }
+        if ((seq1[Row-1] != seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['mismatch']==score_matrix[Row, Col])){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result2 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1)
+          aln_result <- append(aln_result, aln_result2)
+        }
+        if ((score_matrix[Row-1, Col] + Scoring_system['gap']==score_matrix[Row, Col])){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste('-', temp_seq2, sep = '')
+          aln_result3 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col)
+          aln_result <- append(aln_result, aln_result3)
+        }
+        if ((score_matrix[Row, Col-1] + Scoring_system['gap']==score_matrix[Row, Col])){
+          Seq1 <- paste('-', temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result4<- trace_back(Seq1, Seq2, Row = Row, Col = Col-1)
+          aln_result <- append(aln_result, aln_result4)
+        }
+      }
+      else{
+        if(Row>1){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste('-', temp_seq2, sep = '')
+          aln_result5 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col)
+          aln_result <- append(aln_result, aln_result5)
+        }
+        if(Col>1){
+          Seq1 <- paste('-', temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result6 <- trace_back(Seq1, Seq2, Row = Row, Col = Col-1)
+          aln_result <- append(aln_result, aln_result6)
+        }
+      }
+    }
+    aln_result
+  }
+  aln_result <- trace_back()
+  structure(
+    list(
+      Sequences = c(X, Y),
+      Scoring_system = Scoring_system,
+      Score_matrix = score_matrix,
+      Aln_result = aln_result
+    ),
+    class = 'AlignmentResult'
+  )
+}
diff --git a/R/packagename-data.R b/R/packagename-data.R
@@ -0,0 +1,5 @@
+#' @docType data
+#' @name Biopratice.test
+#' @title A dataset containing two sequences and Scoring_system.
+NULL
+Biopratice.test <- list("ATTCCAAG", "TTCGAGT", c(4, -3, -4))
diff --git a/R/print.AlignmentResult.R b/R/print.AlignmentResult.R
@@ -0,0 +1,55 @@
+#' @title print.AlignmentResult
+#' @description This function prints the object returned by function
+#' global_align.
+#' @param x The object returned by function global_align
+#' @param ... Other parameter
+#' @method print AlignmentResult
+#' @export
+print.AlignmentResult <- function(x, ...){
+  cat("Sequence X:\t")
+  cat(x$Sequences[1])
+  cat('\n')
+  cat("Sequence Y:\t")
+  cat(x$Sequences[2])
+  cat('\n')
+  cat('Scoring System:\t')
+  cat(x$Scoring_system[1])
+  cat('\t')
+  cat('for match;')
+  cat('\t')
+  cat(x$Scoring_system[2])
+  cat('\t')
+  cat('for mismatch;')
+  cat('\t')
+  cat(x$Scoring_system[3])
+  cat('\t')
+  cat('for gap;')
+  cat('\t\n\n')
+  cat('Dynamic programing matrix:\n')
+  print(x$Score_matrix, ...)
+  cat('\n')
+  cat('Alignment:\n')
+  for (i in 1:length(x$Aln_result)){
+    temp_result <- x$Aln_result[[i]]
+    seq1 <- unlist(strsplit(temp_result[1], split = ''))
+    seq2 <- unlist(strsplit(temp_result[2], split = ''))
+    bars <- ''
+    for ( i in 1:length(seq1)){
+      if ((seq1[i]==seq2[i])&(seq1[i]!='-')&(seq2[i]!='-')){
+        bars <- paste(bars, '|', sep = '')
+      }
+      else{
+        bars <- paste(bars, ' ', sep = '')
+      }
+    }
+    bars <- paste(bars, '\n', sep = '')
+    cat(temp_result[1])
+    cat('\n')
+    cat(bars)
+    cat(temp_result[2])
+    cat('\n\n')
+  }
+  cat('Optimum alignment mat:\t')
+  cat(x$Score_matrix[dim(x$Score_matrix)[1], dim(x$Score_matrix)[2]])
+  cat('\n')
+}
diff --git a/R/print.readfasta.R b/R/print.readfasta.R
@@ -0,0 +1,50 @@
+#' @title print.readfasta
+#' @description This function prints the object returned by function read.fasta.
+#' @param x The object returned by function read.fasta
+#' @param ... Other parameter pass to print
+#' @method print readfasta
+#' @export
+print.readfasta <- function(x, ...){
+  cat(length(x$seq.list))
+  cat(' DNA sequence in binary format stored in a list.\n\n')
+  if (length(x$seq.list) == 1){
+    cat('Sequence length:')
+    cat(x$len)
+    cat('\n\n')
+    cat('Label:\n')
+    cat(x$Labels)
+    cat('\n\n')
+  }
+  else{
+    max_len <- max(x$len)
+    min_len <- min(x$len)
+    mean_len <- round(mean(x$len), 2)
+    cat('Mean sequence length: ')
+    cat(max_len)
+    cat('\n\t')
+    cat('Shortest sequence: ')
+    cat(min_len)
+    cat('\n\t')
+    cat('Longest sequence: ')
+    cat(max_len)
+    cat('\n\n')
+    cat('Label:\n')
+    for (label in x$Labels){
+      Index <- which(x$Labels == label)
+      if (Index >= 4){
+        break
+      }
+      else{
+        cat(label)
+        cat('\n')
+      }
+    }
+  }
+  cat('\nBase composition:\n')
+  if (dim(x$result)[1] >= 4){
+    print(x$result[1:4, ])
+  }
+  else{
+    print(x$result)
+  }
+}
diff --git a/R/read.fasta.R b/R/read.fasta.R
@@ -0,0 +1,40 @@
+#' @title  read.fasta
+#' @description This function reads the fasta files, extract the sequence from
+#' it and calculate the proportion of each kind of base in sequence.
+#' @param fa_files A character vector in which all elements are the
+#'     absolute path of fasta files.
+#' @export
+read.fasta <- function(fa_files){
+  fa_sum <- vector()
+  for (f in fa_files){
+    f_temp <- readLines(f)
+    fa_sum <- c(fa_sum, f_temp)
+  }
+  label.index <- grep('^>', fa_sum)
+  seq.index <- label.index + 1
+  seq.list <- fa_sum[seq.index]
+  names(seq.list) <- sub('^>(\\w+)\\|?', '\\1', fa_sum[label.index])
+  seq.len <- vector()
+  result <- array(0, dim = c(length(label.index), 17))
+  colnames(result) <- c('A', 'T', 'C', 'G', 'U', "R", 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', '-')
+  rownames(result) <- names(seq.list)
+  for (i in 1:length(seq.list)){
+    s <- seq.list[i]
+    s.list <- unlist(strsplit(toupper(s), split = ''))
+    s.len <- length(s.list)
+    names(s.len) <- names(s)
+    seq.len <- c(seq.len, s.len)
+    for (b in s.list){
+      result[names(s), b] <- result[names(s), b] + 1
+    }
+    result[names(s), ] <- round(result[names(s), ]/s.len, 2)
+  }
+  structure(
+    list(seq.list = seq.list,
+         result = result,
+         Labels = sub('>', '', fa_sum[label.index]),
+         len = seq.len
+    ),
+    class = "readfasta"
+  )
+}
diff --git a/R/read.genbank.R b/R/read.genbank.R
@@ -0,0 +1,28 @@
+#' @title  read.genbank
+#' @description This function reads the genbank files, extracts the sequence
+#' from it, calculates the proportion of each kind of base in sequence
+#' and generates a fasta file which contains the sequence in it.
+#' @param gb_files A character vector in which all elements are the
+#' absolute path of genbank files
+#' @export
+read.genbank <- function(gb_files){
+  fa_files <- vector()
+  for (gb_file in gb_files){
+    gb <- readLines(gb_file)
+    acc <- gsub("ACCESSION\\s+", '', gb[grep("ACCESSION", gb)])
+    acc_file <- paste(acc, ".fasta", sep = '')
+    accession <- paste(">", acc, sep = '')
+    seq <- gb[(grep("ORIGIN", gb)+1):(grep('//', gb)-1)]
+    seq <- gsub('\\d+', '', seq)
+    seq <- gsub('\\s+', '', seq)
+    seq <- paste(seq, collapse = '')
+    seq.len <- length(unlist(strsplit(seq, split = '')))
+    fasta <- c(accession, seq)
+    gb_path <- unlist(strsplit(gb_file, split = '/'))
+    fa_path <- paste('/', paste(gb_path[2:(length(gb_path)-1)], collapse = '/'), sep = '')
+    writeLines(fasta, paste(fa_path, acc_file, sep = '/'))
+    fa_files <- c(fa_files, paste(fa_path, acc_file, sep = '/'))
+  }
+  x <- read.fasta(fa_files)
+  print(x)
+}
diff --git a/data/Biopratice.test.RData b/data/Biopratice.test.RData
diff --git a/man/Biopratice.test.Rd b/man/Biopratice.test.Rd
diff --git a/man/download_genbank.Rd b/man/download_genbank.Rd