From 60b2df5a70bec5d47907cf5400a003d3ea11f1f9 Mon Sep 17 00:00:00 2001
From: mac <mac@macdeMacBook-Pro.local>
Date: Tue, 8 Dec 2020 16:33:30 +0800
Subject: [PATCH] initial commit

---
 .Rbuildignore                |   2 +
 .gitignore                   |   4 ++
 Biopratice.Rproj             |  20 ++++++++
 DESCRIPTION                  |  13 +++++
 NAMESPACE                    |   8 +++
 R/download_genbank.R         |  18 +++++++
 R/global_align.R             |  96 +++++++++++++++++++++++++++++++++++
 R/packagename-data.R         |   5 ++
 R/print.AlignmentResult.R    |  55 ++++++++++++++++++++
 R/print.readfasta.R          |  50 ++++++++++++++++++
 R/read.fasta.R               |  40 +++++++++++++++
 R/read.genbank.R             |  28 ++++++++++
 data/Biopratice.test.RData   | Bin 0 -> 119 bytes
 man/Biopratice.test.Rd       |   9 ++++
 man/download_genbank.Rd      |  18 +++++++
 man/global_aln.Rd            |  20 ++++++++
 man/print.AlignmentResult.Rd |  17 +++++++
 man/print.readfasta.Rd       |  16 ++++++
 man/read.fasta.Rd            |  16 ++++++
 man/read.genbank.Rd          |  17 +++++++
 20 files changed, 452 insertions(+)
 create mode 100644 .Rbuildignore
 create mode 100644 .gitignore
 create mode 100644 Biopratice.Rproj
 create mode 100644 DESCRIPTION
 create mode 100644 NAMESPACE
 create mode 100644 R/download_genbank.R
 create mode 100644 R/global_align.R
 create mode 100644 R/packagename-data.R
 create mode 100644 R/print.AlignmentResult.R
 create mode 100644 R/print.readfasta.R
 create mode 100644 R/read.fasta.R
 create mode 100644 R/read.genbank.R
 create mode 100644 data/Biopratice.test.RData
 create mode 100644 man/Biopratice.test.Rd
 create mode 100644 man/download_genbank.Rd
 create mode 100644 man/global_aln.Rd
 create mode 100644 man/print.AlignmentResult.Rd
 create mode 100644 man/print.readfasta.Rd
 create mode 100644 man/read.fasta.Rd
 create mode 100644 man/read.genbank.Rd

diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000..91114bf
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5b6a065
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
diff --git a/Biopratice.Rproj b/Biopratice.Rproj
new file mode 100644
index 0000000..497f8bf
--- /dev/null
+++ b/Biopratice.Rproj
@@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..341c061
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,13 @@
+Package: Biopratice
+Type: Package
+Title: Biopratice
+Version: 0.1.0
+Author: Zijing Xie
+Maintainer: Zijing Xie <xzj14035@outlook.com>
+Description: This package provides some functions of converting genbank file to
+  fasta file, doing alignment between two sequences using dynamic programming 
+  algorithm and download genbank files from NCBI.
+License: GPL
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 7.1.1
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..eff168e
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,8 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(print,AlignmentResult)
+S3method(print,readfasta)
+export(download_genbank)
+export(global_aln)
+export(read.fasta)
+export(read.genbank)
diff --git a/R/download_genbank.R b/R/download_genbank.R
new file mode 100644
index 0000000..b28a60e
--- /dev/null
+++ b/R/download_genbank.R
@@ -0,0 +1,18 @@
+#' @title  download_genbank
+#' @description This function downloads genbank files from NCBI.
+#' @param acc The accession number
+#' @param db The database, it can be nuccore, nucest, nucgss or popset
+#' @param destfile The path where you want to save the genbank files
+#' @export
+download_genbank <- function(acc, db, destfile){
+  db <- db
+  db <- paste('db=', db, '&id=', sep = '')
+  accs <- acc
+  base_url <- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
+  for (acc in accs){
+    mature_url <- paste(base_url, db, acc, '&rettype=gb', sep = '')
+    gb.file_name <- paste(acc, '.gb', sep = '')
+    destfile <- paste(destfile, gb.file_name, sep = '/')
+    utils::download.file(mature_url,destfile = destfile)
+  }
+}
diff --git a/R/global_align.R b/R/global_align.R
new file mode 100644
index 0000000..bcc2fb4
--- /dev/null
+++ b/R/global_align.R
@@ -0,0 +1,96 @@
+#' @title  global_aln
+#' @description This function reads the fasta files, extract the sequence from
+#'     it and calculate the proportion of each kind of base in sequence.
+#' @param X A sequence to be aligned
+#' @param Y A sequence to be aligned
+#' @param Scoring_system A vector containing the score of match, mismatch and
+#' inserting a gap
+#' @export
+global_aln <- function(X, Y, Scoring_system){
+  Scoring_system <- Scoring_system
+  names(Scoring_system) <- c('match', 'mismatch', 'gap')
+  seq1 <- X
+  seq2 <- Y
+  seq1 <- unlist(strsplit(seq1, split = ''))
+  seq2 <- unlist(strsplit(seq2, split = ''))
+  seq1.len <- length(seq1)
+  seq2.len <- length(seq2)
+  score_matrix <- matrix(0, nrow = seq1.len + 1, ncol = seq2.len + 1)
+  for (Row in 1:(seq1.len+1)){
+    for (Col in 1:(seq2.len+1)){
+      if (Row == 1){
+        if (Col != 1){score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']}
+      }
+      else{
+        if (Col == 1){score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap'];next;}
+        else{
+          insert1 <- score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap']
+          insert2 <- score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']
+          if (seq1[Row-1] == seq2[Col-1]){mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['match']}
+          else{mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['mismatch']}
+          score_matrix[Row, Col] <- max(insert1, insert2, mis_or_match)
+        }
+      }
+    }
+  }
+  trace_back <- function(temp_seq1='', temp_seq2='', Row=seq1.len+1, Col=seq2.len+1, aln_result = list()){
+    if (Row == 1&Col == 1){
+      result <- list(c(temp_seq1, temp_seq2))
+      aln_result <- append(aln_result, result)
+      aln_result
+    }
+    else{
+      if(Row>1&Col>1){
+        if ((seq1[Row-1] == seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['match']==score_matrix[Row, Col])){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result1 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1)
+          aln_result <- append(aln_result, aln_result1)
+        }
+        if ((seq1[Row-1] != seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['mismatch']==score_matrix[Row, Col])){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result2 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1)
+          aln_result <- append(aln_result, aln_result2)
+        }
+        if ((score_matrix[Row-1, Col] + Scoring_system['gap']==score_matrix[Row, Col])){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste('-', temp_seq2, sep = '')
+          aln_result3 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col)
+          aln_result <- append(aln_result, aln_result3)
+        }
+        if ((score_matrix[Row, Col-1] + Scoring_system['gap']==score_matrix[Row, Col])){
+          Seq1 <- paste('-', temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result4<- trace_back(Seq1, Seq2, Row = Row, Col = Col-1)
+          aln_result <- append(aln_result, aln_result4)
+        }
+      }
+      else{
+        if(Row>1){
+          Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '')
+          Seq2 <- paste('-', temp_seq2, sep = '')
+          aln_result5 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col)
+          aln_result <- append(aln_result, aln_result5)
+        }
+        if(Col>1){
+          Seq1 <- paste('-', temp_seq1, sep = '')
+          Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '')
+          aln_result6 <- trace_back(Seq1, Seq2, Row = Row, Col = Col-1)
+          aln_result <- append(aln_result, aln_result6)
+        }
+      }
+    }
+    aln_result
+  }
+  aln_result <- trace_back()
+  structure(
+    list(
+      Sequences = c(X, Y),
+      Scoring_system = Scoring_system,
+      Score_matrix = score_matrix,
+      Aln_result = aln_result
+    ),
+    class = 'AlignmentResult'
+  )
+}
diff --git a/R/packagename-data.R b/R/packagename-data.R
new file mode 100644
index 0000000..9b2f504
--- /dev/null
+++ b/R/packagename-data.R
@@ -0,0 +1,5 @@
+#' @docType data
+#' @name Biopratice.test
+#' @title A dataset containing two sequences and Scoring_system.
+NULL
+Biopratice.test <- list("ATTCCAAG", "TTCGAGT", c(4, -3, -4))
diff --git a/R/print.AlignmentResult.R b/R/print.AlignmentResult.R
new file mode 100644
index 0000000..82680c8
--- /dev/null
+++ b/R/print.AlignmentResult.R
@@ -0,0 +1,55 @@
+#' @title print.AlignmentResult
+#' @description This function prints the object returned by function
+#' global_align.
+#' @param x The object returned by function global_align
+#' @param ... Other parameter
+#' @method print AlignmentResult
+#' @export
+print.AlignmentResult <- function(x, ...){
+  cat("Sequence X:\t")
+  cat(x$Sequences[1])
+  cat('\n')
+  cat("Sequence Y:\t")
+  cat(x$Sequences[2])
+  cat('\n')
+  cat('Scoring System:\t')
+  cat(x$Scoring_system[1])
+  cat('\t')
+  cat('for match;')
+  cat('\t')
+  cat(x$Scoring_system[2])
+  cat('\t')
+  cat('for mismatch;')
+  cat('\t')
+  cat(x$Scoring_system[3])
+  cat('\t')
+  cat('for gap;')
+  cat('\t\n\n')
+  cat('Dynamic programing matrix:\n')
+  print(x$Score_matrix, ...)
+  cat('\n')
+  cat('Alignment:\n')
+  for (i in 1:length(x$Aln_result)){
+    temp_result <- x$Aln_result[[i]]
+    seq1 <- unlist(strsplit(temp_result[1], split = ''))
+    seq2 <- unlist(strsplit(temp_result[2], split = ''))
+    bars <- ''
+    for ( i in 1:length(seq1)){
+      if ((seq1[i]==seq2[i])&(seq1[i]!='-')&(seq2[i]!='-')){
+        bars <- paste(bars, '|', sep = '')
+      }
+      else{
+        bars <- paste(bars, ' ', sep = '')
+      }
+    }
+    bars <- paste(bars, '\n', sep = '')
+    cat(temp_result[1])
+    cat('\n')
+    cat(bars)
+    cat(temp_result[2])
+    cat('\n\n')
+  }
+  cat('Optimum alignment mat:\t')
+  cat(x$Score_matrix[dim(x$Score_matrix)[1], dim(x$Score_matrix)[2]])
+  cat('\n')
+}
diff --git a/R/print.readfasta.R b/R/print.readfasta.R
new file mode 100644
index 0000000..bf1475f
--- /dev/null
+++ b/R/print.readfasta.R
@@ -0,0 +1,50 @@
+#' @title print.readfasta
+#' @description This function prints the object returned by function read.fasta.
+#' @param x The object returned by function read.fasta
+#' @param ... Other parameter pass to print
+#' @method print readfasta
+#' @export
+print.readfasta <- function(x, ...){
+  cat(length(x$seq.list))
+  cat(' DNA sequence in binary format stored in a list.\n\n')
+  if (length(x$seq.list) == 1){
+    cat('Sequence length:')
+    cat(x$len)
+    cat('\n\n')
+    cat('Label:\n')
+    cat(x$Labels)
+    cat('\n\n')
+  }
+  else{
+    max_len <- max(x$len)
+    min_len <- min(x$len)
+    mean_len <- round(mean(x$len), 2)
+    cat('Mean sequence length: ')
+    cat(max_len)
+    cat('\n\t')
+    cat('Shortest sequence: ')
+    cat(min_len)
+    cat('\n\t')
+    cat('Longest sequence: ')
+    cat(max_len)
+    cat('\n\n')
+    cat('Label:\n')
+    for (label in x$Labels){
+      Index <- which(x$Labels == label)
+      if (Index >= 4){
+        break
+      }
+      else{
+        cat(label)
+        cat('\n')
+      }
+    }
+  }
+  cat('\nBase composition:\n')
+  if (dim(x$result)[1] >= 4){
+    print(x$result[1:4, ])
+  }
+  else{
+    print(x$result)
+  }
+}
diff --git a/R/read.fasta.R b/R/read.fasta.R
new file mode 100644
index 0000000..95972c6
--- /dev/null
+++ b/R/read.fasta.R
@@ -0,0 +1,40 @@
+#' @title  read.fasta
+#' @description This function reads the fasta files, extract the sequence from
+#' it and calculate the proportion of each kind of base in sequence.
+#' @param fa_files A character vector in which all elements are the
+#'     absolute path of fasta files.
+#' @export
+read.fasta <- function(fa_files){
+  fa_sum <- vector()
+  for (f in fa_files){
+    f_temp <- readLines(f)
+    fa_sum <- c(fa_sum, f_temp)
+  }
+  label.index <- grep('^>', fa_sum)
+  seq.index <- label.index + 1
+  seq.list <- fa_sum[seq.index]
+  names(seq.list) <- sub('^>(\\w+)\\|?', '\\1', fa_sum[label.index])
+  seq.len <- vector()
+  result <- array(0, dim = c(length(label.index), 17))
+  colnames(result) <- c('A', 'T', 'C', 'G', 'U', "R", 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', '-')
+  rownames(result) <- names(seq.list)
+  for (i in 1:length(seq.list)){
+    s <- seq.list[i]
+    s.list <- unlist(strsplit(toupper(s), split = ''))
+    s.len <- length(s.list)
+    names(s.len) <- names(s)
+    seq.len <- c(seq.len, s.len)
+    for (b in s.list){
+      result[names(s), b] <- result[names(s), b] + 1
+    }
+    result[names(s), ] <- round(result[names(s), ]/s.len, 2)
+  }
+  structure(
+    list(seq.list = seq.list,
+         result = result,
+         Labels = sub('>', '', fa_sum[label.index]),
+         len = seq.len
+    ),
+    class = "readfasta"
+  )
+}
diff --git a/R/read.genbank.R b/R/read.genbank.R
new file mode 100644
index 0000000..8c21ea7
--- /dev/null
+++ b/R/read.genbank.R
@@ -0,0 +1,28 @@
+#' @title  read.genbank
+#' @description This function reads the genbank files, extracts the sequence
+#' from it, calculates the proportion of each kind of base in sequence
+#' and generates a fasta file which contains the sequence in it.
+#' @param gb_files A character vector in which all elements are the
+#' absolute path of genbank files
+#' @export
+read.genbank <- function(gb_files){
+  fa_files <- vector()
+  for (gb_file in gb_files){
+    gb <- readLines(gb_file)
+    acc <- gsub("ACCESSION\\s+", '', gb[grep("ACCESSION", gb)])
+    acc_file <- paste(acc, ".fasta", sep = '')
+    accession <- paste(">", acc, sep = '')
+    seq <- gb[(grep("ORIGIN", gb)+1):(grep('//', gb)-1)]
+    seq <- gsub('\\d+', '', seq)
+    seq <- gsub('\\s+', '', seq)
+    seq <- paste(seq, collapse = '')
+    seq.len <- length(unlist(strsplit(seq, split = '')))
+    fasta <- c(accession, seq)
+    gb_path <- unlist(strsplit(gb_file, split = '/'))
+    fa_path <- paste('/', paste(gb_path[2:(length(gb_path)-1)], collapse = '/'), sep = '')
+    writeLines(fasta, paste(fa_path, acc_file, sep = '/'))
+    fa_files <- c(fa_files, paste(fa_path, acc_file, sep = '/'))
+  }
+  x <- read.fasta(fa_files)
+  print(x)
+}
diff --git a/data/Biopratice.test.RData b/data/Biopratice.test.RData
new file mode 100644
index 0000000000000000000000000000000000000000..68a76c87d3db3686aab23b4072ca7ddcb28cff89
GIT binary patch
literal 119
zcmb2|=3oE=X6~X+gJ)e2k`fXU(h?FAk`mHbjU*$So$r+BN=QV^XpnJRz@k3kS^23m
z`Wjw3Cw=^`YJ2MHF>-M>e>F7VP|$o_z{6QsR#sBtE}YKJwcC1EJHyNxg<L)bPX&tu
Uj7t~RDEwe&i0+Q#o(8lJ0N5cWC;$Ke

literal 0
HcmV?d00001

diff --git a/man/Biopratice.test.Rd b/man/Biopratice.test.Rd
new file mode 100644
index 0000000..4ac8f06
--- /dev/null
+++ b/man/Biopratice.test.Rd
@@ -0,0 +1,9 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/packagename-data.R
+\docType{data}
+\name{Biopratice.test}
+\alias{Biopratice.test}
+\title{A dataset containing two sequences and Scoring_system.}
+\description{
+A dataset containing two sequences and Scoring_system.
+}
diff --git a/man/download_genbank.Rd b/man/download_genbank.Rd
new file mode 100644
index 0000000..dc6e2da
--- /dev/null
+++ b/man/download_genbank.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/download_genbank.R
+\name{download_genbank}
+\alias{download_genbank}
+\title{download_genbank}
+\usage{
+download_genbank(acc, db, destfile)
+}
+\arguments{
+\item{acc}{The accession number}
+
+\item{db}{The database, it can be nuccore, nucest, nucgss or popset}
+
+\item{destfile}{The path where you want to save the genbank files}
+}
+\description{
+This function downloads genbank files from NCBI.
+}
diff --git a/man/global_aln.Rd b/man/global_aln.Rd
new file mode 100644
index 0000000..ff2b62b
--- /dev/null
+++ b/man/global_aln.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/global_align.R
+\name{global_aln}
+\alias{global_aln}
+\title{global_aln}
+\usage{
+global_aln(X, Y, Scoring_system)
+}
+\arguments{
+\item{X}{A sequence to be aligned}
+
+\item{Y}{A sequence to be aligned}
+
+\item{Scoring_system}{A vector containing the score of match, mismatch and
+inserting a gap}
+}
+\description{
+This function reads the fasta files, extract the sequence from
+    it and calculate the proportion of each kind of base in sequence.
+}
diff --git a/man/print.AlignmentResult.Rd b/man/print.AlignmentResult.Rd
new file mode 100644
index 0000000..d911810
--- /dev/null
+++ b/man/print.AlignmentResult.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/print.AlignmentResult.R
+\name{print.AlignmentResult}
+\alias{print.AlignmentResult}
+\title{print.AlignmentResult}
+\usage{
+\method{print}{AlignmentResult}(x, ...)
+}
+\arguments{
+\item{x}{The object returned by function global_align}
+
+\item{...}{Other parameter}
+}
+\description{
+This function prints the object returned by function
+global_align.
+}
diff --git a/man/print.readfasta.Rd b/man/print.readfasta.Rd
new file mode 100644
index 0000000..8905fa3
--- /dev/null
+++ b/man/print.readfasta.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/print.readfasta.R
+\name{print.readfasta}
+\alias{print.readfasta}
+\title{print.readfasta}
+\usage{
+\method{print}{readfasta}(x, ...)
+}
+\arguments{
+\item{x}{The object returned by function read.fasta}
+
+\item{...}{Other parameter pass to print}
+}
+\description{
+This function prints the object returned by function read.fasta.
+}
diff --git a/man/read.fasta.Rd b/man/read.fasta.Rd
new file mode 100644
index 0000000..cfb30ea
--- /dev/null
+++ b/man/read.fasta.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read.fasta.R
+\name{read.fasta}
+\alias{read.fasta}
+\title{read.fasta}
+\usage{
+read.fasta(fa_files)
+}
+\arguments{
+\item{fa_files}{A character vector in which all elements are the
+absolute path of fasta files.}
+}
+\description{
+This function reads the fasta files, extract the sequence from
+it and calculate the proportion of each kind of base in sequence.
+}
diff --git a/man/read.genbank.Rd b/man/read.genbank.Rd
new file mode 100644
index 0000000..c6b5eba
--- /dev/null
+++ b/man/read.genbank.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read.genbank.R
+\name{read.genbank}
+\alias{read.genbank}
+\title{read.genbank}
+\usage{
+read.genbank(gb_files)
+}
+\arguments{
+\item{gb_files}{A character vector in which all elements are the
+absolute path of genbank files}
+}
+\description{
+This function reads the genbank files, extracts the sequence
+from it, calculates the proportion of each kind of base in sequence
+and generates a fasta file which contains the sequence in it.
+}