From 60b2df5a70bec5d47907cf5400a003d3ea11f1f9 Mon Sep 17 00:00:00 2001 From: mac Date: Tue, 8 Dec 2020 16:33:30 +0800 Subject: [PATCH] initial commit --- .Rbuildignore | 2 + .gitignore | 4 ++ Biopratice.Rproj | 20 ++++++++ DESCRIPTION | 13 +++++ NAMESPACE | 8 +++ R/download_genbank.R | 18 +++++++ R/global_align.R | 96 +++++++++++++++++++++++++++++++++++ R/packagename-data.R | 5 ++ R/print.AlignmentResult.R | 55 ++++++++++++++++++++ R/print.readfasta.R | 50 ++++++++++++++++++ R/read.fasta.R | 40 +++++++++++++++ R/read.genbank.R | 28 ++++++++++ data/Biopratice.test.RData | Bin 0 -> 119 bytes man/Biopratice.test.Rd | 9 ++++ man/download_genbank.Rd | 18 +++++++ man/global_aln.Rd | 20 ++++++++ man/print.AlignmentResult.Rd | 17 +++++++ man/print.readfasta.Rd | 16 ++++++ man/read.fasta.Rd | 16 ++++++ man/read.genbank.Rd | 17 +++++++ 20 files changed, 452 insertions(+) create mode 100644 .Rbuildignore create mode 100644 .gitignore create mode 100644 Biopratice.Rproj create mode 100644 DESCRIPTION create mode 100644 NAMESPACE create mode 100644 R/download_genbank.R create mode 100644 R/global_align.R create mode 100644 R/packagename-data.R create mode 100644 R/print.AlignmentResult.R create mode 100644 R/print.readfasta.R create mode 100644 R/read.fasta.R create mode 100644 R/read.genbank.R create mode 100644 data/Biopratice.test.RData create mode 100644 man/Biopratice.test.Rd create mode 100644 man/download_genbank.Rd create mode 100644 man/global_aln.Rd create mode 100644 man/print.AlignmentResult.Rd create mode 100644 man/print.readfasta.Rd create mode 100644 man/read.fasta.Rd create mode 100644 man/read.genbank.Rd diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5b6a065 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata diff --git a/Biopratice.Rproj b/Biopratice.Rproj new file mode 100644 index 0000000..497f8bf --- /dev/null +++ b/Biopratice.Rproj @@ -0,0 +1,20 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..341c061 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,13 @@ +Package: Biopratice +Type: Package +Title: Biopratice +Version: 0.1.0 +Author: Zijing Xie +Maintainer: Zijing Xie +Description: This package provides some functions of converting genbank file to + fasta file, doing alignment between two sequences using dynamic programming + algorithm and download genbank files from NCBI. +License: GPL +Encoding: UTF-8 +LazyData: true +RoxygenNote: 7.1.1 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..eff168e --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,8 @@ +# Generated by roxygen2: do not edit by hand + +S3method(print,AlignmentResult) +S3method(print,readfasta) +export(download_genbank) +export(global_aln) +export(read.fasta) +export(read.genbank) diff --git a/R/download_genbank.R b/R/download_genbank.R new file mode 100644 index 0000000..b28a60e --- /dev/null +++ b/R/download_genbank.R @@ -0,0 +1,18 @@ +#' @title download_genbank +#' @description This function downloads genbank files from NCBI. +#' @param acc The accession number +#' @param db The database, it can be nuccore, nucest, nucgss or popset +#' @param destfile The path where you want to save the genbank files +#' @export +download_genbank <- function(acc, db, destfile){ + db <- db + db <- paste('db=', db, '&id=', sep = '') + accs <- acc + base_url <- 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?' + for (acc in accs){ + mature_url <- paste(base_url, db, acc, '&rettype=gb', sep = '') + gb.file_name <- paste(acc, '.gb', sep = '') + destfile <- paste(destfile, gb.file_name, sep = '/') + utils::download.file(mature_url,destfile = destfile) + } +} diff --git a/R/global_align.R b/R/global_align.R new file mode 100644 index 0000000..bcc2fb4 --- /dev/null +++ b/R/global_align.R @@ -0,0 +1,96 @@ +#' @title global_aln +#' @description This function reads the fasta files, extract the sequence from +#' it and calculate the proportion of each kind of base in sequence. +#' @param X A sequence to be aligned +#' @param Y A sequence to be aligned +#' @param Scoring_system A vector containing the score of match, mismatch and +#' inserting a gap +#' @export +global_aln <- function(X, Y, Scoring_system){ + Scoring_system <- Scoring_system + names(Scoring_system) <- c('match', 'mismatch', 'gap') + seq1 <- X + seq2 <- Y + seq1 <- unlist(strsplit(seq1, split = '')) + seq2 <- unlist(strsplit(seq2, split = '')) + seq1.len <- length(seq1) + seq2.len <- length(seq2) + score_matrix <- matrix(0, nrow = seq1.len + 1, ncol = seq2.len + 1) + for (Row in 1:(seq1.len+1)){ + for (Col in 1:(seq2.len+1)){ + if (Row == 1){ + if (Col != 1){score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap']} + } + else{ + if (Col == 1){score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap'];next;} + else{ + insert1 <- score_matrix[Row, Col] <- score_matrix[Row-1, Col] + Scoring_system['gap'] + insert2 <- score_matrix[Row, Col] <- score_matrix[Row, Col-1] + Scoring_system['gap'] + if (seq1[Row-1] == seq2[Col-1]){mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['match']} + else{mis_or_match <- score_matrix[Row-1, Col-1] + Scoring_system['mismatch']} + score_matrix[Row, Col] <- max(insert1, insert2, mis_or_match) + } + } + } + } + trace_back <- function(temp_seq1='', temp_seq2='', Row=seq1.len+1, Col=seq2.len+1, aln_result = list()){ + if (Row == 1&Col == 1){ + result <- list(c(temp_seq1, temp_seq2)) + aln_result <- append(aln_result, result) + aln_result + } + else{ + if(Row>1&Col>1){ + if ((seq1[Row-1] == seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['match']==score_matrix[Row, Col])){ + Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') + Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') + aln_result1 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1) + aln_result <- append(aln_result, aln_result1) + } + if ((seq1[Row-1] != seq2[Col-1])&(score_matrix[Row-1, Col-1] + Scoring_system['mismatch']==score_matrix[Row, Col])){ + Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') + Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') + aln_result2 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col-1) + aln_result <- append(aln_result, aln_result2) + } + if ((score_matrix[Row-1, Col] + Scoring_system['gap']==score_matrix[Row, Col])){ + Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') + Seq2 <- paste('-', temp_seq2, sep = '') + aln_result3 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col) + aln_result <- append(aln_result, aln_result3) + } + if ((score_matrix[Row, Col-1] + Scoring_system['gap']==score_matrix[Row, Col])){ + Seq1 <- paste('-', temp_seq1, sep = '') + Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') + aln_result4<- trace_back(Seq1, Seq2, Row = Row, Col = Col-1) + aln_result <- append(aln_result, aln_result4) + } + } + else{ + if(Row>1){ + Seq1 <- paste(seq1[Row-1], temp_seq1, sep = '') + Seq2 <- paste('-', temp_seq2, sep = '') + aln_result5 <- trace_back(Seq1, Seq2, Row = Row-1, Col = Col) + aln_result <- append(aln_result, aln_result5) + } + if(Col>1){ + Seq1 <- paste('-', temp_seq1, sep = '') + Seq2 <- paste(seq2[Col-1], temp_seq2, sep = '') + aln_result6 <- trace_back(Seq1, Seq2, Row = Row, Col = Col-1) + aln_result <- append(aln_result, aln_result6) + } + } + } + aln_result + } + aln_result <- trace_back() + structure( + list( + Sequences = c(X, Y), + Scoring_system = Scoring_system, + Score_matrix = score_matrix, + Aln_result = aln_result + ), + class = 'AlignmentResult' + ) +} diff --git a/R/packagename-data.R b/R/packagename-data.R new file mode 100644 index 0000000..9b2f504 --- /dev/null +++ b/R/packagename-data.R @@ -0,0 +1,5 @@ +#' @docType data +#' @name Biopratice.test +#' @title A dataset containing two sequences and Scoring_system. +NULL +Biopratice.test <- list("ATTCCAAG", "TTCGAGT", c(4, -3, -4)) diff --git a/R/print.AlignmentResult.R b/R/print.AlignmentResult.R new file mode 100644 index 0000000..82680c8 --- /dev/null +++ b/R/print.AlignmentResult.R @@ -0,0 +1,55 @@ +#' @title print.AlignmentResult +#' @description This function prints the object returned by function +#' global_align. +#' @param x The object returned by function global_align +#' @param ... Other parameter +#' @method print AlignmentResult +#' @export +print.AlignmentResult <- function(x, ...){ + cat("Sequence X:\t") + cat(x$Sequences[1]) + cat('\n') + cat("Sequence Y:\t") + cat(x$Sequences[2]) + cat('\n') + cat('Scoring System:\t') + cat(x$Scoring_system[1]) + cat('\t') + cat('for match;') + cat('\t') + cat(x$Scoring_system[2]) + cat('\t') + cat('for mismatch;') + cat('\t') + cat(x$Scoring_system[3]) + cat('\t') + cat('for gap;') + cat('\t\n\n') + cat('Dynamic programing matrix:\n') + print(x$Score_matrix, ...) + cat('\n') + cat('Alignment:\n') + for (i in 1:length(x$Aln_result)){ + temp_result <- x$Aln_result[[i]] + seq1 <- unlist(strsplit(temp_result[1], split = '')) + seq2 <- unlist(strsplit(temp_result[2], split = '')) + bars <- '' + for ( i in 1:length(seq1)){ + if ((seq1[i]==seq2[i])&(seq1[i]!='-')&(seq2[i]!='-')){ + bars <- paste(bars, '|', sep = '') + } + else{ + bars <- paste(bars, ' ', sep = '') + } + } + bars <- paste(bars, '\n', sep = '') + cat(temp_result[1]) + cat('\n') + cat(bars) + cat(temp_result[2]) + cat('\n\n') + } + cat('Optimum alignment mat:\t') + cat(x$Score_matrix[dim(x$Score_matrix)[1], dim(x$Score_matrix)[2]]) + cat('\n') +} diff --git a/R/print.readfasta.R b/R/print.readfasta.R new file mode 100644 index 0000000..bf1475f --- /dev/null +++ b/R/print.readfasta.R @@ -0,0 +1,50 @@ +#' @title print.readfasta +#' @description This function prints the object returned by function read.fasta. +#' @param x The object returned by function read.fasta +#' @param ... Other parameter pass to print +#' @method print readfasta +#' @export +print.readfasta <- function(x, ...){ + cat(length(x$seq.list)) + cat(' DNA sequence in binary format stored in a list.\n\n') + if (length(x$seq.list) == 1){ + cat('Sequence length:') + cat(x$len) + cat('\n\n') + cat('Label:\n') + cat(x$Labels) + cat('\n\n') + } + else{ + max_len <- max(x$len) + min_len <- min(x$len) + mean_len <- round(mean(x$len), 2) + cat('Mean sequence length: ') + cat(max_len) + cat('\n\t') + cat('Shortest sequence: ') + cat(min_len) + cat('\n\t') + cat('Longest sequence: ') + cat(max_len) + cat('\n\n') + cat('Label:\n') + for (label in x$Labels){ + Index <- which(x$Labels == label) + if (Index >= 4){ + break + } + else{ + cat(label) + cat('\n') + } + } + } + cat('\nBase composition:\n') + if (dim(x$result)[1] >= 4){ + print(x$result[1:4, ]) + } + else{ + print(x$result) + } +} diff --git a/R/read.fasta.R b/R/read.fasta.R new file mode 100644 index 0000000..95972c6 --- /dev/null +++ b/R/read.fasta.R @@ -0,0 +1,40 @@ +#' @title read.fasta +#' @description This function reads the fasta files, extract the sequence from +#' it and calculate the proportion of each kind of base in sequence. +#' @param fa_files A character vector in which all elements are the +#' absolute path of fasta files. +#' @export +read.fasta <- function(fa_files){ + fa_sum <- vector() + for (f in fa_files){ + f_temp <- readLines(f) + fa_sum <- c(fa_sum, f_temp) + } + label.index <- grep('^>', fa_sum) + seq.index <- label.index + 1 + seq.list <- fa_sum[seq.index] + names(seq.list) <- sub('^>(\\w+)\\|?', '\\1', fa_sum[label.index]) + seq.len <- vector() + result <- array(0, dim = c(length(label.index), 17)) + colnames(result) <- c('A', 'T', 'C', 'G', 'U', "R", 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N', '-') + rownames(result) <- names(seq.list) + for (i in 1:length(seq.list)){ + s <- seq.list[i] + s.list <- unlist(strsplit(toupper(s), split = '')) + s.len <- length(s.list) + names(s.len) <- names(s) + seq.len <- c(seq.len, s.len) + for (b in s.list){ + result[names(s), b] <- result[names(s), b] + 1 + } + result[names(s), ] <- round(result[names(s), ]/s.len, 2) + } + structure( + list(seq.list = seq.list, + result = result, + Labels = sub('>', '', fa_sum[label.index]), + len = seq.len + ), + class = "readfasta" + ) +} diff --git a/R/read.genbank.R b/R/read.genbank.R new file mode 100644 index 0000000..8c21ea7 --- /dev/null +++ b/R/read.genbank.R @@ -0,0 +1,28 @@ +#' @title read.genbank +#' @description This function reads the genbank files, extracts the sequence +#' from it, calculates the proportion of each kind of base in sequence +#' and generates a fasta file which contains the sequence in it. +#' @param gb_files A character vector in which all elements are the +#' absolute path of genbank files +#' @export +read.genbank <- function(gb_files){ + fa_files <- vector() + for (gb_file in gb_files){ + gb <- readLines(gb_file) + acc <- gsub("ACCESSION\\s+", '', gb[grep("ACCESSION", gb)]) + acc_file <- paste(acc, ".fasta", sep = '') + accession <- paste(">", acc, sep = '') + seq <- gb[(grep("ORIGIN", gb)+1):(grep('//', gb)-1)] + seq <- gsub('\\d+', '', seq) + seq <- gsub('\\s+', '', seq) + seq <- paste(seq, collapse = '') + seq.len <- length(unlist(strsplit(seq, split = ''))) + fasta <- c(accession, seq) + gb_path <- unlist(strsplit(gb_file, split = '/')) + fa_path <- paste('/', paste(gb_path[2:(length(gb_path)-1)], collapse = '/'), sep = '') + writeLines(fasta, paste(fa_path, acc_file, sep = '/')) + fa_files <- c(fa_files, paste(fa_path, acc_file, sep = '/')) + } + x <- read.fasta(fa_files) + print(x) +} diff --git a/data/Biopratice.test.RData b/data/Biopratice.test.RData new file mode 100644 index 0000000000000000000000000000000000000000..68a76c87d3db3686aab23b4072ca7ddcb28cff89 GIT binary patch literal 119 zcmb2|=3oE=X6~X+gJ)e2k`fXU(h?FAk`mHbjU*$So$r+BN=QV^XpnJRz@k3kS^23m z`Wjw3Cw=^`YJ2MHF>-M>e>F7VP|$o_z{6QsR#sBtE}YKJwcC1EJHyNxg