Skip to content

Commit

Permalink
Pushing code that I forgot to push sooner
Browse files Browse the repository at this point in the history
  • Loading branch information
Kris Alavattam committed May 2, 2023
1 parent 99a75e2 commit 5d370c5
Show file tree
Hide file tree
Showing 2 changed files with 613 additions and 0 deletions.
129 changes: 129 additions & 0 deletions results/2023-0215/tutorial_extract-non-overlapping-regions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/bin/env Rscript

# tutorial_extract-non-overlapping-regions.R
# AntoniosK, me
# 2015-0812

# - See stackoverflow.com/questions/31968890/how-to-extract-non-overlapping-ranges-of-field-region-using-r
# - See stackoverflow.com/questions/16957293/collapse-intersecting-regions


# Initialize functions -------------------------------------------------------
`%+=%` <- function(x, y) eval.parent(substitute(x <- x + y))
# stackoverflow.com/questions/5738831/r-plus-equals-and-plus-plus-equivalent-from-c-c-java-etc


detect_overlap <- function(x, y) {
# Detect overlaps
# :param x: row number <numeric>
# :param y: range start and end columns/vectors <numeric>
# :return z: Boolean <0 or 1>
z <- ifelse(x[2] >= y[1], 1, 0)
return(z)
}


# delete_overlap <- function(x, y) {
# # Delete row that overlaps previous row from dataframe/tibble
# # :param x: dataframe/tibble
# # :param y: range start and end columns/vectors <numeric>
# # :return x: dataframe/tibble sans rows that overlapped previous row
# i <- 2
# while(i <= nrow(x)) {
# # i <- 2
# tmp <- x
# if(detect_overlap(tmp[i - 1, y], tmp[i, y]) == 1) {
# tmp[i, ] <- NA
# x <- tmp[complete.cases(tmp), ]
# } else {
# x <- tmp
# i %+=% 1
# }
# }
#
# return(x)
# }


# Initialize example dataframe -----------------------------------------------
df <- data.frame(
seqname = c(rep("I", 4), rep("II", 5)),
start = c(1, 20, 101, 220, 1, 100, 130, 300, 301),
end = c(100, 120, 200, 280, 50, 150, 180, 400, 350)
)

# order data based on minimum value of range (in case you don't have an order already)
df <- df[order(df$seqname, df$start, df$end), ]
df

# region start end
# 1 I 1 100
# 2 I 20 120
# 3 I 101 200
# 4 I 220 280
# 5 II 1 50
# 6 II 100 150
# 7 II 130 180
# 8 II 300 400
# 9 II 301 350


# Compares each row with the previous one, deleting rows that overlap --------
# Without accounting for chr/seqname ---------------------
# i <- 2 # Set starting row
# while(i <= nrow(df)) {
# # i <- 4
# df_tmp <- df
# if(overlap(df_tmp[i - 1, 2:3], df_tmp[i, 2:3]) == 1) {
# df_tmp[i, ] <- NA
# df <- df_tmp[complete.cases(df_tmp), ]
# } else {
# df <- df_tmp
# i %+=% 1
# }
# }
# df

# With accounting for chr/seqname ------------------------
out <- plyr::ddply(
df,
"seqname",
function(x) {
i <- 2
while(i <= nrow(x)) {
# i <- 2
tmp <- x
if(detect_overlap(tmp[i - 1, 2:3], tmp[i, 2:3]) == 1) {
tmp[i, ] <- NA
x <- tmp[complete.cases(tmp), ]
} else {
x <- tmp
i %+=% 1
}
}

return(x)
}
)


out
# seqname start end
# 1 I 1 100
# 2 I 101 200
# 3 I 220 280
# 4 II 1 50
# 5 II 100 150
# 6 II 300 400

df
# seqname start end
# 1 I 1 100
# 2 I 20 120
# 3 I 101 200
# 4 I 220 280
# 5 II 1 50
# 6 II 100 150
# 7 II 130 180
# 8 II 300 400
# 9 II 301 350
Loading

0 comments on commit 5d370c5

Please sign in to comment.