forked from tm4ss/tm4ss.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalculateLogLikelihood.R
30 lines (23 loc) · 1.01 KB
/
calculateLogLikelihood.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
calculateLogLikelihood <- function(termCountsTarget, termCountsComparison, minSignificance = 6.63) {
uniqueTerms <- setdiff(names(termCountsTarget), names(termCountsComparison))
zeroCounts <- rep(0, length(uniqueTerms))
names(zeroCounts) <- uniqueTerms
termCountsComparison <- c(termCountsComparison, zeroCounts)
termsToCompare <- intersect(names(termCountsTarget), names(termCountsComparison))
a <- termCountsTarget[termsToCompare]
b <- termCountsComparison[termsToCompare]
c <- sum(termCountsTarget)
d <- sum(termCountsComparison)
Expected1 = c * (a+b) / (c+d)
Expected2 = d * (a+b) / (c+d)
t1 <- a * log((a/Expected1) + (a == 0))
t2 <- b * log((b/Expected2) + (b == 0))
logLikelihood <- 2 * (t1 + t2)
# compare relative frequencies to indicate over/underuse
relA <- a / c
relB <- b / d
# underused terms are multiplied by -1
logLikelihood[relA < relB] <- logLikelihood[relA < relB] * -1
logLikelihood[logLikelihood < minSignificance] <- 0
return(logLikelihood)
}