Resolve conflict

Merge branch 'add-wordvector' of https://github.com/koheiw/LSX into add-wordvector # Conflicts: # NEWS.md
koheiw · Dec 13, 2024 · e118750 · e118750
2 parents 614b62f + 6600d4a
commit e118750
Show file tree

Hide file tree

Showing 6 changed files with 25 additions and 10 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: LSX
 Type: Package
 Title: Semi-Supervised Algorithm for Document Scaling
-Version: 1.4.1
+Version: 1.4.2
 Authors@R: person("Kohei", "Watanabe", email = "[email protected]", role = c("aut", "cre", "cph"))
 Description: A word embeddings-based semi-supervised model for document scaling Watanabe (2020) <doi:10.1080/19312458.2020.1832976>.
     LSS allows users to analyze large and complex corpora on arbitrary dimensions with seed words exploiting efficiency of word embeddings (SVD, Glove).

diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 * Add `as.textmodel_lss()` for objects from the **wordvector** package.
 * Reduce dependent packages by moving **rsparse**, **irlba** and *rsvd* to Suggests.
+* Fix handling of phrasal patterns in `textplot_terms()`.
+* Improve objects created by `as.textmodel_lss.textmodel_lss()`.
 
 ## Changes in v1.4.1
 

diff --git a/R/as.textmodel.R b/R/as.textmodel.R
@@ -63,7 +63,8 @@ as.textmodel_lss.matrix <- function(x, seeds,
         seeds_weighted = seed,
         embedding = x,
         similarity = simil$seed,
-        call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE)
+        call = try(match.call(sys.function(-1), call = sys.call(-1)), silent = TRUE),
+        version = utils::packageVersion("LSX")
     )
     return(result)
 }
@@ -93,6 +94,7 @@ as.textmodel_lss.textmodel_lss <- function(x, ...) {
     if (is.null(x$embedding))
         stop("x must be a valid textmodel_lss object")
     result <- as.textmodel_lss(x$embedding, ...)
+    result$concatenator <- x$concatenator
     result$data <- x$data
     result$frequency <- x$frequency[names(result$beta)]
     return(result)

diff --git a/R/textplot.R b/R/textplot.R
@@ -98,8 +98,19 @@ textplot_terms.textmodel_lss <- function(x, highlighted = NULL,
             case_insensitive = TRUE,
             concatenator = concatenator
         )
+
+        # flag nested patterns (see quanteda::dfm_lookup)
+        if (length(ids)) {
+          m <- factor(names(ids), levels = unique(names(ids)))
+          dup <- unlist(lapply(split(ids, m), duplicated), use.names = FALSE)
+        } else {
+          dup <- logical()
+        }
+
         key <- attr(ids, "key")
+        ids <- ids[lengths(ids) == 1 & !dup] # drop phrasal and nested patterns
         id <- unlist(ids)
+
         if (!is.null(key) && !is.null(id)) {
             temp$group <- factor(names(id[match(temp$id, id)]), levels = key)
         } else {

diff --git a/tests/testthat/test-as.textmodel.R b/tests/testthat/test-as.textmodel.R
@@ -75,7 +75,7 @@ test_that("as.textmodel_lss works with textmodel_lss", {
     expect_equal(lss$embedding, lss_test$embedding)
     expect_identical(lss$data, lss_test$data)
     expect_identical(lss$frequency, lss_test$frequency)
-    expect_identical(names(lss$frequency), names(lss$frequency))
+    expect_identical(lss$concatenator, lss_test$concatenator)
 
     expect_error(
         as.textmodel_lss(lss_test, seed, slice = 100),

diff --git a/tests/testthat/test-textplot.R b/tests/testthat/test-textplot.R
@@ -100,18 +100,18 @@ test_that("textplot_terms works even when frequency has zeros (#85)", {
 })
 
 test_that("textplot_terms works with dictionary", {
-    toks <- tokens_compound(toks_test, data_dictionary_LSD2015)
-    dfmt <- dfm(toks) %>%
-        dfm_subset(Year > 2000)
+
+    dict <- dictionary(list("american" = c("american *"),
+                            "president" = c("president *")))
+    toks <- tokens_subset(toks_test, Year > 2000) %>%
+      tokens_compound(dict)
+    dfmt <- dfm(toks)
     seed <- c("nice*" = 1, "positive*" = 1, "bad*" = -1, "negative*" = -1)
     suppressWarnings(
         lss <- textmodel_lss(dfmt, seed, k = 10)
     )
     expect_silent(print(
-        textplot_terms(lss, data_dictionary_LSD2015, max_highlighted = 10)
-    ))
-    expect_silent(print(
-        textplot_terms(lss, dictionary(list(phrase = "hard work")))
+        textplot_terms(lss, dict, max_highlighted = 10)
     ))
     expect_silent(print(
         textplot_terms(lss, dictionary(list(none = "xxxxx")))