Merge pull request #63 from ncn-foreigners/dev

merge from DEV before CRAN submission
ncn-foreigners · Jan 11, 2025 · 14e9dc7 · 14e9dc7
2 parents d98bc0f + e02d28d
commit 14e9dc7
Show file tree

Hide file tree

Showing 42 changed files with 2,898 additions and 2,631 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -45,7 +45,8 @@ Imports:
     nleqslv,
     doParallel,
     foreach,
-    parallel
+    parallel,
+    formula.tools
 LinkingTo: 
     Rcpp,
     RcppArmadillo

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,8 +8,10 @@ S3method(deviance,nonprobsvy)
 S3method(hatvalues,nonprobsvy)
 S3method(logLik,nonprobsvy)
 S3method(nobs,nonprobsvy)
+S3method(nonprobsvycheck,nonprobsvy)
 S3method(pop.size,nonprobsvy)
 S3method(print,nonprobsvy)
+S3method(print,nonprobsvycheck)
 S3method(print,summary_nonprobsvy)
 S3method(residuals,nonprobsvy)
 S3method(summary,nonprobsvy)
@@ -21,6 +23,7 @@ export(controlSel)
 export(genSimData)
 export(logit_model_nonprobsvy)
 export(nonprob)
+export(nonprobsvycheck)
 export(pop.size)
 export(probit_model_nonprobsvy)
 import(Rcpp)
@@ -34,13 +37,16 @@ importFrom(Rcpp,sourceCpp)
 importFrom(doParallel,registerDoParallel)
 importFrom(foreach,"%dopar%")
 importFrom(foreach,foreach)
+importFrom(formula.tools,lhs.vars)
+importFrom(formula.tools,rhs.vars)
 importFrom(maxLik,maxLik)
 importFrom(ncvreg,cv.ncvreg)
 importFrom(nleqslv,nleqslv)
 importFrom(parallel,makeCluster)
 importFrom(parallel,stopCluster)
 importFrom(stats,AIC)
 importFrom(stats,BIC)
+importFrom(stats,aggregate)
 importFrom(stats,as.formula)
 importFrom(stats,binomial)
 importFrom(stats,coef)
@@ -63,11 +69,13 @@ importFrom(stats,model.frame)
 importFrom(stats,model.matrix)
 importFrom(stats,model.response)
 importFrom(stats,nobs)
+importFrom(stats,plogis)
 importFrom(stats,pnorm)
 importFrom(stats,predict)
 importFrom(stats,predict.glm)
 importFrom(stats,printCoefmat)
 importFrom(stats,pt)
+importFrom(stats,qlogis)
 importFrom(stats,qnorm)
 importFrom(stats,rbinom)
 importFrom(stats,rchisq)
@@ -77,6 +85,7 @@ importFrom(stats,rexp)
 importFrom(stats,rnorm)
 importFrom(stats,runif)
 importFrom(stats,sd)
+importFrom(stats,setNames)
 importFrom(stats,summary.glm)
 importFrom(stats,terms)
 importFrom(stats,uniroot)
@@ -85,6 +94,7 @@ importFrom(stats,var)
 importFrom(stats,vcov)
 importFrom(stats,weighted.mean)
 importFrom(survey,as.svrepdesign)
+importFrom(survey,svyby)
 importFrom(survey,svymean)
 importFrom(survey,svyrecvar)
 importFrom(survey,svytotal)

diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,10 @@
 
 ### Features
 - two additional datasets have been included: `jvs` (Job Vacancy Survey; a probability sample survey) and `admin` (Central Job Offers Database; a non-probability sample survey). The units and auxiliary variables have been aligned in a way that allows the data to be integrated using the methods implemented in this package.
+- a `nonprobsvycheck` function was added to check the balance in the totals of the variables based on the weighted weights between the non-probability and probability samples.
+
+### Bugfixes
+- basic methods and functions related to variance estimation, weights and probability linking methods have been rewritten in a more optimal and readable way.
 
 # nonprobsvy 0.1.1
 

diff --git a/R/beta_funcs.R b/R/beta_funcs.R
diff --git a/R/bias_correction_ipw.R b/R/bias_correction_ipw.R
@@ -32,50 +32,6 @@ mm <- function(X,
     par0 <- start
   }
   prior_weights <- c(weights_rand, weights)
-
-  # MI - bias correction #########
-  # multiroot <- nleqslv::nleqslv( # TODO to fix "Jacobian is completely unusable (all zero entries?)"
-  #   x = rep(0, p), # TODO add user-specified parameters to control functions
-  #   fn = u_beta_mi,# TODO algorithm did not converge in maxit iterations for cloglog
-  #   R = R,
-  #   X = X,
-  #   y = y,
-  #   weights = prior_weights,
-  #   family_nonprobsvy = family
-  # )
-  # print(multiroot$x)
-  ##########
-
-  # IPW - bias correction #########
-  # multiroot <- nleqslv::nleqslv(
-  #   x = rep(0, p), # TODO add user-specified parameters to control functions
-  #   fn = u_theta_ipw,
-  #   method = "Newton", # TODO consider the method Broyden
-  #   global = "qline", # c("dbldog", "pwldog", cline", "qline", "gline", "hook", "none")
-  #   xscalm = "fixed", # c("fixed","auto")
-  #   jacobian = TRUE,
-  #   control = list(scalex = rep(1, length(rep(0, p)))), # TODO algorithm did not converge in maxit iterations for cloglog
-  #   R = R,
-  #   X = X,
-  #   y = y,
-  #   weights = weights,
-  #   method_selection = method_selection
-  # )
-  # print(multiroot$x)
-  ##########
-
-  ######### BB
-  # multiroot <- nleqslv::nleqslv(
-  #   par = par0, # TODO add user-specified parameters to control functions
-  #   fn = u_theta_beta_dr,
-  #   R = R,
-  #   X = X,
-  #   y = y,
-  #   weights = prior_weights,
-  #   method_selection = method_selection,
-  #   family_nonprobsvy = family
-  # )
-  # par_sel <- multiroot$par
   ######### NLESQLV
   multiroot <- nleqslv::nleqslv(
     x = par0,

diff --git a/R/boot_ipw.R b/R/boot_ipw.R
@@ -280,14 +280,6 @@ bootIPW_multicore <- function(X_rand,
         ps_nons <- est_method_obj$ps_nons
         weights_nons <- 1 / ps_nons
         N_est_nons <- ifelse(is.null(pop_size), sum(weights[strap_nons] * weights_nons), pop_size)
-
-        # mu_hat_boot <- mu_hatIPW(
-        #   y = y[strap_nons],
-        #   weights = weights[strap_nons],
-        #   weights_nons = weights_nons,
-        #   N = N_est_nons
-        # ) # IPW estimator
-
         mu_hats_this_boot <- numeric(mu_len)
 
         for (l in 1:mu_len) {
@@ -321,13 +313,6 @@ bootIPW_multicore <- function(X_rand,
 
         weights_nons <- 1 / ps_nons
         N_est_nons <- ifelse(is.null(pop_size), sum(weights_strap * weights_nons), pop_size)
-
-        # mu_hat_boot <- mu_hatIPW(
-        #   y = y[strap],
-        #   weights = weights_strap,
-        #   weights_nons = weights_nons,
-        #   N = N_est_nons
-        # ) # IPW estimator
         for (l in 1:mu_len) {
           mu_hats_boot[k, l] <- mu_hatIPW(
             y = ys[[l]][strap],
@@ -341,8 +326,6 @@ bootIPW_multicore <- function(X_rand,
     }
   )
   mu_hats_boot <- matrix(mu_hats_boot, nrow = num_boot, ncol = mu_len, byrow = TRUE)
-  # mu_hats_boot_means <- colMeans(mu_hats_boot)
-  # boot_var <- 1 / (num_boot - 1) * sum((mu_hats - mu_hat_boot)^2)
   for (l in 1:mu_len) {
     boot_vars[l] <- 1 / (num_boot - 1) * sum((mu_hats_boot[, l] - mu_hats[l])^2)
   }

diff --git a/R/boot_mi.R b/R/boot_mi.R
@@ -163,12 +163,6 @@ bootMI <- function(X_rand,
             weights_strap <- weights[strap]
             X_nons_strap <- X_nons[strap, , drop = FALSE]
             y_strap <- y[strap]
-
-            # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand)
-            # weights_rand_strap <- weights_rand[strap_rand]
-            # X_rand_strap <- X_rand[strap_rand, , drop = FALSE]
-            # N_strap <- sum(weights_rand_strap)
-
             # using svy package
             strap_rand_svy <- which(rep_weights[, k] != 0)
             weights_rand_strap_svy <- rep_weights[, k] * weights_rand
@@ -276,8 +270,6 @@ bootMI <- function(X_rand,
             mu_hat_boot <- as.vector(y_strap_rand)
             mu_hats[k] <- mu_hat_boot
             if (verbose) {
-              # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
-              # print(info)
               setTxtProgressBar(pb, k)
             }
             k <- k + 1
@@ -309,8 +301,6 @@ bootMI <- function(X_rand,
             mu_hat_boot <- mean(y_strap[model_rand$nn.idx])
             mu_hats[k] <- mu_hat_boot
             if (verbose) {
-              # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
-              # print(info)
               utils::setTxtProgressBar(pb, k)
             }
             k <- k + 1
@@ -367,21 +357,9 @@ bootMI <- function(X_rand,
                 )
               }
             )
-            #
-            # model_rand <- nonprobMI_nn(
-            #   data = y_strap_nons,
-            #   query = y_strap_rand,
-            #   k = control_outcome$k,
-            #   treetype = control_outcome$treetype,
-            #   searchtype = control_outcome$searchtype
-            # )
-
-
             mu_hat_boot <- mean(y_strap[model_rand$nn.idx])
             mu_hats[k] <- mu_hat_boot
             if (verbose) {
-              # info <- paste("iteration ", k, "/", num_boot, ", estimated mean = ", mu_hat_boot, sep = "")
-              # print(info)
               utils::setTxtProgressBar(pb, k)
             }
             k <- k + 1
@@ -396,7 +374,6 @@ bootMI <- function(X_rand,
       }
     }
   }
-  # mu_hat_boot <- mean(mu_hats)
   if (method == "pmm") {
     if (nn_exact_se) {
       comp2 <- pmm_exact(pi_ij,
@@ -499,7 +476,6 @@ bootMI_multicore <- function(X_rand,
           strap_rand_svy <- which(rep_weights[, k] != 0)
           weights_rand_strap_svy <- rep_weights[, k] * weights_rand
           N_strap <- sum(weights_rand_strap_svy)
-          # X_rand_strap <- X_rand[which(rep_weights[,k] != 0),]
 
           model_strap <- stats::glm.fit(
             x = X_nons_strap,
@@ -525,11 +501,6 @@ bootMI_multicore <- function(X_rand,
           X_nons_strap <- X_nons[strap, , drop = FALSE]
           y_strap <- y[strap]
 
-          # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand)
-          # weights_rand_strap <- weights_rand[strap_rand]
-          # X_rand_strap <- X_rand[strap_rand, , drop = FALSE]
-          # N_strap <- sum(weights_rand_strap)
-
           # using svy package
           strap_rand_svy <- which(rep_weights[, k] != 0)
           weights_rand_strap_svy <- rep_weights[, k] * weights_rand
@@ -546,7 +517,6 @@ bootMI_multicore <- function(X_rand,
           )
           y_rand_strap <- apply(model_rand$nn.idx, 1,
             FUN = \(x) mean(y_strap[x])
-            # FUN=\(x) mean(sample_nonprob$short_[x])
           )
           weighted.mean(x = y_rand_strap, w = weights_strap_rand)
         }
@@ -561,11 +531,6 @@ bootMI_multicore <- function(X_rand,
           X_nons_strap <- X_nons[strap, , drop = FALSE]
           y_strap <- y[strap]
 
-          # strap_rand <- sample.int(replace = TRUE, n = n_rand, prob = 1/weights_rand)
-          # weights_rand_strap <- weights_rand[strap_rand]
-          # X_rand_strap <- X_rand[strap_rand, , drop = FALSE]
-          # N_strap <- sum(weights_rand_strap)
-
           # using svy package
           strap_rand_svy <- which(rep_weights[, k] != 0)
           weights_rand_strap_svy <- rep_weights[, k] * weights_rand
@@ -612,7 +577,6 @@ bootMI_multicore <- function(X_rand,
 
           y_rand_strap <- apply(model_rand$nn.idx, 1,
             FUN = \(x) mean(y_strap[x])
-            # FUN=\(x) mean(sample_nonprob$short_[x])
           )
           weighted.mean(x = y_rand_strap, w = weights_strap_rand)
         }
@@ -640,8 +604,6 @@ bootMI_multicore <- function(X_rand,
           beta <- model_strap$coefficients
           eta <- pop_totals %*% beta / N
           y_strap_rand <- family_nonprobsvy$linkinv(eta)
-
-          # mu_hat_boot <- mu_hatMI(ystrap_rand, weights_rand_strap_svy, N_strap)
           as.vector(y_strap_rand)
         }
       )