ncn-foreigners / nonprobsvy Goto Github PK
View Code? Open in Web Editor NEWAn R package for modern methods for non-probability surveys
Home Page: https://ncn-foreigners.github.io/nonprobsvy/
License: Other
An R package for modern methods for non-probability surveys
Home Page: https://ncn-foreigners.github.io/nonprobsvy/
License: Other
In this issue I provide links for empirical likelihood method that can be implemented in the package. One may consider using own code or momentfit:: Wu_lam
or other.
Issues to add/consider for the next version of the package:
xgboost
model.svrep
(bootstrap
weighting) to the functionality of the package.DR
and MI
estimator (when NN
method for mass imputation)nonprobsvy
object).pop_totals
.set.seed(2023-10-19)
n_reps <- 500
N <- 100000
n <- 1000
x1 <- rnorm(N,1,1)
x2 <- rexp(N,1)
alp <- rnorm(N)
epsilon <- rnorm(N)
y11 <- 1 + x1 + x2 + alp + epsilon
y12 <- 0.5*(x1-1.5)^2 + x2^2 + alp + epsilon
y21 <- rbinom(N,1,plogis(1 + x1 + x2 + alp))
y22 <- rbinom(N,1,plogis(0.5*(x1-1.5)^2 + x2^2 + alp))
p1 <- plogis(x2)
p2 <- plogis(-3+(x1-1.5)^2+(x2-2)^2)
pop_data <- data.frame(x1,x2,y11,y12,y21,y22,p1,p2) |> setDT()
pop_data[, x1k := x1/N]
pop_data[, x2k := x2/N]
p_quantiles1 <- seq(0.25, 0.75, 0.25)
p_quantiles2 <- seq(0.1, 0.9, 0.1)
sample_prob <- pop_data[sample(1:N, n),]
sample_prob$w <- N/n
sample_prob_svy <- svydesign(ids=~1, weights = ~w, data = sample_prob)
## match population totals (should we calibrate x1_* and x2_* so they exactly match 0.25)
sample_prob_svy_cal <- calibrate(sample_prob_svy, ~ x1 + x2,
c(`(Intercept)` = N, x1=sum(x1), x2=sum(x2)))
sample_bd1 <- pop_data[rbinom(N,1,pop_data$p1)==1, ]
target
argument.bd1_ipw_0 <- nonprob(selection = ~ x1 + x2,
target = ~ y11 + y12 + y21 + y22,
svydesign = sample_prob_svy,
data = sample_bd1,
se = TRUE,
control_inference = controlInf(var_method = "bootstrap", num_boot = 10),
verbose = T,
control_selection = controlSel(est_method_sel = "gee", h = 1, start_type = "naive"))
Results
[1] "iteration 1/10, estimated mean = 2.85205174317731"
[1] "iteration 2/10, estimated mean = 2.96401796345292"
[1] "iteration 3/10, estimated mean = 2.94871299846419"
[1] "iteration 4/10, estimated mean = 3.01392624049592"
[1] "iteration 5/10, estimated mean = 2.94170984330577"
[1] "iteration 6/10, estimated mean = 2.9224964117775"
[1] "iteration 7/10, estimated mean = 2.93534474976417"
[1] "iteration 8/10, estimated mean = 2.93058020856146"
[1] "iteration 9/10, estimated mean = 2.90403964012494"
[1] "iteration 10/10, estimated mean = 2.93133613697437"
[1] "iteration 1/10, estimated mean = 2.67422366432402"
[1] "iteration 2/10, estimated mean = 2.62639389897099"
[1] "iteration 3/10, estimated mean = 2.55918335738336"
If I provide more than one y
in MI or DR than the summary is different and the resulting object outcome
contains only one model (I don't know which one).
library(data.table)
library(nonprobsvy)
set.seed(2023-10-19)
n_reps <- 50
N <- 50000
n <- 1000
x1 <- rnorm(N,1,1)
x2 <- rexp(N,1)
alp <- rnorm(N)
epsilon <- rnorm(N)
y11 <- 1 + x1 + x2 + alp + epsilon
y12 <- 0.5*(x1-1.5)^2 + x2^2 + alp + epsilon
y21 <- rbinom(N,1,plogis(1 + x1 + x2 + alp))
y22 <- rbinom(N,1,plogis(0.5*(x1-1.5)^2 + x2^2 + alp))
p1 <- plogis(x2)
p2 <- plogis(-3+(x1-1.5)^2+(x2-2)^2)
pop_data <- data.frame(x1,x2,y11,y12,y21,y22,p1,p2) |> setDT()
sample_prob <- pop_data[sample(1:N, n),]
sample_prob$w <- N/n
sample_prob_svy <- svydesign(ids=~1, weights = ~w, data = sample_prob)
sample_bd1 <- pop_data[rbinom(N,1,pop_data$p1)==1, ]
sample_bd1$w_naive <- N/nrow(sample_bd1)
sample_bd2 <- pop_data[rbinom(N,1,pop_data$p2)==1, ]
sample_bd2$w_naive <- N/nrow(sample_bd2)
m1 <- nonprob(outcome = y11 + y12 ~ x1 + x2, data=sample_bd2, svydesign=sample_prob_svy)
m1$outcome
In case of DR is the same. Only one set of selection and outcome coefficients is reported.
m2 <- nonprob(selection = ~ x1 + x2, outcome = y11 + y12 ~ x1 + x2, data=sample_bd2, svydesign=sample_prob_svy)
maxit
is not passed to nleqslv:: nleqslv
nleqslv
and other functions.An example code for NN in readme is:
nonprob(
outcome = y ~ x1 + x2 + ... + xk,
data = nonprob,
svydesign = prob,
method_outcome = "nn",
family_outcome = "gaussian",
control_outcome = controlOutcome(k = 2)
)
with controlOutcome
function instead of controlOut
.
Problem: summary
does not work correctly for pmm
and bootstrap
returns different point estimates than non-bootstrap approach.
Code:
library(data.table)
library(nonprobsvy)
set.seed(2023-10-19)
n_reps <- 50
N <- 50000
n <- 1000
x1 <- rnorm(N,1,1)
x2 <- rexp(N,1)
alp <- rnorm(N)
epsilon <- rnorm(N)
y11 <- 1 + x1 + x2 + alp + epsilon
y12 <- 0.5*(x1-1.5)^2 + x2^2 + alp + epsilon
y21 <- rbinom(N,1,plogis(1 + x1 + x2 + alp))
y22 <- rbinom(N,1,plogis(0.5*(x1-1.5)^2 + x2^2 + alp))
p1 <- plogis(x2)
p2 <- plogis(-3+(x1-1.5)^2+(x2-2)^2)
pop_data <- data.frame(x1,x2,y11,y12,y21,y22,p1,p2) |> setDT()
sample_prob <- pop_data[sample(1:N, n),]
sample_prob$w <- N/n
sample_prob_svy <- svydesign(ids=~1, weights = ~w, data = sample_prob)
sample_bd1 <- pop_data[rbinom(N,1,pop_data$p1)==1, ]
sample_bd1$w_naive <- N/nrow(sample_bd1)
sample_bd2 <- pop_data[rbinom(N,1,pop_data$p2)==1, ]
sample_bd2$w_naive <- N/nrow(sample_bd2)
m1 <- nonprob(outcome = y11 ~ x1 + x2, data=sample_bd2, svydesign=sample_prob_svy, method_outcome = "pmm")
m2 <- nonprob(outcome = y11 ~ x1 + x2, data=sample_bd2, svydesign=sample_prob_svy, method_outcome = "pmm",
control_inference = controlInf(var_method = "bootstrap", num_boot = 20))
Errors regarding summary
> summary(m1)
Error in if (attr(k, "glm")) { : argument is of length zero
> summary(m2)
Error in if (attr(k, "glm")) { : argument is of length zero
Potential errors with bootstrap
> m1$output
mean SE
y11 2.912592 0.04656599
> m2$output
mean SE
y11 2.592961 0.03303442
> mean(sample_bd2$y11)
[1] 2.592326
Error comes from
y_rand_pred <- stats::predict.glm(model_out$glm, newdata = model_frame, type = "response")
- line 40 glm.R
Reason - model_frame stores vector
instead of data.frame
.
File tree of the generated error -> glm.R#40 - nonprobDR.R#454 - nonprob.R#77.
Potential solution is to transform output of model_frame
function in data_manip.R file
Generate data
set.seed(123)
N <- 10000
n <- 1000
x1 <- rnorm(n = N, mean = 1, sd = 1)
x2 <- rexp(n = N, rate = 1)
epsilon <- rnorm(n = N) # rnorm(N)
y1 <- 1 + x1 + x2 + epsilon
y2 <- 0.5*(x1 - 0.5)^2 + x2 + epsilon
p1 <- exp(x2)/(1+exp(x2))
p2 <- exp(-0.5+0.5*(x2-2)^2)/(1+exp(-0.5+0.5*(x2-2)^2))
population <- data.frame(x1,x2,y1,y2,p1,p2)
flag_bd1 <- rbinom(n = N, size = 1, prob = population$p1)
flag_srs <- sample(x = 1:N, size = n)
source_nonprob <- population[flag_bd1 == 1, ]
source_prob <- svydesign(ids = ~ 1, data = population[flag_srs, ], weights = rep(N/n, n))
source_prob_cal <- calibrate(source_prob, formula = ~x1+x2, c(`(Intercept)`= N, x1 = sum(populacja$x1), x2 = sum(populacja$x2))) ## calibration
Using source_prob
works smoothly
test1a <- nonprob(selection = ~ x1+ x2,
target = ~ y1,
data = source_nonprob,
svydesign = source_prob)
but using source_prob_cal
causes the following error
test1b <- nonprob(selection = ~ x1+ x2,
target = ~ y1,
data = source_prob_cal,
svydesign = source_prob)
Error in as.data.frame.default(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors) :
cannot coerce class 'c("survey.design2", "survey.design")' to a data.frame
For the IPW estimators (MLE, GEE) as well as DR we should also add information on the estimated totals based on the IPW weights, pop_totals (estimated or provided) and maybe diff between? Something that may give information on the quality of the weights.
For instance:
library(survey)
library(nonprobsvy)
set.seed(1234567890)
N <- 1e6 ## 1000000
n <- 1000
x1 <- rnorm(n = N, mean = 1, sd = 1)
x2 <- rexp(n = N, rate = 1)
epsilon <- rnorm(n = N) # rnorm(N)
y1 <- 1 + x1 + x2 + epsilon
y2 <- 0.5*(x1 - 0.5)^2 + x2 + epsilon
p1 <- exp(x2)/(1+exp(x2))
p2 <- exp(-0.5+0.5*(x2-2)^2)/(1+exp(-0.5+0.5*(x2-2)^2))
flag_bd1 <- rbinom(n = N, size = 1, prob = p1)
flag_srs <- as.numeric(1:N %in% sample(1:N, size = n))
base_w_srs <- N/n
population <- data.frame(x1,x2,y1,y2,p1,p2,base_w_srs, flag_bd1, flag_srs)
base_w_bd <- N/sum(population$flag_bd1)
sample_prob <- svydesign(ids= ~1, weights = ~ base_w_srs,
data = subset(population, flag_srs == 1))
result_ipw <- nonprob(
selection = ~ x1+x2,
target = ~y1,
data = subset(population, flag_bd1 == 1),
svydesign = sample_prob
)
## estimated totals from the IPW
ipw_totals <- colSums(result_ipw$X[(n+1):nrow(result_ipw$X),]*result_ipw$weights)
## estimated totals from the population based on sample
prob_totals <- svytotal(~x1+x2, sample_prob)
##diff
ipw_totals-c(sum(weights(sample_prob)), prob_totals)
which gives
> ipw_totals-c(sum(weights(sample_prob)), prob_totals)
(Intercept) x1 x2
25687.1268 10626.6271 -397.9258
and calibrated IPW
## calibrated
result_ipw_gee <- nonprob(
selection = ~ x1+x2,
target = ~y1,
data = subset(population, flag_bd1 == 1),
svydesign = sample_prob,
control_selection = controlSel(est_method_sel = "gee", h = 1)
)
## estimated totals from the IPW
ipw_gee_totals <- colSums(result_ipw_gee$X[(n+1):nrow(result_ipw_gee$X),]*result_ipw_gee$weights)
## estimated totals from the population based on sample
prob_totals <- svytotal(~x1+x2, sample_prob)
##diff
ipw_gee_totals-c(sum(weights(sample_prob)), prob_totals)
gives
> ipw_gee_totals-c(sum(weights(sample_prob)), prob_totals)
(Intercept) x1 x2
2.211891e-09 -6.845221e-08 1.979060e-09
Here will be the list of papers, working papers etc that cites the software.
I suggest that:
nonprob
,summary
one may add distribution of weights (eg. using summary
) to provide more information about the quality of weights,X
from the survey or population totals.TBA
Code to generate data
library(nonprobsvy)
library(survey)
set.seed(123)
N <- 1e5
n <- 1000
x1 <- rnorm(N)
x2 <- rnorm(N)
x3 <- rnorm(N)
y <- 1 + x1 + x2 + rnorm(N)
rho <- plogis(exp(-1 + x2 + x3)/ (1 + exp(-1 + x2 + x3)))
br <- rbinom(N, size = 1, prob = rho)
random_s <- sample(1:N, size = n)
pop_df <- data.frame(x1,x2,x3,y,rho,br, random_s=1:N %in% random_s, random_p = n/N)
nonprob_df<-subset(pop_df, br == 1)
prob_df<-subset(pop_df, random_s == 1)
prob_df_svy <- svydesign(ids = ~1, probs = ~random_p, data = prob_df)
MI works but bootstrap is applied even if not specified. Moreover, bootstrap variance (1.682901e-05
) significantly differs from the analytic version (0.01837757
).
res_mi <- nonprobsvy::nonprob(outcome = y ~ x1 + x2,
data = nonprob_df,
svydesign = prob_df_svy,
family_outcome = "gaussian",
control_inference = controlInf(est_method = "likelihood",
var_method = "analytic"))
Output
$population_mean
[1] 1.001379
$variance
[,1]
[1,] 0.01837757
$standard_error
[,1]
[1,] 0.1355639
$CI
[1] 0.7356787 1.2670794
$beta
[,1]
(Intercept) 0.9991237
x1 1.0019048
x2 0.9890786
$boot_variance
[1] 1.682901e-05
I suggest:
variance
and standard_error
should be scalarsbootstrap
should be conducted only when it is selected as a inference method in controlInf
.standard_error
into SE from nonprob and prob surveysurvey
(code below) with nonprobsvy::nonprob
m1 <- lm(y~x1 +x2, nonprob_df)
prob_df_svy <- update(prob_df_svy, y= predict(m1, prob_df_svy$variables))
svymean(~y, prob_df_svy)
mean SE
y 1.0014 0.0437
When I run the code
res_ps <- nonprobsvy::nonprob(selection = br ~ x2 + x3,
data = nonprob_df,
svydesign = prob_df_svy,
family_selection = "binomial",
control_inference = controlInf(est_method = "likelihood",
var_method = "analytic"))
res_ps
I got the following error
Error in rbind(X_nons, X_rand) :
number of columns of matrices must match (see arg 2)
When I run the code
res_ps <- nonprobsvy::nonprob(selection = br ~ x2 + x3,
outcome = y ~ x1 + x2,
data = nonprob_df,
svydesign = prob_df_svy,
family_outcome = "gaussian",
family_selection = "binomial",
control_inference = controlInf(est_method = "likelihood",
var_method = "analytic"))
I got an error which may be connected with the situation when the selection variable (br
) in nonprob_df
dataset is equal to 1
.
Error in if (is.null(newVal) && ((sum(f1) - sum(f0)) < slot(control, "tol"))) { :
missing value where TRUE/FALSE needed
By default when calling controlOut
epsilon value is set to 1e-6
which is a sensible choice FOR stats::glm
but this epsilon
value is also used for fitting RANN::nn2()
. The default eps
in RANN::nn2()
is set to 0
, this is because when eps = 0
RANN::nn2
performs exact nearest neighbour search but when eps > 0
and appropriate solution is used.
This leads to worse results (as presented in code bellow) in predictive mean matching (ditto for kNN imputation) and can be confusing and users would probably expect the default to be set to an exact solution
library(nonprobsvy)
set.seed(1234567890)
N <- 1e5 ## 1000000
n <- 100
x1 <- rnorm(n = N, mean = 1, sd = 1)
x2 <- rexp(n = N, rate = 1)
epsilon <- rnorm(n = N) # rnorm(N)
population <- data.frame(
x1,
x2,
y1 = 1 + x1 + x2 + epsilon,
y2 = 0.5*(x1 - 0.5)^2 + x2 + epsilon,
p1 = exp(x2)/(1+exp(x2)),
p2 = exp(-0.5+0.5*(x2-2)^2)/(1+exp(-0.5+0.5*(x2-2)^2)),
base_w_srs = N/n,
flag_bd1 = rbinom(n = N, size = 1, prob = p1),
flag_srs = as.numeric(1:N %in% sample(1:N, size = n))
)
base_w_bd <- N/sum(population$flag_bd1)
sample_prob <- svydesign(ids= ~1, weights = ~ base_w_srs,
data = subset(population, flag_srs == 1))
pmm1 <- nonprob(
outcome = y1 ~ x1 + x2,
data = population[population$flag_bd1 == 1, , drop = FALSE],
svydesign = sample_prob,
method_outcome = "pmm",
family_outcome = "gaussian",
control_outcome = controlOut(k=1)
)
pmm2 <- nonprob(
outcome = y2 ~ x1 + x2,
data = population[population$flag_bd1 == 1, , drop = FALSE],
svydesign = sample_prob,
method_outcome = "pmm",
family_outcome = "gaussian",
control_outcome = controlOut(k=1)
)
# manually
df_prob <- population[population$flag_srs == 1, ]
df_non <- population[population$flag_bd1 == 1, ]
lm1 <- lm(y1 ~ x1 + x2, data = df_non)
lm2 <- lm(y2 ~ x1 + x2, data = df_non)
pred1 <- predict(lm1, newdata = df_prob)
pred2 <- predict(lm2, newdata = df_prob)
nn1 <- sapply(pred1, FUN = function(x) {which.min(abs(df_non$y1 - x))})
nn2 <- sapply(pred2, FUN = function(x) {which.min(abs(df_non$y2 - x))})
(est1 <- weighted.mean(x = df_non$y1[nn1],
w = df_prob$base_w_srs))
pmm1$output$mean
(est2 <- weighted.mean(x = df_non$y2[nn2],
w = df_prob$base_w_srs))
pmm2$output$mean
population[, c("y1", "y2"), drop = FALSE] |> colMeans()
xx1 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1))
# the same
xx11 <- dbscan::kNN(k = 1, x = cbind(df_non$y1), query = cbind(pred1), search = "linear")
xx111 <- RANN::nn2(data = cbind(df_non$y1), query = cbind(pred1), k = 1, eps = 0)
cbind(nn1, xx1$id, pmm1$outcome$y1$model_rand$nn.idx, xx111$nn.idx)
library(survey)
library(nonprobsvy)
### generate data under simple random sampling
set.seed(123)
N <- 10000
n <- 1000
x1 <- rnorm(n = N, mean = 1, sd = 1)
x2 <- rexp(n = N, rate = 1)
epsilon <- rnorm(n = N) # rnorm(N)
y1 <- 1 + x1 + x2 + epsilon
y2 <- 0.5*(x1 - 0.5)^2 + x2 + epsilon
p1 <- exp(x2)/(1+exp(x2))
p2 <- exp(-0.5+0.5*(x2-2)^2)/(1+exp(-0.5+0.5*(x2-2)^2))
populacja <- data.frame(x1,x2,y1,y2,p1,p2)
flag_p1 <- rbinom(n = N, size = 1, prob = populacja$p1)
flag_p2 <- rbinom(n = N, size = 1, prob = populacja$p2)
flag_srs <- sample(x = 1:N, size = n)
source_nonprob_p1 <- populacja[flag_p1 == 1, ]
source_nonprob_p2 <- populacja[flag_p2 == 1, ]
source_prob <- svydesign(ids = ~ 1, data = populacja[flag_srs, ], weights = rep(N/n, n))
suppressWarnings(
source_prob_no_weights <- svydesign(ids = ~ 1, data = populacja[flag_srs, ])
)
Running this code results
test1b <- nonprob(selection = ~ x1+ x2,
target = ~ y1,
data = source_nonprob_p1,
svydesign = source_prob_no_weights)
results with the following errors
Error in ps_method(X_nons, log_like, gradient, hessian, start, optim_method) :
Inifinite value of log_like in fitting ps_est by maxLik, error code 5
Traceback
6. stop("Inifinite value of log_like in fitting ps_est by maxLik, error code 5") at logitModel.R#57
5. ps_method(X_nons, log_like, gradient, hessian, start, optim_method) at EstimationMethods.R#105
4. estimation_method$model_selection(X, X_nons, X_rand, weights,
weights_rand, R, method_selection, optim_method, h = h, est_method,
maxit, varcov, ...) at internals.R#25
3. internal_selection(X = X, X_nons = X_nons, X_rand = X_rand, weights = weights,
weights_rand = weights_rand, R = R, method_selection = method_selection,
optim_method = optim_method, h = h, est_method = est_method,
maxit = maxit, varcov = TRUE) at nonprobIPW.R#104
2. nonprobIPW(selection, target, data, svydesign, pop_totals, pop_means,
pop_size, method_selection, family_selection, subset, strata,
weights, na_action, control_selection, control_inference,
start, verbose, contrasts, model, x, y, ...) at nonprob.R#116
1. nonprob(selection = ~x1 + x2, target = ~y1, data = source_nonprob_p1,
svydesign = source_prob_no_weights)
nonprobsvy/R/nonprobVariablesSelection.R
Line 272 in 4690071
For some examples this line causes problems
Error in names(beta) <- colnames(X_design) :
'names' attribute [20] must be the same length as the vector [9]
Models and methods that will be available in the 0.1.0 version of the nonprobsvy
package
So, if we would like to add calibration for quantiles for all of the models we need the following changes:
nonprob(..., pop_quantiles=...)
and
controlSel(quantile=...)
controlOut(quantile=...)
where pop_quantiles
are list of quantiles given by the user (known) or quantile
argument that works as in WeightIt.
I do not want to include quantile
argument in the main function as it may suggest that we want to estimate given quantile for the target / output variable(s). Any comments are welcome.
I would like to propose a breaking change for the PMM estimator. Currently, the control outcome takes the
controlOut(predictive_match = 1)
but I suggest to change to the most popular alternative i.e. the predictive_match = 2
).
If se=FALSE
is provided in the nonprob
function the resulting output
and confidence_interval
should be the same data.frames
as if se=FALSE
but it should contain NAs
.
Current output:
> nonprob_example[c("output", "confidence_interval")]
$output
mean
Y_11 1.860623
Y_12 6.945413
$confidence_interval
NULL
Desired output:
> nonprob_example[c("output", "confidence_interval")]
$output
mean SE
Y_11 1.860623 NA
Y_12 6.945413 NA
$confidence_interval
lower_bound upper_bound
Y_11 NA NA
Y_12 NA NA
Issues to consider for the current version of the package.
To develop (SHORT TERM):
pop_totals
/ pop_means
for variable selection methodgee
/ mle
provided after variable selectionmodel.frame
call for all models and one function for models with and without variable selectiony1 + y2 + ... + yk ~ x1 + x2 + x3 + ... + xn
control
function or consider another way to define itOutcomeMethods
BIC.nonprobsvy
in summary
gee
with h functions in control_selection
error
message in case of duplicates of outcome variables in formulaerror
message in case of badly defined formulasxgboost
model.svrep
(bootstrap weighting) to the functionality of the package.div
to variable selection modelsTo develop (LONG TERM):
nonprobsvy
object).To fix:
weights
for non-probability sample - not stable algorithm during estimation (overestimation of propensity weights or errors in maxLik
model)Currently, the nonprobSel
function applied to IPW estimator iterates over target
parameter, i.e. variable selection is applied as many times as the number of variables specified in the target
parameter. Here's an example
library(nonprobsvy)
library(sampling)
set.seed(2023-20-22)
N <- 10000
n_A <- 500
p <- 50
alpha_vec1 <- c(-2, 1, 1, 1,1, rep(0, p-5))
alpha_vec2 <- c(0,0,0,3,3,3,3, rep(0, p-7))
beta_vec <- c(1,0,0,1,1,1,1, rep(0, p-7))
X <- cbind("(Intercept)"=1, matrix(rnorm(N*(p-1)), nrow=N, byrow=T,
dimnames = list(NULL, paste0("X",1:(p-1)))))
X_formula <- as.formula(paste("~", paste0("X",1:(p-1), collapse = " + ")))
X_totals <- colSums(X)
X_means <- colMeans(X[,-1])
Y_11 <- 1 + as.numeric(X %*% beta_vec) + rnorm(N) ## OM I: linear model
Y_12 <- 1 + exp(3*sin(as.numeric(X %*% beta_vec))) + X[, "X5"] + X[, "X6"] + rnorm(N) ## OM II: nonlinear model
pi_Y_21 <- plogis(as.numeric(X %*% beta_vec)) ## OM III: linear model for binary Y
pi_Y_22 <- plogis(2 - log((X %*% beta_vec)^2) - 2*X[,"X5"] + 2*X[, "X6"]) ## OM IV: nonlinear model for binary Y
Y_21 <- rbinom(N, 1, prob = pi_Y_21)
Y_22 <- rbinom(N, 1, prob = pi_Y_22)
pi_A <- inclusionprobabilities(0.25 + abs(X[, "X1"]) + 0.03*abs(Y_11), n_A) ## inclusion based on Y_11 only
pi_B1 <- plogis(as.numeric(X %*% alpha_vec1)) ## PSM I: linear probability
pi_B2 <- plogis(3.5 + as.numeric(log(X^2) %*% alpha_vec2) -
sin(X[, "X3"] + X[, "X4"]) - X[,"X5"] + X[, "X6"]) ## PSM II: nonlinear
flag_B1 <- rbinom(N, 1, prob = pi_B1)
flag_B2 <- rbinom(N, 1, prob = pi_B2)
flag_A <- UPpoisson(pik = pi_A)
pop_data <- data.frame(pi_A, flag_A, flag_B1, flag_B2, Y_11, Y_12, Y_21, Y_22, X[, 2:p])
sample_B1 <- subset(pop_data, flag_B1 == 1)
sample_B2 <- subset(pop_data, flag_B2 == 1)
sample_A <- subset(pop_data, flag_A == 1)
X_totals <- colSums(X)
X_means <- colMeans(X[,-1])
sample_A_svy <- svydesign(ids = ~ 1,
probs = ~ pi_A,
pps = poisson_sampling(sample_A$pi_A),
data = sample_A)
sample_A_svy_cal <- calibrate(sample_A_svy,
formula = as.formula(paste0("~", paste(names(X_totals)[2:p], collapse = "+"))),
population = X_totals,
calfun = cal.raking)
## ipw estimator
y11_corr_one <- nonprob(selection = ~ X1 + X2 + X3 + X4,
target = ~ Y_11 + Y_12,
data = sample_B1,
svydesign = sample_A_svy_cal,
method_selection = "logit",
control_inference = controlInf(vars_selection = TRUE),
verbose = T)
## verbose
# Starting CV fold #1
# Starting CV fold #2
# Starting CV fold #3
# Starting CV fold #4
# Starting CV fold #5
# Starting CV fold #6
# Starting CV fold #7
# Starting CV fold #8
# Starting CV fold #9
# Starting CV fold #10
# Starting CV fold #1
# Starting CV fold #2
# Starting CV fold #3
# Starting CV fold #4
# Starting CV fold #5
# Starting CV fold #6
# Starting CV fold #7
# Starting CV fold #8
# Starting CV fold #9
# Starting CV fold #10
The initial version of the package should have:
gaussian
, binomial
, Gamma
, inverse.gaussian
, poisson
, quasi*
and MASS::negative.binomial
.stats::glm
and MASS::glm.nb
)RANN::nn2
)Literature:
binomial()
with logit
, probit
and cloglog
linksLiterature:
Literature:
If I generate a large non-probability sample I have problem with estimating models.
library(data.table)
library(nonprobsvy)
set.seed(2023-10-19)
n_reps <- 50
N <- 100000
n <- 1000
x1 <- rnorm(N,1,1)
x2 <- rexp(N,1)
alp <- rnorm(N)
epsilon <- rnorm(N)
y11 <- 1 + x1 + x2 + alp + epsilon
y12 <- 0.5*(x1-1.5)^2 + x2^2 + alp + epsilon
y21 <- rbinom(N,1,plogis(1 + x1 + x2 + alp))
y22 <- rbinom(N,1,plogis(0.5*(x1-1.5)^2 + x2^2 + alp))
p1 <- plogis(x2)
p2 <- plogis(-3+(x1-1.5)^2+(x2-2)^2)
pop_data <- data.frame(x1,x2,y11,y12,y21,y22,p1,p2) |> setDT()
sample_prob <- pop_data[sample(1:N, n),]
sample_prob$w <- N/n
sample_prob_svy <- svydesign(ids=~1, weights = ~w, data = sample_prob)
sample_bd1 <- pop_data[rbinom(N,1,pop_data$p1)==1, ]
sample_bd1$w_naive <- N/nrow(sample_bd1)
sample_bd2 <- pop_data[rbinom(N,1,pop_data$p2)==1, ]
sample_bd2$w_naive <- N/nrow(sample_bd2)
IPW (and thus DR)
## ipw h=1
test1 <- nonprob(selection = ~ x1 + x2,
target = ~ y11 + y12 + y21 + y22,
svydesign = sample_prob_svy,
data = sample_bd1,
control_selection = controlSel(est_method_sel = "gee", h = 1))
> test1 <- nonprob(selection = ~ x1 + x2,
+ target = ~ y11 + y12 + y21 + y22,
+ svydesign = sample_prob_svy,
+ data = sample_bd1,
+ control_selection = controlSel(est_method_sel = "gee", h = 1))
Error in nleqslv::nleqslv(x = start0, fn = u_theta, method = "Newton", :
initial value of fn function contains non-finite values (starting at index=1)
Check initial x and/or correctness of function
In addition: Warning messages:
1: glm.fit: algorithm did not converge
2: glm.fit: fitted probabilities numerically 0 or 1 occurred
## ipw mle
test1 <- nonprob(selection = ~ x1 + x2,
target = ~ y11 + y12 + y21 + y22,
svydesign = sample_prob_svy,
data = sample_bd1)
> test1 <- nonprob(selection = ~ x1 + x2,
+ target = ~ y11 + y12 + y21 + y22,
+ svydesign = sample_prob_svy,
+ data = sample_bd1)
Error in solve.default(-hess) :
Lapack routine dgesv: system is exactly singular: U[1,1] = 0
In addition: Warning messages:
1: glm.fit: algorithm did not converge
2: glm.fit: fitted probabilities numerically 0 or 1 occurred
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.