Impute

alternatively use

dd <- prolfqua::sim_lfq_data_protein_config(Nprot = 100,weight_missing = 2)

## creating sampleName from fileName column

## completing cases

## completing cases done

## setup done

dd$data$abundance |> is.na() |> sum()

## [1] 552

Contrasts <- c("dilution.b-a" = "group_A - group_B", "dilution.c-e" = "group_A - group_Ctrl")
mh1 <- prolfqua::MissingHelpers$new(dd$data, dd$config, prob = 0.5,weighted = TRUE)
imputed <- mh1$get_contrasts(Contrasts = Contrasts)

## completing cases

## dilution.b-a=group_A - group_B

## dilution.c-e=group_A - group_Ctrl

## dilution.b-a=group_A - group_B

## dilution.c-e=group_A - group_Ctrl

## dilution.b-a=group_A - group_B

## dilution.c-e=group_A - group_Ctrl

mh2 <- prolfqua::MissingHelpers$new(dd$data, dd$config, prob = 0.5,weighted = FALSE)
imputed2 <- mh2$get_contrasts(Contrasts = Contrasts)

## completing cases

## dilution.b-a=group_A - group_B

## dilution.c-e=group_A - group_Ctrl

## dilution.b-a=group_A - group_B

## dilution.c-e=group_A - group_Ctrl

## dilution.b-a=group_A - group_B

## dilution.c-e=group_A - group_Ctrl

plot(imputed$estimate, imputed2$estimate)
abline(0 , 1 , col=2 , lwd=2)

mh1$get_LOD()

##      50% 
## 18.13636

plot( imputed$estimate, -log10(imputed$p.value), pch = "*" )
points(imputed2$estimate, -log10(imputed2$p.value), col = 2, pch = "x")

modelling with linear models

Model with missing data

modelName <- "f_condtion_r_peptide"
formula_Protein <-
  prolfqua::strategy_lm("abundance  ~ group_",
              model_name = modelName)


mod <- prolfqua::build_model(
  dd$data,
  formula_Protein,
  modelName = modelName,
  subject_Id = dd$config$table$hierarchy_keys_depth())

## Warning: There were 11 warnings in `dplyr::mutate()`.
## The first warning was:
## ℹ In argument: `linear_model = purrr::map(data, model_strategy$model_fun, pb =
##   pb)`.
## ℹ In group 18: `protein_Id = "DoWup2~8058"`.
## Caused by warning in `value[[3L]]()`:
## ! WARN :Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 10 remaining warnings.

## Joining with `by = join_by(protein_Id)`

mod$modelDF

## # A tibble: 100 × 9
## # Groups:   protein_Id [100]
##    protein_Id  data     linear_model exists_lmer isSingular df.residual sigma
##    <chr>       <list>   <list>       <lgl>       <lgl>            <dbl> <dbl>
##  1 0EfVhX~3967 <tibble> <lm>         TRUE        FALSE                6 1.22 
##  2 0m5WN4~6025 <tibble> <lm>         TRUE        FALSE                4 0.447
##  3 0YSKpy~2865 <tibble> <lm>         TRUE        TRUE                 1 1.55 
##  4 3QLHfm~8938 <tibble> <lm>         TRUE        FALSE                7 0.923
##  5 3QYop0~7543 <tibble> <lm>         TRUE        FALSE                7 0.808
##  6 76k03k~7094 <tibble> <lm>         TRUE        FALSE                8 1.09 
##  7 7cbcrd~7351 <tibble> <lm>         TRUE        FALSE                9 0.889
##  8 7QuTub~1867 <tibble> <lm>         TRUE        FALSE                6 0.644
##  9 7soopj~5352 <tibble> <lm>         TRUE        FALSE                3 0.920
## 10 7zeekV~7127 <tibble> <lm>         TRUE        FALSE                6 1.06 
## # ℹ 90 more rows
## # ℹ 2 more variables: nrcoef <int>, nrcoeff_not_NA <int>

mod$modelDF$nrcoeff_not_NA |> table()

## 
##  2  3 
## 15 74

mod$modelDF$isSingular |> table()

## 
## FALSE  TRUE 
##    69    20

mod$modelDF |> nrow()

## [1] 100

mod$get_anova()

## # A tibble: 69 × 10
##    protein_Id  isSingular nrcoef factor    Df Sum.Sq Mean.Sq F.value  p.value
##    <chr>       <lgl>       <int> <chr>  <int>  <dbl>   <dbl>   <dbl>    <dbl>
##  1 0EfVhX~3967 FALSE           3 group_     2 16.1     8.04    5.39  0.0457  
##  2 0m5WN4~6025 FALSE           3 group_     2 37.9    18.9    94.9   0.000426
##  3 3QLHfm~8938 FALSE           3 group_     2  3.90    1.95    2.29  0.172   
##  4 3QYop0~7543 FALSE           3 group_     2  4.44    2.22    3.40  0.0932  
##  5 76k03k~7094 FALSE           3 group_     2  2.11    1.05    0.891 0.448   
##  6 7cbcrd~7351 FALSE           3 group_     2 21.0    10.5    13.3   0.00205 
##  7 7QuTub~1867 FALSE           3 group_     2 14.1     7.04   17.0   0.00338 
##  8 7zeekV~7127 FALSE           3 group_     2  0.425   0.212   0.189 0.832   
##  9 9VUkAq~9664 FALSE           3 group_     2  7.50    3.75   11.8   0.0128  
## 10 At886V~1021 FALSE           3 group_     2 46.3    23.2    49.4   0.00151 
## # ℹ 59 more rows
## # ℹ 1 more variable: FDR <dbl>

prolfqua::model_summary(mod)

## $exists
## 
## FALSE  TRUE 
##    11    89 
## 
## $isSingular
## 
## FALSE  TRUE 
##    69    20

maxcoef <- max(mod$modelDF$nrcoeff_not_NA, na.rm = TRUE)
goodmods <- mod$modelDF |> dplyr::filter(isSingular == FALSE, exists_lmer == TRUE, nrcoeff_not_NA == maxcoef)

dim(goodmods)

## [1] 61  9

xx <- lapply(goodmods$linear_model, vcov)
nr <- sapply(xx, nrow)
nr |> table()

## nr
##  3 
## 61

nr <- sapply(xx, ncol)
nr |> table()

## nr
##  3 
## 61

sum_matrix <- Reduce(`+`, xx)
sum_matrix/length(xx)

##             (Intercept)    group_B group_Ctrl
## (Intercept)   0.4317112 -0.4317112 -0.4317112
## group_B      -0.4317112  0.7569259  0.4317112
## group_Ctrl   -0.4317112  0.4317112  0.8485382

Model with lod imputation

loddata <- dd$data

loddata <- loddata |> dplyr::mutate(abundance = ifelse(is.na(abundance), mh1$get_LOD(), abundance))
modI <- prolfqua::build_model(
  loddata,
  formula_Protein,
  modelName = modelName,
  subject_Id = dd$config$table$hierarchy_keys_depth())

## Joining with `by = join_by(protein_Id)`

modI$modelDF$nrcoeff_not_NA |> table()

## 
##   3 
## 100

modI$modelDF$isSingular |> table()

## 
## FALSE 
##   100

modI$modelDF |> nrow()

## [1] 100

allModels <- modI$modelDF$linear_model

xx <- lapply(allModels, vcov)
sum_matrix <- Reduce(`+`, xx)
sum_matrix/length(xx)

##             (Intercept)    group_B group_Ctrl
## (Intercept)   0.4759103 -0.4759103 -0.4759103
## group_B      -0.4759103  0.9518206  0.4759103
## group_Ctrl   -0.4759103  0.4759103  0.9518206

m <- (modI$modelDF$linear_model[[1]])
df.residual(m)

## [1] 9

sigma(m)

## [1] 1.222678

vcov(m)

##             (Intercept)    group_B group_Ctrl
## (Intercept)   0.3737352 -0.3737352 -0.3737352
## group_B      -0.3737352  0.7474704  0.3737352
## group_Ctrl   -0.3737352  0.3737352  0.7474704

2024-04-22

modelling with linear models