workflow format update

cumc · Sep 13, 2024 · b15327f · b15327f
1 parent 2c6697a
commit b15327f
Show file tree

Hide file tree

Showing 11 changed files with 531 additions and 560 deletions.
diff --git a/code/association_scan/quantile_twas/quantile_twas.ipynb b/code/association_scan/quantile_twas/quantile_twas.ipynb
@@ -121,7 +121,7 @@
     "- qvalue_qr: the q-value of p_qr. \n",
     "- qvalue_qr_0.05 to qvalue_qr_0.95: quantile-specific QR q-values for the quantile levels 0.05, 0.1, ..., 0.95. \n",
     "- coef_qr_0.05 to coef_qr_0.95: quantile-specific QR coefficients for the quantile levels 0.05, 0.1, ..., 0.95. \n",
-    "- se_qr_0.05 to coef_qr_0.95: quantile-specific QR standard errors for the quantile levels 0.05, 0.1, ..., 0.95. \n",
+    "- se_qr_0.05 to se_qr_0.95: quantile-specific QR standard errors for the quantile levels 0.05, 0.1, ..., 0.95. \n",
     "\n",
     "\n",
     "Additionally, the matrix of quantile TWAS weights and pseudo R-squares is calculated and saved using Koenker and Machado's Pseudo R² method, as described in their [Koenker and Machado, 1999: Inference in Quantile Regression](https://www.maths.usyd.edu.au/u/jchan/GLM/Koenker&Machado1999InferenceQuantileReg.pdf)"
@@ -1075,19 +1075,24 @@
     "            print('qr_beta_R2_results_time:')\n",
     "            print(qr_beta_R2_results_time)\n",
     "            beta.mat = qr_beta_R2_results$beta_mat\n",
+    "            rownames_beta <- rownames(beta.mat)\n",
+    "            rownames(beta.mat) <- gsub(\"^X.filter\", \"\", rownames_beta)            \n",
     "            pseudo_R2 = qr_beta_R2_results$pseudo_R2\n",
     "\n",
     "            # define twas weight\n",
     "            ## FIXME: so far we use beta as twas weight, and just calculate pseudo R^2 without actual filtering out.\n",
     "            twas_weight = beta.mat\n",
     "\n",
+    "            quantile_twas_prediction =  ExprData$X.filter%*%beta.mat\n",
+    "\n",
     "            # save info of ExprData$X.filter, qr_nominal_result, twas_weight, pseudo_R2 \n",
     "            # Store results in the fitted list for this iteration\n",
     "            fitted[[r]] <- list(\n",
-    "                X_filter = ExprData$X.filter,\n",
+    "                variant_names <- colnames(ExprData$X.filter),\n",
     "                qr_nominal_result = qr_nominal_result,\n",
     "                twas_weight = twas_weight,\n",
-    "                pseudo_R2 = pseudo_R2\n",
+    "                pseudo_R2 = pseudo_R2,\n",
+    "                quantile_twas_prediction = quantile_twas_prediction\n",
     "            )        \n",
     "\n",
     "            condition_names <- c(condition_names, new_names)\n",

diff --git a/code/mnm_analysis/mnm_regression.ipynb b/code/mnm_analysis/mnm_regression.ipynb
diff --git a/code/mnm_analysis/rss_analysis.ipynb b/code/mnm_analysis/rss_analysis.ipynb
@@ -378,6 +378,10 @@
     "parameter: finemapping_method = \"single_effect\"\n",
     "# compute LD from genotype\n",
     "parameter: compute_LD_from_genotype = False\n",
+    "# remove a variant if it has more than imiss missing individual level data\n",
+    "parameter: imiss = 1.0\n",
+    "# MAF cutoff\n",
+    "parameter: maf = 0.0025\n",
     "\n",
     "depends: sos_variable(\"regional_data\")\n",
     "regions = list(regional_data['regions'].keys())\n",
@@ -386,6 +390,7 @@
     "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n",
     "R: expand = '${ }', stdout = f\"{_output:n}.stdout\", stderr = f\"{_output:n}.stderr\", container = container, entrypoint = entrypoint\n",
     "    library(pecotmr)\n",
+    "    library(dplyr)\n",
     "    library(data.table)\n",
     "    skip_region = c(${', '.join(['\"{}\"'.format(item) for item in skip_regions])})\n",
     "    studies = c(${', '.join(['\"{}\"'.format(item) for item in regional_data[\"GWAS\"].keys()])})\n",
@@ -404,6 +409,11 @@
     "                                        '${\"chr%s:%s-%s\" % (regional_data['regions'][_regions][0], regional_data['regions'][_regions][1], regional_data['regions'][_regions][2])}'),\n",
     "                            missing_rate_thresh = ${imiss}, maf_thresh=${maf})%>%\n",
     "                            cor\n",
+    "\n",
+    "    ## When we compute LD from genotype adhoc, there could be snp with conventional Ref/Alt allele like  4:185875956:C:<INS:ME:ALU>, which will break the allele_qc function in the following steps.\n",
+    "    ## The additional filtering remove these unconventional allele and keep only SNP that wont break the allele_qc function, which depends on the fact that there is exactly three \":\" in the snp ID.\n",
+    "    correct_variants <-  rownames(LD_data)[sapply( rownames(LD_data), function(x) sum(strsplit(x, \"\", fixed = TRUE)[[1]] == \":\") == 3)]\n",
+    "    LD_data = LD_data[correct_variants,correct_variants]\n",
     "    LD_data = list(combined_LD_matrix  = LD_data, combined_LD_variants  = rownames(LD_data) )\n",
     "     } else { \n",
     "    LD_data = load_LD_matrix(\"${ld_meta_data}\", '${\"chr%s:%s-%s\" % (regional_data['regions'][_regions][0], regional_data['regions'][_regions][1], regional_data['regions'][_regions][2])}')\n",

diff --git a/code/mnm_analysis/univariate_fine_mapping_twas_vignette.ipynb b/code/mnm_analysis/univariate_fine_mapping_twas_vignette.ipynb
@@ -130,26 +130,31 @@
    "id": "199a2637-cc0c-4aff-b4ea-a2546936f073",
    "metadata": {},
    "source": [
-    "`ROSMAP_mega_eQTL.ENSG00000073921.univariate_susie_twas_weights.rds`:\n",
+    "`ROSMAP_mega_eQTL.chr11_ENSG00000073921.univariate_bvrs.rds`:\n",
     "* For each gene of interest and phenotype, this file contains: \n",
-    "    1. variant names\n",
-    "    2. analysis script\n",
-    "    3. other quantities - information on dropped samples, if any\n",
-    "    4. summary statistics (beta and se) for each variant\n",
-    "    5. sample names\n",
-    "    6. summary statistics for top loci\n",
-    "    7. susie results trimmed - includes pip, sets, cs_corr, sets_secondary, alpha, lbf_variable, V, niter, X_column_scale_factors, mu, mu2\n",
-    "    8. preset variant results - results from linkage disequilibrium variants\n",
-    "    9. twas weights - for each variant (for enet, lasso, mrash and susie methods)\n",
-    "    10. twas predictions - for each sample (for enet, lasso, mrash and susie methods)\n",
-    "    11. twas cross validation results - information on the best method. Data is split into five parts\n",
-    "    12. region info - information on the region of the specific gene \n",
-    "    13. total time elapsed \n",
+    "    1. susie_fitted\n",
+    "    2. variant_names\n",
+    "    3. analysis script\n",
+    "    4. other quantities - information on dropped samples, if any\n",
+    "    5. summary statistics (beta and se) for each variant\n",
+    "    6. sample names\n",
+    "    7. summary statistics for top loci\n",
+    "    8. susie results trimmed - includes pip, sets, cs_corr, sets_secondary, alpha, lbf_variable, V, niter, X_column_scale_factors, mu, mu2\n",
+    "    9. total time elapsed\n",
+    "    10. region info\n",
     "\n",
-    "`ROSMAP_mega_eQTL.ENSG00000073921.univariate.rds`:\n",
+    "`ROSMAP_mega_eQTL.chr11_ENSG00000073921.univariate_data.rds`:\n",
     "* For each gene of interest, contains residuals for each sample and phenotype\n",
     "* see [pecotmr code](https://github.com/cumc/pecotmr/blob/68d87ca1d0a059022bf4e55339621cbddc8993cc/R/file_utils.R#L461) for description at `load_regional_association_data` function"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "894acfcd-54f2-416e-adf7-9432a321ec6c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -168,7 +173,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.2"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,