Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix VCF processing #8

Merged
merged 3 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Authors@R:
Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Imports:
dplyr,
SummarizedExperiment,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ importFrom(rvest,html_nodes)
importFrom(rvest,html_table)
importFrom(rvest,html_text)
importFrom(rvest,read_html)
importFrom(stringr,str_replace)
importFrom(stringr,str_split_i)
importFrom(stringr,str_to_title)
importFrom(tibble,as_tibble)
Expand Down
33 changes: 20 additions & 13 deletions R/personalis.R
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,15 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality))
table_number <- if_else(sample_type == "somatic", 1, 2)
columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total")

tables <- read_html(html_file) |>
html_elements(html_section) |>
html_elements("table") |>
html_table(na.strings = "N/A")

if (!length(tables)) {
return(tibble())
} else {
tes <- tables[table_number] |>
lapply(function(df) {
colnames(df) <- make.names(colnames(df))
Expand All @@ -350,6 +354,7 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
select(sample, var_name, value) |>
pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
mutate(across(contains("Number"), fix_thousands_separator))
}
}

#' Read in (unfiltered) small variant data from VCF files from personalis folders
Expand Down Expand Up @@ -379,15 +384,16 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
modality = modality,
sample_type = sample_type
)

if (!length(variant_list)) {
return(NULL)
}

col_data <- map(variant_list, "summary_stats") |>
bind_rows() |>
tibble::column_to_rownames("sample")

col_data <- bind_rows(map(variant_list, "summary_stats"))
if (nrow(col_data)) {
col_data <- col_data |>
tibble::column_to_rownames("sample")
}

all_variants <- map(variant_list, "vcf_data") |> bind_rows()
row_data <- all_variants |>
select(
Expand Down Expand Up @@ -440,19 +446,20 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ
# even though they are all in the K13T folder.
tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name)

# tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases
file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf")

variant_table <- parse_vcf_to_df(
file.path(
sample_folder,
sprintf("%s_Pipeline", modality),
"Variants",
sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending)
sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), "vcf.gz")
)
) |>
mutate(sample = sample_name) |>
mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
)

if (nrow(variant_table)) {
variant_table <- variant_table |>
mutate(sample = sample_name) |>
mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
}

variant_table
}
Expand Down
18 changes: 12 additions & 6 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ catch_file_not_found <- function(path, callback, ...) {
if (
# List of erorr messages to catch
grepl("`path` does not exist: .*", conditionMessage(e)) || # from read_excel
grepl("'.*?' does not exist.", conditionMessage(e)) # from read_html
grepl("'.*?' does not exist.", conditionMessage(e)) || # from read_html
grepl("cannot open the connection", conditionMessage(e)) # from read.vcfR
) {
return(NULL)
} else {
Expand Down Expand Up @@ -122,22 +123,27 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
#' @return {tibble} new data frame with all variants (fixed field and genotype information)
#' @importFrom dplyr mutate left_join
#' @importFrom vcfR read.vcfR vcfR2tidy
#' @importFrom stringr str_split_i
#' @importFrom stringr str_split_i str_replace
#' @importFrom tibble as_tibble
parse_vcf_to_df <- function(path) {
# parse VCF file
vcf_content <- read.vcfR(path)

vcf_content <- tryCatch({
read.vcfR(path)
}, error = function(e) {
read.vcfR(str_replace(path, "vcf.gz", "vcf"))
}
)

# fixed field content to data frame
fixed_df <- vcfR2tidy(vcf_content)$fix

# GT content to data frame
gt_df <- vcfR2tidy(vcf_content)$gt

# create addition column with observed nucleotides in order to avoid collisions when we do the left_join
gt_df <- gt_df |>
dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))

# next use ChromKey, POS and ALT for joining vcf content data frames
joined_vcf_df <- fixed_df |>
dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))
Expand Down
Loading