From ebede8885267c2608ee4c0bdf1e72ba8828272fd Mon Sep 17 00:00:00 2001 From: Chun-Hui Gao Date: Wed, 27 Jul 2022 21:26:43 +0800 Subject: [PATCH 1/2] add GO mapper --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/gson.R | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2f246209..9637b9a9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -50,4 +50,4 @@ Packaged: NA biocViews: Annotation, Clustering, GeneSetEnrichment, GO, KEGG, MultipleComparison, Pathways, Reactome, Visualization Encoding: UTF-8 -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.0 diff --git a/NAMESPACE b/NAMESPACE index 688f27d5..f2e88a13 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,6 +70,7 @@ export(gseMKEGG) export(gseWP) export(gseaplot) export(gsfilter) +export(gson_GO_mapper) export(gson_KEGG) export(heatplot) export(idType) diff --git a/R/gson.R b/R/gson.R index 802787c9..251d1145 100644 --- a/R/gson.R +++ b/R/gson.R @@ -54,3 +54,87 @@ gson_GO <- function(OrgDb, keytype = 'ENTREZID', ont = "BP") { accessed_date = as.character(Sys.Date()) ) } + + +#' Build a gson object that annotate Gene Ontology +#' +#' @param data a two-column data.frame of original GO annotation. The columns are "gene_id" and "go_id". +#' @param ont type of GO annotation, which is "ALL", "BP", "MF", or "CC". default: "ALL". +#' @param species name of species. Default: NULL. +#' @param ... pass to `gson::gson()` constructor. +#' +#' @return a `gson` instance +#' @export +#' +#' @examples +#' data = data.frame(gene_id = "gene1", +#' go_id = c("GO:0035492", "GO:0009764", "GO:0031063", "GO:0033714", "GO:0036349")) +#' gson_go_mapper(data, species = "E. coli") +gson_GO_mapper = function(data, + ont = c("ALL", "BP", "CC", "MF"), + species = NULL, + ...){ + ont = match.arg(ont) + + data = unique(data) # cleanup + if (nrow(data) == 0) { + simpleError("Data is empty in this call.") + } + + # resources from `GO.db` + goterms = AnnotationDbi::Ontology(GO.db::GOTERM) + termname = AnnotationDbi::Term(GO.db::GOTERM) + go.db_info = GO.db::GO_dbInfo() + go.db_source_date = go.db_info[go.db_info$name == "GOSOURCEDATE", "value"] + ancestor_map = lapply(c(GO.db::GOBPANCESTOR, + GO.db::GOCCANCESTOR, + GO.db::GOMFANCESTOR), + as.list) %>% + unlist(recursive = FALSE) + + # filter GO terms + data[["ontology"]] = goterms[data[["go_id"]]] + n_na_ont = sum(is.na(data[["ontology"]])) + if ( n_na_ont > 0){ + warning(sprintf("%s GO term(s) are too new for current `GO.db` [source date: %s],\n and are to be dropped. Consider to update `GO.db` if possible.", + n_na_ont, + go.db_source_date)) + } + + # map to GO ancestor + ancestor_list = ancestor_map[data$go_id] + names(ancestor_list) = data$gene_id + ancestor_go = AnnotationDbi::unlist2(ancestor_list) + + # gsid2gene + gsid2gene = data.frame( + gsid = c(ancestor_go, data$go_id), + gene = c(names(ancestor_go), data$gene_id), + ontology = goterms[c(ancestor_go, data$go_id)] + ) %>% + dplyr::filter(.data$gsid != "all") %>% + unique() + + if (ont != "ALL"){ + gsid2gene = gsid2gene %>% + dplyr::filter(.data$ontology == ont) + } + + # gsid2name + uniq_gsid = unique(gsid2gene$gsid) %>% as.character() + gsid2name = data.frame( + gsid = uniq_gsid, + name = termname[uniq_gsid] %>% as.character() + ) + + # construct `gson` object + gson::gson( + gsid2gene = gsid2gene, + gsid2name = gsid2name, + species = species, + gsname = paste0("Gene Ontology: ", ont), + version = sprintf("[GO.db source date: %s]", go.db_source_date), + accessed_date = as.character(Sys.Date()), + ... + ) +} \ No newline at end of file From 8a233504c4074941931bd35de4a93142a4b59ff1 Mon Sep 17 00:00:00 2001 From: Chun-Hui Gao Date: Wed, 27 Jul 2022 21:32:21 +0800 Subject: [PATCH 2/2] update author and version --- DESCRIPTION | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9637b9a9..84744caa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: clusterProfiler Type: Package Title: A universal enrichment tool for interpreting omics data -Version: 4.5.1.902 +Version: 4.5.1.904 Authors@R: c( person(given = "Guangchuang", family = "Yu", email = "guangchuangyu@gmail.com", role = c("aut", "cre", "cph"), comment = c(ORCID = "0000-0002-6485-8781")), person(given = "Li-Gen", family = "Wang", email = "reeganwang020@gmail.com", role = "ctb"), @@ -9,7 +9,8 @@ Authors@R: c( person(given = "Xiao", family = "Luo", email = "l77880853349@163.com", role = "ctb"), person(given = "Meijun", family = "Chen", email = "mjchen1996@outlook.com", role = "ctb"), person(given = "Giovanni", family = "Dall'Olio", email = "giovanni.dallolio@upf.edu", role = "ctb"), - person(given = "Wanqian", family = "Wei", email = "altair_wei@outlook.com", role = "ctb") + person(given = "Wanqian", family = "Wei", email = "altair_wei@outlook.com", role = "ctb"), + person(given = "Chun-Hui", family = "Gao", email = "gaospecial@gmail.com", role = "ctb", comment = c(ORCID = "0000-0002-1445-7939")) ) Maintainer: Guangchuang Yu Description: This package supports functional characteristics of both coding and non-coding genomics data for thousands of species with up-to-date gene annotation. It provides a univeral interface for gene functional annotation from a variety of sources and thus can be applied in diverse scenarios. It provides a tidy interface to access, manipulate, and visualize enrichment results to help users achieve efficient data interpretation. Datasets obtained from multiple treatments and time points can be analyzed and compared in a single run, easily revealing functional consensus and differences among distinct conditions.