library(dplyr)
library(clusterProfiler)
library(enrichplot)Enrichment Analysis
本文包括ORA分析和GSEA分析,均以最常用的KEGG为例。
过代表分析(ORA)
## choose organism ----
Organism <- 'Mouse' # 'Human' or 'Mouse'
ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))
## prepare data ----
gene_enrich <- gene_diff_sig |>
dplyr::filter(signif == 'up') |>
dplyr::select(gene_id) |>
pull(gene_id)
## KEGG enrich ----
kk <- enrichKEGG(
gene = gene_enrich,
organism = ifelse(Organism == 'Human', 'hsa', 'mmu'), # human: hsa, mouse: mmu
pvalueCutoff = 0.05,
# qvalueCutoff = 2,
)
kk_result_p <- subset(kk@result, pvalue < 0.05)
View(kk_result_p)
# write.csv(kk_result_p, file = '_.csv')
# kk_result_p <- read.csv(file = '_.csv', row.names = 1)
# KEGG pathway classification ----
KEGG_class <- readr::read_rds('./KEGG_classification_25_07_22.rds')
kk_result_p <- dplyr::inner_join(KEGG_class, kk_result_p, by = c("Pathway" = "Description"))
kk_result_p$Class2 <- factor(kk_result_p$Class2, levels = unique(kk_result_p$Class2))
## calculate fold enrichment (enrichment score) ----
# fold_enrichment = GeneRatio / BgRatio.
kk_result_p$fold_enrichment <-
apply(kk_result_p, 1, function(x) {
GeneRatio <- eval(parse(text = x['GeneRatio']))
BgRatio <- eval(parse(text = x['BgRatio']))
return(round(GeneRatio / BgRatio, 2))
})
# delete suffix ' - Mus musculus (house mouse)' when organism is mouse
if(Organism == 'Mouse') {
kk_result_p$Description <- kk_result_p$Description |> stringr::str_sub(end = -30L)
}
## remove disease-related terms ----
tmp <- kk_result_p[stringr::str_detect(kk_result_p$Class2, 'disease', negate = T), ]
tmp <- tmp |> dplyr::arrange(fold_enrichment) |>
filter(Descroption != 'Alcoholism')
## bar plot ----
# w/o class2
ggplot(tmp,
aes(
x = reorder(Descroption, -pvalue),
y = -log10(pvalue),
fill = fold_enrichment
)) +
geom_bar(stat = "identity", width = 0.8) +
scale_fill_gradient(low = "#95cefa",
high = "#234a6d",
name = 'fold_enrichment') +
coord_flip() + theme_bw() +
theme(panel.grid = element_blank())
if(F) {
ggsave(
filename = '_.pdf',
width = 200,
height = 100,
units = 'mm',
dpi = 300
)
}基因集富集分析(GSEA)
GSEA需要一个值为log2FC,names属性为gene id的向量,而且log2FC为降序排列。
# 4. GSEA ----
f <- file.path('_.xls')
gene_diff <- readr::read_tsv(f)
gene_diff
## choose organism ----
Organism <- 'Mouse' # 'Human' or 'Mouse'
ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))
## select columns for GSEA ----
gene_gsea <- gene_diff |>
dplyr::select(gene_id, gene_symbol, log2FC) |>
dplyr::arrange(desc(log2FC))
gene_list <- gene_gsea$log2FC
names(gene_list) <- gene_gsea$gene_id
## GSEA KEGG ----
res_KEGG <- gseKEGG(
gene_list,
organism = ifelse(Organism == 'Human', 'hsa', 'mmu'),
pvalueCutoff = 0.05,
pAdjustMethod = "BH"
)
GSEA_result <- subset(res_KEGG@result, pvalue < 0.05)
View(GSEA_result)
# readr::write_csv(GSEA_result, file = '_.csv')
## ridge plot ----
ridgeplot(
res_KEGG,
showCategory = 20,
fill = "p.adjust",
#填充色 "pvalue", "p.adjust", "qvalue"
core_enrichment = TRUE,
#是否只使用 core_enriched gene
label_format = 30,
orderBy = "NES",
decreasing = FALSE
) +
theme(axis.text.y = element_text(size = 8))
## line chart & hits plot ----
# select a specific term
n <- 1
gsea_description <- res_KEGG@result$Description[n]
if(Organism == 'Mouse') {
gsea_description <- gsea_description |> stringr::str_sub(end = -30L)
}
gseaplot2(
res_KEGG,
geneSetID = n,
title = gsea_description,
subplots = 1:2, # choose subplots, including 1 line chart, 2 hits plot and 3 ?
base_size = 10
)
if(F) {
ggsave(
filename = paste0('_', gsea_description, '.pdf'),
width = 100,
height = 80,
units = 'mm',
dpi = 300
)
}