library(dplyr)
library(clusterProfiler)
library(enrichplot)
Enrichment Analysis
本文包括ORA分析和GSEA分析,均以最常用的KEGG为例。
过代表分析(ORA)
## choose organism ----
<- 'Mouse' # 'Human' or 'Mouse'
Organism ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))
## prepare data ----
<- gene_diff_sig |>
gene_enrich ::filter(signif == 'up') |>
dplyr::select(gene_id) |>
dplyrpull(gene_id)
## KEGG enrich ----
<- enrichKEGG(
kk gene = gene_enrich,
organism = ifelse(Organism == 'Human', 'hsa', 'mmu'), # human: hsa, mouse: mmu
pvalueCutoff = 0.05,
# qvalueCutoff = 2,
)<- subset(kk@result, pvalue < 0.05)
kk_result_p View(kk_result_p)
# write.csv(kk_result_p, file = '_.csv')
# kk_result_p <- read.csv(file = '_.csv', row.names = 1)
# KEGG pathway classification ----
<- readr::read_rds('./KEGG_classification_25_07_22.rds')
KEGG_class <- dplyr::inner_join(KEGG_class, kk_result_p, by = c("Pathway" = "Description"))
kk_result_p $Class2 <- factor(kk_result_p$Class2, levels = unique(kk_result_p$Class2))
kk_result_p
## calculate fold enrichment (enrichment score) ----
# fold_enrichment = GeneRatio / BgRatio.
$fold_enrichment <-
kk_result_papply(kk_result_p, 1, function(x) {
<- eval(parse(text = x['GeneRatio']))
GeneRatio <- eval(parse(text = x['BgRatio']))
BgRatio return(round(GeneRatio / BgRatio, 2))
})
# delete suffix ' - Mus musculus (house mouse)' when organism is mouse
if(Organism == 'Mouse') {
$Description <- kk_result_p$Description |> stringr::str_sub(end = -30L)
kk_result_p
}
## remove disease-related terms ----
<- kk_result_p[stringr::str_detect(kk_result_p$Class2, 'disease', negate = T), ]
tmp <- tmp |> dplyr::arrange(fold_enrichment) |>
tmp filter(Descroption != 'Alcoholism')
## bar plot ----
# w/o class2
ggplot(tmp,
aes(
x = reorder(Descroption, -pvalue),
y = -log10(pvalue),
fill = fold_enrichment
+
)) geom_bar(stat = "identity", width = 0.8) +
scale_fill_gradient(low = "#95cefa",
high = "#234a6d",
name = 'fold_enrichment') +
coord_flip() + theme_bw() +
theme(panel.grid = element_blank())
if(F) {
ggsave(
filename = '_.pdf',
width = 200,
height = 100,
units = 'mm',
dpi = 300
) }
基因集富集分析(GSEA)
GSEA需要一个值为log2FC,names属性为gene id的向量,而且log2FC为降序排列。
# 4. GSEA ----
<- file.path('_.xls')
f <- readr::read_tsv(f)
gene_diff
gene_diff
## choose organism ----
<- 'Mouse' # 'Human' or 'Mouse'
Organism ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))
## select columns for GSEA ----
<- gene_diff |>
gene_gsea ::select(gene_id, gene_symbol, log2FC) |>
dplyr::arrange(desc(log2FC))
dplyr
<- gene_gsea$log2FC
gene_list names(gene_list) <- gene_gsea$gene_id
## GSEA KEGG ----
<- gseKEGG(
res_KEGG
gene_list,organism = ifelse(Organism == 'Human', 'hsa', 'mmu'),
pvalueCutoff = 0.05,
pAdjustMethod = "BH"
)
<- subset(res_KEGG@result, pvalue < 0.05)
GSEA_result View(GSEA_result)
# readr::write_csv(GSEA_result, file = '_.csv')
## ridge plot ----
ridgeplot(
res_KEGG,showCategory = 20,
fill = "p.adjust",
#填充色 "pvalue", "p.adjust", "qvalue"
core_enrichment = TRUE,
#是否只使用 core_enriched gene
label_format = 30,
orderBy = "NES",
decreasing = FALSE
+
) theme(axis.text.y = element_text(size = 8))
## line chart & hits plot ----
# select a specific term
<- 1
n <- res_KEGG@result$Description[n]
gsea_description
if(Organism == 'Mouse') {
<- gsea_description |> stringr::str_sub(end = -30L)
gsea_description
}
gseaplot2(
res_KEGG,geneSetID = n,
title = gsea_description,
subplots = 1:2, # choose subplots, including 1 line chart, 2 hits plot and 3 ?
base_size = 10
)
if(F) {
ggsave(
filename = paste0('_', gsea_description, '.pdf'),
width = 100,
height = 80,
units = 'mm',
dpi = 300
) }