Enrichment Analysis

本文包括ORA分析和GSEA分析,均以最常用的KEGG为例。参考clusterProfiler说明文档

过代表分析(ORA)

ORA需输入一个基因ID向量。KEGG支持的ID包括Entrez ID等,如果使用蛋白质谱数据则需转换ID。

library(clusterProfiler)
library(dplyr)

## choose organism ----
Organism <- 'Human' # 'Human' or 'Mouse'
ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))

## prepare data ----
data(geneList, package="DOSE")
gene <- names(geneList)[abs(geneList) > 2]

## KEGG enrich ----
kk <- enrichKEGG(
  gene = gene,
  organism = ifelse(Organism == 'Human', 'hsa', 'mmu'), # human: hsa, mouse: mmu
  pvalueCutoff = 0.05,
  # qvalueCutoff = 2,
)
kk_result_p <- subset(kk@result, pvalue < 0.05)
View(kk_result_p)
# write.csv(kk_result_p, file = '_.csv')
# kk_result_p <- read.csv(file = '_.csv', row.names = 1)

# delete suffix ' - Mus musculus (house mouse)' when organism is mouse
if(Organism == 'Mouse') {
  kk_result_p$Description <- kk_result_p$Description |> stringr::str_sub(end = -30L)
}

## remove specific terms ----
## remove disease-related terms
tmp <- kk_result_p[stringr::str_detect(kk_result_p$category, 'disease', negate = T), ]
## remove 'Alcoholism'
tmp <- tmp |> dplyr::arrange(fold_enrichment) |> filter(Description != 'Alcoholism')

## bar plot ----
ggplot(tmp,
       aes(
         x = reorder(Descroption, -pvalue),
         y = -log10(pvalue),
         fill = FoldEnrichment
       )) +
  geom_bar(stat = "identity", width = 0.8) +
  scale_fill_gradient(low = "#95cefa",
                      high = "#234a6d",
                      name = 'FoldEnrichment') +
  coord_flip() + theme_bw() +
  theme(panel.grid = element_blank())

if(F) {
  ggsave(
    filename = '_.pdf',
    width = 200,
    height = 100,
    units = 'mm',
    dpi = 300
  )
}

使用setReadable()函数将富集结果的GeneID列转换为Gene Symbol.

kk <- setReadable(kk, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
## The geneID column is translated to symbol
head(kk, 3)

基因集富集分析(GSEA)

GSEA需要一个值为log2FC,names属性为gene id的向量,而且log2FC为降序排列。

library(clusterProfiler)
library(dplyr)
library(enrichplot)

# GSEA ----
f <- file.path('_.xls')
gene_diff <- readr::read_tsv(f)
gene_diff

## choose organism ----
Organism <- 'Mouse' # 'Human' or 'Mouse'
ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))

## select columns for GSEA ----
gene_gsea <- gene_diff |>
  dplyr::select(gene_id, gene_symbol, log2FC) |> 
  dplyr::arrange(desc(log2FC))

gene_list <- gene_gsea$log2FC
names(gene_list) <- gene_gsea$gene_id

## GSEA KEGG ----
res_KEGG <- gseKEGG(
  gene_list,
  organism = ifelse(Organism == 'Human', 'hsa', 'mmu'),
  pvalueCutoff = 0.05,
  pAdjustMethod = "BH"
)

GSEA_result <- subset(res_KEGG@result, pvalue < 0.05)
View(GSEA_result)
# readr::write_csv(GSEA_result, file = '_.csv')

## ridge plot ----
ridgeplot(
  res_KEGG,
  showCategory = 20,
  fill = "p.adjust",
  #填充色 "pvalue", "p.adjust", "qvalue"
  core_enrichment = TRUE,
  #是否只使用 core_enriched gene
  label_format = 30,
  orderBy = "NES",
  decreasing = FALSE
) +
  theme(axis.text.y = element_text(size = 8))

## line chart & hits plot ----
# select a specific term
n <- 1
gsea_description <- res_KEGG@result$Description[n]

if(Organism == 'Mouse') {
  gsea_description <- gsea_description |> stringr::str_sub(end = -30L)
}

gseaplot2(
  res_KEGG,
  geneSetID = n,
  title = gsea_description,
  subplots = 1:2, # choose subplots, including 1 line chart, 2 hits plot and 3 ?
  base_size = 10
)

if(F) {
  ggsave(
    filename = paste0('_', gsea_description, '.pdf'),
    width = 100,
    height = 80,
    units = 'mm',
    dpi = 300
  )
}