Enrichment Analysis

本文包括ORA分析和GSEA分析,均以最常用的KEGG为例。

library(dplyr)
library(clusterProfiler)
library(enrichplot)

过代表分析(ORA)

## choose organism ----
Organism <- 'Mouse' # 'Human' or 'Mouse'
ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))

## prepare data ----
gene_enrich <- gene_diff_sig |> 
  dplyr::filter(signif == 'up') |> 
  dplyr::select(gene_id) |> 
  pull(gene_id)

## KEGG enrich ----
kk <- enrichKEGG(
  gene = gene_enrich,
  organism = ifelse(Organism == 'Human', 'hsa', 'mmu'), # human: hsa, mouse: mmu
  pvalueCutoff = 0.05,
  # qvalueCutoff = 2,
)
kk_result_p <- subset(kk@result, pvalue < 0.05)
View(kk_result_p)
# write.csv(kk_result_p, file = '_.csv')
# kk_result_p <- read.csv(file = '_.csv', row.names = 1)

# KEGG pathway classification ----
KEGG_class <- readr::read_rds('./KEGG_classification_25_07_22.rds')
kk_result_p <- dplyr::inner_join(KEGG_class, kk_result_p, by = c("Pathway" = "Description"))
kk_result_p$Class2 <- factor(kk_result_p$Class2, levels = unique(kk_result_p$Class2))

## calculate fold enrichment (enrichment score) ----
# fold_enrichment = GeneRatio / BgRatio. 
kk_result_p$fold_enrichment <-
  apply(kk_result_p, 1, function(x) {
    GeneRatio <- eval(parse(text = x['GeneRatio']))
    BgRatio <- eval(parse(text = x['BgRatio']))
    return(round(GeneRatio / BgRatio, 2))
  })

# delete suffix ' - Mus musculus (house mouse)' when organism is mouse
if(Organism == 'Mouse') {
  kk_result_p$Description <- kk_result_p$Description |> stringr::str_sub(end = -30L)
}

## remove disease-related terms ----
tmp <- kk_result_p[stringr::str_detect(kk_result_p$Class2, 'disease', negate = T), ]
tmp <- tmp |> dplyr::arrange(fold_enrichment) |>
  filter(Descroption != 'Alcoholism')

## bar plot ----
# w/o class2
ggplot(tmp,
       aes(
         x = reorder(Descroption, -pvalue),
         y = -log10(pvalue),
         fill = fold_enrichment
       )) +
  geom_bar(stat = "identity", width = 0.8) +
  scale_fill_gradient(low = "#95cefa",
                      high = "#234a6d",
                      name = 'fold_enrichment') +
  coord_flip() + theme_bw() +
  theme(panel.grid = element_blank())

if(F) {
  ggsave(
    filename = '_.pdf',
    width = 200,
    height = 100,
    units = 'mm',
    dpi = 300
  )
}

基因集富集分析(GSEA)

GSEA需要一个值为log2FC,names属性为gene id的向量,而且log2FC为降序排列。

# 4. GSEA ----
f <- file.path('_.xls')
gene_diff <- readr::read_tsv(f)
gene_diff

## choose organism ----
Organism <- 'Mouse' # 'Human' or 'Mouse'
ifelse(Organism == 'Human', library(org.Hs.eg.db), library(org.Mm.eg.db))

## select columns for GSEA ----
gene_gsea <- gene_diff |>
  dplyr::select(gene_id, gene_symbol, log2FC) |> 
  dplyr::arrange(desc(log2FC))

gene_list <- gene_gsea$log2FC
names(gene_list) <- gene_gsea$gene_id

## GSEA KEGG ----
res_KEGG <- gseKEGG(
  gene_list,
  organism = ifelse(Organism == 'Human', 'hsa', 'mmu'),
  pvalueCutoff = 0.05,
  pAdjustMethod = "BH"
)

GSEA_result <- subset(res_KEGG@result, pvalue < 0.05)
View(GSEA_result)
# readr::write_csv(GSEA_result, file = '_.csv')

## ridge plot ----
ridgeplot(
  res_KEGG,
  showCategory = 20,
  fill = "p.adjust",
  #填充色 "pvalue", "p.adjust", "qvalue"
  core_enrichment = TRUE,
  #是否只使用 core_enriched gene
  label_format = 30,
  orderBy = "NES",
  decreasing = FALSE
) +
  theme(axis.text.y = element_text(size = 8))

## line chart & hits plot ----
# select a specific term
n <- 1
gsea_description <- res_KEGG@result$Description[n]

if(Organism == 'Mouse') {
  gsea_description <- gsea_description |> stringr::str_sub(end = -30L)
}

gseaplot2(
  res_KEGG,
  geneSetID = n,
  title = gsea_description,
  subplots = 1:2, # choose subplots, including 1 line chart, 2 hits plot and 3 ?
  base_size = 10
)

if(F) {
  ggsave(
    filename = paste0('_', gsea_description, '.pdf'),
    width = 100,
    height = 80,
    units = 'mm',
    dpi = 300
  )
}