## read GEO files and create a seurat object
if(FALSE) {
# read given counts
f <- file.path('~/data/GSE165318/GSE165318_raw_count.single.cell.txt.gz')
raw_counts <- read.delim(f)
sce <- CreateSeuratObject(counts = raw_counts)
# read given normalized data
f <- file.path('~/data/GSE165318/GSE165318_normalized.single.cell.txt.gz')
norm_data <- read.delim(f)
sce <- SetAssayData(object = sce, assay = "RNA", layer = "data", new.data = norm_data)
# read given metadata
f <- file.path('~/data/GSE165318/GSE165318_meta.data.single.cell.csv.gz')
metadata <- data.table::fread(f)
head(metadata)
colnames(metadata) <- c('cell_id', 'nGene', 'nUMI', 'sample', 'mito_pct', 'trt_grp', 'cluster_55')
head(metadata)
sce <- AddMetaData(object = sce, metadata = metadata)
# delete used variables and free up memory
rm(raw_counts, norm_data, metadata); gc()
}Single Cell Analysis
读取非标准文件制作 Seurat 对象
既提供 raw counts 又提供 normalized data 时可将二者合并到同一个 Seurat 对象中,先用CreateSeuratObject()指定 counts 创建一个 Seurat 变量,再用SetAssayData()向其加入新数据并指定存放的 layer。metadata 可在创建 Seurat 变量时指定,也可用AddMetaData()加入。以GSE165318为例。
有时10X标准文件被放在同一个文件夹里,没有分开存放于独立的文件夹,需要提取样本名依次读取,再进行合并。以GSE291670为例。
f <- file.path('~/data/GSE291670/')
if(F) {
setwd(f)
files <- list.files(f)
samples <- files |> stringr::str_sub(start = 1L, end = 10L) |> base::unique()
tictoc::tic()
seu_list <- list()
for (sample in samples) {
print(paste0('Reading ', sample))
barcodes <- files |> stringr::str_subset(sample) |> stringr::str_subset('barcodes')
features <- files |> stringr::str_subset(sample) |> stringr::str_subset('features')
matrix <- files |> stringr::str_subset(sample) |> stringr::str_subset('matrix')
Mtx <- ReadMtx(
cells = barcodes,
features = features,
mtx = matrix,
)
seu <- CreateSeuratObject(counts = Mtx, project = sample)
seu_list <- append(seu_list, seu)
}
tictoc::toc()
seu <- merge(x=seu_list[[1]], y=seu_list[-1])
rm(seu_list, Mtx, barcodes, features, matrix, files, samples)
gc()
seu <- JoinLayers(seu)
readr::write_rds(seu, file = file.path(f, 'GSE291670.rds'))
}
seu <- readr::read_rds(file = file.path(f, 'GSE291670.rds'))Doublet Finder 双细胞检测
DoubletFinder can only be applied to a single sample and before batch effect correcting. Existing doublets will have a negative impact on sample integrity.
There are two R packages named DoubletFinder and chris-mcginnis-ucsf/DoubletFinder is preferred.
doubletFinder() must be performed after FindClusters().
# DoubletFinder ----
sweep.res <- paramSweep(seu, PCs = 1:10, sct = FALSE)
sweep.stats <- summarizeSweep(sweep.res, GT = FALSE)
bcmvn <- find.pK(sweep.stats)
## choose the pK with the highest BCmetric
best.pK <- bcmvn$pK[which.max(bcmvn$BCmetric)] |> as.character() |> as.numeric()
## determine the number of doublets
nExp <- round(ncol(seu) * 0.08)
## estimate the proportion of homotypic doublet
homotypic.prop <- modelHomotypic(seu$seurat_clusters)
nExp.adj <- round(nExp * (1 - homotypic.prop))
## run DoubletFinder first time
seu <- doubletFinder(seu, PCs = 1:10, pN = 0.25, pK = best.pK, nExp = nExp,
reuse.pANN = NULL, sct = FALSE)
## second run
seu <- doubletFinder(seu, PCs = 1:10, pN = 0.25, pK = best.pK, nExp = nExp.adj,
reuse.pANN = "pANN_0.25_0.02_2325", sct = FALSE)
DimPlot(seu, group.by = "DF.classifications_0.25_0.02_2094", reduction = "umap")
seu_singlet <- subset(seu,
subset = DF.classifications_0.25_0.02_2094 == "Singlet")Correct Batch Effect 矫正批次效应
批次效应产生的原因由很多种,如不同的实验者,不同的实验仪器、实验环境等等。这些差异不能被纳入细胞与细胞间的生物学差异,所以应当被矫正(“去除批次效应”的说法是不科学的)。有时批次效应的矫正亦称数据整合,不能将此简单理解为数据合并。
矫正批次效应的目标是尽可能使不同的样本均匀地分布到各个细胞类型中,也就是说不存在某个类型的细胞只含有某些样本而不含有其它样本。如果有一个由5个患者组织组成的数据集,分群后发现fibroblast只来自1号病人而不来自其他4个病人,显然是不合理的。批次效应的矫正就是为了去除这种不合理性。
但是需要注意,肿瘤细胞本身具有异质性,即使是矫正批次效应后仍然会存在某群肿瘤细胞只包含部分患者来源的细胞,另一群肿瘤细胞包含其余患者来源的细胞。这种情况不能认为是批次效应矫正不充分,也不能加大矫正力度强行使肿瘤细胞群涵盖所有患者,否则会掩盖肿瘤异质性,造成矫正过度。
SCTransform
Tumor vasculature at single-cell resolution, Nature, July 2024
Molecular programs of regional specification and neural stem cell fate progression in macaque telencephalon, Science, October 2023
两篇文章使用SCTransform进行Normalize,随后不再单独进行FindVariable。也就是使用SCTransform可以直接进行归一化(NormalizeData)、计算高变基因(FindVariableFeatures)和标准化(ScaleData)。
SCTransform使用3000个HVGs和30个PCs,比FindVariableFeatures默认的2000个HVGs更多,因此更能提取出单个细胞间微弱的生物学差异。两者没有绝对的优劣之分,建议两者都尝试。
锚定点CCA算法
Satija课题组于2019年在Cell发表。Seurat官网
可以按照样本的来源矫正批次效应,也可以按照处理组或者数据的文章来源矫正批次效应。根据整合的结果进行更改。常用按照样本进行矫正。
## CCA integrate
ifnb <- IntegrateLayers(
object = ifnb,
method = CCAIntegration,
orig.reduction = "pca",
new.reduction = "integrated.cca",
verbose = FALSE)基因集评分
Seurat::AddModuleScore()runs on log-normalized data and requires ‘data’ layer in the active assay.
Taking p53 pathway as an example:
library(msigdbr)
# download all hallmark genes from MSigDB
if(F) {
MSigDB <- msigdbr(species = "mouse", collection = "H")
readr::write_rds(MSigDB, file = '~/data/MSigDB/species_mouse_collection_H.rds')
}
MSigDB <- readr::read_rds(file = '~/data/MSigDB/species_mouse_collection_H.rds')
## retrieve p53 pathway gene list
genes <- MSigDB |> filter(gs_name == 'HALLMARK_P53_PATHWAY') |>
select(gene_symbol) |> unique()
## score
## NOTION: AddModuleScore() runs on normalized data and requires 'data' layer in the active assay
stem_cell <- AddModuleScore(stem_cell, features = list(genes), name = "p53_Hallmark")
# boxplot
stem_cell[[]] |>
ggplot(aes(x = group, y = p53_Hallmark1, fill = group)) +
geom_boxplot(outlier.size = 0.5) +
theme_classic() +
labs(y = "p53 pathway score", title = 'CBC')scanpy.tl.score_genes()implements the same functionality as Seurat::AddModuleScore() and runs on log-normalized data as well. It will add a new column to adata.obs, with the name specified by score_name.
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
gene_sets = [
"KRT86", "ENTPD1", "TNFRSF18"
]
sc.tl.score_genes(
adata,
gene_list=gene_sets,
score_name="gene_set_score",
ctrl_size=50,
use_raw=False
# 'use_raw' must be explicitly declared as false
)
adataPseudobulk Analysis
使用pseudobulk法将counts数按照患者为单位进行汇总,再根据分组信息计算差异基因。
在 Seurat 体系下有多个包可以做 pseudobulk 分析,也可使用 R 自带的 rowSum() 函数汇总后用 DESeq2 包计算差异基因。
注意此处的基因过滤标准:exprSet <- exprSet[apply(exprSet, 1, function(x) sum(x > 1) > 1), ] 目的是去掉几乎不表达的基因,只保留那些在 ≥2 个 pseudobulk 样本中表达量 >1 的基因。此处过滤基因而不是样本,有些 pseudobulk 样本可能因为 counts 含量太少也应当去除,但此处暂不予考虑。
# pseudobulk
# calculate DEGs between E and NE in iCAF
iCAF <- subset(CAF, idents = c(3, 5))
## rowSum() method
bs <- split(colnames(iCAF), iCAF$patient_id)
names(bs)
ct <- do.call(cbind, lapply(names(bs), function(x) {
# x=names(bs)[[1]]
kp = colnames(iCAF) %in% bs[[x]]
rowSums(as.matrix(iCAF@assays$RNA$counts[, kp]))
}))
colnames(ct) <- names(bs)
exprSet <- exprSet[apply(ct, 1, function(x)
sum(x > 1) > 1), ]
dim(exprSet)
head(exprSet)
## DESeq2::DESeq()
phe <- unique(iCAF@meta.data[, c('patient_id', 'expansion')])
group_list <- phe[match(names(bs), phe$patient_id), 'expansion']
table(group_list)
# Step 1: create DESeq2 object
colData <- data.frame(row.names = colnames(exprSet), group_list = group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet, colData = colData, design = ~ group_list)
# Step 2: perform differential expression analysis
dds2 <- DESeq(dds)
# Step 3: 提取差异分析结果,E组比NE组
# 注意contrast顺序,此写法得到的结果中log2fc>0代表在E组高表达
tmp <- results(dds2, contrast = c("group_list", "E", "NE"))
DEG_DESeq2 <- as.data.frame(tmp[order(tmp$pvalue), ])
head(DEG_DESeq2)在 scanpy 体系下可以使用 decoupler 模块做 pseudobulk 分析。同样使用 DESeq2 计算差异基因。dc.pp.filter_samples(pdata, min_cells=10, min_counts=1000) 表明剔除细胞数少于 10 和 counts 数小于 1000 的样本。注意此处剔除样本而不是基因。
import decoupler as dc
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats
import seaborn as sns
import matplotlib.pyplot as plt
pdata = dc.pp.pseudobulk(
target_cells,
sample_col='cohort',
groups_col=['treatment'],
mode='sum'
)
# 剔除细胞数少于 10 和 counts 数小于 1000 的样本
dc.pp.filter_samples(pdata, min_cells=10, min_counts=1000)
pdata
dc.pl.filter_samples(
adata=pdata,
groupby=["response_group_binary"],
min_cells=10,
min_counts=1000,
figsize=(5, 3),
)计算差异基因
def run_deseq2_R_NR(pdata_sub):
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
adata=pdata_sub,
design="~response_group_binary",
refit_cooks=True,
inference=inference,
quiet=True
)
# Compute LFCs
dds.deseq2()
# Extract contrast between conditions
stat_res = DeseqStats(dds, contrast=["response_group_binary", "R", "NR"], inference=inference)
# Compute Wald test
stat_res.summary()
results_df = stat_res.results_df
return dds, results_df
treatments = pdata.obs['treatment'].unique()
treatments
dds_list = {}
de_results = {}
for t in treatments:
pdata_sub = pdata[pdata.obs['treatment'] == t].copy()
print(f"Running DE for treatment: {t}")
dds_list[t], de_results[t] = run_deseq2_R_NR(pdata_sub)
print(f'Treatment: {t} completed.')查看目的基因
for t in treatments:
print(f'-------- {t} --------')
print(de_results[t].loc[['GENE']])
print('')绘制箱线图
plot_df_list = {}
for t in treatments:
# 提取所有基因在每个患者中的 DESeq2 标准化表达量
norm_counts = pd.DataFrame(
dds_list[t].layers["normed_counts"],
index=dds_list[t].obs_names,
columns=dds_list[t].var_names
)
# 提取目的基因表达量
gene_expr = norm_counts[['GENE']].copy()
meta = dds_list[t].obs[['response_group_binary', 'treatment', 'cohort']]
# 合并分组信息用于绘图
plot_df_list[t] = (
gene_expr
.join(meta)
.reset_index()
.rename(columns={'index': 'sample'})
)
# 绘图
plt.figure()
sns.boxplot(
data=plot_df_list[t],
x='response_group_binary',
y='GENE',
showfliers=False
)
sns.stripplot(
data=plot_df_list[t],
x='response_group_binary',
y='GENE',
color='black',
size=5,
jitter=True
)
plt.ylabel('Normalized expression')
plt.xlabel(t)
plt.title('GENE')
fc = de_results[t].loc['GENE', 'log2FoldChange']
pvalue = de_results[t].loc['GENE', 'pvalue']
plt.text(
x=0.5,
y=plot_df_list[t]['GENE'].max() * 1.1,
s=f'log2FC = {fc:.3f}\npvalue = {pvalue:.3f}',
ha='center'
)
# plt.tight_layout()
print(f'-------- {t} --------')
plt.show()