library(survival)
library(survivalROC)
library(survminer)
library(dplyr)
td
# columns: 4
# Patient_ID, Gene, Status, Time
sROC <- survivalROC(
Stime = td$Time,
status = td$Status,
marker = td$Gene,
predict.time = 24,
# 想要考察的时间段
method = "KM"
)
## calculate Yoden index
result <- sROC |>
mutate(Youden = TP - FP) |>
slice_max(order_by = Youden, n = 1, with_ties = FALSE) |>
mutate(
max_Youden = Youden,
cut.value = cut.values,
.keep = "none"
)
print(
paste0(
"最大约登指数 ", result$max_Youden,
",此时基因表达量为 ", result$cut.value
)
)
td <- td |> mutate(LowerThanCutOff = Gene <= result$cut.value) |> count(LowerThanCutOff)
fit <- survfit(data = td , Surv(time, Status) ~ LowerThanCutOff)
# 查看生存率50%对应的时间
# summary(fit)$table[, "median"]
p_value <- pairwise_survdiff(data = td, Surv(time, Status) ~ LowerThanCutOff)
p_value$p.value
# KM plot ----
p <- ggsurvplot(
fit,
data = td,
pval = TRUE,
pval.method = F,
pval.coord = c(0.05, 0.05), # p value location
pval.size = 5,
conf.int = TRUE, # 置信区间
risk.table = F, # 添加风险表
legend = c(0.8, 0.9), # legend location
legend.title = "Expression",
legend.labs = c("High", "Low"),
font.legend = 14,
font.main = c(16, "bold", "darkblue"),
font.tickslab = 12,
ylab = "Survival",
font.x = 16,
font.y = 16,
ggtheme = theme_survminer(), # 更改ggplot2的主题
palette = c("red", "cyan3") #定义颜色
)
p
## save plot ----
# KM plot
filename <- paste0( format(Sys.time(), "%H_%M_%S"), '_KM_plot.pdf')
ggsave(plot = p$plot, filename = filename, width = 225, height = 200, units = 'mm')
# risk table
filename <- paste0(format(Sys.time(), "%H_%M_%S"), '_risk_table.pdf')
ggsave(plot = p$table, filename = filename, width = 225, height = 50, units = 'mm')Statistics
生存分析
现有患者某基因表达量、患者生存状态和生存时间,要求用约登指数最大时的基因表达量作为cutoff值进行生存分析。
四格表卡方检验
有任意一个期望频数小于5时应使用Fisher精确检验。
不推荐用Excel做卡方检验,因为需要手动计算理论频数和判断是否需要进行连续性矫正。
df <- matrix(c(28, 34, 9, 3), nrow = 2, ncol = 2)
result <- chisq.test(df)
if (any(result$expected < 5)) {
print('Fisher 精确检验')
fisher.test(df) # fisher 精确检验
} else {
result
}方差分析
当有来自三个或以上总体的样本时,通常使用方差分析方法以检验总体均值是否相等。根据基础统计学第 14 版第 12 章的汽车碰撞数据,判断越大的车是否越安全。表中列出了四种不同车型对头部损伤测量的结果。
compact <- c(253, 143, 124, 301, 422, 324, 258, 271, 467, 298, 315, 304)
mid <- c(117, 121, 204, 195, 186, 178, 157, 203, 132, 212, 229, 235)
full <- c(249, 90, 178, 114, 183, 87, 180, 103, 154, 129, 266, 338)
SUV <- c(121, 112, 261, 145, 198, 193, 193, 111, 276, 156, 213, 143)
HIC <- tibble::lst(compact, mid, full, SUV) |>
tibble::enframe(name = 'size', value = 'HIC') |>
tidyr::unnest_longer(HIC)
HIC# A tibble: 48 × 2
size HIC
<chr> <dbl>
1 compact 253
2 compact 143
3 compact 124
4 compact 301
5 compact 422
6 compact 324
7 compact 258
8 compact 271
9 compact 467
10 compact 298
# ℹ 38 more rows
进行单因素方差分析。单因素指的是用一个因素(因子、处理)对数据进行分类,这里是车型。
model <- aov(HIC ~ size, data = HIC)
summary(model) Df Sum Sq Mean Sq F value Pr(>F)
size 3 115887 38629 7.685 0.000309 ***
Residuals 44 221159 5026
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
可以发现 p<0.05。但是这仅能拒绝总体均值不全相等的假设,但不能得出具体哪一个总体的均值与其它均值不同的结论。也就说目前还不知道哪种车型的 HIC 与其它不同。
既然我们想知道哪种车型与其它车型有差异,为什么不对样本两两比较?因为每次检验都使用 0.05 的显著性水平,那么 6 次比较之后总体置信水平就低至 0.056=0.735。也就是说增加了犯第一类风险的错误,明明没有差异,但我们认为其中一次检验有差异。而方差分析方法通过仅使用一次检验来避免该风险。
但是我们仍然想知道究竟哪一组均值与其它均值不同,此时可以使用多重比较检验,也就是仍然两两比较但是对其进行校正,以克服显著性水平随着检验次数增加而增加的问题。校正方法有多种,以邦费罗尼校正(Bonferroni Correction)为例。
pairwise.t.test(HIC$HIC, HIC$size, p.adjust.method = "bonferroni")
Pairwise comparisons using t tests with pooled SD
data: HIC$HIC and HIC$size
compact full mid
full 0.0012 - -
mid 0.0029 1.0000 -
SUV 0.0019 1.0000 1.0000
P value adjustment method: bonferroni
也可以使用更专业的统计分析包进行计算。
library(emmeans)Warning: package 'emmeans' was built under R version 4.5.3
Welcome to emmeans.
Caution: You lose important information if you filter this package's results.
See '? untidy'
model <- lm(HIC ~ size, data = HIC)
marginal_means <- emmeans(model, ~ size)
pairs(marginal_means, adjust = "bonferroni") contrast estimate SE df t.ratio p.value
compact - full 117.42 28.9 44 4.057 0.0012
compact - mid 109.25 28.9 44 3.775 0.0029
compact - SUV 113.17 28.9 44 3.910 0.0019
full - mid -8.17 28.9 44 -0.282 1.0000
full - SUV -4.25 28.9 44 -0.147 1.0000
mid - SUV 3.92 28.9 44 0.135 1.0000
P value adjustment: bonferroni method for 6 tests
非参数检验
“非参数检验”是一个误导性的术语,会让人误以为该检验不需要参数,但实际上有些非参数检验还是需要用到参数的,比如中位数。参数检验要求样本服从某一特定的总体分布,而非参数检验则不涉及有关总体分布的参数。因此,非参数检验又被称为“不受分布限制检验”。此处介绍符号秩检验和秩和检验。
Wilcoxon 符号秩检验与 Mann-Whitney U 检验(又称 Wilcoxon 秩和检验)的主要区别在于数据结构:前者适用于配对样本(related / paired samples)分析前后差异,后者适用于独立样本(independent samples)比较组间差异。两者均是非参数检验,不要求数据满足正态分布。
这两种检验均通过 wilcox.test() 函数实现。
wilcox.test(value ~ group, data = my_data, paired = _)IC50
library(ggplot2)
### this is the data ###
# data from four experiments
conc <- c(5.00E-07, 1.00E-06, 1.00E-05,
5.00E-07, 1.00E-06, 5.00E-06, 1.00E-05, 2.00E-05,
5.00E-07, 1.00E-06, 2.50E-06, 5.00E-06, 1.00E-05,
5.00E-07, 1.00E-06, 2.50E-06, 5.00E-06, 1.00E-05)
dead.cells <- c(34.6, 47.7, 81.7,
37.6, 55.7, 89.1, 84.3, 85.2,
34.4, 46.1, 76.2, 84.3, 84.1,
24.5, 26.1, 60.6, 82.7, 87)
# transform the data to make it postive and put into a data frame for fitting
data <- as.data.frame(conc) # create the data frame
data$dead.cells <- dead.cells
data$nM <- data$conc * 1000000000
data$log.nM <- log10(data$nM)
### fit the data ###
# make sure logconc remains positive, otherwise multiply to keep positive values
# (such as in this example where the iconc is multiplied by 1000
fit <- nls(
dead.cells ~ bot + (top - bot) / (1 + (log.nM / LD50) ^ slope),
data = data,
start = list(
bot = 20,
top = 95,
LD50 = 3,
slope = -12
)
)
m <- coef(fit)
val <- format((10 ^ m[3]), dig = 1)
### ggplot the results ###
p <- ggplot(data = data, # specify the data frame with data
aes(x = nM, y = dead.cells)) + # specify x and y
geom_point() + # make a scatter plot
scale_x_log10(breaks = c(500, 1000, 2500, 5000, 10000, 20000)) +
xlab("Drug concentration (nM)") + # label x-axis
ylab("Dead cells (% of cells)") + # label y-axis
ggtitle("Drug Dose Response and LD50") + # add a title
theme_bw() + # a simple theme
expand_limits(y = c(20, 100)) # customise the y-axis
# Add the line to graph using methods.args (New: Jan 2016)
p <- p + geom_smooth(
method = "nls",
method.args = list(
formula = y ~ bot + (top - bot) / (1 + (x / LD50) ^ slope),
start = list(
bot = 20,
top = 95,
LD50 = 3,
slope = -12
)
),
se = FALSE
)
# Add LD50 to the graph.
p <- p +
annotate(geom = "text", x = 7000, y = 60, label = "LD50(nM): ", color = "red") +
annotate(geom = "text", x = 9800, y = 60, label = val, color = "red")
p # show the plot