1.文件准备 B1.txt B2.txt(群体的分类文件,后续的分析结果是B2相对于B1的选择结果位点)
2.pi结果计算(linux):
#输入文件
vcftools --vcf clean.vcf \
#指定窗口 步长
--window-pi 100000 --window-pi-step 10000 \
#使用的参考文件
--keep B1.txt \
#输出文件
--out piPi.B1
#同上
vcftools --vcf clean.vcf \
--window-pi 100000 --window-pi-step 10000 \
--keep B2.txt \
--out piPi.B2
3.Fst结果计算(linux):
#Fst 群体间多样性差异
vcftools --vcf clean.vcf \
--fst-window-size 100000 \
--fst-window-step 10000 \
--weir-fst-pop B1.txt \
--weir-fst-pop B2.txt \
--out Fst.B1-B2
(窗口与步长)
4. "reference.fai"——(作图需要)
可以直接从上一步的结果文件pi.B2.windowed.pi里提取
# 读取pi数据文件
input_file <- "pi.B2.windowed.pi"
data <- read.table(input_file, header = TRUE)
# 获取染色体信息
chr_info <- aggregate(BIN_END ~ CHROM, data = data, max)
chr_info <- chr_info[order(chr_info$CHROM), ]
# 创建FAI文件内容
fai_content <- data.frame(
CHROM = chr_info$CHROM,
LENGTH = chr_info$BIN_END,
OFFSET = 0, # 这个值在这里不重要,我们设为0
LINEBASES = 0, # 这个值在这里不重要,我们设为0
LINEWIDTH = 0 # 这个值在这里不重要,我们设为0
)
# 写入FAI文件
write.table(fai_content, file = "reference.fai", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
5.Pi单独作图
setwd('E:/pyProjects/zc/zc_select_sweep1')
# Load required libraries
library(ggplot2)
# Set parameters
input_file <- "piPi.B2.windowed.pi"
fai_file <- "reference.fai"
output_prefix <- "pi.B2"
height <- 8
width <- 24
lab_size <- 16
axis_size <- 16
title_size <- 24
# Other fixed parameters
threshold <- 0.05
point_size <- 1
line_size <- 0.2
# Read data
rawdata <- read.table(input_file, header = TRUE)
rawdata <- rawdata[, c("CHROM", "BIN_START", "BIN_END", "PI")]
colnames(rawdata) <- c("chr", "start", "end", "value")
# Read FAI file
refl <- read.table(fai_file, header = FALSE)
colnames(refl) <- c("chr", "length", "offset", "linebases", "linewidth")
# Filter data based on chromosomes in FAI file
rawdata <- rawdata[rawdata$chr %in% refl$chr, ]
# Calculate chromosome positions
chrlen <- refl$length
names(chrlen) <- refl$chr
blank <- 0.005 * sum(chrlen)
chrpos <- cumsum(c(0, chrlen[-length(chrlen)] + blank))
names(chrpos) <- names(chrlen)
# Prepare plot data
plot.data <- data.frame(
chr = as.character(rawdata$chr),
pos = rawdata$start + chrpos[rawdata$chr],
value = rawdata$value
)
plot.data$chr <- factor(plot.data$chr, levels = refl$chr, ordered = TRUE)
# Calculate threshold
threshold_value <- quantile(rawdata$value, threshold)
# Create plot
p <- ggplot(plot.data, aes(x = pos, y = value, colour = chr)) +
geom_point(size = point_size) +
scale_colour_manual(values = rep(c("#eb65a0", "#22c2e4", "#4abb6b", "#f28d21"), 100)) +
labs(x = "Chromosome", y = expression(theta[pi])) +
scale_x_continuous(
expand = c(0.001, 0),
breaks = sapply(split(plot.data$pos, plot.data$chr), mean),
labels = levels(plot.data$chr)
) +
geom_hline(yintercept = threshold_value, linetype = 2, size = 0.2) +
theme_classic() +
theme(
axis.text = element_text(size = axis_size),
axis.text.x = element_text(angle = 65, vjust = 1, hjust = 1, color = "black"),
axis.text.y = element_text(color = "black"),
axis.title = element_text(size = lab_size),
plot.title = element_text(size = title_size, hjust = 0.5),
legend.position = 'none'
) +
scale_y_continuous(expand = c(0, 0))
# Save plots
ggsave(paste0(output_prefix, "_manhattan.png"), p, height = height, width = width, dpi = 300, units = "in")
# Save selected regions
topdata <- subset(rawdata, value <= threshold_value)
write.table(topdata, file = paste0(output_prefix, "_selected_region_top", threshold, ".txt"),
quote = FALSE, sep = "\t", row.names = FALSE)
print("Plot and data files have been saved.")
6.Fst+Pi联合作图与分析
setwd("E:/pyProjects/zc/zc_select_sweep1/analysis_w500000_s50000")
library(RColorBrewer)
# Set parameters
#前三步的结果文件
fst_file <- "results_Fst.B1-B2.windowed.weir.fst"
pi1_file <- "results_piPi.B1.windowed.pi"
pi2_file <- "results_piPi.B2.windowed.pi"
pop1_name <- "wild"
pop2_name <- "cultivated"
threshold <- 0.05
use_zscore <- TRUE
use_log2 <- TRUE
output_prefix <- "fst-pi.B1-B2"
output_format <- "png"
# Read data
fst <- read.table(fst_file, sep="\t", header=TRUE, comment.char="")
fst <- data.frame(CHROM=fst$CHROM, BIN_START=fst$BIN_START, BIN_END=fst$BIN_END, FST=fst$WEIGHTED_FST)
pop1 <- read.table(pi1_file, sep="\t", header=TRUE, comment.char="")
pop1 <- data.frame(CHROM=pop1$CHROM, BIN_START=pop1$BIN_START, BIN_END=pop1$BIN_END, PIA=pop1$PI)
pop2 <- read.table(pi2_file, sep="\t", header=TRUE, comment.char="")
pop2 <- data.frame(CHROM=pop2$CHROM, BIN_START=pop2$BIN_START, BIN_END=pop2$BIN_END, PIB=pop2$PI)
# Merge data
d <- merge(fst, pop1, by=c("CHROM", "BIN_START", "BIN_END"))
data <- merge(d, pop2, by=c("CHROM", "BIN_START", "BIN_END"))
data$PIR <- data$PIA / data$PIB
# Set labels
xlab <- bquote(pi[.(pop1_name)]/pi[.(pop2_name)])
ylab <- expression(F[st])
# Calculate cutoffs
fst_cut <- 0
pi_up_cut <- 0
pi_down_cut <- 0
if(use_zscore) {
M <- mean(data$FST)
S <- sd(data$FST)
data$FST <- (data$FST - M) / S
ylab <- expression(z(F[st]))
cat("z Fst quantile:\n", quantile(data$FST, c(0.8, 0.90, 0.95, 0.99)), "\n")
fst_cut <- quantile(data$FST, 1 - threshold)
fst_legend <- c(bquote(z(F[st]) < .(round(fst_cut, 2))),
bquote(z(F[st]) >= .(round(fst_cut, 2))),
expression("Cumulative (%)"))
} else {
cat("Fst quantile:\n", quantile(data$FST, c(0.8, 0.90, 0.95, 0.99)), "\n")
fst_cut <- quantile(data$FST, 1 - threshold)
fst_legend <- c(bquote(F[st] < .(round(fst_cut, 2))),
bquote(F[st] >= .(round(fst_cut, 2))),
expression("Cumulative (%)"))
}
if(use_log2) {
xlab <- bquote(log[2](pi[.(pop1_name)]/pi[.(pop2_name)]))
data$PIR <- log2(data$PIR)
cat("log2 Pi ratio(popa/popb) quantile:\n", quantile(data$PIR, c(0.8, 0.90, 0.95, 0.99)), "\n")
pi_up_cut <- quantile(data$PIR, 1 - threshold)
pi_down_cut <- quantile(data$PIR, threshold)
pi_legend <- c(substitute(paste(down, "<", log2(pi[ratio]), "<", up),
list(up=round(pi_up_cut, 2), down=round(pi_down_cut, 2))),
bquote(log2(pi[ratio]) <= .(round(pi_down_cut, 2))),
bquote(log2(pi[ratio]) >= .(round(pi_up_cut, 2))),
expression("Cumulative (%)"))
} else {
cat("Pi ratio(popa/popb) quantile:\n", quantile(data$PIR, c(0.8, 0.90, 0.95, 0.99)), "\n")
pi_up_cut <- quantile(data$PIR, 1 - threshold)
pi_down_cut <- quantile(data$PIR, threshold)
pi_legend <- c(substitute(paste(down, "<", pi[ratio], "<", up),
list(up=round(pi_up_cut, 2), down=round(pi_down_cut, 2))),
bquote(pi[ratio] <= .(round(pi_down_cut, 2))),
bquote(pi[ratio] >= .(round(pi_up_cut, 2))),
expression("Cumulative (%)"))
}
# Set colors
mycol <- brewer.pal(8, "Set1")
palette(c(mycol[3], "#000000BB", mycol[2], "grey", mycol[5]))
# Identify selected regions
data$is_select <- NA
data$is_select[data$FST >= fst_cut & data$PIR >= pi_up_cut] <- 1 # pop B selected region
data$is_select[data$FST >= fst_cut & data$PIR <= pi_down_cut] <- 3 # pop A selected region
data$is_select[is.na(data$is_select)] <- 2
# Output selected regions
out.df <- subset(data, is_select == 1 | is_select == 3)
out.df$is_select[out.df$is_select == 1] <- paste0(pop2_name, " selected region")
out.df$is_select[out.df$is_select == 3] <- paste0(pop1_name, " selected region")
out.df$is_select[out.df$is_select == 2] <- "not selected region"
write.table(out.df, file=paste0(output_prefix, ".selected.region.txt"),
sep="\t", row.names=FALSE, quote=FALSE)
write.table(data, file=paste0(output_prefix, ".merged.all.txt"),
sep="\t", row.names=FALSE, quote=FALSE)
# Create plot
if (output_format == "pdf") {
pdf(file=paste0(output_prefix, ".pdf"), width=8, height=8)
} else {
png(filename=paste0(output_prefix, ".png"), width=2400, height=2400, res=300)
}
# Set up layout
layout(matrix(c(2,0,1,3), 2, 2, byrow=TRUE), widths=c(5,2), heights=c(2,5), TRUE)
# Main scatter plot
par(mar=c(5.1, 4.1, 0, 0))
plot(data$FST ~ data$PIR, pch=21,
col=data$is_select, bg=data$is_select,
xlim=c(floor(min(data$PIR)), ceiling(max(data$PIR))),
ylim=c(floor(min(data$FST)), ceiling(max(data$FST))),
xlab=xlab,
ylab=ylab,
cex=0.5)
legend("bottomright", legend=c(paste0(" Selected region (", pop1_name, ")"),
paste0(" Selected region (", pop2_name, ")")),
pch=21, col=c(3,1), pt.bg=c(3,1), bty="n", pt.cex=1.5, adj=c(0, 0.5))
abline(h=fst_cut, col=2, lty=2, lwd=1.5)
abline(v=pi_up_cut, col=2, lty=2, lwd=1.5)
abline(v=pi_down_cut, col=2, lty=2, lwd=1.5)
# Upper PI histogram
par(mar=c(0, 4.1, 3, 0))
xhist <- hist(data$PIR, breaks=seq(from=floor(min(data$PIR)), to=ceiling(max(data$PIR)), length.out=100),
plot=FALSE)
xbar.col <- rep(4, 100)
xbar.col[xhist$mids < pi_down_cut] <- 3
xbar.col[xhist$mids > pi_up_cut] <- 1
hist(data$PIR, breaks=seq(from=floor(min(data$PIR)), to=ceiling(max(data$PIR)), length.out=100),
ann=FALSE, axes=TRUE, xaxt="n", col=xbar.col, border=xbar.col)
ec <- ecdf(data$PIR)
lines(x=xhist$mids, y=ec(xhist$mids)*max(xhist$counts), col='black', lwd=2)
axis(4, at=seq(from=0, to=max(xhist$counts), length.out=6), labels=seq(0, 100, 20), col='black', col.axis='black')
mtext(side=4, line=3, 'Cumulative (%)', col='black', cex=0.8, adj=1)
mtext(side=2, line=3, 'Counts', col='black', cex=0.8)
abline(v=pi_up_cut, col=2, lty=2, lwd=1.5)
abline(v=pi_down_cut, col=2, lty=2, lwd=1.5)
legend("bottomright", legend=pi_legend, lty=1, lwd=1.5, col=c(4,3,1,"black"), bty="n")
# Right FST histogram
par(mar=c(5.1, 0, 0, 3))
yhist <- hist(data$FST, breaks=seq(from=floor(min(data$FST)), to=ceiling(max(data$FST)), length.out=100),
plot=FALSE)
ybar.col <- rep(4, 100)
ybar.col[yhist$mids > fst_cut] <- 5
barplot(yhist$counts, horiz=TRUE, space=0, axes=TRUE, col=ybar.col, main="", border=ybar.col)
axis(3, at=seq(from=0, to=max(yhist$counts), length.out=6), labels=seq(0, 100, 20), col='black', col.axis='black')
ec <- ecdf(data$FST)
lines(y=(yhist$mids-min(data$FST))/(max(data$FST)-min(data$FST))*100,
x=ec(yhist$mids)*max(yhist$counts), col='black', lwd=2)
mtext(side=3, line=3, 'Cumulative (%)', col='black', cex=0.8, padj=1, adj=1)
mtext(side=1, line=3, 'Counts', col='black', cex=0.8)
abline(h=(fst_cut-min(data$FST))/(max(data$FST)-min(data$FST))*100-0.5, col=2, lty=2, lwd=1.5)
legend("bottomright", legend=fst_legend, lty=1, lwd=1.5, col=c(4,5,"black"), bty="n")
dev.off()
cat("Analysis complete. Output files have been generated.\n")
重合高执行度选中区间文件:
fst-pi.B1-B2.selected.region.txt