Pi+Fst群体进化与选择分析

zky___

已于 2024-09-25 10:19:17 修改

阅读量82

点赞数

文章标签：数据库 linux 前端

于 2024-09-21 12:40:01 首次发布

本文链接：https://blog.csdn.net/m0_69569041/article/details/142414738

版权

1.文件准备 B1.txt B2.txt（群体的分类文件，后续的分析结果是B2相对于B1的选择结果位点）

2.pi结果计算（linux）：


#输入文件
vcftools --vcf clean.vcf \
#指定窗口 步长
--window-pi 100000 --window-pi-step 10000 \
#使用的参考文件
--keep B1.txt \
#输出文件
--out piPi.B1

#同上
vcftools --vcf clean.vcf \
--window-pi 100000 --window-pi-step 10000 \
--keep B2.txt \
--out piPi.B2

3.Fst结果计算（linux）：


#Fst  群体间多样性差异
vcftools  --vcf clean.vcf \
--fst-window-size 100000 \
--fst-window-step 10000 \
--weir-fst-pop B1.txt \
--weir-fst-pop B2.txt \
--out  Fst.B1-B2

（窗口与步长）

4. "reference.fai"——（作图需要）

可以直接从上一步的结果文件pi.B2.windowed.pi里提取

# 读取pi数据文件
input_file <- "pi.B2.windowed.pi"
data <- read.table(input_file, header = TRUE)

# 获取染色体信息
chr_info <- aggregate(BIN_END ~ CHROM, data = data, max)
chr_info <- chr_info[order(chr_info$CHROM), ]

# 创建FAI文件内容
fai_content <- data.frame(
  CHROM = chr_info$CHROM,
  LENGTH = chr_info$BIN_END,
  OFFSET = 0,  # 这个值在这里不重要，我们设为0
  LINEBASES = 0,  # 这个值在这里不重要，我们设为0
  LINEWIDTH = 0  # 这个值在这里不重要，我们设为0
)

# 写入FAI文件
write.table(fai_content, file = "reference.fai", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)

5.Pi单独作图



setwd('E:/pyProjects/zc/zc_select_sweep1')
# Load required libraries
library(ggplot2)

# Set parameters
input_file <- "piPi.B2.windowed.pi"
fai_file <- "reference.fai"
output_prefix <- "pi.B2"
height <- 8
width <- 24
lab_size <- 16
axis_size <- 16
title_size <- 24

# Other fixed parameters
threshold <- 0.05
point_size <- 1
line_size <- 0.2

# Read data
rawdata <- read.table(input_file, header = TRUE)
rawdata <- rawdata[, c("CHROM", "BIN_START", "BIN_END", "PI")]
colnames(rawdata) <- c("chr", "start", "end", "value")

# Read FAI file
refl <- read.table(fai_file, header = FALSE)
colnames(refl) <- c("chr", "length", "offset", "linebases", "linewidth")

# Filter data based on chromosomes in FAI file
rawdata <- rawdata[rawdata$chr %in% refl$chr, ]

# Calculate chromosome positions
chrlen <- refl$length
names(chrlen) <- refl$chr
blank <- 0.005 * sum(chrlen)
chrpos <- cumsum(c(0, chrlen[-length(chrlen)] + blank))
names(chrpos) <- names(chrlen)

# Prepare plot data
plot.data <- data.frame(
  chr = as.character(rawdata$chr),
  pos = rawdata$start + chrpos[rawdata$chr],
  value = rawdata$value
)
plot.data$chr <- factor(plot.data$chr, levels = refl$chr, ordered = TRUE)

# Calculate threshold
threshold_value <- quantile(rawdata$value, threshold)

# Create plot
p <- ggplot(plot.data, aes(x = pos, y = value, colour = chr)) +
  geom_point(size = point_size) +
  scale_colour_manual(values = rep(c("#eb65a0", "#22c2e4", "#4abb6b", "#f28d21"), 100)) +
  labs(x = "Chromosome", y = expression(theta[pi])) +
  scale_x_continuous(
    expand = c(0.001, 0),
    breaks = sapply(split(plot.data$pos, plot.data$chr), mean),
    labels = levels(plot.data$chr)
  ) +
  geom_hline(yintercept = threshold_value, linetype = 2, size = 0.2) +
  theme_classic() +
  theme(
    axis.text = element_text(size = axis_size),
    axis.text.x = element_text(angle = 65, vjust = 1, hjust = 1, color = "black"),
    axis.text.y = element_text(color = "black"),
    axis.title = element_text(size = lab_size),
    plot.title = element_text(size = title_size, hjust = 0.5),
    legend.position = 'none'
  ) +
  scale_y_continuous(expand = c(0, 0))

# Save plots
ggsave(paste0(output_prefix, "_manhattan.png"), p, height = height, width = width, dpi = 300, units = "in")

# Save selected regions
topdata <- subset(rawdata, value <= threshold_value)
write.table(topdata, file = paste0(output_prefix, "_selected_region_top", threshold, ".txt"),
            quote = FALSE, sep = "\t", row.names = FALSE)

print("Plot and data files have been saved.")

6.Fst+Pi联合作图与分析



setwd("E:/pyProjects/zc/zc_select_sweep1/analysis_w500000_s50000")
library(RColorBrewer)

# Set parameters
#前三步的结果文件
fst_file <- "results_Fst.B1-B2.windowed.weir.fst"
pi1_file <- "results_piPi.B1.windowed.pi"
pi2_file <- "results_piPi.B2.windowed.pi"

pop1_name <- "wild"
pop2_name <- "cultivated"
threshold <- 0.05
use_zscore <- TRUE
use_log2 <- TRUE
output_prefix <- "fst-pi.B1-B2"
output_format <- "png"

# Read data
fst <- read.table(fst_file, sep="\t", header=TRUE, comment.char="")
fst <- data.frame(CHROM=fst$CHROM, BIN_START=fst$BIN_START, BIN_END=fst$BIN_END, FST=fst$WEIGHTED_FST)

pop1 <- read.table(pi1_file, sep="\t", header=TRUE, comment.char="")
pop1 <- data.frame(CHROM=pop1$CHROM, BIN_START=pop1$BIN_START, BIN_END=pop1$BIN_END, PIA=pop1$PI)

pop2 <- read.table(pi2_file, sep="\t", header=TRUE, comment.char="")
pop2 <- data.frame(CHROM=pop2$CHROM, BIN_START=pop2$BIN_START, BIN_END=pop2$BIN_END, PIB=pop2$PI)

# Merge data
d <- merge(fst, pop1, by=c("CHROM", "BIN_START", "BIN_END"))
data <- merge(d, pop2, by=c("CHROM", "BIN_START", "BIN_END"))
data$PIR <- data$PIA / data$PIB

# Set labels
xlab <- bquote(pi[.(pop1_name)]/pi[.(pop2_name)])
ylab <- expression(F[st])

# Calculate cutoffs
fst_cut <- 0
pi_up_cut <- 0
pi_down_cut <- 0

if(use_zscore) {
    M <- mean(data$FST)
    S <- sd(data$FST)
    data$FST <- (data$FST - M) / S
    ylab <- expression(z(F[st]))
    cat("z Fst quantile:\n", quantile(data$FST, c(0.8, 0.90, 0.95, 0.99)), "\n")
    fst_cut <- quantile(data$FST, 1 - threshold)
    fst_legend <- c(bquote(z(F[st]) < .(round(fst_cut, 2))),
                    bquote(z(F[st]) >= .(round(fst_cut, 2))),
                    expression("Cumulative (%)"))
} else {
    cat("Fst quantile:\n", quantile(data$FST, c(0.8, 0.90, 0.95, 0.99)), "\n")
    fst_cut <- quantile(data$FST, 1 - threshold)
    fst_legend <- c(bquote(F[st] < .(round(fst_cut, 2))),
                    bquote(F[st] >= .(round(fst_cut, 2))),
                    expression("Cumulative (%)"))
}

if(use_log2) {
    xlab <- bquote(log[2](pi[.(pop1_name)]/pi[.(pop2_name)]))
    data$PIR <- log2(data$PIR)
    cat("log2 Pi ratio(popa/popb) quantile:\n", quantile(data$PIR, c(0.8, 0.90, 0.95, 0.99)), "\n")
    pi_up_cut <- quantile(data$PIR, 1 - threshold)
    pi_down_cut <- quantile(data$PIR, threshold)
    pi_legend <- c(substitute(paste(down, "<", log2(pi[ratio]), "<", up),
                              list(up=round(pi_up_cut, 2), down=round(pi_down_cut, 2))),
                   bquote(log2(pi[ratio]) <= .(round(pi_down_cut, 2))),
                   bquote(log2(pi[ratio]) >= .(round(pi_up_cut, 2))),
                   expression("Cumulative (%)"))
} else {
    cat("Pi ratio(popa/popb) quantile:\n", quantile(data$PIR, c(0.8, 0.90, 0.95, 0.99)), "\n")
    pi_up_cut <- quantile(data$PIR, 1 - threshold)
    pi_down_cut <- quantile(data$PIR, threshold)
    pi_legend <- c(substitute(paste(down, "<", pi[ratio], "<", up),
                              list(up=round(pi_up_cut, 2), down=round(pi_down_cut, 2))),
                   bquote(pi[ratio] <= .(round(pi_down_cut, 2))),
                   bquote(pi[ratio] >= .(round(pi_up_cut, 2))),
                   expression("Cumulative (%)"))
}

# Set colors
mycol <- brewer.pal(8, "Set1")
palette(c(mycol[3], "#000000BB", mycol[2], "grey", mycol[5]))

# Identify selected regions
data$is_select <- NA
data$is_select[data$FST >= fst_cut & data$PIR >= pi_up_cut] <- 1    # pop B selected region
data$is_select[data$FST >= fst_cut & data$PIR <= pi_down_cut] <- 3  # pop A selected region
data$is_select[is.na(data$is_select)] <- 2

# Output selected regions
out.df <- subset(data, is_select == 1 | is_select == 3)
out.df$is_select[out.df$is_select == 1] <- paste0(pop2_name, " selected region")
out.df$is_select[out.df$is_select == 3] <- paste0(pop1_name, " selected region")
out.df$is_select[out.df$is_select == 2] <- "not selected region"

write.table(out.df, file=paste0(output_prefix, ".selected.region.txt"),
            sep="\t", row.names=FALSE, quote=FALSE)

write.table(data, file=paste0(output_prefix, ".merged.all.txt"),
            sep="\t", row.names=FALSE, quote=FALSE)

# Create plot
if (output_format == "pdf") {
    pdf(file=paste0(output_prefix, ".pdf"), width=8, height=8)
} else {
    png(filename=paste0(output_prefix, ".png"), width=2400, height=2400, res=300)
}

# Set up layout
layout(matrix(c(2,0,1,3), 2, 2, byrow=TRUE), widths=c(5,2), heights=c(2,5), TRUE)

# Main scatter plot
par(mar=c(5.1, 4.1, 0, 0))
plot(data$FST ~ data$PIR, pch=21,
     col=data$is_select, bg=data$is_select,
     xlim=c(floor(min(data$PIR)), ceiling(max(data$PIR))),
     ylim=c(floor(min(data$FST)), ceiling(max(data$FST))),
     xlab=xlab,
     ylab=ylab,
     cex=0.5)
legend("bottomright", legend=c(paste0(" Selected region (", pop1_name, ")"),
                               paste0(" Selected region (", pop2_name, ")")),
       pch=21, col=c(3,1), pt.bg=c(3,1), bty="n", pt.cex=1.5, adj=c(0, 0.5))
abline(h=fst_cut, col=2, lty=2, lwd=1.5)
abline(v=pi_up_cut, col=2, lty=2, lwd=1.5)
abline(v=pi_down_cut, col=2, lty=2, lwd=1.5)

# Upper PI histogram
par(mar=c(0, 4.1, 3, 0))
xhist <- hist(data$PIR, breaks=seq(from=floor(min(data$PIR)), to=ceiling(max(data$PIR)), length.out=100),
              plot=FALSE)
xbar.col <- rep(4, 100)
xbar.col[xhist$mids < pi_down_cut] <- 3
xbar.col[xhist$mids > pi_up_cut] <- 1
hist(data$PIR, breaks=seq(from=floor(min(data$PIR)), to=ceiling(max(data$PIR)), length.out=100),
     ann=FALSE, axes=TRUE, xaxt="n", col=xbar.col, border=xbar.col)
ec <- ecdf(data$PIR)
lines(x=xhist$mids, y=ec(xhist$mids)*max(xhist$counts), col='black', lwd=2)
axis(4, at=seq(from=0, to=max(xhist$counts), length.out=6), labels=seq(0, 100, 20), col='black', col.axis='black')
mtext(side=4, line=3, 'Cumulative  (%)', col='black', cex=0.8, adj=1)
mtext(side=2, line=3, 'Counts', col='black', cex=0.8)
abline(v=pi_up_cut, col=2, lty=2, lwd=1.5)
abline(v=pi_down_cut, col=2, lty=2, lwd=1.5)
legend("bottomright", legend=pi_legend, lty=1, lwd=1.5, col=c(4,3,1,"black"), bty="n")

# Right FST histogram
par(mar=c(5.1, 0, 0, 3))
yhist <- hist(data$FST, breaks=seq(from=floor(min(data$FST)), to=ceiling(max(data$FST)), length.out=100),
              plot=FALSE)
ybar.col <- rep(4, 100)
ybar.col[yhist$mids > fst_cut] <- 5
barplot(yhist$counts, horiz=TRUE, space=0, axes=TRUE, col=ybar.col, main="", border=ybar.col)
axis(3, at=seq(from=0, to=max(yhist$counts), length.out=6), labels=seq(0, 100, 20), col='black', col.axis='black')
ec <- ecdf(data$FST)
lines(y=(yhist$mids-min(data$FST))/(max(data$FST)-min(data$FST))*100,
      x=ec(yhist$mids)*max(yhist$counts), col='black', lwd=2)
mtext(side=3, line=3, 'Cumulative (%)', col='black', cex=0.8, padj=1, adj=1)
mtext(side=1, line=3, 'Counts', col='black', cex=0.8)
abline(h=(fst_cut-min(data$FST))/(max(data$FST)-min(data$FST))*100-0.5, col=2, lty=2, lwd=1.5)
legend("bottomright", legend=fst_legend, lty=1, lwd=1.5, col=c(4,5,"black"), bty="n")

dev.off()

cat("Analysis complete. Output files have been generated.\n")