功能介绍
以下脚本的功能是,针对某个特定的线粒体单倍群(例如,M9),分区域(地级市或者省)统计该单倍群以及该单倍群下游的所有单倍群的数量,然后把下游的单倍群累加到上游去,从而计算该地区的单倍群频率。
计算公式如下,以 M9单倍群的某个地区为例:
M
9
单倍群在四川省的频率
=
M
9
单倍群在四川省的数量
+
M
9
单倍群下游单倍群在四川省的数量
四川省所有单倍群的数量
M9单倍群在四川省的频率 = \frac{M9单倍群在四川省的数量+M9单倍群下游单倍群在四川省的数量}{四川省所有单倍群的数量}
M9单倍群在四川省的频率=四川省所有单倍群的数量M9单倍群在四川省的数量+M9单倍群下游单倍群在四川省的数量
效果展示
以省
以地级市
额外功能
该脚本会自动检查你的原始数据中不存在但是在完整的系统发育树中存在的单倍群,并打印到终端,方便快速检查。
例如,当我们想绘制 M9
单倍群的时候,针对我的数据,会打印:
The following haplogroups were not found in the dataset: E1, E2b1, E2a1, E2a2, E2a1a, (C16261T), E1a2a, E1a2a1, E1a2a2, E1a2a3, E1a2a4, E1a1b, E1a1c, E1a1b1, E1a1b2, E1a1b4, E1a1a1c, E1a1a1b1, E1a1a1b2, C150T, M9a1b1b, M9a1b1a1, M9a1a1c1b2, M9a1a1c1b1a1
准备工作
- 原始数据,格式如下所示:
Province,City,Haplogroup
四川省,德阳市,B4h
......
- 地图文件:包括
shp
和附带的数据文件,例如:
市(等积投影)_修正.shx
市(等积投影)_修正.prj
市(等积投影)_修正.dbf
市(等积投影)_修正.cpg
- 线粒体单倍群系统发育树,在文末给出。
代码1:以省为单位绘制
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from pypinyin import pinyin, Style
# 读取并解析单倍群列表文件
def parse_haplogroup_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
haplogroup_lines = f.readlines()
haplogroup_tree = {}
stack = []
for line in haplogroup_lines:
stripped_line = line.strip()
if not stripped_line:
continue
indent_level = len(line) - len(stripped_line)
while len(stack) > 0 and stack[-1][1] >= indent_level:
stack.pop()
if stack:
parent = stack[-1][0]
haplogroup_tree.setdefault(parent, []).append(stripped_line)
stack.append((stripped_line, indent_level))
return haplogroup_tree
# 查找特定单倍群的所有下游单倍群
def find_all_descendants(haplogroup, haplogroup_tree):
descendants = []
stack = [haplogroup]
while stack:
current = stack.pop()
if current in haplogroup_tree:
children = haplogroup_tree[current]
descendants.extend(children)
stack.extend(children)
return descendants
# 替换下游单倍群为上游单倍群,并记录未找到的单倍群
def replace_haplogroups(data, haplogroup_name, haplogroup_tree):
descendants = find_all_descendants(haplogroup_name, haplogroup_tree)
found_haplogroups = set(data['Haplogroup'].unique())
missing_haplogroups = [hap for hap in descendants if hap not in found_haplogroups]
if missing_haplogroups:
print(f"The following haplogroups were not found in the dataset: {', '.join(missing_haplogroups)}")
data['Haplogroup'] = data['Haplogroup'].apply(lambda x: haplogroup_name if x in descendants else x)
return data
# 加载数据
file_path = '数据清洗整理.csv'
data = pd.read_csv(file_path)
# 加载并解析单倍群列表文件
haplogroup_file_path = '线粒体单倍群phylotree(version17).txt'
haplogroup_tree = parse_haplogroup_file(haplogroup_file_path)
# 获取单倍群在各省的数量
haplogroup_name = 'M9'
data_replaced = replace_haplogroups(data, haplogroup_name, haplogroup_tree)
# 过滤指定单倍群并统计各省数量
def count_haplogroup_by_province(haplogroup_name, data, haplogroup_tree):
descendants = find_all_descendants(haplogroup_name, haplogroup_tree)
filtered_data = data[data['Haplogroup'] == haplogroup_name]
province_counts = filtered_data['Province'].value_counts().reset_index()
province_counts.columns = ['Province', 'Count']
other_counts = data[~data['Haplogroup'].isin(descendants)].groupby('Province').size().reset_index(name='Total_Other')
province_counts = province_counts.merge(other_counts, on='Province', how='left')
return province_counts
province_counts = count_haplogroup_by_province(haplogroup_name, data_replaced, haplogroup_tree)
# 计算各省的频率
province_counts['Frequency'] = province_counts['Count'] / province_counts['Total_Other']
# 将中文省份名称转换为拼音
province_counts['Province_pinyin'] = province_counts['Province'].apply(lambda x: ''.join([item[0] for item in pinyin(x, style=Style.NORMAL)]))
# 加载中国省份地理数据
china_map = gpd.read_file('省面/省面.shp')
# 确认地理数据中的省份名称列
province_column = '省全名'
# 将地理数据中的省份名称转换为拼音
china_map['Province_pinyin'] = china_map[province_column].apply(lambda x: ''.join([item[0] for item in pinyin(x, style=Style.NORMAL)]))
# 合并统计数据和地理数据
china_map = china_map.merge(province_counts, left_on='Province_pinyin', right_on='Province_pinyin', how='left')
china_map = china_map.infer_objects() # 确保数据类型正确
china_map = china_map.fillna(0) # 填充缺失值
# 加载其他地理数据
buffer_map = gpd.read_file('省面/0-15KM缓冲国界.shp')
coastline_map = gpd.read_file('省面/海岸.shp')
south_china_sea_box_map = gpd.read_file('省面/南海附图框.shp')
#south_china_sea_box_small_map = gpd.read_file('省面/南海附图框南海诸岛小框.shp')
reef_map = gpd.read_file('省面/珊瑚礁.shp')
# 检查province_counts是否为空
if province_counts.empty:
print("Error: The haplogroup data is empty. Please check the input data and haplogroup name.")
else:
# 获取频率的最大值和最小值
max_frequency = province_counts['Frequency'].max()
min_frequency = province_counts['Frequency'].min()
# 绘制热图
fig, ax = plt.subplots(1, 1, figsize=(10, 18)) # 调整图形高度
china_map.boundary.plot(ax=ax, linewidth=0.03, color='#8B8B8B')
china_map.plot(column='Frequency', ax=ax, legend=True,
cmap='Spectral_r', # 使用配色
legend_kwds={'label': f"Haplogroup {haplogroup_name} Frequency by Province",
'orientation': "vertical"},
vmin=min_frequency, vmax=max_frequency) # 设置颜色条范围
# 绘制其他shp文件
buffer_map.plot(ax=ax, color='#8B8B7B', linewidth=0.5)
coastline_map.plot(ax=ax, color='#8B8B6B', linewidth=0.5)
south_china_sea_box_map.plot(ax=ax, color='#8B8B5B', linewidth=0.5)
#south_china_sea_box_small_map.plot(ax=ax, color='#8B8B4B', linewidth=0.5)
reef_map.plot(ax=ax, color='#8B8B3B', linewidth=0.5)
# 隐藏地图外框的横轴、纵轴的数字标尺
ax.set_xticks([])
ax.set_yticks([])
plt.title(f'Distribution of Haplogroup {haplogroup_name} in China by Province')
# 添加表格
table_data = province_counts[['Province_pinyin', 'Count', 'Frequency']].values
column_labels = ['Province', 'Count', 'Frequency']
table = plt.table(cellText=table_data,
colLabels=column_labels,
cellLoc='center',
loc='bottom',
bbox=[0, -1.4, 1, 1.2]) # 调整表格位置和大小
table.auto_set_font_size(False)
table.set_fontsize(8) # 调整字体大小
# 调整单元格的高度
cell_dict = table.get_celld()
for i in range(len(table_data) + 1): # 包括标题行
for j in range(len(column_labels)):
cell_dict[(i, j)].set_height(0.15) # 调整单元格高度
plt.subplots_adjust(left=0.2, bottom=0.4) # 调整图形布局
plt.show()
代码2:以地级市为单位绘制
其实逻辑实现基本一致,只是把省替换为市罢了。
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from pypinyin import pinyin, Style
# 读取并解析单倍群列表文件
def parse_haplogroup_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
haplogroup_lines = f.readlines()
haplogroup_tree = {}
stack = []
for line in haplogroup_lines:
stripped_line = line.strip()
if not stripped_line:
continue
indent_level = len(line) - len(stripped_line)
while len(stack) > 0 and stack[-1][1] >= indent_level:
stack.pop()
if stack:
parent = stack[-1][0]
haplogroup_tree.setdefault(parent, []).append(stripped_line)
stack.append((stripped_line, indent_level))
return haplogroup_tree
# 查找特定单倍群的所有下游单倍群
def find_all_descendants(haplogroup, haplogroup_tree):
descendants = []
stack = [haplogroup]
while stack:
current = stack.pop()
if current in haplogroup_tree:
children = haplogroup_tree[current]
descendants.extend(children)
stack.extend(children)
return descendants
# 替换下游单倍群为上游单倍群,并记录未找到的单倍群
def replace_haplogroups(data, haplogroup_name, haplogroup_tree):
descendants = find_all_descendants(haplogroup_name, haplogroup_tree)
found_haplogroups = set(data['Haplogroup'].unique())
missing_haplogroups = [hap for hap in descendants if hap not in found_haplogroups]
if missing_haplogroups:
print(f"The following haplogroups were not found in the dataset: {', '.join(missing_haplogroups)}")
data['Haplogroup'] = data['Haplogroup'].apply(lambda x: haplogroup_name if x in descendants else x)
return data
# 加载数据
file_path = '数据清洗整理_最终修改.csv'
data = pd.read_csv(file_path)
# 加载并解析单倍群列表文件
haplogroup_file_path = '线粒体单倍群phylotree(version17).txt'
haplogroup_tree = parse_haplogroup_file(haplogroup_file_path)
# 获取单倍群在各市的数量
haplogroup_name = 'M7a1a1'
data_replaced = replace_haplogroups(data, haplogroup_name, haplogroup_tree)
# 过滤指定单倍群并统计各市数量
def count_haplogroup_by_city(haplogroup_name, data, haplogroup_tree):
descendants = find_all_descendants(haplogroup_name, haplogroup_tree)
filtered_data = data[data['Haplogroup'] == haplogroup_name]
city_counts = filtered_data['City'].value_counts().reset_index()
city_counts.columns = ['City', 'Count']
other_counts = data[~data['Haplogroup'].isin(descendants)].groupby('City').size().reset_index(name='Total_Other')
city_counts = city_counts.merge(other_counts, on='City', how='left')
return city_counts
city_counts = count_haplogroup_by_city(haplogroup_name, data_replaced, haplogroup_tree)
# 计算各市的频率
city_counts['Frequency'] = city_counts['Count'] / city_counts['Total_Other']
# 将中文城市名称转换为拼音
city_counts['City_pinyin'] = city_counts['City'].apply(lambda x: ''.join([item[0] for item in pinyin(x, style=Style.NORMAL)]))
# 加载中国地级市地理数据
china_map = gpd.read_file('雄鸡地级市/市(等积投影)_修正.shp')
# 确认地理数据中的城市名称列
city_column = '市'
# 将地理数据中的城市名称转换为拼音
china_map['City_pinyin'] = china_map[city_column].apply(lambda x: ''.join([item[0] for item in pinyin(x, style=Style.NORMAL)]))
# 合并统计数据和地理数据
china_map = china_map.merge(city_counts, left_on='City_pinyin', right_on='City_pinyin', how='left')
china_map = china_map.infer_objects() # 确保数据类型正确
china_map = china_map.fillna(0) # 填充缺失值
# 加载其他地理数据
multilinestring_map = gpd.read_file('省面/海岸.shp')
linestring_map = gpd.read_file('雄鸡地级市/中国轮廓线.shp')
# 检查city_counts是否为空
if city_counts.empty:
print("Error: The haplogroup data is empty. Please check the input data and haplogroup name.")
else:
# 获取频率的最大值和最小值
max_frequency = city_counts['Frequency'].max()
min_frequency = city_counts['Frequency'].min()
# 绘制热图
fig, ax = plt.subplots(1, 1, figsize=(10, 18)) # 调整图形高度
china_map.boundary.plot(ax=ax, linewidth=0.03, color='#8B8B8B')
china_map.plot(column='Frequency', ax=ax, legend=True,
cmap='Spectral_r', # 使用配色
legend_kwds={'label': f"Haplogroup {haplogroup_name} Frequency by City",
'orientation': "vertical"}) # 删除vmin和vmax,使用默认值
# 绘制其他shp文件
#multilinestring_map.plot(ax=ax, color='#8B8B8B', linewidth=0.5)
linestring_map.plot(ax=ax, color='#8B8B8B', linewidth=0.5)
# 隐藏地图外框的横轴、纵轴的数字标尺
ax.set_xticks([])
ax.set_yticks([])
plt.title(f'Distribution of Haplogroup {haplogroup_name} in China by City')
# 添加表格
table_data = city_counts[['City_pinyin', 'Count', 'Frequency']].values
column_labels = ['City', 'Count', 'Frequency']
table = plt.table(cellText=table_data,
colLabels=column_labels,
cellLoc='center',
loc='bottom',
bbox=[0, -5, 1, 5]) # 调整表格位置和大小
table.auto_set_font_size(False)
table.set_fontsize(5) # 调整字体大小
# 调整单元格的高度
cell_dict = table.get_celld()
for i in range(len(table_data) + 1): # 包括标题行
for j in range(len(column_labels)):
cell_dict[(i, j)].set_height(0.15) # 调整单元格高度
plt.subplots_adjust(left=0.2, bottom=0.4) # 调整图形布局
plt.show()
线粒体单倍群系统发育树
该发育树通过 phylotree2016年版本构建
。
下载地址(有效期至2025年6月):
https://scientific-attach.oss-cn-chengdu.aliyuncs.com/%E9%99%84%E4%BB%B6/%E7%BA%BF%E7%B2%92%E4%BD%93%E5%8D%95%E5%80%8D%E7%BE%A4phylotree(version17).txt?OSSAccessKeyId=LTAI5tELp2DoTHGGZeQkMQmK&Expires=1749692016&Signature=uUB7fAEOfW2cC1TdPlLfMEa2wlM%3D