# -*- coding: utf-8 -*-
# @Time : 2021/5/19
# @Author : Henrette_L
import os
import pandas as pd
import time
start = time.time()
#path = '/Users/Henriette/Documents/PythonProgrames/数据分析/data/'
path = input("请输入文件路径:")
filename = input("请输入文件保存名称:")
file_list = os.listdir(path)
#file_list.remove('.DS_Store')
print("**********测试读取大文件数据*********")
for i in file_list[:1]:
# df = pd.read_table(path+i,compression='gzip', header=0 ,sep="\t",chunksize=1000000)
reader = pd.read_table(path+"/"+i, sep='\t', iterator=True, compression='gzip', header=0)
loop = True
chunkSize = 500
chunks = []
while loop:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
loop = False
print("Iteration is stopped.")
df = pd.concat(chunks, ignore_index=True)
col_list = df.columns.to_list()
res_df = pd.DataFrame(columns=col_list)
print("*****测试结束,开始正式读取******")
for i in file_list:
reader = pd.read_table(path+"/"+i, sep='\t', iterator=True, compression='gzip', header=0)
loop = True
chunkSize = 1000
chunks = []
while loop:
try:
chunk = reader.get_chunk(chunkSize)
chunks.append(chunk)
except StopIteration:
loop = False
df = pd.concat(chunks, ignore_index=True)
col_list = df.columns.to_list()
res_df = pd.DataFrame(columns=col_list)
df = pd.read_table(path+i,compression='gzip', header=0, sep="\t")
main = df.loc[:, 'REF':"L9"] # 需要计算区域
index_list = df.index.to_list() # 行索引列表
col_list = df.columns.to_list()
# print(col_list)
print("**********开始计算四列数据***********")
"""计算四列数据"""
ref_number_list = []
alt_number_list = []
# 低频位点比例
res1 = []
# 正常位点比例
res2 = []
for i in index_list[:]:
REF = df.loc[i]['REF']
ALT = df.loc[i]['ALT']
col_data = df.loc[i]["REF":"L9"].values.tolist()
num = pd.value_counts(col_data)
try:
ref_number = num[REF*2]
except:
ref_number = 0
try:
alt_number = num[ALT*2]
except:
alt_number = 0
ref_number_list.append(ref_number)
alt_number_list.append(alt_number)
# print(ref_number, alt_number)
# 低频位点
num1 = min(ref_number, alt_number)/(ref_number+alt_number)
res1.append(num1)
# 正常位点
num2 = (ref_number+alt_number)/(len(col_data)-2)
res2.append(num2)
df['ref_number'] = ref_number_list
df['alt_number'] = alt_number_list
df['低频位点比例'] = res1
df['正常位点比例'] = res2
print("************计算结束,开始筛选**************")
# 筛选低频位点比例 >0.05 的
res = df[df['低频位点比例']>0.05]
# print(res)
res_df = pd.merge(res_df, res, how='outer')
print("*******正在保存数据******")
res_df.to_csv(filename+".csv", index=None, encoding='utf_8_sig')
print("此次查找花费的时间是:{.5f}秒".format(time.time() - start))
print("结束")
数据分析处理.ipynb
最新推荐文章于 2021-06-26 12:17:39 发布