发现美国毒株当中比中国多了16个基因片段,M片段未找到其作用,但是百度后发现在有些病毒中M片段可以降低毒性。另一个不知道怎么回事的问题是,当使用滑动窗口观察皮尔逊数值在不同切片长度变化的过程中,发现了随着长度加长,中美两株样例毒株的向量编码的标准差相关系数会逐渐趋于相关系数为0。
且在约200下标长度的时候,开始接近相关系数为0。
或者说,毒株的编码向量段时间内就出现了大幅的变动……
不知道该说什么……只有期望生物或者数学大佬能够有新的发现了。
后面将要尝试将信息编码以信息分析的方式进行探索。
import numpy as np
import matplotlib.pyplot as plt
import scipy
from scipy import stats as sst
path1 = '/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/武汉病毒基因组序列数据/MT259251.genome.fasta'
path2 = '/Users/manmanzhang/Library/Mobile Documents/com~apple~CloudDocs/MyProject/InferenceSystem/src/I5_algorithm/武汉病毒基因组序列数据/sequence(武汉病毒所发布的新冠状病毒全基因组序列).fasta'
loadfile = lambda path : [i.replace('\n','') for i in open(path,'r').readlines()]
# %% [markdown]
# ## 初始信息
# %%
info1 , info2 = loadfile(path1) , loadfile(path2)
# %% [markdown]
# ## 初始数据
# %%
data1 , data2 = info1[1:],info2[1:]
# %%
len(data1[0]),len(data2[0])
# %% [markdown]
# ## 字母转码
# %%
data_to_number = lambda data : np.array([[ord(j) for j in i] for i in data])
# %%
data_to_number1,data_to_number2 = data_to_number(data1),data_to_number(data2)
data_to_number1[0][:10],data_to_number2[0][:10]
# %% [markdown]
# ## 循环皮尔逊系数
# %% [markdown]
# ## 并单组信息行并降维
# %%
reduce_the_dimension_of_the_data = lambda data_table : np.array([i for j in range(len(data_table)) for i in data_table[j]])
# %%
reduce_dim1 , reduce_dim2 = reduce_the_dimension_of_the_data(data_to_number1) ,reduce_the_dimension_of_the_data(data_to_number2)
reduce_dim1[:10] , reduce_dim2[:10]
# %% [markdown]
# ## 信息差
# %%
info_cur_one_dim = {chr(i):i for i in set(reduce_dim1)^set(reduce_dim2)}
info_cur_one_dim
# %% [markdown]
# ## 信息向量
# %%
Uset = list(set(list(reduce_dim1))|set(list(reduce_dim2)))
vector_of_info = lambda reduce_dim :np.array([Uset.index(i) for i in reduce_dim])
# %%
vector_of_info1 ,vector_of_info2 = vector_of_info(reduce_dim1) ,vector_of_info(reduce_dim2)
# %%
m1 , m2 = vector_of_info1.shape , vector_of_info2.shape
maxm = m1<m2 and m1[0] or m2[0]
maxm,m1[0]-m2[0]
# %% [markdown]
# ## 基因片段对比结果美国基因片段的总数量要比中国多出16个片段
# %%
def data_to_mat(data):
numberList = data_to_number(data)
maxrows = max([len(i) for i in numberList])
insert_nan = [i if len(i) == maxrows else [type(z)==str and eval(z) or z for z in i+('np.nan,'*(maxrows-len(i))).split(',')[:-1]] for i in numberList]
return np.array(insert_nan)
# %%
mat1,mat2= data_to_mat(data1),data_to_mat(data2)
# %%
mat1.shape,mat2.shape
# %% [markdown]
# ## 均值函数
# %%
def myavg(vector):
m = vector.shape[0]
return np.dot(vector,np.ones(m))/m
# %%
myavg(vector_of_info1),np.mean(vector_of_info1,axis=0),
# %% [markdown]
# ## 平方和函数
# %%
def myporw2(vector):
return np.dot(vector,vector.T)
# %%
myporw2(vector_of_info1)
# %% [markdown]
# ## 方差函数
# %%
def myvar(vector):
m = vector.shape
cur = (vector-myavg(vector))
return np.dot(cur,cur.T)/m[0]
# %%
myvar(vector_of_info1),np.var(vector_of_info1)
# %% [markdown]
# ## 标准差函数
# %%
def mystd1(vector):
m = vector.shape[0]
return np.sqrt(myvar(vector))
# %%
mystd1(vector_of_info1),np.std(vector_of_info1)
# %% [markdown]
#
# %%
mystd1(vector_of_info1)-mystd1(vector_of_info2)
# %% [markdown]
# ## 无偏标准差函数
# %%
def mystd2(vector):
m = vector.shape[0]
return np.sqrt(myvar(vector))
# %%
mystd2(vector_of_info1),np.std(vector_of_info1,axis=0)
# %% [markdown]
# ## 协方差函数
# %%
def mycov(vector1,vector2):
m = vector1.shape[0]
return np.dot((vector1-myavg(vector1)),(vector2-myavg(vector2)))/(m-1)
# %%
# %%
mycov(vector_of_info1[:maxm],vector_of_info2[:maxm])
# %% [markdown]
# ## scipy 返回的是标协方差矩阵
# %%
scipy.cov(vector_of_info1[:maxm],vector_of_info2[:maxm])
# %% [markdown]
# ## numpy 标准返回的是协方差矩阵
# %%
np.cov(vector_of_info1[:10],vector_of_info2[:10])
# %% [markdown]
# ## 变异系数
# %%
def mycoe(vector):
return mystd2(vector)/myavg(vector)
# %%
mycoe(vector_of_info1),mycoe(vector_of_info2)
# %% [markdown]
# ## 标准化/标准分数
# %%
def zscore(vector):
return (vector-myavg(vector))/mystd2(vector)
# %%
zscore(vector_of_info1)
# %% [markdown]
# ## 相关系数
# %%
def pearson(vector1,vector2):
n = vector1.shape[0]
sum_arr1 , sum_arr2 = vector1.sum() , vector2.sum()
sum_pow_arr1,sum_pow_arr2 = np.dot(vector1,vector1) , np.dot(vector2,vector2)
p_sum_arr = np.dot(vector1,vector2)
cov = p_sum_arr-(sum_arr1*sum_arr2/n)
std = np.sqrt((sum_pow_arr1 - (sum_arr1** 2) / n) * (sum_pow_arr2 - (sum_arr2** 2) / n))
return cov/std
# %%
vector_of_info1[:-17-1].shape,vector_of_info2[:-1-1].shape
# %% [markdown]
# ## 均方误差
# %%
def MSE(yhat,y):
return np.dot(yhat-myavg(y),yhat-myavg(y))
# %% [markdown]
# ### $ 回归方程( X^{T}X )^{-1} X^{T} Y $
# %%
def EquationRegression(X,Y,predict=False):
xm,xn = X.shape
ym,yn = Y.shape
newX = np.c_[np.ones(xm),X]
fit = np.dot(np.dot(np.linalg.inv(np.dot(newX.T,newX)),newX.T),Y)
if predict:
predictX = np.dot(np.r_[np.ones(1),np.array(predict)],fit)
return fit,predictX
else:
return fit
# %%
x = np.array([[62.47, 2.0], [65.78, 3.0], [58.05, 2.0], [52.09, 2.0], [74.98, 3.0], [55.87, 2.0], [90.66, 3.0], [113.68, 3.0], [97.92, 2.0], [46.33, 2.0], [134.55, 3.0], [151.15, 3.0], [63.01, 2.0], [65.66, 2.0], [108.81, 3.0], [66.19, 3.0], [54.1, 2.0], [73.44, 2.0], [51.78, 2.0], [92.42, 3.0], [59.13, 2.0], [49.49, 1.0], [51.68, 2.0], [52.87, 2.0], [69.46, 2.0], [76.41, 2.0], [63.1, 2.0], [197.37, 5.0], [93.53, 3.0], [91.35, 3.0], [103.49, 3.0], [45.12, 2.0], [59.59, 2.0], [174.66, 4.0], [35.8, 1.0], [91.35, 3.0], [55.07, 2.0], [119.44, 3.0], [65.85, 2.0], [72.05, 3.0], [85.98, 3.0], [103.29, 4.0], [184.05, 5.0], [90.87, 3.0], [38.83, 1.0], [51.65, 1.0], [50.14, 1.0]])
y = np.array([[213.0], [226.0], [179.0], [188.0], [215.0], [152.0], [290.0], [375.0], [305.0], [166.0], [385.0], [500.0], [195.0], [200.0], [310.0], [205.0], [158.0], [270.0], [150.0], [310.0], [180.0], [200.0], [155.0], [178.0], [303.0], [250.0], [218.0], [630.0], [326.0], [310.0], [530.0], [138.0], [230.0], [560.0], [115.0], [400.0], [140.0], [547.0], [240.0], [250.0], [315.0], [330.0], [680.0], [302.0], [130.0], [162.0], [140.0]])
# %%
EquationRegression(x,y)
# %% [markdown]
# ## 滑动皮尔逊系数
# %%
[pearson(vector_of_info1[:-17-i],vector_of_info2[:-1-i]) for i in range(vector_of_info1.shape[0] - vector_of_info2.shape[0])]
# %% [markdown]
# ## 滑动皮尔逊函数
# %%
pearson_arr = lambda alpha : np.array([pearson(vector_of_info1[0:i+alpha],vector_of_info2[0:i+alpha]) for i in range(vector_of_info2.shape[0]//alpha)])
# %%
movepearson = [pearson_arr(i) for i in range(2,20000)]
# %% [markdown]
# ## 皮尔逊滑动范围变化
# %%
plt.plot([i.shape[0] for i in movepearson])
# %% [markdown]
# ## 随着滑动的步长越来越大,
#
# %%
plt.figure(figsize=(16,9),facecolor='g')
plt.plot([mystd1(i) for i in movepearson],c='g')
# %%
m = len(movepearson)
gradient = lambda alpha : [alpha- (mystd1(i)/m)*mystd1(i) for i in movepearson]
# %%
gradMovePreason = gradient(0.1)
# %%
plt.figure(figsize=(16,9))
plt.plot(gradMovePreason,c='r')
plt.plot([i for i in gradMovePreason if (i-0)<0.01 ] , c='y')
# %% [markdown]
# ## 拉动皮尔逊函数
# %%
person_same = lambda alpha : np.array(
[pearson(vector_of_info1[i:i+alpha],vector_of_info2[i:i+alpha])
for i in
range(0,vector_of_info2.shape[0]//alpha)]
)
# %%
the_some_length = [person_same(i) for i in range(3,20000)]
# %%
plt.plot([mystd2(i) for i in the_some_length])
# %%
comp = vector_of_info2.shape[0]<vector_of_info1.shape[0] and vector_of_info2.shape[0] or vector_of_info1.shape[0]
comp
# %%
person_longht = lambda alpha : np.array([pearson(vector_of_info1[0:i],vector_of_info2[0:i]) for i in range(alpha,comp)])
# %%
pearson(vector_of_info1[:maxm],vector_of_info2[:maxm])
# %%
sst.pearsonr(vector_of_info1[:maxm],vector_of_info2[:maxm])
# %% [markdown]
# ## 相关系数矩阵
# %%
def myprson(vector):
mean = np.mean(vector,axis=0)
std = np.std(vector,axis=0)
zscore = (vector-mean)/std
return np.corrcoef(zscore)
# %%
myprson(mat1[:-1])
# %% [markdown]
# ## k-means 函数
# %%
import matplotlib.pyplot as plt
import numpy as np
import sys
sys.setrecursionlimit(10000)
np.set_printoptions(suppress=True)
def quarterback_method(vector):
'''
四分位法
:param vector:
:return:
'''
lower_q = np.quantile(vector, 0.25, interpolation='lower') # 下四分卫数
higher_q = np.quantile(vector, 0.75, interpolation='higher') # 上四分位数
middle_q = np.median(vector) # 中位数
int_r = higher_q - lower_q # 四分位距
numbers = [lower_q, middle_q, higher_q, int_r] # 分位数整理
data = [vector[vector < i] for i in numbers][:-1] # 前75%分位的数
new_data = np.setdiff1d(vector, np.hstack((data[1], data[2]))) # 后25分位的数
return numbers, data + [new_data] # 分位数和四分位后的数据
def K_point(number):
'''
随机选择K中心
:param number:
:return:
'''
K = lambda: data[np.random.randint(len(data))]
return [np.array(K())[0] for i in range(number)]
def random_point(number):
'''
随机选择K中心
:param number:
:return:
'''
K = lambda: data[np.random.randint(len(data))]
return [np.array(K())[0] for i in range(number)]
def euclidean_distance(pointA, pointB):
'''
欧式距离计算公式
:param pointA:
:param pointB:
:return:
'''
return np.array(np.sqrt(np.dot(np.power((pointA - pointB), 2), np.ones(2))))[0]
def mean_point(dataset):
'''
求中心点
:param dataset:
:return:
'''
x_mean = (np.squeeze(np.array(dataset[:,0:1]))/len(dataset)).sum()
y_mean = (np.squeeze(np.array(dataset[:,-1:]))/len(dataset)).sum()
return np.array([x_mean, y_mean])
def error_sum_of_squares(new_K, old_K):
'''
误差平方和
:param data:
:param K:
:return:
'''
return ((old_K - new_K) ** 2).sum()
def K_means(dataset, K_point, restart=0):
'''标准计算循环完成K-means,误差平方和版本'''
#print('restart', restart)
expr1 = [(np.array(i)[0], [euclidean_distance(j, np.array(i)) for j in K_point]) for i in dataset]
NK = -1
classified_of_data = []
new_K_point = []
for kindex in range(len(K_point)):
NK += 1
kindex = []
for i in range(len(expr1)):
Name = expr1[i][-1].index(min(expr1[i][-1]))
data = list(np.array(dataset[i])[0]) # 最近的点的下标
if Name == NK:
kindex.append(data)
else:
pass
new_K_point.append(mean_point(np.mat(kindex))) # 新的k点整理数据结构
classified_of_data.append(np.mat(kindex))
error_sum_of_squares = sum([((mean_point(i)-j)**2).sum() for i,j in zip(classified_of_data,K_point)]) #误差平方和
if error_sum_of_squares == 0 : # 如果不满足分类条件,则递归继续
print('Ok , K-means 平方误差和版')
return np.array(new_K_point), classified_of_data
else:
restart += 1
return K_means(dataset, new_K_point, restart)
def K_means_run(class_number, all_datas, drawing=False):
'''
误差平方和版本
:param class_number:
:param all_datas:
:param drawing:
:return:
'''
k_of_constant = K_point(class_number)
result = K_means(all_datas, k_of_constant, 0)
if drawing == True:
for data, k in zip(result[-1], result[0]):
x, y = np.squeeze(np.array(data[:, 0:1])), np.squeeze(np.array(data[:, -1:])) # ,
plt.scatter(x, y)
plt.scatter(k[0], k[-1], c='black', marker='x', alpha=1.0)
plt.show()
plt.close()
else:
pass
return result
# %% [markdown]
# ## 数据标准化
# %%
y1 = np.array([myavg(i)/mystd2(i) for i in mat1])
x1 = np.arange(y1.shape[0])
data11 = np.mat(np.c_[x1,y1])[:-1]
data11[-10:]
# %%
plt.scatter(x1,y1,c='r')
# %%
y2 = np.array([myavg(i)/mystd2(i) for i in mat2])
x2 = np.arange(y2.shape[0])
data22 = np.mat(np.c_[x2,y2])[:-2]
data22[-10:]
# %%
plt.scatter(x2,y2,c='g')
# %% [markdown]
# ## 标准化后聚类
# %%
from sklearn.cluster import KMeans
# %%
def kmeans_building_from_sklearn(data,types_num,types_init = "k-means++"):
kmeans_model = KMeans(n_clusters=types_num, init=types_init).fit(data)
newdata = np.c_[kmeans_model.labels_,np.array(data)]
slice_ = [np.squeeze((newdata==i)[:,0:1]) for i in range(types_num)]
return [newdata[i] for i in slice_],kmeans_model.cluster_centers_,slice_,kmeans_model.inertia_,kmeans_model.n_iter_
# %%
to_array_2 = lambda data : np.array(np.squeeze(data))[0]
# %% [markdown]
# ## 第一株
# %%
result = kmeans_building_from_sklearn(data11, 5) # 本例要分3类,所以传入一个3
x = [to_array_2(data11[i][:,0:1]) for i in result[2]]
y = [to_array_2(data11[i][:,-1:]) for i in result[2]]
for i,j,Kp in zip(x,y,result[1]):
plt.scatter(i,j)
plt.scatter(Kp[0],Kp[-1],c='black', marker='x', alpha=1.0)
plt.title('USA')
plt.show()
plt.close()
# %%
result = kmeans_building_from_sklearn(data22, 5) # 本例要分3类,所以传入一个3
x = [to_array_2(data22[i][:,0:1]) for i in result[2]]
y = [to_array_2(data22[i][:,-1:]) for i in result[2]]
for i,j,Kp in zip(x,y,result[1]):
plt.scatter(i,j)
plt.scatter(Kp[0],Kp[-1],c='black', marker='x', alpha=1.0)
plt.title('WuHan')
plt.show()
plt.close()
# %%
import jieba
import math
from scipy import spatial
import numpy as np
class cosine_similarity_of_text:
def __init__(self,strings1=None,strings2=None,model='fitnumpy'):
self.strings1 , self.strings2 = strings1 , strings2
self.strings1_cut, self.strings2_cut = self.cut(strings1), self.cut(strings2)
self.strings1_cut_code ,self.strings2_cut_code = self.cut_code(self.strings1_cut) , self.cut_code(self.strings2_cut)
self.frequency_of_word1 = self.frequency_of_word(self.strings1_cut,self.strings1_cut_code)
self.frequency_of_word2 = self.frequency_of_word(self.strings2_cut,self.strings2_cut_code)
if model == 'numpy':
self.fit = self.full_npcosine(np.array(self.frequency_of_word1),np.array(self.frequency_of_word2))
elif model == 'python':
self.fit =self.full_pycosine(self.frequency_of_word1, self.frequency_of_word2).__next__()
def cut(self,strings):
'''
分词
:param strings:
:return:
'''
return [i for i in jieba.cut(strings, cut_all=True) if i != '']
def word_set(self):
'''
并集词汇表
:return:
'''
return set(self.strings1_cut) | (set(self.strings2_cut))
def word_dict(self):
'''
创建字典
:return:
'''
return {tuple(self.word_set())[i]: i for i in range(len(self.word_set()))}
def cut_code(self,cut):
'''
单文词语编码
:param cut:
:return:
'''
return (self.word_dict()[word] for word in cut)
def frequency_of_word(self,string_cut,string_cut_code):
'''
统计词频生成词向量
:param string_cut:
:param string_cut_code:
:return:
'''
dict_ = self.word_dict()
string_cut_code = string_cut_code
string_cut_code = [0] * len(dict_)
for word in string_cut:
string_cut_code[dict_[word]] += 1
return (string_cut_code)
def full_pycosine(self,vector1,vector2):
'''
python数据结构计算余弦相似度
:param vector1:
:param vector2:
:return:
'''
sum = 0
sqrt1 = 0
sqrt2 = 0
for i in range(len(vector1)):
sum += vector1[i] * vector2[i]
sqrt1 += pow(vector1[i], 2)
sqrt2 += pow(vector2[i], 2)
try:
result = yield round(float(sum) / (math.sqrt(sqrt1) * math.sqrt(sqrt2)), 2)
except ZeroDivisionError:
result = 0.0
return result
def full_npcosine(self,vector1,vector2):
'''
scipy方法创建ndarray结构的余弦相似度
:param vector1:
:param vector2:
:return:
'''
return spatial.distance.cosine(vector1,vector2)
def __del__(self):
pass
# %%
# %%
def hamming(x, y):
return sum(x!=y)/len(x)
# %% [markdown]
# ## 整体聚类
# %% [markdown]
# ## 直接聚类
# %% [markdown]
# ## 找到信息元素差的位置
# %%
def check(one_dim1,one_dim2,info_cur):
compare = lambda reduce_dim :[i for i in info_cur if info_cur[i] in reduce_dim]
if len(compare(one_dim1))>0:
return {'one_dim1':[{chr(one_dim1[i]):i for i in range(len(one_dim1)) if one_dim1[i] == info_cur[j]} for j in info_cur]}
elif len(compare(one_dim2))>0:
return {'one_dim1':[{chr(one_dim2[i]):i for i in range(len(one_dim2)) if one_dim1[i] == info_cur[j]} for j in info_cur]}
else:
return 'nothing'
# %%
check(reduce_dim1,reduce_dim2,info_cur_one_dim)
# %% [markdown]
# ## 经过计算,得到第一组数据比第二组多了一个M,位于总信息表的29837的位置
# %% [markdown]
# ## 单组元素信息表
# %%
dict_of_info = lambda data,data_to_number : {index_of_dict:(letter,number) for letter,number,index_of_dict in zip(reduce_the_dimension_of_the_data(data),reduce_the_dimension_of_the_data(data_to_number),range(len(reduce_the_dimension_of_the_data(data_to_number))))}
# %% [markdown]
# ## 分组元素字典
# %%
dict1 ,dict2 = dict_of_info(data1,data_to_number1),dict_of_info(data2,data_to_number2)
dict1[1],dict2[1]
# %% [markdown]
# ## 顺序差
# %%
info_cur_of_number = [dict1[i][-1]-dict2[i][-1] for i in range(min([len(dict2),len(dict1)]))]
# %% [markdown]
# ## 组并集
# %%
unindata = (set(data1)|set(data2))
# %% [markdown]
# ## 组字典
# %%
dict_of_set = {tuple(unindata)[i]:i for i in range(len(unindata))}
# %% [markdown]
# ## 观察得到武汉的基因组行数比美国要短
# %%
len(data1),len(data2),info1[0],info2[0]
# %% [markdown]
# ## 两组毒株不存在行向量的包含关系
# %%
(set(data1)>set(data2)),(set(data1)<set(data2))
# %% [markdown]
#
# %%
(set(data1)&set(data2))
# %% [markdown]
# ## 单行差异完全
# %%
len(set(data1)^set(data2))-len((set(data1)|set(data2)))