Materials Project预测晶系

import pymatgen as mg
from pymatgen import MPRester
from pymatgen import Composition, Element
import itertools
import IPython
import random
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


API_KEY = 'xxxxxxxxxx' ##Materialsproject自己号带的

def que_p(target): #分子式,比如Fe2O3
    a = MPRester(API_KEY)
    data=a.get_data(target,data_type='vasp')
    return data

elements = ["H", "Li", "Be","C", "N", "O", "Na",
            "Mg", "Al", "Si", "P", "S", "K", "Ca", "Sc", "Ti",
            "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As",
            "Se", "Br", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru",
            "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "Cs",
            "Ba", "La", "Ta", "W", "Re", "Os"]

#两两组合,共1326个
complex = list(itertools.combinations(elements,2))
#随机取complex中的50个
complex_picked = random.sample(complex,300)

#这里不需要给到具体的式量,
#pymatgen 会直接爬由这两个元素组成的所有分子式量组合的化合物
#比如你输入FeO会自动把Fe2O3Fe3O4之类的一系列东西全爬下来。

#把元素组合整理成pymatgen 能接受的形式之后丢进去就行。我这里只用了结构式和晶系这两个数据。
system = list(map(lambda x: complex_picked[x][0] + '-' + complex_picked[x][1],np.arange(len(complex_picked))))
def que(x):
    a = MPRester(API_KEY)
    data = a.query(criteria=x,properties=["unit_cell_formula","pretty_formula","spacegroup"])
    return data

#之后开干用for循环
data=[]
valid_num=[]
for i in system:
    res = que(i)
    if res != []:
        data.extend(res)
        valid_num.append(system.index(i))
        print("len(res):",len(res),"  system.index(i):",system.index(i),"  res:",res)

#接下来是descriptor的计算。这里我使用的还是简单的descriptors
length = np.arange(len(data))
formula = list(map(lambda x:data[x]['pretty_formula'],length))
def mat_descriptor_calculation(material):
    mat = Composition(material)
    fraction_list = list(map(lambda x:mat.get_atomic_fraction(x),mat))
    sort_frac = fraction_list.sort(reverse=True)
    if sort_frac is not None:
        fraction = sort_frac[0]/sort_frac[1]
    else:
        fraction = 1
    atomic_number = list(map(lambda x:x.Z,mat))
    electronegativity = list(map(lambda x:x.X,mat))
    element_group = list(map(lambda x:x.group,mat))
    descriptor = atomic_number + electronegativity + element_group
    descriptor.append(fraction)
    return descriptor

#接下来整理晶系的数据
mat_descriptor = list(map(lambda x:mat_descriptor_calculation(data[x]['pretty_formula']),length))
crystal_system = list(map(lambda x:data[x]['spacegroup']['crystal_system'],length))
#去掉了重复的项之后,发现爬到的数据已经包含了全部的晶系
crystal = list(set(crystal_system))

#目的标签有这七个,这是一个多目标分类问题。将结晶的种类整理成标签
label=[]
for i in crystal_system:
    for j in np.arange(len(crystal)):
        if i == crystal[j]:
            i=j
            label.append(i)

#接下来导入pipeline,划分数据集,使用拿衣服贝叶斯来进行分类。
steps = [
    ('autoscale',StandardScaler()),
    ('NB',GaussianNB())
]

X_train,X_test,y_train,y_test = train_test_split(mat_descriptor,label,test_size=0.1,random_state=33)
pipeline = Pipeline(steps=steps)
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

from sklearn.metrics import accuracy_score
y_pred_score = accuracy_score(y_test,y_pred)
print(y_pred_score) #0.3

模型增加+数据获取分离:

获取数据(记得填上自己的API_KEY):

from pymatgen import MPRester
from pymatgen import Composition, Element
import itertools
import random
import numpy as np
import pandas as pd

API_KEY = ''  ##Materialsproject自己号带的


def que_p(target):  # 分子式,比如Fe2O3
    a = MPRester(API_KEY)
    data = a.get_data(target, data_type='vasp')
    return data


elements = ["H", "Li", "Be", "C", "N", "O", "Na",
            "Mg", "Al", "Si", "P", "S", "K", "Ca", "Sc", "Ti",
            "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As",
            "Se", "Br", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru",
            "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "Cs",
            "Ba", "La", "Ta", "W", "Re", "Os"]

# 两两组合,共1326个
complex = list(itertools.combinations(elements, 2))
# 随机取complex中的300个
count = input("Please input complex-sample count:")
complex_picked = random.sample(complex, int(count))

# 这里不需要给到具体的式量,
# pymatgen 会直接爬由这两个元素组成的所有分子式量组合的化合物
# 比如你输入FeO会自动把Fe2O3Fe3O4之类的一系列东西全爬下来。

# 把元素组合整理成pymatgen 能接受的形式之后丢进去就行。我这里只用了结构式和晶系这两个数据。
system = list(map(lambda x: complex_picked[x][0] + '-' + complex_picked[x][1], np.arange(len(complex_picked))))
cnt=0*1
def que(x):
    a = MPRester(API_KEY)
    data = a.query(criteria=x, properties=["unit_cell_formula", "pretty_formula", "spacegroup"])
    return data


# 之后开干用for循环
data = []
valid_num = []
for i in system:
    res = que(i)
    cnt = cnt+1
    if res != []:
        data.extend(res)
        valid_num.append(system.index(i))
        print('num:',cnt,"  len(res):", len(res), "  system.index(i):", system.index(i), "  res:", res)

# 接下来是descriptor的计算。这里我使用的还是简单的descriptors
length = np.arange(len(data))
formula = list(map(lambda x: data[x]['pretty_formula'], length))


def mat_descriptor_calculation(material):
    #将化学式拆分
    mat=Composition(material)
    #lambda x: mat.get_atomic_fraction(x), mat  mat为lambda x中x的参数
    #mat.get_atomic_fraction(x) x为mat中的元素,获取该元素在化学式中的成分比
    fraction_list=list(map(lambda x: mat.get_atomic_fraction(x), mat))
    #降序
    sort_frac=fraction_list.sort(reverse=True)
    if sort_frac is not None:
        fraction=sort_frac[0]/sort_frac[1]
    else:
        fraction=1
    #原子序数
    atomic_number=list(map(lambda x: x.Z, mat))
    #电负性
    electronegativity=list(map(lambda x: x.X, mat))
    #周期
    element_group=list(map(lambda x: x.group, mat))
    #将上述相加
    descriptor=atomic_number+electronegativity+element_group
    #附加原子数比值
    #descriptor=[原子序数, 电负性, 周期, 原子数比值] 每组7个
    descriptor.append(fraction)
    return descriptor

print(data)
# 接下来整理晶系的数据
mat_descriptor = list(map(lambda x: mat_descriptor_calculation(data[x]['pretty_formula']), length))
crystal_system = list(map(lambda x: data[x]['spacegroup']['crystal_system'], length))
# 去掉了重复的项之后,发现爬到的数据已经包含了全部的晶系
crystal = list(set(crystal_system))
print(crystal_system)
print(crystal)


# 目的标签有这七个,这是一个多目标分类问题。将结晶的种类整理成标签
label = []
for i in crystal_system:
    for j in np.arange(len(crystal)):
        if i == crystal[j]:
            i = j
            label.append(i)

print(mat_descriptor)
print("===="*20)

print(label)
print("===="*20)

df = pd.DataFrame(mat_descriptor)
df.columns = ["num1","num2","elecgativity1","elegativity2","ele_group1","ele_group2","fraction"]
df["label"] = label
csvname = "data"+'-'+count+'.csv'
df.to_csv(csvname,columns=["num1","num2","elecgativity1","elegativity2","ele_group1","ele_group2","fraction",'label'],index=False)

处理数据(在pd读取数据时更改名字):

from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import numpy as np

data = pd.read_csv("data-1326.csv")
label = data[["label"]]
mat_descriptor = data.drop(columns="label")
Barname = []
Barval = []

X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.5, random_state=44)
#===贝叶斯===
# 接下来导入pipeline,划分数据集,使用模型来进行分类。
steps = [
    ('autoscale', StandardScaler()),
    ('NB', GaussianNB())
]
pipeline = Pipeline(steps=steps)
pipeline.fit(X_train, y_train.values.ravel())
y_bayes_pred = pipeline.predict(X_test)
y_bayes_pred_score = accuracy_score(y_test, y_bayes_pred)
print("y_bayes_pred_score:",y_bayes_pred_score)
Barname.append("bayes")
Barval.append(y_bayes_pred_score)


# ===KNN===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.25, random_state=2)
from sklearn.neighbors import KNeighborsClassifier
steps2 = [
    ("autoscale",StandardScaler()),
    ("KNN",KNeighborsClassifier()),
]
pipeline2 = Pipeline(steps = steps2)
pipeline2.fit(X_train,y_train.values.ravel())
y_KNN_pred = pipeline2.predict(X_test)
y_KNN_pred_score = accuracy_score(y_test,y_KNN_pred)
print("y_KNN_pred_score:",y_KNN_pred_score)
Barname.append("KNN")
Barval.append(y_KNN_pred_score)


#===SVC===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.1, random_state=3)

steps3 = [
    ("autoscale",StandardScaler()),
    ("SVC",SVC(probability=True)),
]
pipeline3 = Pipeline(steps = steps3)
pipeline3.fit(X_train,y_train.values.ravel())
y_SVC_pred = pipeline3.predict(X_test)
y_SVC_pred_score = accuracy_score(y_test,y_SVC_pred)
print("y_SVC_pred_score:",y_SVC_pred_score)
Barname.append("SVC")
Barval.append(y_SVC_pred_score)

#===Bagging预测器+KNN分类器===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.25, random_state=4)
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging1 = BaggingClassifier(KNeighborsClassifier(), n_estimators=10,
                            max_samples=0.5, max_features=0.5)
steps4 = [
    ("autoscale",StandardScaler()),
    ("bag",bagging1),
]
pipeline4 = Pipeline(steps = steps4)
pipeline4.fit(X_train,y_train.values.ravel())
y_bag1_pred = pipeline4.predict(X_test)
y_bag1_pred_score = accuracy_score(y_test,y_bag1_pred)
print("y_bagKNN_pred_score:",y_bag1_pred_score)
Barname.append("Bagging+KNN")
Barval.append(y_bag1_pred_score)


#===Bagging + 决策树 ===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.25, random_state=9)
from sklearn.tree import DecisionTreeClassifier
bagging2 = BaggingClassifier(DecisionTreeClassifier(),
                                     n_estimators = 100, max_features = 0.5)
steps9 = [
    ("autoscale",StandardScaler()),
    ("bag",bagging2),
]
pipeline9 = Pipeline(steps = steps9)
pipeline9.fit(X_train,y_train.values.ravel())
y_bag2_pred = pipeline9.predict(X_test)
y_bag2_pred_score = accuracy_score(y_test,y_bag2_pred)
print("y_bagTree_pred_score:",y_bag2_pred_score)
Barname.append("Bagging+Tree")
Barval.append(y_bag2_pred_score)


#===pca+LogisticRegression===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.1, random_state=5)
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression
steps5 = [('sc',StandardScaler()),
                 ('pca',PCA(n_components=2)),
                 ('clf',LogisticRegression(random_state=666))   #设置随机种子,使测试结果复现
                 ]

pipeline5 = Pipeline(steps = steps5)
pipeline5.fit(X_train,y_train.values.ravel())
y_LR_pred = pipeline5.predict(X_test)
y_LR_pred_score = accuracy_score(y_test,y_LR_pred)
print("y_LR_pred_score:",y_LR_pred_score)
Barname.append("LogisticRegression")
Barval.append(y_LR_pred_score)


#===PCA + RandomForestClassifier===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.1, random_state=6)
steps6 =[
    ('scaler',StandardScaler()),
    ('pca',PCA()),
    ('rf',RandomForestClassifier()),
]
pipeline6 = Pipeline(steps = steps6)
pipeline6.fit(X_train,y_train.values.ravel())
y_rf_pred = pipeline6.predict(X_test)
y_rf_pred_score = accuracy_score(y_test,y_rf_pred)
print("y_rf_pred_score:",y_rf_pred_score)
Barname.append("RandomForest")
Barval.append(y_rf_pred_score)


#===网格搜索KNN===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.25, random_state=7)
pipeline7 = Pipeline([
    ('scaler',StandardScaler()),
    ("KNN",KNeighborsClassifier()),
])

param_grid= [
    {
        'KNN__weights': ['uniform'],
        'KNN__n_neighbors': [i for i in range(1, 11)]
    },
    {
        'KNN__weights': ['distance'],
        'KNN__n_neighbors': [i for i in range(1, 11)],
        'KNN__p': [i for i in range(1, 6)]
    }
]

GKNN = GridSearchCV(pipeline7,param_grid, cv=4, verbose=1)
GKNN.fit(X_train,y_train.values.ravel())
GKNN_pred = GKNN.predict(X_test)
GKNN_pred_score = accuracy_score(y_test,GKNN_pred)
print("GridSearchCV-KNN_best score:",GKNN.best_score_)
#print(clf.best_estimator_)
print("normal KNN:",GKNN_pred_score)
Barname.append("GKNN.best")
Barval.append(GKNN.best_score_)
Barname.append("GKNN.pred")
Barval.append(GKNN_pred_score)


#===Bagging + 决策树 GridSearchCV===

# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.25, random_state=10)
bagging2 = BaggingClassifier(DecisionTreeClassifier(),
                                     n_estimators = 100, max_features = 0.5)
pipeline10 = Pipeline([
    ("autoscale",StandardScaler()),
    ("bag2",bagging2),
])
param_grid={
    'bag2__base_estimator__max_depth' : [1, 2, 3, 4, 5],
    'bag2__max_samples' : [0.05, 0.1, 0.2, 0.5]
}
Gbag2 = GridSearchCV(pipeline10,param_grid,
                        refit=True,
                        return_train_score=True,
                        cv=5)
Gbag2.fit(X_train,y_train.values.ravel())
Gbag2_pred = Gbag2.predict(X_test)
y_Gbag2_pred_score = accuracy_score(y_test,Gbag2_pred)
print("GridSearchCV-Gbag2_best score:",Gbag2.best_score_)
#print(GSVM.best_estimator_)
print("normal Gbag2",y_Gbag2_pred_score)
Barname.append("GbagTree.best")
Barval.append(Gbag2.best_score_)
Barname.append("GbagTree.pred")
Barval.append(y_Gbag2_pred_score)


#===Bagging + KNN GridSearchCV===
# X_train, X_test, y_train, y_test = train_test_split(mat_descriptor, label, test_size=0.25, random_state=11)
bagging = BaggingClassifier(KNeighborsClassifier())
pipeline11 = Pipeline([
    ("autoscale",StandardScaler()),
    ("bag",bagging),
])
param_grid= [
    {
        'bag__base_estimator__weights': ['uniform'],
        'bag__base_estimator__n_neighbors': [i for i in range(1, 11)]
    },
    {
        'bag__base_estimator__weights': ['distance'],
        'bag__base_estimator__n_neighbors': [i for i in range(1, 11)],
        'bag__base_estimator__p': [i for i in range(1, 6)]
    }
]
Gbag = GridSearchCV(pipeline11,param_grid,cv=5)
Gbag.fit(X_train,y_train.values.ravel())
Gbag_pred = Gbag.predict(X_test)
y_Gbag_pred_score = accuracy_score(y_test,Gbag_pred)
print("GridSearchCV-Gbag_best score:",Gbag.best_score_)
#print(GSVM.best_estimator_)
print("normal Gbag",y_Gbag_pred_score)
Barname.append("GbagKNN.best")
Barval.append(Gbag.best_score_)
Barname.append("GbagKNN.pred")
Barval.append(y_Gbag_pred_score)




fig = sns.barplot(x=Barname,y=Barval)
for p in fig.patches:
    height = p.get_height()
    print(p.get_height())
    fig.text(x = p.get_x()+(p.get_width()/2), y = height+0.01,s = '{:.4f}'.format(height),ha = 'center')
fig.set_title("data_300/test_size=0.5")
plt.xticks(rotation=30,fontsize = 8)
plt.yticks([0.1,0.15,0.2,0.25,0.3,0.35,0.4], rotation=0, fontsize="10", va="center")
plt.tight_layout( w_pad=3.0, h_pad=3.0)
plt.figure(figsize=(5,30))
plt.show()

  • 0
    点赞
  • 39
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

萌新待开发

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值