使用Materials Project的API接口下载材料数据集(索引formula)

最新推荐文章于 2025-04-18 14:22:57 发布

成贤往事

最新推荐文章于 2025-04-18 14:22:57 发布

阅读量1.2k

点赞数 5

分类专栏： ai4sci AI for Materials 文章标签：深度学习 python gpt-3 回归人工智能机器学习数据挖掘

本文链接：https://blog.csdn.net/qq_45751961/article/details/139513106

版权

ai4sci 同时被 2 个专栏收录

2 篇文章

订阅专栏

AI for Materials

2 篇文章

订阅专栏

得益于毕设需要找一些热电的数据集，因此我是有结构formula的csv文件，但是icsd号对不上，因此希望通过formula+spacegroup+natoms+volume+band_gap确定想要的结构。
代码如下：

import json
import os
from tqdm import tqdm
import pandas as pd
#https://next-gen.materialsproject.org/api登录访问新版32位密钥
API_KEY = "xxx32位，而不是16位"  # 注册账号时给的KEY，具体位置是官网右上角my DashBoard
mpr = MPRester(api_key=API_KEY)  # 创建MPRester对象进行认证

def download_te_data(root_path, csv_name, ltc_dataset):
    with open(ltc_dataset, "r") as f:
        ltc_data = json.load(f)
    df = pd.read_csv(os.path.join(root_path,csv_name))
    formula_list = []
    error_list = []
    with open("TEDesignLab_dataset.csv","w") as f:
        f.write("poscar_name,tc\n")
        for id, tgt_crys in tqdm(df.iterrows()):
            formula = tgt_crys["compound"]
            if formula not in ltc_data:
                id = "icsd-" + str(tgt_crys["icsd"])
                sg = tgt_crys["sg"]
                natoms = tgt_crys["natoms"]
                eg =tgt_crys["Eg (eV)"]
                vol = tgt_crys["volume"]
                docs = mpr.summary.search(formula=formula)  # 设定搜索条件
                for crys in docs:
                    idx_sg = crys.symmetry.number
                    nsites = crys.nsites
                    try:
                        icsd_id = crys.database_IDs["icsd"]
                        if id in icsd_id and sg == idx_sg and natoms == nsites:#根据icsd_id、空间群、原子数确定一个结构是否是想要的结构
                            stru = crys.structure
                            poscar = Poscar(stru)
                            if formula not in formula_list:
                                poscar.write_file(os.path.join(root_path, formula + "-POSCAR"))
                                crys_name = formula + "-POSCAR"
                                formula_list.append(formula)
                            else:
                                poscar.write_file(os.path.join(root_path, formula + "-" + str(sg) + "-POSCAR"))
                                crys_name = formula + "-" + str(sg) + "-POSCAR"
                            f.write(f"{crys_name},{K}\n")
                            break
                    except:
                        error_list.append(formula)
                        band_gap = crys.band_gap
                        diff_Eg = band_gap - eg
                        diff_vol =crys.volume - vol
                        if sg == idx_sg and natoms == nsites and abs(diff_Eg)<=1 and abs(diff_vol)<=10:#根据icsd_id、空间群、原子数确定一个结构是否是想要的结构
                            stru = crys.structure
                            poscar = Poscar(stru)
                            if formula not in formula_list:
                                poscar.write_file(os.path.join(root_path, formula + "-POSCAR"))
                                crys_name = formula + "-POSCAR"
                                formula_list.append(formula)
                            else:
                                poscar.write_file(os.path.join(root_path, formula + "-" + str(sg) + "-POSCAR"))
                                crys_name = formula + "-" + str(sg) + "-POSCAR"
                            f.write(f"{crys_name},{K}\n")
                            break
        print(error_list)

def convert_csv_to_json(root_path, csv_file1, csv_file2):
    crys_dict = {}
    df = pd.read_csv(os.path.join(root_path, csv_file1))
    df2 = pd.read_csv(os.path.join(root_path, csv_file2))
    for id, row in tqdm(df.iterrows()):
        crys_dict[row["poscar_name"]] = row["tc"]
    for id, row in df2.iterrows():
        crys_dict[row["poscar_name"]] = row["tc"]

    json_data = json.dumps(crys_dict)
    with open("total_dataset.json","w") as f:
        f.write(json_data)

    # for name in column_names:
    #     print(name)
    # for id, row in df.iterrows():
    #     el_name = row["Symbol"]

if __name__ =="__main__":
    root_path = "new_tc_dataset"
    csv_name = "example_dataset.csv"
    ltc_dataset = "total_dataset.json"
    #convert_csv_to_json(root_path, "datasets.csv", "TE-dataset.csv")
    #download_te_data(root_path, csv_name, ltc_dataset)

代码临时写的，用gpt稍微读读应该差不多，不写注释和优化代码逻辑了，能跑