rcsb爬虫

# -*- coding:utf-8 -*-
import os
import time
import requests
import re
import pandas as pd
import numpy as np
# retval = os.getcwd()
# os.chdir(retval+"/temp")
filename='1R4L'
# filename = '1RL'
url=f"http://files.rcsb.org/download/{filename}.pdb"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.47'}
timeout=5





def get_correctfile(path):
    files = os.listdir(path)
    files.sort()
    list=[]
    for file in files:
        if not os.path.isdir(path + file):  # 判断该文件是否是一个文件夹
            f_name = str(file)
            #             print(f_name)
            tr = '\\'  # 多增加一个斜杠
            filename = path + tr + f_name
            if(filename[-3:]=="pdb"):
                list.append(filename[len(path)+1:-4])

    return (list)

def get_file(path):  # 创建一个空列表
    list=[]
    if(path[-3:]=="txt"):
        for line in open(path, "r", encoding='UTF-8'):
            ret = re.findall('[0-9A-Za-z]{4}', line, re.M)
            list.extend(ret)
            # print(list)
    if (path[-4:] == "xlsx"):
        df = pd.read_excel(path, header=None)
        # df.dropna(how = 'any')
        b = np.array(df)
        c = b.tolist()
        d = sum(c, [])
        d = [x for x in d if x == x]
        list.extend(d)
    if (path[-3:] == "csv"):
        df = pd.read_csv(path, header=None)
        # df.dropna(how = 'any')
        b = np.array(df)
        c = b.tolist()
        d = sum(c, [])
        d = [x for x in d if x == x]
        list.extend(d)
    # print(line)
    return (list)


def save_file(fileurl,filename):
    content = requests.get(fileurl,headers=headers)
    if content.status_code != 404:
        content=content.text
        with open(filename+'.pdb', "wb") as f:
            f.write(content.encode("utf-8"))
            global total
            total += 1
            print(f"保存第{total}张图片")
    else:
        print('404')


if __name__ == '__main__':
    global total
    total = 0
    list=[]
    falselist=[]
    correctlist=[]
    path= input("输入文件选择的文件目录带文件类型:")

    workpath = input("输入工作目录:")

    workpath = workpath + "\\temp"
    os.makedirs(workpath)
    os.chdir(workpath)
    list=get_file(path)
    for filename in list:
        url = f"http://files.rcsb.org/download/{filename}.pdb"
        save_file(url,filename)
    flag=5
    while(flag):
        flag=flag-1
        falselist=[]
        correctlist=get_correctfile(workpath)
        # print(correctlist)
        for name in list:
            if(name not in correctlist):
                falselist.extend([name])
        # print(falselist)
        # print(list)
        print(f"共计{len(list)}个文件,已成功下载{len(correctlist)}个")
        for filename in falselist:
            url = f"http://files.rcsb.org/download/{filename}.pdb"
            save_file(url, filename)
    print(f"请求结束共计{len(list)}个文件,已成功下载{len(correctlist)}个,失败文件已输出")
    file = open('AAAfalselist.txt', 'w')
    for filename in falselist:
        file.write(filename+'\n')
    file.close()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
04-11 4544
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值