中药 爬虫

# -*- coding:utf-8 -*-
# coding = GBK
import json

from bs4 import BeautifulSoup as bs
import urllib.request
import re
from selenium import webdriver
import requests
import os


# html_doc = "https://old.tcmsp-e.com/browse.php?qc=herbs"
# req = urllib.request.Request(html_doc)
# webpage = urllib.request.urlopen(req)
# html = webpage.read()
# # reg=r'data:[\s\S]*]'
# # resule=re.compile(reg)
# # img=re.findall(resule,html.decode('utf-8'))
# # print(img)
# reg=r'"herb_pinyin":"Aidicha","herb_en_name":"Ardisiae Japonicae Herba"'
# reg=r'"herb_pinyin":(.*?),"child'
# resule=re.compile(reg)
# img=re.findall(reg,html.decode('utf-8'))
# cnname=[]
# enname=[]
# for i in img:
#     cnname.append(re.findall(r'"(.*?)",',i)[0])
#     enname.append(re.findall(r'"herb_en_name":"(.*)"', i)[0])
# print(cnname)
# print(enname)
# print(enname[0])
enname=[]
cnname=[]
for line in open(r'E:\zzztest\zzzzDEMO\che\temp\work\finalquestion.txt', "r"):
    en=re.findall(r'-(.*)',line)
    cn=re.findall(r'(.*)-',line)
    enname.append(en[0])
    cnname.append(cn[0])
print(enname)
print(cnname)
ss=[]
ss=enname
global newname
count=0
for i in cnname:
    print('当前轮次'+str(count))


    # print(cnname[count])
    # print(ss[count])
    # while(count==389):
    #     print(1)
    # newname='Santalum%20Album%20L.'
    # ss[count]='Santalum Album L.'


    newname='a'
    url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"
    # if(len(ss[count])>1):
    print(ss[count])
    name=ss[count].split(' ')
    # else:name=ss[count]
    # count=count+1
    print(name)
    for i in name:
        newname
        if i==name[0]:
            newname=i
            # print(newname)
        if i!=name[0] and i!=name[len(name)-1]:
            newname=newname+'%20'+i
        if i!=name[0] and i==name[len(name)-1]:
            newname=newname+'%20'+i
    print(newname)
    # while(1):
    #     print(1)
    url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"

    driver = webdriver.Chrome()
    driver.get(url)

    timeout=5
    def get_correctfile(path):
        files = os.listdir(path)
        files.sort()
        list=[]
        for file in files:
            if not os.path.isdir(path + file):  # 判断该文件是否是一个文件夹
                f_name = str(file)
                #             print(f_name)
                tr = '\\'  # 多增加一个斜杠
                filename = path + tr + f_name
                if(filename[-4:]=="mol2"):
                    list.append(filename[len(path)+1:-5])

        return (list)
    def save_file(fileurl,filename):
        try:
            content = requests.get(fileurl,timeout=timeout,verify=False)

            if content.status_code != 404:
                content=content.text
                with open(filename+'.mol2', "wb") as f:
                    f.write(content.encode("utf-8"))
                    global total

                    total += 1
                    print(f"保存第{total}个文件")
            else:
                print('404')
        except requests.exceptions.ConnectionError:
            print('超时')
            pass

    driver = webdriver.Chrome()
    driver.get(url)
    global list
    list= []
    newlist = []
    flag=1
    workpath=r'E:\zzztest\zzzzDEMO\che\temp\work\final'
    os.chdir(workpath)
    # print(cnname)
    # print(enname)
    nowpath=workpath+r'/'+cnname[count]+r'-'+ss[count]
    print(nowpath)
    if not os.path.exists(nowpath):
        os.makedirs(nowpath)
    os.chdir(nowpath)
    global newtext
    while(flag):
        k = driver.find_elements_by_class_name('k-pager-info')
        if(flag):
            row = driver.find_elements_by_tag_name('tr')
            for i in row:
                j = i.find_elements_by_tag_name('td')
                i = 0
                for item in j:
                    text = item.text
                    aa = text.split(r'\n')
                    text = aa[0]
                    if i % 13 == 0 and text:
                        # print(text)
                        list.append(text)
                    # newlist.append(list[0+i*13])
                    i = i + 1
                    # print(newlist)
            print(list)
        for item in k:
            text = item.text
            # print(text)
            if(text):
                newtext=re.findall(r"\d+\.?\d*",text)
                print(newtext)
                if(newtext[1]==newtext[2]):
                    flag=0
        j = driver.find_elements_by_class_name('k-link')
        for item in j:
            text = item.text
            if (text == 'Go to the next page'):
                item.click()
    print(list)
    total=0
    for i in list:
        filename=i
        down=f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
        save_file(down,filename)


    global falselist
    global coorectlist
    falselist=[]
    correctlist=[]
    flag = 2
    while (flag):
        flag = flag - 1
        falselist = []
        correctlist = get_correctfile(nowpath)
        # print(correctlist)
        for name in list:
            if (name not in correctlist):
                falselist.extend([name])
        print(falselist)
        # print(list)
        print(f"共计{len(list)}个文件,已成功下载{len(correctlist)}个")
        for filename in falselist:
            url = f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
            save_file(url, filename)
    print(f"请求结束共计{len(list)}个文件,已成功下载{len(correctlist)}个,失败文件已输出")
    file = open(f"'total'+{filename}.txt", 'w')
    for name in list:
        if(name==list[0]):
            file.write(nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
            print(nowpath[40:]+'+共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个')
            if(newtext[2]!=str(len(correctlist))):
                f = open(r"E:\zzztest\zzzzDEMO\che\temp\work\question.txt", "a+")
                f.write('第count轮'+nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
                f.close()
        file.write(name + '\n')
    file.close()





    print('count'+cnname[count]+'over')
    count=count+1
    driver.close()
    driver.quit()
    driver.quit()









# html = driver.page_source
# # response = bs(requests.get(url).content, 'html.parser')
# html = driver.execute_script("return document.documentElement.outerHTML")
# print(html)

# jsonResponse = json.loads(html.decode('utf-8'))

# soup = BeautifulSoup(html, 'html.parser')
# # data=(str(soup)).data
#
# s=soup.findAll(r"/ul")
# # times = soup.select('data')
# print(s)
# f = open(r'soup.txt','w',encoding='utf-8')  #文件路径、操作模式、编码  # r''
# f.write(str(soup))
# f.close()
# for line in open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',"r",encoding='utf-8'):
#     print(line)
# with open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt', "r",encoding='utf-8') as f:  # 打开文件
#     data = f.read()  # 读取文件
#     print(data)
# f = open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',encoding='utf-8')
# print(f.read())
# regex = 'data:[.*?]'
# for line in f:
#     s=re.findall(regex,f)
#     print('ok')
# # print(type(f))
#
# f.close()

校验

# -*- coding:utf-8 -*-
# coding = GBK
import json

from bs4 import BeautifulSoup as bs
import urllib.request
import re
from selenium import webdriver
import requests
import os


html_doc = "https://old.tcmsp-e.com/browse.php?qc=herbs"
req = urllib.request.Request(html_doc)
webpage = urllib.request.urlopen(req)
html = webpage.read()
# reg=r'data:[\s\S]*]'
# resule=re.compile(reg)
# img=re.findall(resule,html.decode('utf-8'))
# print(img)
reg=r'"herb_pinyin":"Aidicha","herb_en_name":"Ardisiae Japonicae Herba"'
reg=r'"herb_pinyin":(.*?),"child'
resule=re.compile(reg)
img=re.findall(reg,html.decode('utf-8'))
cnname=[]
enname=[]
for i in img:
    cnname.append(re.findall(r'"(.*?)",',i)[0])
    enname.append(re.findall(r'"herb_en_name":"(.*)"', i)[0])
print(cnname)
print(enname)
print(enname[0])
ss=[]
ss=enname
global newname
count=0
for i in cnname:
    print('当前轮次'+str(count))


    # print(cnname[count])
    # print(ss[count])
    while(count==1):
        print(1)
    # newname='Santalum%20Album%20L.'
    # ss[count]='Santalum Album L.'


    newname='a'
    url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"
    if(len(ss[count])>1):
        name=ss[count].split(' ')
    else:name=ss[count]
    # count=count+1
    print(name)
    for i in name:
        newname
        if i==name[0]:
            newname=i
            # print(newname)
        if i!=name[0] and i!=name[len(name)-1]:
            newname=newname+'%20'+i
        if i!=name[0] and i==name[len(name)-1]:
            newname=newname+'%20'+i
    print(newname)

    url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"

    driver = webdriver.Chrome()
    driver.get(url)

    timeout=5
    def get_correctfile(path):
        files = os.listdir(path)
        files.sort()
        list=[]
        for file in files:
            if not os.path.isdir(path + file):  # 判断该文件是否是一个文件夹
                f_name = str(file)
                #             print(f_name)
                tr = '\\'  # 多增加一个斜杠
                filename = path + tr + f_name
                if(filename[-4:]=="mol2"):
                    list.append(filename[len(path)+1:-5])

        return (list)
    def save_file(fileurl,filename):
        try:
            content = requests.get(fileurl,timeout=timeout,verify=False)

            if content.status_code != 404:
                content=content.text
                with open(filename+'.mol2', "wb") as f:
                    f.write(content.encode("utf-8"))
                    global total

                    total += 1
                    print(f"保存第{total}个文件")
            else:
                print('404')
        except requests.exceptions.ConnectionError:
            print('超时')
            pass

    driver = webdriver.Chrome()
    driver.get(url)
    global list
    list= []
    newlist = []
    flag=1
    workpath=r'E:\zzztest\zzzzDEMO\che\temp\work\final'
    os.chdir(workpath)
    # print(cnname)
    # print(enname)
    nowpath=workpath+r'/'+cnname[count]+r'-'+ss[count]
    print(nowpath)
    if not os.path.exists(nowpath):
        os.makedirs(nowpath)
    os.chdir(nowpath)
    global newtext
    # while(flag):
    k = driver.find_elements_by_class_name('k-pager-info')
        # if(flag):
        #     row = driver.find_elements_by_tag_name('tr')
        #     for i in row:
        #         j = i.find_elements_by_tag_name('td')
        #         i = 0
        #         for item in j:
        #             text = item.text
        #             aa = text.split(r'\n')
        #             text = aa[0]
        #             if i % 13 == 0 and text:
        #                 # print(text)
        #                 list.append(text)
        #             # newlist.append(list[0+i*13])
        #             i = i + 1
        #             # print(newlist)
        #     print(list)
    for item in k:
        text = item.text
        # print(text)
        if(text):
            newtext=re.findall(r"\d+\.?\d*",text)
            print(newtext)
            # if(newtext[1]==newtext[2]):
            #     flag=0
        # j = driver.find_elements_by_class_name('k-link')
        # for item in j:
        #     text = item.text
        #     if (text == 'Go to the next page'):
        #         item.click()
    print(list)
    total=0
    for i in list:
        filename=i
        down=f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
        save_file(down,filename)


    global falselist
    global coorectlist
    falselist=[]
    correctlist=[]
    flag = 2
    # while (flag):
    #     flag = flag - 1
    #     falselist = []
    correctlist = get_correctfile(nowpath)
    #     # print(correctlist)
    #     for name in list:
    #         if (name not in correctlist):
    #             falselist.extend([name])
    #     print(falselist)
    #     # print(list)
    #     print(f"共计{len(list)}个文件,已成功下载{len(correctlist)}个")
    #     for filename in falselist:
    #         url = f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
    #         save_file(url, filename)
    # print(f"请求结束共计{len(list)}个文件,已成功下载{len(correctlist)}个,失败文件已输出")
    file = open(f"'total'.txt", 'w')
    # for name in list:
    #     if(name==list[0]):
    file.write(nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
    print(nowpath[40:]+'+共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个')
    # if(newtext[2]!=str(len(correctlist))):
    #     f = open(r"E:\zzztest\zzzzDEMO\che\temp\work\question.txt", "a+")
    #     f.write('第count轮'+nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
    #     f.close()
        # file.write(name + '\n')
    file.close()





    print('count'+cnname[count]+'over')
    count=count+1
    driver.close()
    driver.quit()
    driver.quit()









# html = driver.page_source
# # response = bs(requests.get(url).content, 'html.parser')
# html = driver.execute_script("return document.documentElement.outerHTML")
# print(html)

# jsonResponse = json.loads(html.decode('utf-8'))

# soup = BeautifulSoup(html, 'html.parser')
# # data=(str(soup)).data
#
# s=soup.findAll(r"/ul")
# # times = soup.select('data')
# print(s)
# f = open(r'soup.txt','w',encoding='utf-8')  #文件路径、操作模式、编码  # r''
# f.write(str(soup))
# f.close()
# for line in open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',"r",encoding='utf-8'):
#     print(line)
# with open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt', "r",encoding='utf-8') as f:  # 打开文件
#     data = f.read()  # 读取文件
#     print(data)
# f = open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',encoding='utf-8')
# print(f.read())
# regex = 'data:[.*?]'
# for line in f:
#     s=re.findall(regex,f)
#     print('ok')
# # print(type(f))
#
# f.close()

test

# -*- coding:utf-8 -*-
import os
import re
workpath = r'E:\zzztest\zzzzDEMO\che\temp\work\final'
print(len(workpath))
os.chdir(workpath)

def get_file(path):  # 创建一个空列表
    files = os.listdir(path)
    files.sort()  # 排序
    list = []
    for file in files:


        f_name = str(file)
        #             print(f_name)
        tr = '\\'  # 多增加一个斜杠
        filename = path + tr + f_name
        list.append(filename)
    return (list)

def get(path):  # 创建一个空列表
    files = os.listdir(path)
    files.sort()  # 排序
    list = []
    for file in files:
        if not os.path.isdir(path + file):  # 判断该文件是否是一个文件夹
            f_name = str(file)
            #             print(f_name)
            tr = '\\'  # 多增加一个斜杠
            filename = path + tr + f_name
            if(filename[-4:]=="mol2"):
                list.append(filename)
    return (list)
def get_list(path):  # 创建一个空列表
    files = os.listdir(path)
    files.sort()  # 排序
    txtlist = []
    for file in files:
        if not os.path.isdir(path + file):  # 判断该文件是否是一个文件夹
            f_name = str(file)
            #             print(f_name)
            tr = '\\'  # 多增加一个斜杠
            filename = path + tr + f_name
            if(filename[-3:]=="txt"):
                txtlist.append(filename)
    return (txtlist)

list=get_file(workpath)
# print('共'+str(len(list))+'文件夹')
# print(list)
errorlist=[]
for i in list:
    # print(i[40:])
    # re.match(r'',i[])
    index=0
    txtlist=get_list(i)
    # print(txtlist)
    count=1
    cnt=0
    errlist=[]
    for line in open(txtlist[0], "r"):
        if cnt:
            errlist.append(line[0:9])
        if count:
            # print(line)
            result = re.findall(r"\d+\.?\d*",line)
            # print(txtlist[0])
            # print(result)
            count=0
            if(result[0]!=result[1]):
                print(result[0]+'!='+result[1])
                cnt=1

    print(errlist)
    newlist=[]
    finallist=[]
    if(errlist):

        ss=get(i)
        # print(ss)
        for i in ss:
            # print(i)
            newlist.append(i[-14:-5])
    print(newlist)
    cnt=0
    for i in errlist:
        # print(len(errlist))
        # print(len(newlist))
        if i in newlist:
            cnt=cnt+1
            # print(i)
    if errlist:
        print(len(errlist))
        print(len(newlist))
        print(cnt)




    index=index+1
# f = open(r"E:\zzztest\zzzzDEMO\che\temp\work\finalquestion.txt", "a+")
# for i in errorlist:
#     f.write((i)+'\n')
# f.close()













  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值