# -*- coding:utf-8 -*-
# coding = GBK
import json
from bs4 import BeautifulSoup as bs
import urllib.request
import re
from selenium import webdriver
import requests
import os
# html_doc = "https://old.tcmsp-e.com/browse.php?qc=herbs"
# req = urllib.request.Request(html_doc)
# webpage = urllib.request.urlopen(req)
# html = webpage.read()
# # reg=r'data:[\s\S]*]'
# # resule=re.compile(reg)
# # img=re.findall(resule,html.decode('utf-8'))
# # print(img)
# reg=r'"herb_pinyin":"Aidicha","herb_en_name":"Ardisiae Japonicae Herba"'
# reg=r'"herb_pinyin":(.*?),"child'
# resule=re.compile(reg)
# img=re.findall(reg,html.decode('utf-8'))
# cnname=[]
# enname=[]
# for i in img:
# cnname.append(re.findall(r'"(.*?)",',i)[0])
# enname.append(re.findall(r'"herb_en_name":"(.*)"', i)[0])
# print(cnname)
# print(enname)
# print(enname[0])
enname=[]
cnname=[]
for line in open(r'E:\zzztest\zzzzDEMO\che\temp\work\finalquestion.txt', "r"):
en=re.findall(r'-(.*)',line)
cn=re.findall(r'(.*)-',line)
enname.append(en[0])
cnname.append(cn[0])
print(enname)
print(cnname)
ss=[]
ss=enname
global newname
count=0
for i in cnname:
print('当前轮次'+str(count))
# print(cnname[count])
# print(ss[count])
# while(count==389):
# print(1)
# newname='Santalum%20Album%20L.'
# ss[count]='Santalum Album L.'
newname='a'
url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"
# if(len(ss[count])>1):
print(ss[count])
name=ss[count].split(' ')
# else:name=ss[count]
# count=count+1
print(name)
for i in name:
newname
if i==name[0]:
newname=i
# print(newname)
if i!=name[0] and i!=name[len(name)-1]:
newname=newname+'%20'+i
if i!=name[0] and i==name[len(name)-1]:
newname=newname+'%20'+i
print(newname)
# while(1):
# print(1)
url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"
driver = webdriver.Chrome()
driver.get(url)
timeout=5
def get_correctfile(path):
files = os.listdir(path)
files.sort()
list=[]
for file in files:
if not os.path.isdir(path + file): # 判断该文件是否是一个文件夹
f_name = str(file)
# print(f_name)
tr = '\\' # 多增加一个斜杠
filename = path + tr + f_name
if(filename[-4:]=="mol2"):
list.append(filename[len(path)+1:-5])
return (list)
def save_file(fileurl,filename):
try:
content = requests.get(fileurl,timeout=timeout,verify=False)
if content.status_code != 404:
content=content.text
with open(filename+'.mol2', "wb") as f:
f.write(content.encode("utf-8"))
global total
total += 1
print(f"保存第{total}个文件")
else:
print('404')
except requests.exceptions.ConnectionError:
print('超时')
pass
driver = webdriver.Chrome()
driver.get(url)
global list
list= []
newlist = []
flag=1
workpath=r'E:\zzztest\zzzzDEMO\che\temp\work\final'
os.chdir(workpath)
# print(cnname)
# print(enname)
nowpath=workpath+r'/'+cnname[count]+r'-'+ss[count]
print(nowpath)
if not os.path.exists(nowpath):
os.makedirs(nowpath)
os.chdir(nowpath)
global newtext
while(flag):
k = driver.find_elements_by_class_name('k-pager-info')
if(flag):
row = driver.find_elements_by_tag_name('tr')
for i in row:
j = i.find_elements_by_tag_name('td')
i = 0
for item in j:
text = item.text
aa = text.split(r'\n')
text = aa[0]
if i % 13 == 0 and text:
# print(text)
list.append(text)
# newlist.append(list[0+i*13])
i = i + 1
# print(newlist)
print(list)
for item in k:
text = item.text
# print(text)
if(text):
newtext=re.findall(r"\d+\.?\d*",text)
print(newtext)
if(newtext[1]==newtext[2]):
flag=0
j = driver.find_elements_by_class_name('k-link')
for item in j:
text = item.text
if (text == 'Go to the next page'):
item.click()
print(list)
total=0
for i in list:
filename=i
down=f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
save_file(down,filename)
global falselist
global coorectlist
falselist=[]
correctlist=[]
flag = 2
while (flag):
flag = flag - 1
falselist = []
correctlist = get_correctfile(nowpath)
# print(correctlist)
for name in list:
if (name not in correctlist):
falselist.extend([name])
print(falselist)
# print(list)
print(f"共计{len(list)}个文件,已成功下载{len(correctlist)}个")
for filename in falselist:
url = f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
save_file(url, filename)
print(f"请求结束共计{len(list)}个文件,已成功下载{len(correctlist)}个,失败文件已输出")
file = open(f"'total'+{filename}.txt", 'w')
for name in list:
if(name==list[0]):
file.write(nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
print(nowpath[40:]+'+共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个')
if(newtext[2]!=str(len(correctlist))):
f = open(r"E:\zzztest\zzzzDEMO\che\temp\work\question.txt", "a+")
f.write('第count轮'+nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
f.close()
file.write(name + '\n')
file.close()
print('count'+cnname[count]+'over')
count=count+1
driver.close()
driver.quit()
driver.quit()
# html = driver.page_source
# # response = bs(requests.get(url).content, 'html.parser')
# html = driver.execute_script("return document.documentElement.outerHTML")
# print(html)
# jsonResponse = json.loads(html.decode('utf-8'))
# soup = BeautifulSoup(html, 'html.parser')
# # data=(str(soup)).data
#
# s=soup.findAll(r"/ul")
# # times = soup.select('data')
# print(s)
# f = open(r'soup.txt','w',encoding='utf-8') #文件路径、操作模式、编码 # r''
# f.write(str(soup))
# f.close()
# for line in open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',"r",encoding='utf-8'):
# print(line)
# with open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt', "r",encoding='utf-8') as f: # 打开文件
# data = f.read() # 读取文件
# print(data)
# f = open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',encoding='utf-8')
# print(f.read())
# regex = 'data:[.*?]'
# for line in f:
# s=re.findall(regex,f)
# print('ok')
# # print(type(f))
#
# f.close()
校验
# -*- coding:utf-8 -*-
# coding = GBK
import json
from bs4 import BeautifulSoup as bs
import urllib.request
import re
from selenium import webdriver
import requests
import os
html_doc = "https://old.tcmsp-e.com/browse.php?qc=herbs"
req = urllib.request.Request(html_doc)
webpage = urllib.request.urlopen(req)
html = webpage.read()
# reg=r'data:[\s\S]*]'
# resule=re.compile(reg)
# img=re.findall(resule,html.decode('utf-8'))
# print(img)
reg=r'"herb_pinyin":"Aidicha","herb_en_name":"Ardisiae Japonicae Herba"'
reg=r'"herb_pinyin":(.*?),"child'
resule=re.compile(reg)
img=re.findall(reg,html.decode('utf-8'))
cnname=[]
enname=[]
for i in img:
cnname.append(re.findall(r'"(.*?)",',i)[0])
enname.append(re.findall(r'"herb_en_name":"(.*)"', i)[0])
print(cnname)
print(enname)
print(enname[0])
ss=[]
ss=enname
global newname
count=0
for i in cnname:
print('当前轮次'+str(count))
# print(cnname[count])
# print(ss[count])
while(count==1):
print(1)
# newname='Santalum%20Album%20L.'
# ss[count]='Santalum Album L.'
newname='a'
url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"
if(len(ss[count])>1):
name=ss[count].split(' ')
else:name=ss[count]
# count=count+1
print(name)
for i in name:
newname
if i==name[0]:
newname=i
# print(newname)
if i!=name[0] and i!=name[len(name)-1]:
newname=newname+'%20'+i
if i!=name[0] and i==name[len(name)-1]:
newname=newname+'%20'+i
print(newname)
url=f"https://old.tcmsp-e.com/tcmspsearch.php?qr={newname}&qsr=herb_en_name&token=37e7ba346324682a73bb9b3b160faa6b"
driver = webdriver.Chrome()
driver.get(url)
timeout=5
def get_correctfile(path):
files = os.listdir(path)
files.sort()
list=[]
for file in files:
if not os.path.isdir(path + file): # 判断该文件是否是一个文件夹
f_name = str(file)
# print(f_name)
tr = '\\' # 多增加一个斜杠
filename = path + tr + f_name
if(filename[-4:]=="mol2"):
list.append(filename[len(path)+1:-5])
return (list)
def save_file(fileurl,filename):
try:
content = requests.get(fileurl,timeout=timeout,verify=False)
if content.status_code != 404:
content=content.text
with open(filename+'.mol2', "wb") as f:
f.write(content.encode("utf-8"))
global total
total += 1
print(f"保存第{total}个文件")
else:
print('404')
except requests.exceptions.ConnectionError:
print('超时')
pass
driver = webdriver.Chrome()
driver.get(url)
global list
list= []
newlist = []
flag=1
workpath=r'E:\zzztest\zzzzDEMO\che\temp\work\final'
os.chdir(workpath)
# print(cnname)
# print(enname)
nowpath=workpath+r'/'+cnname[count]+r'-'+ss[count]
print(nowpath)
if not os.path.exists(nowpath):
os.makedirs(nowpath)
os.chdir(nowpath)
global newtext
# while(flag):
k = driver.find_elements_by_class_name('k-pager-info')
# if(flag):
# row = driver.find_elements_by_tag_name('tr')
# for i in row:
# j = i.find_elements_by_tag_name('td')
# i = 0
# for item in j:
# text = item.text
# aa = text.split(r'\n')
# text = aa[0]
# if i % 13 == 0 and text:
# # print(text)
# list.append(text)
# # newlist.append(list[0+i*13])
# i = i + 1
# # print(newlist)
# print(list)
for item in k:
text = item.text
# print(text)
if(text):
newtext=re.findall(r"\d+\.?\d*",text)
print(newtext)
# if(newtext[1]==newtext[2]):
# flag=0
# j = driver.find_elements_by_class_name('k-link')
# for item in j:
# text = item.text
# if (text == 'Go to the next page'):
# item.click()
print(list)
total=0
for i in list:
filename=i
down=f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
save_file(down,filename)
global falselist
global coorectlist
falselist=[]
correctlist=[]
flag = 2
# while (flag):
# flag = flag - 1
# falselist = []
correctlist = get_correctfile(nowpath)
# # print(correctlist)
# for name in list:
# if (name not in correctlist):
# falselist.extend([name])
# print(falselist)
# # print(list)
# print(f"共计{len(list)}个文件,已成功下载{len(correctlist)}个")
# for filename in falselist:
# url = f"https://old.tcmsp-e.com/tcmspmol/{filename}.mol2"
# save_file(url, filename)
# print(f"请求结束共计{len(list)}个文件,已成功下载{len(correctlist)}个,失败文件已输出")
file = open(f"'total'.txt", 'w')
# for name in list:
# if(name==list[0]):
file.write(nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
print(nowpath[40:]+'+共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个')
# if(newtext[2]!=str(len(correctlist))):
# f = open(r"E:\zzztest\zzzzDEMO\che\temp\work\question.txt", "a+")
# f.write('第count轮'+nowpath[40:]+':共计'+newtext[2]+'个文件成功'+str(len(correctlist))+'个'+'\n')
# f.close()
# file.write(name + '\n')
file.close()
print('count'+cnname[count]+'over')
count=count+1
driver.close()
driver.quit()
driver.quit()
# html = driver.page_source
# # response = bs(requests.get(url).content, 'html.parser')
# html = driver.execute_script("return document.documentElement.outerHTML")
# print(html)
# jsonResponse = json.loads(html.decode('utf-8'))
# soup = BeautifulSoup(html, 'html.parser')
# # data=(str(soup)).data
#
# s=soup.findAll(r"/ul")
# # times = soup.select('data')
# print(s)
# f = open(r'soup.txt','w',encoding='utf-8') #文件路径、操作模式、编码 # r''
# f.write(str(soup))
# f.close()
# for line in open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',"r",encoding='utf-8'):
# print(line)
# with open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt', "r",encoding='utf-8') as f: # 打开文件
# data = f.read() # 读取文件
# print(data)
# f = open(r'E:\zzztest\zzzzDEMO\che\temp\work\resultsoup.txt',encoding='utf-8')
# print(f.read())
# regex = 'data:[.*?]'
# for line in f:
# s=re.findall(regex,f)
# print('ok')
# # print(type(f))
#
# f.close()
test
# -*- coding:utf-8 -*-
import os
import re
workpath = r'E:\zzztest\zzzzDEMO\che\temp\work\final'
print(len(workpath))
os.chdir(workpath)
def get_file(path): # 创建一个空列表
files = os.listdir(path)
files.sort() # 排序
list = []
for file in files:
f_name = str(file)
# print(f_name)
tr = '\\' # 多增加一个斜杠
filename = path + tr + f_name
list.append(filename)
return (list)
def get(path): # 创建一个空列表
files = os.listdir(path)
files.sort() # 排序
list = []
for file in files:
if not os.path.isdir(path + file): # 判断该文件是否是一个文件夹
f_name = str(file)
# print(f_name)
tr = '\\' # 多增加一个斜杠
filename = path + tr + f_name
if(filename[-4:]=="mol2"):
list.append(filename)
return (list)
def get_list(path): # 创建一个空列表
files = os.listdir(path)
files.sort() # 排序
txtlist = []
for file in files:
if not os.path.isdir(path + file): # 判断该文件是否是一个文件夹
f_name = str(file)
# print(f_name)
tr = '\\' # 多增加一个斜杠
filename = path + tr + f_name
if(filename[-3:]=="txt"):
txtlist.append(filename)
return (txtlist)
list=get_file(workpath)
# print('共'+str(len(list))+'文件夹')
# print(list)
errorlist=[]
for i in list:
# print(i[40:])
# re.match(r'',i[])
index=0
txtlist=get_list(i)
# print(txtlist)
count=1
cnt=0
errlist=[]
for line in open(txtlist[0], "r"):
if cnt:
errlist.append(line[0:9])
if count:
# print(line)
result = re.findall(r"\d+\.?\d*",line)
# print(txtlist[0])
# print(result)
count=0
if(result[0]!=result[1]):
print(result[0]+'!='+result[1])
cnt=1
print(errlist)
newlist=[]
finallist=[]
if(errlist):
ss=get(i)
# print(ss)
for i in ss:
# print(i)
newlist.append(i[-14:-5])
print(newlist)
cnt=0
for i in errlist:
# print(len(errlist))
# print(len(newlist))
if i in newlist:
cnt=cnt+1
# print(i)
if errlist:
print(len(errlist))
print(len(newlist))
print(cnt)
index=index+1
# f = open(r"E:\zzztest\zzzzDEMO\che\temp\work\finalquestion.txt", "a+")
# for i in errorlist:
# f.write((i)+'\n')
# f.close()