import requests,re,threading
from bs4 import BeautifulSoup
def getPicUrl1(url):#获得照片网站的链接
html=requests.get(url)
html.encoding='GBK'
html=html.text
try:
jpg=re.search(r'http://t1.mmonly.cc/uploads/tu/\d{6}/\d{4}/.+jpg',html)
print("成功")
src=jpg.group(0)
return src
except:
print("失败")
def getPic(src):
for i in range(50):
if(i==0):
src=src
startSrc=src
elif(i==1):
continue
else:
src=src.replace(".html",'')+'_'+str(i)+'.html'
print(src)
html=requests.get(src)
if (str(html))!='<Response [200]>':
break
#
html.encoding='GBK'
html=html.text
jpgUrl=getPicUrl1(src)
jpg=requests.get(jpgUrl)
soup=BeautifulSoup(html,'lxml')
h1=soup.find_all('h1')
strH1=str(h1)
name=re.search(r'[\u4e00-\u9fa5]+',strH1)
name=name.group()
name=name+str(i)+'.jpg'
#path=r'E/pics/%s'%name
print(name)
with open(name,'wb')as f:
f.write(jpg.content)
print("%s写入成功"%name)
f.close()
#print("%s写入失败"%name)
src=startSrc
def main():
srcs=[]
ths=[]
mainUrl='http://www.mmonly.cc/tag/cs/'
html=requests.get(mainUrl)
html.encoding='GBK'
html=html.text
urls=re.findall(r'http:.+/mmtp.+html',html)
for url in urls:
getPic(url)
"""
th=htreading.Thread(target=getPic,args=[url])
th.strat()
ths.append(th)
for i in ths:
i.join()
"""
main()
多线程爬取美图网照片
最新推荐文章于 2024-04-19 10:19:46 发布