PY3.6爬取图片,附带断点续传
在这里插入代码片
```import urllib.request
import bs4
import re
import os
from urllib import request
def getHtml(url):
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent":user_agent}
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
return html
def parse(url):
html_doc = getHtml(url)
sp = bs4.BeautifulSoup(html_doc, 'html.parser', from_encoding="utf-8")
return sp
def get_ok(uu,page_count):
if page_count>0:
for c in range(0,10):
if c == 0:
x_url = uu
st_sta = get_picmm(x_url)
elif c >1:
x_url = uu+str(c)+".html"
st_sta2 = get_picmm(x_url)
def get_picmm(x_url):
print('x_url',x_url)
name = parse(x_url).find("div","boxs").find("ul")
cre_urll( name)
def cre_urll(name):
for n in name:
if len(n) >10:
p_href = n.find('a').get('href').replace('https://www.meitulu.com/item/','').replace('.html','')
p_title = n.find('a').find('img').get('alt')
print('p_title',p_title)
p_url_list = p_title[-4:].replace('[','').replace(']','').replace('图','').replace('列','')
get_foot_page(int(p_url_list),p_href,p_title)
else:
pass
def get_foot_page(p_url_list,p_href,p_title):
pic_url = 'https://mtl.gzhuibei.com/images/img/'+str(p_href)+"/"
file_handle=open("D:\\XX\\pic_nima\\muluguochan.txt",mode='a',encoding='utf-8')
file_handle2=open("D:\\XX\\pic_nima\\muluguochan.txt",mode='r',encoding='utf-8')
ll = file_handle2.read()
if ll.count(pic_url)==1:
print('已下载,跳过',pic_url)
pass
elif ll.count(pic_url)==0:
print('当前下载',pic_url)
for i in range(1,p_url_list+1):
pic_url2 = pic_url+str(i)+'.jpg'
save_pic(p_title,pic_url2,i)
file_handle.write(pic_url+",\n")
pass
def save_pic(title,pic_url,i):
our_dir = "D:\\XX\\pic_nima\\国产\\"+title+"\\"
if not os.path.exists(our_dir):
os.makedirs(our_dir)
urllib.request.urlretrieve(pic_url, filename=our_dir+str(i)+".jpg")
pass
if __name__ == '__main__':
x_url = "https://www.meitulu.com/guochan/"
get_ok(x_url,199)