#单线程下载实例
# https://www.doutula.com/?page=0 第一页
# https://www.doutula.com/article/list/?page=2 第二页
# https://www.doutula.com/article/list/?page=3
import requests
from lxml import etree
from time import *
def get_url_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2816.400'
}
req = requests.get(url,headers= headers)
req.encoding = 'utf-8'
#print(req.text)
html_element = etree.HTML(req.text)
#//*[@id="home"]/div/div[2]/div[2]/ul/li/div[2]/div/a[1]
# //*[@id="home"]/div/div[2]/div[2]/ul/li/div[2]/div/a[1]/img[2]
# 获取图片地址,并过滤掉gif
#img_herfs = html_element.xpath('//div[@class="row"]//img/@data-original')
#方法1 用lxml.etree._Element的get方法获取属性内容
# img_herfs = html_element.xpath('//div[@class="row"]//img[@class!="gif"]')
'''
img_herfs = html_element.xpath('//div[@class="row"]//img[@data-original]')
for img_herf in img_herfs:
#print(type(img_herf))
#print(img_herf)
#print(etree.tostring(img_herf))
img_url = img_herf.get('data-original')
print(img_url)
'''
#方法2:用获取属性的方法来获取
img_herfs = html_element.xpath('//div[@class="row"]//img/@data-original')
for img_herf in img_herfs:
#print(type(img_herf))
#print(img_herf)
#print(img_url)
req = requests.get(img_herf,headers)
#req.encoding='utf-8'
print(img_herf[-10:])
#save_data(img_herf[-10:],req.text) #TypeError: a bytes-like object is required, not 'str'
save_data(img_herf[-10:],req.content)
#print(type(req.text)) # <class 'str'>
#print(type(req.content)) # <class 'bytes'>
def save_data(file_name,data):
file = r"D:\Python_study\new_python\venv\img\{}".format(file_name)
with open(file,'wb') as f:
f.write(data)
if __name__ == '__main__':
start = time()
url = 'https://www.doutula.com/?page={}'
for i in range(2):
print(url.format(i)+'---------------开始处理---------------')
get_url_data(url.format(i))
print(url.format(i) + '---------------处理完成---------------')
#break
end = time()
print('=============总耗时'+str(end-start)+"=============")
#=============总耗时32.88343548774719=============
多线程爬取图片
附录
import re
import os
# 在os模块中可以很好的分割字符,比如这种带.的数据
url_str = 'http://img.doutula.com/production/uploads/image/2020/06/13/20200613032628_MUgIEA.jpg'
suffix = os.path.splitext(url_str)[0]
#print(suffix) #http://img.doutula.com/production/uploads/image/2020/06/13/20200613032628_MUgIEA
suffix = os.path.splitext(url_str)[1]
#print(suffix) #.jpg
s_str = re.split('\.',url_str)
print(s_str)
#['http://img', 'doutula', 'com/production/uploads/image/2020/06/13/20200613032628_MUgIEA', 'jpg']
print(s_str[-1])#jpg
print(s_str[-2])#com/production/uploads/image/2020/06/13/20200613032628_MUgIEA