爬取校花网图片
1.所用工具总结
- python3.7
- requests
- urllib
- lmxl
2.代码部分
download.py
import requests
from urllib.parse import urljoin
import lxml.html
# 解决lxml 中读取不到etree 模块的问题
etree = lxml.html.etree
# Download 类
class Download:
# 初始化函数
def __init__(self, baseUrl, filePath):
self.baseUrl = baseUrl
self.filePath = filePath
# 获取页面字符串
def getHtmlText(self, pagePath):
url = urljoin(self.baseUrl, pagePath)
response = requests.get(url)
return response.text
# 获取图片字典
# 结合xpath
def getImgList(self, pageText):
html = etree.HTML(pageText)
img_divs = html.xpath('//div[@id="list_img"]//div[@class="img"]//img')
img_list = list()
for item in img_divs:
img_name = item.xpath('./@alt')[0]
img_src = urljoin(self.baseUrl, item.xpath('./@src')[0])
dict_img = dict()
dict_img.setdefault('name', img_name)
dict_img.setdefault('src', img_src)
dict_img.setdefault('filePath', self.filePath + img_name + '.jpg')
img_list.append(dict_img)
return img_list
# 保存图片
def saveImg(self, imgDict):
src = imgDict.get('src')
filePath = imgDict.get('filePath')
response = requests.get(src)
try:
with open(filePath, 'wb') as f:
f.write(response.content)
except OSError as error:
print(error)
except IOError as error:
print(error)
main.py
from utils.download import Download
import time
#url 和 路径
baseUrl = 'http://www.xiaohuar.com/'
filePath = 'D:\\test\\'
# 初始化Download对象
dd = Download(baseUrl=baseUrl, filePath=filePath)
# 大学校花共计47页
# url结构为 http://www.xiaohuar.com/list-1-%s.html
for index in range(47):
# 页面路径
pagePath = 'list-1-%s.html' % index
# 取得页面字符串
pageText = dd.getHtmlText(pagePath=pagePath)
# 获取图片路径字典
img_list = dd.getImgList(pageText)
# 遍历下载图片
for item in img_list:
dd.saveImg(item)
percent = index / 47 * 100
print('完成率%.2f' % percent)
time.sleep(1)
3. 运行结果
4.坑的总结
lxml 较新的版本没法直接读取etree模块,修改如下
import lxml.html
# 解决lxml 中读取不到etree 模块的问题
etree = lxml.html.etree