1、pyquery用法类似于jQuery选择器。
2、UserAgent可以自动生成User-Agent。
3、代码块。
#ecoding=gbk
import requests
import time
from pyquery import PyQuery as pq
from fake_useragent import UserAgent
# 定义头部文件
ua = UserAgent(verify_ssl = False)
headers = {
"Cookie": "__cfduid=d32456ad91a005b1632715ead68484c4b1595580134; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1595580139; zkhanecookieclassrecord=%2C66%2C54%2C; Hm_lpvt_526caf4e20c21f06a4e9209712d6a20e=1595582860"
,
"User-Agent": ua.random
}
# 指定遍历次数
for page in range(0, 10):
if page == 0:
url = "http://pic.netbian.com/4kfengjing/index.html"
else:
url = "http://pic.netbian.com/4kfengjing/index_{page + 1}.html"
# 抓取页面
page_response = requests.get(url, headers=headers, verify=False)
# 该网站编码为GBK,以实际网站编码为主
page_response.encoding = "gbk"
# 进行装载
page_doc = pq(page_response.text)
# 筛选所有的a标签,用法类似jQuery选择器
page_a = page_doc.find('.clearfix li a').items()
for page_a_href in page_a:
a_href = page_a_href.attr('href')
print("正在进入html......")
details_response = requests.get("http://pic.netbian.com"+a_href, headers=headers, verify=False)
details_response.encoding = "gbk"
details_page_doc = pq(details_response.text)
details_page_imgs = details_page_doc.find('#img img').items()
for details_page_img in details_page_imgs:
# 取到图片标题并进行编码
details_page_img_title = details_page_img.attr('title'.encode('gbk'))
details_page_img_src = details_page_img.attr('src')
# 取到图片内容
details_page_img_response = requests.get("http://pic.netbian.com"+ details_page_img_src, headers=headers, verify=False)
# 输出内容到本地
with open(f"{details_page_img_title}.jpg", "wb") as f:
f.write(details_page_img_response.content)
print(f"{details_page_img_src}保存成功")