从今天开始就要撸起袖子,直接写Python爬虫了,学习语言最好的办法就是有目的的进行,所以,本着美女是学习动力的第一原则,啊哈哈。写个程序把妹子们都下载下来吧。
福利时刻
妹子图片质量整体上还是不错呦,放两张不同风格的图大家感受下,O(∩_∩)O哈哈~
爬取结果
完整代码如下:
import re, os, requests, time
from lxml import etree
from mytool.randomUA import get_ua
from mytool.requests_plus import request_ssr
def mian():
for page in range(1, 2):
url = f"https://www.mmonly.cc/mmtp/qcmn/list_16_{page}.html"
response = request_ssr(url, headers=get_ua())
if response:
html = response.content.decode("gbk")
root = etree.HTML(html)
div_list = root.xpath("//div[@id='infinite_scroll']/div[@class='item masonry_brick masonry-brick']")
print(len(div_list))
data_list = []
for div in div_list:
detail_url = div.xpath(".//div[@class='ABox']/a/@href")[0]
title = div.xpath(".//div[@class='title']/span/a/text()")[0]
data = {}
data["detail_url"] = detail_url
data["title"] = title
data_list.append(data)
detail(page, data_list)
# 第二层首页面信息
def detail(page, data_list):
for data in data_list:
detail_url = data["detail_url"]
title = data["title"]
title = re.sub("[/\\\"\s]", "", title)
os.makedirs(f"./{title}", exist_ok=True)
print("创建目录:", f"./{title}")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
}
response = request_ssr(detail_url, headers=headers)
if response:
detail_html = response.content.decode("gbk")
root = etree.HTML(detail_html)
img_url = root.xpath("//div[@id='big-pic']//img/@src")[0]
total_page_num = root.xpath("//span[@class='totalpage']/text()")[0]
print("当前下载链接:", img_url)
with open(f"./{title}/1.jpg", "wb") as f:
response = request_ssr(img_url, headers=headers)
if response:
f.write(response.content)
for i in range(2, int(total_page_num) + 1):
next_url = detail_url[:-5] + f"_{i}.html"
response = request_ssr(next_url, headers=headers)
if response:
detail_html = response.content.decode("gbk")
root = etree.HTML(detail_html)
img_url = root.xpath("//div[@id='big-pic']//img/@src")[0]
print("当前下载链接:", img_url)
with open(f"./{title}/{i}.jpg", "wb") as f:
response = request_ssr(img_url, headers=headers)
if response:
f.write(response.content)
if __name__ == '__main__':
mian()
好了,今天的一个简单的爬虫成了