自己动手编写了一个爬虫,用来爬取某网站的妹纸图片。
先介绍下思路吧
首先:获取包含某页图片的HTML代码,找到各个分页的后缀规律
其次:找到包含单个图片的xpath
最后:储存在本地就好了
多的不说:上代码,有什么不明白的欢迎留言。
当然了,资助一波还是感激涕零的
import requests
from lxml import etree
class Get_image():
def __init__(self, url):
self.headers = {
"User-Agent" : "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
}
self.urls = url
def parse_multi_page(self):
# 15 代表每页的图片个数 4 代表 爬取的页数
number = [[j + i * 15 for j in range(1,16)] for i in range(4)]
page = 1
for url,index in zip(self.urls, number):
res = requests.get(url, self.headers)
html = etree.HTML(res.content)
parse_result = html.xpath("//div[@class='pic']//li//a/img/@src")
# 有多少个jpg 就有多少个num(图片下标)
for image_url, num in zip(parse_result,index):
res = requests.get(image_url, self.headers)
with open('girl/{}.jpg'.format(num), "ab") as f:
f.write(res.content)
print("第" + str(page) + "页抓取完毕")
page = page + 1
def main():
start_url = "http://www.hunter-its.com/m/{}.html"
url = []
# 5 - 1 代表爬取的页数
for i in range(1,5):
temp_url = start_url.format(i)
url.append(temp_url)
get_image = Get_image(url)
get_image.parse_multi_page()
if __name__ == "__main__":
main()