#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/12/1 14:34
# @Author : huni
# @File : 全站爬取.py
# @Software: PyCharm
import requests
from lxml import etree
import os
if __name__ == '__main__':
headers = {
'Referer': 'http://www.xiannvku.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
while 1:
print('输入要搜索的内容:')
key = input()
base_url = 'http://www.xiannvku.com/index.php/pic/search'
paradata = {
'key' : key
}
search_page = requests.post(url=base_url,headers=headers,data=paradata).text
search_tree = etree.HTML(search_page)
search_num = search_tree.xpath('//div[@class="text-c"]/a[1]/text()')[0]
print('搜索到:',search_num,'内容')
if search_num == '0条':
continue
else:
search_path = './XXX' + f'/{key}'
if not os.path.exists(search_path):
os.mkdir(search_path)
gril_page_num = (int(search_num.replace('条','')) // 28) + 1
for i in range(1,gril_page_num+1):
every_url = f'http://www.xiannvku.com/pic/search?key={key}&page={i}'
every_page = requests.get(url=every_url,headers=headers).text
every_tree = etree.HTML(every_page)
li_list = every_tree.xpath('//ul[@class="img"]/li')
for li in li_list:
detailurl = li.xpath('./a[1]/@href')[0]
detail_page = requests.get(detailurl,headers=headers).text
detail_tree = etree.HTML(detail_page)
pagenum = int(detail_tree.xpath('//div[@id="pages"]/a')[-2:-1][0].xpath('./text()')[0])
title = detail_tree.xpath('//title/text()')[0]
title_path = search_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
for j in range(1,pagenum+1):
rep = str(j) + '.html'
href = detailurl.replace(detailurl.split('-')[-1],rep)
page = requests.get(url=href,headers=headers).text
tree = etree.HTML(page)
img_list = tree.xpath('//div[@class="content"]/center/img')
for img in img_list:
src = img.xpath('./@src')[0]
jpgname = src.split('/')[-1]
jpgpath = title_path + '/' + jpgname
jpgdata = requests.get(url=src,headers=headers).content
with open(jpgpath,'wb') as fp:
fp.write(jpgdata)
print(jpgname,'保存完成!')
print(title,'的第%s页图保存完毕'%j)
print('第%s页保存完毕'%i)
break
python 爬虫 论一个爬虫的自我修养
最新推荐文章于 2020-12-09 10:18:56 发布