python 爬虫 论一个爬虫的自我修养

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/12/1 14:34
# @Author  : huni
# @File    : 全站爬取.py
# @Software: PyCharm

import requests
from lxml import etree
import os

if __name__ == '__main__':
    headers = {
        'Referer': 'http://www.xiannvku.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
    }

    while 1:
        print('输入要搜索的内容:')
        key = input()

        base_url = 'http://www.xiannvku.com/index.php/pic/search'

        paradata = {
            'key' : key
        }
        search_page = requests.post(url=base_url,headers=headers,data=paradata).text
        search_tree = etree.HTML(search_page)
        search_num = search_tree.xpath('//div[@class="text-c"]/a[1]/text()')[0]
        print('搜索到:',search_num,'内容')

        if search_num == '0条':
            continue
        else:
            search_path = './XXX' + f'/{key}'
            if not os.path.exists(search_path):
                os.mkdir(search_path)
            gril_page_num = (int(search_num.replace('条','')) // 28) + 1
            for i in range(1,gril_page_num+1):
                every_url = f'http://www.xiannvku.com/pic/search?key={key}&page={i}'
                every_page = requests.get(url=every_url,headers=headers).text
                every_tree = etree.HTML(every_page)

                li_list = every_tree.xpath('//ul[@class="img"]/li')
                for li in li_list:
                    detailurl = li.xpath('./a[1]/@href')[0]
                    detail_page = requests.get(detailurl,headers=headers).text
                    detail_tree = etree.HTML(detail_page)
                    pagenum = int(detail_tree.xpath('//div[@id="pages"]/a')[-2:-1][0].xpath('./text()')[0])

                    title = detail_tree.xpath('//title/text()')[0]
                    title_path = search_path + f'/{title}'
                    if not os.path.exists(title_path):
                        os.mkdir(title_path)

                    for j in range(1,pagenum+1):
                        rep = str(j) + '.html'
                        href = detailurl.replace(detailurl.split('-')[-1],rep)

                        page = requests.get(url=href,headers=headers).text
                        tree = etree.HTML(page)

                        img_list = tree.xpath('//div[@class="content"]/center/img')
                        for img in img_list:
                            src = img.xpath('./@src')[0]
                            jpgname = src.split('/')[-1]
                            jpgpath = title_path + '/' + jpgname
                            jpgdata = requests.get(url=src,headers=headers).content

                            with open(jpgpath,'wb') as fp:
                                fp.write(jpgdata)
                                print(jpgname,'保存完成!')
                        print(title,'的第%s页图保存完毕'%j)
                print('第%s页保存完毕'%i)

            break




  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值