python 爬虫 通过搜索引擎搜索好看的图片进行爬取(解决页面广告链接问题)

仅仅展示单线程的代码,多线程可以自行探索不在过多赘述

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/12/31 12:02
# @Author  : huni
# @File    : xxx单函数.py
# @Software: PyCharm
import requests
from lxml import etree
from urllib import parse
import os
if __name__ == '__main__':
    m_path = './xxx'
    if not os.path.exists(m_path):
        os.mkdir(m_path)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
    }
    kw = '陆萱萱'
    keyword = parse.quote(kw, encoding='utf-8')
    url = f'https://xchina.co/search/keyword-{keyword}.html'
    resp = requests.get(url=url,headers=headers).text
    tree = etree.HTML(resp)
    href_part_list = list(set(tree.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[1]/div[1]//@href')))
    for href_part in href_part_list:
        href = 'https://xchina.co/' + href_part
        resp1 = requests.get(url=href,headers=headers).text
        tree1 = etree.HTML(resp1)
        div_list = tree1.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[2]/div')
        for div in div_list:
            href1 = 'https://xchina.co/' + div.xpath('./a[1]/@href')[0]
            resp2 = requests.get(url=href1,headers=headers).text
            tree2 = etree.HTML(resp2)
            title = tree2.xpath('/html/head/title/text()')[0]
            title_path = m_path + f'/{title}'
            if not os.path.exists(title_path):
                os.mkdir(title_path)
            page_num = int(tree2.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[1]//text()')[-4])
            for j in range(1,page_num+1):
                href2 = href1.replace('.html',f'/{j}.html')
                resp3 = requests.get(url=href2,headers=headers).text
                tree3 = etree.HTML(resp3)
                a_list = tree3.xpath('/html/body/div[5]/div[1]/div/div[2]/div[3]/div[2]/a')
                for a in a_list:
                    src = 'https://xchina.co' + a.xpath('./@href')[0]
                    jpg_data = requests.get(url=src,headers=headers).content
                    jpg_name = src.split('/')[-1]
                    jpg_path = title_path + f'/{jpg_name}'
                    with open(jpg_path,'wb') as fp:
                        fp.write(jpg_data)
                        print(jpg_name,'下载完成')

------写在后面:
大家如果觉得小编的代码有用,可以多多关注小编,
同时小编的公众号也开通了,大家可以关注下,后续进行粉丝回馈,大家一起学习python叭
在这里插入图片描述
打赏小编点这里哦
在这里插入图片描述

评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值