爬虫—图片下载

最新推荐文章于 2024-04-30 10:57:33 发布

liuwangleoooO

最新推荐文章于 2024-04-30 10:57:33 发布

阅读量3.1k

点赞数

分类专栏： # python爬虫文章标签： xpath python 爬虫

本文链接：https://blog.csdn.net/qq_36581961/article/details/110733949

版权

python爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

图片下载

代码附上

import random

import requests
from lxml import etree


def getHtml(url):
    # proxies = {'http': '123.160.68.82:9999'}
    try:
        head = ['Mozilla/5.0', 'Chrome/78.0.3904.97', 'Safari/537.36']
        headers = {
            'user-agent': head[random.randint(0, 2)]
        }
        response = requests.get(url=url, headers=headers, timeout=5)  # , proxies=proxies)
        if response.status_code == 200:
            content = response.content
            return content
        else:
            return response.status_code
    except Exception as e:
        return e


def htmlToTree(html):
    tree = etree.HTML(html)
    return tree


def downloadImg(url, desc):
    response = getHtml(url)
    path = "source/img/{}.gif".format(desc)
    with open(path, 'wb') as f:
        f.write(response)


def parseHtml(html):
    tree = htmlToTree(html=html)
    data = tree.xpath('//div[@class="main"]//div[@class="Left_bar"]//div[@class="tab_tj"]/'
                      'div[@class="tab_box"]/div/ul[@class="clearfix"]/li/a')
    for item in data:
        a = etree.tostring(item, pretty_print=True, encoding='utf-8').decode('utf-8')
        a_tree = htmlToTree(a)
        img_url = a_tree.xpath("//a/img/@data-src")[0]
        img_desc = a_tree.xpath("//a/p/text()")[0]
        downloadImg(img_url, img_desc)
        print(img_url, img_desc)


if __name__ == '__main__':
    url = 'http://www.win4000.com/meinvtag12327.html'
    html = getHtml(url)
    parseHtml(html)