爬取百度贴吧（搞笑吧）的图片

最新推荐文章于 2020-04-22 23:23:35 发布

qq_42553082

最新推荐文章于 2020-04-22 23:23:35 发布

阅读量168

点赞数

本文链接：https://blog.csdn.net/qq_42553082/article/details/82220844

版权

import requests
from lxml import etree
import os

url = "https://tieba.baidu.com/f?ie=utf-8"

# 用户输入要爬取的贴吧名称
kw = input("请输入您要爬取贴吧的名称：")
params = {"kw": kw}

# 起始页
start = int(input("请输入您要爬取起始页（从1开始）:"))
end = int(input("请输入爬取的截止页："))

name = 1

for n in range(start, end+1):
    pn = (n - 1) * 50
    full_url = url + '&pn=' + str(pn)
    response = requests.get(full_url, params=params)

    print("获取第", n, "页的帖子链接。。。")
    print(response.url)

    content = response.content


    html = etree.HTML(content)
    tieba_urls = html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')

    print("tieba_urls===", )
    print(tieba_urls)

    # --------------------------

    for tieba_url in tieba_urls:
        tieba_url = 'https://tieba.baidu.com' + tieba_url

        print('具體貼吧tieba_url',tieba_url)

        response = requests.get(tieba_url)
        content = response.content

        html = etree.HTML(content)
        img_urls = html.xpath('//div[@class="d_post_content j_d_post_content "]/img[@class="BDE_Image"]/@src')

        for img_url in img_urls:
            print(img_url)

            # response = requests.get(img_url)
            path = './imagesll3/'
            if not os.path.exists(path):
                os.mkdir(path)

            print("正在下载图片：", img_url)

            response = requests.get(img_url)

            if response.status_code == 200:
                with open(path + str(name) + '.jpg', 'wb') as f:
                    for block in response.iter_content(1024):
                        if not block:
                            break
                        else:
                            f.write(block)
                            print('222')

            name += 1
            print('+1')

qq_42553082

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取百度贴吧（搞笑吧）的图片

import requestsfrom lxml import etreeimport osurl = "https://tieba.baidu.com/f?ie=utf-8"# 用户输入要爬取的贴吧名称kw = input("请输入您要爬取贴吧的名称：")params = {"kw": kw}# 起始页start = int(input("请输入您要爬取起始页（从1开始）:...
复制链接

扫一扫