有趣段子 + 图片爬虫

import requests
import re
import json
import os

class NeiHanSpider():

    def __init__(self):
        self.start_url = "http://www.budejie.com/"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
        }


    def get_html(self, url):
        response = requests.get(url, headers=self.headers, timeout=10)
        if response.status_code == 200:
            return response.content.decode()
        else:
            return None


    def parse_content(self, html_str):
        div = re.findall(r'"j-r-list"(.*?)j-r-wrst gud-put  index-wrst', html_str, re.S)
        div_html = div[0] + div[1]
        li = re.findall(r'<li>(.*?)<!--操作工具条-->', div_html, re.S)
        data = []
        for l in li:
            author_name = re.findall(r'class="u-user-name".*>(.*)</a>', l)[0]
            pub_time = re.findall(r'class="u-time  f-ib f-fr">(.*)</span>', l)[0]
            content = re.findall(r'class="j-r-list-c-desc">.*?<a.*?>(.*?)</a>.*</div>', l, re.S)[0]
            img_src = re.findall(r'class="j-r-list-c-img">.*data-original="(.*)" title', l, re.S)
            if len(img_src) == 0:
                img_src = None
            else:
                img_src = img_src[0]
            item = {
                "author_name": author_name,
                "pub_time": pub_time,
                "content": content,
                "img_src": img_src
            }
            data.append(item)

        return data


    def save_img(self, url, content):
        content = re.sub(r'[\\|/|:|*|?|<|>\|\n]', "", content)
        end_name = os.path.splitext(url)[1]
        filename = content + end_name
        with open("./data/imgs/" + filename, "wb") as fp:
            fp.write(requests.get(url).content)
        print(content + "写入成功...")



    def run(self):
        for index, url in enumerate([self.start_url + "{}".format(i+1) for i in range(10)]):
            print("正在爬取第{}页...".format(index + 1) + "{}".format(url))
            html_str = self.get_html(url)
            data = self.parse_content(html_str)
            for d in data:
                f = open("./data/duanzi.json", "a", encoding="utf-8")
                json.dump(d, f, ensure_ascii=False)
                f.write("\n")
                f.close()
                if d["img_src"]:
                    self.save_img(d["img_src"], d["content"])



if __name__ == '__main__':
    nhs = NeiHanSpider()
    nhs.run()

运行结果如下:
在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值