简单小爬虫之内涵段子网

#内涵段子的小爬虫

# coding=utf-8
import requests
from retrying import retry
import re
import json


class NeihanSpider(object):
    def __init__(self):
        self.start_url = "https://neihanshequ.com/"
        self.headers = {
            "User-Agent": "Mozilla/5.0(Macintosh; Intel Mac OS X 10_12_4) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 60.0.3112.113Safari / 537.36"
        }
        self.next_url_temp = "https://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time={}"

    @retry(stop_max_attempt_number=3)
    def _parse_url(self, url):
        response = requests.get(url, headers=self.headers, timeout=3)
        assert response.status_code == 200
        return response.content.decode()

    def parse_url(self, url):
        print("now parseing", url)
        try:
            return self._parse_url(url)
        except Exception as e:
            print(e)
            return None

    def get_content_list(self, html_str):
        content_list = re.findall(r"<h1 class=\"title\">.*?<p>(.*?)</p>", html_str, re.S)
        max_time = re.findall(r"max_time: '(.*?)',", html_str)

        return content_list, max_time[0]

    def save_content_list(self, content_list):
        file_path = "./duanzi.txt"
        with open(file_path, "a", encoding='utf-8')as f:
            for content in content_list:
                f.write(content)
                f.write("\n")

    def get_next_page_content(self, url):
        json_response = self.parse_url(url)
        dict_response = json.loads(json_response)
        content_list = dict_response["data"]["data"]
        content_list = [i["group"]["text"] for i in content_list]
        max_time = dict_response["data"]["max_time"]
        has_more = dict_response["data"]["has_more"]
        return content_list, max_time, has_more

    def run(self):
        # 1.start_url
        # 2.发送请求,获取相应
        html_str = self.parse_url(self.start_url)
        # 3.提取数据
        if html_str is not None:
            content_list, max_time = self.get_content_list(html_str)

            # 4.保存
            self.save_content_list(content_list)
            # 5.获取下一页的内容
            has_more = True
            while has_more:
                next_url = self.next_url_temp.format(max_time)
                content_list, max_time, has_more = self.get_next_page_content(next_url)
                self.save_content_list(content_list)


if __name__ == '__main__':
    neihan = NeihanSpider()
    neihan.run()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值