使用requests爬取内涵8的内涵段子

使用requests爬取内涵8的内涵段子

  • 使用retrying模块进行超时重试处理
  • 使用随机User-Agent和随机代理服务器 进行简单的反反爬

"""
myreq.py  
模块功能:
parse_url 方法给 url 就返回 html

简单反反爬
1.随机 User-Agent
2.随机代理服务器

容错处理
1.重试
2.超时
"""

import requests
import random
from retrying import retry

# 随机 User-Agent
USER_AGENT_LIST = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15"
]

# 定义 代理服务器
# 未来 2个 代理服务器列表可能从第三方购买的
HTTP_PROXIES = [
    'http://115.223.197.119:9000',
]
HTTPS_PROXIES = [
    'https://218.60.8.99:3129'
]

@retry(stop_max_attempt_number=5)
def __parse_url(url,method='get',data={},proxies={}):
    print("***请求中***")
    headers = {
        'User-Agent': random.choice(USER_AGENT_LIST)
    }

    if method=='get':
        response = requests.get(
            url,
            headers=headers,
            proxies=proxies,
            timeout=2,
            params=data
        )
    else:
        response = requests.post(
            url,
            headers=headers,
            proxies=proxies,
            timeout=2,
            params=data
        )
    response.encoding = 'utf-8'
    return response.text


def parse_url(url, method='get', data={}):
    """
    请求并返回html内容
    :param self:
    :param url:
    :return:
    """
    html = None
    proxies = {
        "http": random.choice(HTTP_PROXIES) if len(HTTP_PROXIES) > 0 else None,
        "https": random.choice(HTTPS_PROXIES) if len(HTTPS_PROXIES) > 0 else None
    }

    try:
        html = __parse_url(url,method=method,proxies=proxies,data=data)
    except:
        # 加点log日志
        html = None

    # 删除无效的代理
    if html is None:
        scheme = requests.utils.urlparse(url).scheme
        if scheme == 'http':
            print("当前代理无效:",proxies['http'])
            HTTP_PROXIES.remove(proxies['http'])
        elif scheme == 'https':
            print("当前代理无效:", proxies['https'])
            HTTPS_PROXIES.remove(proxies["https"])
    # print(HTTP_PROXIES)
    # print(HTTPS_PROXIES)
    return html

if __name__ == '__main__':
    print(parse_url("https://www.baidu.com"))

  1. 根据返回主页爬取内涵URL列表
  2. 根据内涵URL列表 分别爬取各个详情页
"""
https://www.neihan8.com/e/action/ListInfo/?classid=11&page=1093
"""
import re
from html.parser import HTMLParser
from myreq import parse_url


DEBUG = False  # 通过debug限制爬一次(true)还是全部(false)
class Neihan8Spider(object):
    def __init__(self):
        self.base_detail_url = 'https://www.neihan8.com'



    def save_content(self,content):
        """保存数据"""
        print("*" * 50)
        print(content)

    def run(self):
        # 1.爬取主页中url列表
        url_list = []

        # 定义提取详细页的url
        detail_link_pattern = re.compile(r'<a href="(.*)" class="title" title')
        # 定义粗提取详细页中数据
        detail_content_pattern = re.compile(r'<div class="detail">(.*)<div class="ad610">',re.S )
        # 定义细提取详细页中数据
        detail_part_pattern = re.compile(r'<p>(.*?)</p>')
        for i in range(0,1094):
            url = 'https://www.neihan8.com/e/action/ListInfo/?classid=11&page={}'.format(i)
            url_list.append(url)
        for url in url_list:
            html = parse_url(url)
            detail_url_list = detail_link_pattern.findall(html)
            # print(detail_url_list)

            if DEBUG:break
        # 2.单独爬取详情页中的数据
            for detail_url in detail_url_list:
                detail_html = parse_url(self.base_detail_url + detail_url)
                # 粗提取
                detail_content = detail_content_pattern.findall(detail_html)
                # print(detail_content)
                if len(detail_content)>0:
                    detail_content = detail_content[0]
                else:
                    continue

                # 细提取 提取p标签中的内容
                content = ''
                detail_part_list = detail_part_pattern.findall(detail_content)
                # print(detail_part_list)
                # 3.数据清理
                for part in detail_part_list:
                    part = HTMLParser().unescape(part)
                    part = part.strip()
                    content += part + '\n'

                # 保存数据
                self.save_content(content)
                if DEBUG:break


if __name__ == '__main__':
    spider = Neihan8Spider()
    spider.run()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值