python3 爬虫内涵段子

最新推荐文章于 2024-08-16 17:41:41 发布

a2798003474

最新推荐文章于 2024-08-16 17:41:41 发布

阅读量138

点赞数

文章标签：爬虫 python

原文链接：http://www.cnblogs.com/Bighua123/p/8418968.html

版权

import re
from urllib import request
class Sprder:
    def __init__(self):
        self.page=1
        self.switch=True
    def loadPage(self):
        """"
        下载页面
        """
        url="http://www.neihan8.com/article/list_5_"+str(self.page)+".html"
        user_agent = 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident / 5.0'
        headers = {'User-Agent': user_agent}
        request1=request.Request(url,headers=headers)
        response=request.urlopen(request1)
        html=response.read().decode("gbk")
        pattern=re.compile(r'<div\sclass="f18 mb20">(.*?)</div>', re.S)
        content_list=pattern.findall(html)

        self.dealPage(content_list)

    def dealPage(self,content_list):
        """
        处理每页段子
        """
        for item in content_list:
            item=item.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("&ldquo;","")
            self.writePage(item)

    def writePage(self,item):
        """
         把段子逐个写入文件
        """
        with open("段子.txt","a") as f:
            f.write(item)
    def startWork(self):
        """
        控制爬虫运行

        """
        while self.switch:
            self.loadPage()
            command=str(input("如果继续按回车（退出输入quit）"))
            if command=="quit":
                self.switch=False

            self.page+=1
if __name__ == '__main__':
        duanziSpider=Sprder()
        # duanziSpider.loadPage()
        duanziSpider.startWork()

转载于:https://www.cnblogs.com/Bighua123/p/8418968.html

a2798003474

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python3 爬虫内涵段子

import refrom urllib import requestclass Sprder: def __init__(self): self.page=1 self.switch=True def loadPage(self): """" 下载页面 """ url="http://www...
复制链接

扫一扫