neihan8段子爬取

# coding:utf-8
import urllib2
import re
import time


class Spider(object):
    def __init(self):
        pass

    def loadPage(self):
        startNum = int(raw_input("请输入起始页号:"))
        endNum = int(raw_input("请输入结束页号:"))

        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
            }

        for num in range(startNum, endNum + 1):
            if num == 1:
                url = "http://www.neihan8.com/article/index.html"
            else:
                url = "http://www.neihan8.com/article/index_%s.html"%str(num)

            print url

            request = urllib2.Request(url,headers = headers)
            response = urllib2.urlopen(request)
            #print response.read()
            html = response.read()

            pattern = re.compile('<div\sclass="desc">(.*?)</div>',re.S)

            content_list = pattern.findall(html)

            self.writePage(content_list)


    def writePage(self,content_list):
            with open("duanzi.txt","a") as f:
                for content in content_list:
                    f.write(content+"\r\n\r\n")


if __name__ == "__main__":
    Spider().loadPage()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值