pathon爬虫对象版

注意:类的方法声名时,必须加上self变量(不管方法中有没有用到,如下例中的方法getHtml())
脚本如下:
#-*- coding:utf-8 -*-
#import urllib
import urllib2
import re
import sys
import cookielib


reload(sys)
sys.setdefaultencoding("utf-8")

class getTheLine(object):
    def __init__(self,url,re_compile,filePath,pages):
        self.url = url
        self.re_compile = re_compile
        self.filePath = filePath
        self.pages = pages

    def getHtml(self,html_url):
        cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
        #opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        opener = urllib2.build_opener(cookie_support)
        user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36']
        opener.addheaders = [("User-agent", user_agents), ("Accept", "*/*"), ('Referer', 'http://www.douban.com')]
        response = opener.open(html_url)
        html = response.read().decode("utf-8")
        return html

    def getTitle(self,html_url):
        #reg = u'class="note-item">.*?<a.*?href=(.*?)class="title".*?target="_blank">(.*?)</a>.*?<span>(.*?)喜欢</span>'
        #titleRe = re.compile(reg,re.S)
        titlelist = re.findall(self.re_compile,self.getHtml(html_url))
        return titlelist

    def howManyPages(self):
        page_num = 0
        #filePath = r'C:\Users\Administrator\tmp\DoubanTop250.txt'
        while page_num < self.pages:
            #html_url = 'https://www.douban.com/tag/%E8%A3%85%E4%BF%AE/article?start=' + str(page_num*15)
            html_url = self.url + '?start=' + str(page_num*15)
            page_num = page_num + 1
            Contents = self.getTitle(html_url)
            if page_num == 1:
                files_work = open(self.filePath, 'w')
            else:
                files_work = open(self.filePath, 'a')
            for Content in Contents:
                if int(Content[2]) >= 1500:
                #if 1:
                    files_work.write(Content[2] + '人喜欢' + '\r')
                    files_work.write('Title:' + Content[1] + '\r')
                    files_work.write('Link:' + Content[0] + '\r')
                    files_work.write('from the ' + str(page_num) + ' page' + '\r\n')
            print 'Read the ' + str(page_num) + ' page successful...'
            files_work.close()


reg = u'class="note-item">.*?<a.*?href=(.*?)class="title".*?target="_blank">(.*?)</a>.*?<span>(.*?)喜欢</span>'
titleRe = re.compile(reg,re.S)
filePath = r'C:\Users\Administrator\tmp\article.txt'
html_url = 'https://www.douban.com/tag/%E8%A3%85%E4%BF%AE/article'
pages = 10

getTheLine(html_url,titleRe,filePath,pages).howManyPages()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值