pathon爬虫对象版

最新推荐文章于 2024-02-22 11:06:57 发布

小桔帽

最新推荐文章于 2024-02-22 11:06:57 发布

阅读量688

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/u010895119/article/details/54927029

版权

python 专栏收录该内容

27 篇文章 0 订阅

订阅专栏

注意：类的方法声名时，必须加上self变量（不管方法中有没有用到，如下例中的方法getHtml()）

脚本如下：

#-*- coding:utf-8 -*-
#import urllib
import urllib2
import re
import sys
import cookielib


reload(sys)
sys.setdefaultencoding("utf-8")

class getTheLine(object):
    def __init__(self,url,re_compile,filePath,pages):
        self.url = url
        self.re_compile = re_compile
        self.filePath = filePath
        self.pages = pages

    def getHtml(self,html_url):
        cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
        #opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
        opener = urllib2.build_opener(cookie_support)
        user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36']
        opener.addheaders = [("User-agent", user_agents), ("Accept", "*/*"), ('Referer', 'http://www.douban.com')]
        response = opener.open(html_url)
        html = response.read().decode("utf-8")
        return html

    def getTitle(self,html_url):
        #reg = u'class="note-item">.*?<a.*?href=(.*?)class="title".*?target="_blank">(.*?)</a>.*?<span>(.*?)喜欢</span>'
        #titleRe = re.compile(reg,re.S)
        titlelist = re.findall(self.re_compile,self.getHtml(html_url))
        return titlelist

    def howManyPages(self):
        page_num = 0
        #filePath = r'C:\Users\Administrator\tmp\DoubanTop250.txt'
        while page_num < self.pages:
            #html_url = 'https://www.douban.com/tag/%E8%A3%85%E4%BF%AE/article?start=' + str(page_num*15)
            html_url = self.url + '?start=' + str(page_num*15)
            page_num = page_num + 1
            Contents = self.getTitle(html_url)
            if page_num == 1:
                files_work = open(self.filePath, 'w')
            else:
                files_work = open(self.filePath, 'a')
            for Content in Contents:
                if int(Content[2]) >= 1500:
                #if 1:
                    files_work.write(Content[2] + '人喜欢' + '\r')
                    files_work.write('Title:' + Content[1] + '\r')
                    files_work.write('Link:' + Content[0] + '\r')
                    files_work.write('from the ' + str(page_num) + ' page' + '\r\n')
            print 'Read the ' + str(page_num) + ' page successful...'
            files_work.close()


reg = u'class="note-item">.*?<a.*?href=(.*?)class="title".*?target="_blank">(.*?)</a>.*?<span>(.*?)喜欢</span>'
titleRe = re.compile(reg,re.S)
filePath = r'C:\Users\Administrator\tmp\article.txt'
html_url = 'https://www.douban.com/tag/%E8%A3%85%E4%BF%AE/article'
pages = 10

getTheLine(html_url,titleRe,filePath,pages).howManyPages()