【爬虫】抓取人人车论坛的内容,并且保存Mongodb

import json, requests, re

import pymongo


class RRCSpider(object):
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
        # 建立pymongo的链接
        self.client = pymongo.MongoClient('localhost')
        self.db = self.client['rrc']
        print('def __init__(self):')

    def DB(self,info,list):
        print('def DB(self,info,list):')
        data_dict = {'主题': info[0], '标题': info[1],'建贴人': info[2],'建贴时间':info[3],'回复':info[4],'查看':info[5],'最后回复人':info[6],'最后回复时间':info[7],'detail':list}
        self.db['rrcluntan'].insert_one(data_dict)


    def get_article_list(self):
        print('def get_article_list(self):')
        """
        请求人人车论坛主页面
        :return:``
        """
        i = 1

        while i <=600:
            article_url = 'https://bbs.renrenche.com/forum.php?mod=forumdisplay&fid=51&page={}'.format(i)
            response = requests.get(article_url, headers=self.headers)
            # print(response.text)
            info = re.findall(re.compile(r'<a href="forum.php\?mod=forumdisplay&fid=51&amp;filter=typeid&amp;typeid=.*?">(.*?)</a>]</em>.*?<a href="forum.php\?mod=viewthread&amp;tid=.*?&amp;.*?class="s xst">(.*?)</a>.*?<a href="home.php\?mod=space&amp;.*?">(.*?)</a></cite>.*?<em><span>(.*?)</span></em>.*?<a href="forum.php\?mod=viewthread&amp.*? class="xi2">(.*?)</a><em>(.*?)</em></td>.*?<cite><a href="home.php\?mod=space&.*?c="1">(.*?)</a></cite>.*?<a href="forum.php\?mod=redirect.*?>(.*?)</a></em>', re.S), response.text)
            # print(info)
            url1=re.findall(re.compile(r'</a>]</em> <a href="(.*?)".*?>',re.S),response.text)
            url=[]
            for x in url1:
                x='https://bbs.renrenche.com/'+x.replace('amp;','')
                # print(x)
                url.append(x)
            i=i+1
        return  info,url

    def get_article_detail(self,url):
        print('准备抓def get_article_detail({})'.format(url))
        list=[]
        this = url + "&page={}".format(1)
        response=requests.get(this,headers=self.headers)
        p=re.findall(re.compile(r'<span title=".*?"> / (.*?) 页</span></label>',re.S),response.text)
        if len(p)==0:
            p=1
        else:
            p=int(p[0])
        print('共{}页'.format(p))
        i=1
        while i<=p:
            print('正在抓取 def get_article_detail({})第{}页'.format(url, i))
            response = requests.get(url + "&page={}".format(i), headers=self.headers)
            lists = re.findall(re.compile(r'<table cellspacing=".*?" cellpadding=".*?"><tr><td class=".*?" id=".*?" style=".*?">(.*?)</td></tr></table>', re.S), response.text)
            # print(lists)

            i=i+1
            for x in lists:
                list.append(x)
        return list


if __name__ == '__main__':
    rrc = RRCSpider()
    info,url=rrc.get_article_list()
    i=0
    # print(info)
    while i <min(len(info),len(url)):
        rrc.DB(info[i],rrc.get_article_detail(url[i]))
        i=i+1


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值