【爬虫】抓取人人车论坛的内容，并且保存Mongodb

最新推荐文章于 2024-01-16 00:49:26 发布

F_aF_a

最新推荐文章于 2024-01-16 00:49:26 发布

阅读量726

点赞数

分类专栏： Python # 爬虫

本文链接：https://blog.csdn.net/qq_37252519/article/details/86316373

版权

Python 同时被 2 个专栏收录

40 篇文章 0 订阅

订阅专栏

爬虫

14 篇文章 0 订阅

订阅专栏

import json, requests, re

import pymongo


class RRCSpider(object):
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
        # 建立pymongo的链接
        self.client = pymongo.MongoClient('localhost')
        self.db = self.client['rrc']
        print('def __init__(self):')

    def DB(self,info,list):
        print('def DB(self,info,list):')
        data_dict = {'主题': info[0], '标题': info[1],'建贴人': info[2],'建贴时间':info[3],'回复':info[4],'查看':info[5],'最后回复人':info[6],'最后回复时间':info[7],'detail':list}
        self.db['rrcluntan'].insert_one(data_dict)


    def get_article_list(self):
        print('def get_article_list(self):')
        """
        请求人人车论坛主页面
        :return:``
        """
        i = 1

        while i <=600:
            article_url = 'https://bbs.renrenche.com/forum.php?mod=forumdisplay&fid=51&page={}'.format(i)
            response = requests.get(article_url, headers=self.headers)
            # print(response.text)
            info = re.findall(re.compile(r'<a href="forum.php\?mod=forumdisplay&fid=51&amp;filter=typeid&amp;typeid=.*?">(.*?)</a>]</em>.*?<a href="forum.php\?mod=viewthread&amp;tid=.*?&amp;.*?class="s xst">(.*?)</a>.*?<a href="home.php\?mod=space&amp;.*?">(.*?)</a></cite>.*?<em><span>(.*?)</span></em>.*?<a href="forum.php\?mod=viewthread&amp.*? class="xi2">(.*?)</a><em>(.*?)</em></td>.*?<cite><a href="home.php\?mod=space&.*?c="1">(.*?)</a></cite>.*?<a href="forum.php\?mod=redirect.*?>(.*?)</a></em>', re.S), response.text)
            # print(info)
            url1=re.findall(re.compile(r'</a>]</em> <a href="(.*?)".*?>',re.S),response.text)
            url=[]
            for x in url1:
                x='https://bbs.renrenche.com/'+x.replace('amp;','')
                # print(x)
                url.append(x)
            i=i+1
        return  info,url

    def get_article_detail(self,url):
        print('准备抓def get_article_detail({})'.format(url))
        list=[]
        this = url + "&page={}".format(1)
        response=requests.get(this,headers=self.headers)
        p=re.findall(re.compile(r'<span title=".*?"> / (.*?) 页</span></label>',re.S),response.text)
        if len(p)==0:
            p=1
        else:
            p=int(p[0])
        print('共{}页'.format(p))
        i=1
        while i<=p:
            print('正在抓取 def get_article_detail({})第{}页'.format(url, i))
            response = requests.get(url + "&page={}".format(i), headers=self.headers)
            lists = re.findall(re.compile(r'<table cellspacing=".*?" cellpadding=".*?"><tr><td class=".*?" id=".*?" style=".*?">(.*?)</td></tr></table>', re.S), response.text)
            # print(lists)

            i=i+1
            for x in lists:
                list.append(x)
        return list


if __name__ == '__main__':
    rrc = RRCSpider()
    info,url=rrc.get_article_list()
    i=0
    # print(info)
    while i <min(len(info),len(url)):
        rrc.DB(info[i],rrc.get_article_detail(url[i]))
        i=i+1

F_aF_a

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【爬虫】抓取人人车论坛的内容，并且保存Mongodb

import json, requests, reimport pymongoclass RRCSpider(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36...
复制链接

扫一扫

专栏目录