import json, requests, re
import pymongo
class RRCSpider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
# 建立pymongo的链接
self.client = pymongo.MongoClient('localhost')
self.db = self.client['rrc']
print('def __init__(self):')
def DB(self,info,list):
print('def DB(self,info,list):')
data_dict = {'主题': info[0], '标题': info[1],'建贴人': info[2],'建贴时间':info[3],'回复':info[4],'查看':info[5],'最后回复人':info[6],'最后回复时间':info[7],'detail':list}
self.db['rrcluntan'].insert_one(data_dict)
def get_article_list(self):
print('def get_article_list(self):')
"""
请求人人车论坛主页面
:return:``
"""
i = 1
while i <=600:
article_url = 'https://bbs.renrenche.com/forum.php?mod=forumdisplay&fid=51&page={}'.format(i)
response = requests.get(article_url, headers=self.headers)
# print(response.text)
info = re.findall(re.compile(r'<a href="forum.php\?mod=forumdisplay&fid=51&filter=typeid&typeid=.*?">(.*?)</a>]</em>.*?<a href="forum.php\?mod=viewthread&tid=.*?&.*?class="s xst">(.*?)</a>.*?<a href="home.php\?mod=space&.*?">(.*?)</a></cite>.*?<em><span>(.*?)</span></em>.*?<a href="forum.php\?mod=viewthread&.*? class="xi2">(.*?)</a><em>(.*?)</em></td>.*?<cite><a href="home.php\?mod=space&.*?c="1">(.*?)</a></cite>.*?<a href="forum.php\?mod=redirect.*?>(.*?)</a></em>', re.S), response.text)
# print(info)
url1=re.findall(re.compile(r'</a>]</em> <a href="(.*?)".*?>',re.S),response.text)
url=[]
for x in url1:
x='https://bbs.renrenche.com/'+x.replace('amp;','')
# print(x)
url.append(x)
i=i+1
return info,url
def get_article_detail(self,url):
print('准备抓def get_article_detail({})'.format(url))
list=[]
this = url + "&page={}".format(1)
response=requests.get(this,headers=self.headers)
p=re.findall(re.compile(r'<span title=".*?"> / (.*?) 页</span></label>',re.S),response.text)
if len(p)==0:
p=1
else:
p=int(p[0])
print('共{}页'.format(p))
i=1
while i<=p:
print('正在抓取 def get_article_detail({})第{}页'.format(url, i))
response = requests.get(url + "&page={}".format(i), headers=self.headers)
lists = re.findall(re.compile(r'<table cellspacing=".*?" cellpadding=".*?"><tr><td class=".*?" id=".*?" style=".*?">(.*?)</td></tr></table>', re.S), response.text)
# print(lists)
i=i+1
for x in lists:
list.append(x)
return list
if __name__ == '__main__':
rrc = RRCSpider()
info,url=rrc.get_article_list()
i=0
# print(info)
while i <min(len(info),len(url)):
rrc.DB(info[i],rrc.get_article_detail(url[i]))
i=i+1
【爬虫】抓取人人车论坛的内容,并且保存Mongodb
最新推荐文章于 2024-01-16 00:49:26 发布