以抽屉网为例,爬取该用户评论,并存入MongoDB数据库
"""
链接mongoDB后,导入数据
"""
import requests
import urllib
import re
from bs4 import BeautifulSoup
import pymongo
import time
from datetime import datetime,timedelta
def transTime(rtime):#时间戳转为时间
now = time.strftime("%Y-%m-%d %H:%M:%S")
# print(rtime)
rtime = rtime.encode('utf-8')
# print(rtime)
if b"\xe5\xa4\xa9" in rtime:#带天数先正则
res = re.search(b"(.*)\xe5\xa4\xa9(.*)\xe5\xb0\x8f\xe6\x97\xb6", rtime)
days = str(res[1], encoding='utf-8')
hours = str(res[2], encoding='utf-8')
return days, hours, 0
elif b"\xe5\xb0\x8f\xe6\x97\xb6" in rtime:#带小时再正则
res = re.search(b"(.*)\xe5\xb0\x8f\xe6\x97\xb6(.*)\xe5\x88\x86\xe9\x92\x9f", rtime)
# hours = res[1]
# minutes = res[2]
hours = str(res[1], encoding='utf-8')
minutes = str(res[2], encoding='utf-8')
return 0, hours, minutes
elif b'\xe5\xb0\x8f\xe4\xba\x8e' in rtime:#小于1分钟
return 0, 0, 0
else:#几分钟
minutes = re.search(b"^(.*)\xe5\x88\x86", rtime).group(1)
minutes = str(minutes, encoding='utf-8')
return 0, 0, minutes
def transDatetime(days,hours,minutes):#转换时间为日期
now = datetime.now()
d1 = now - timedelta(minutes=int(minutes))
if hours!=0:
d1 = d1 - timedelta(hours=int(hours))
if days!=0:
d1 = d1 - timedelta(days=int(days))
return d1
def saveComments(n):
# 数据存入MongoDB 参考网址:https://www.jianshu.com/p/7d14c3ad810f
client = pymongo.MongoClient("localhost", 27017) # 创建连接,因为用的本机的mongodb数据库,所以直接写localhost即可,也可以写成127.0.0.1,27017为端口
db = client['mydb'] # 连接的数据库
collection = db['saveComments_1'] # 连接的表
# 通过循环实现对不同页码的网页的数据爬取
# 参考网址:https://www.cnblogs.com/dudududu/p/8823871.html https://blog.csdn.net/weixin_41032076/article/details/80171640
for page in range(n): # 以10页为例
url = 'https://dig.chouti.com/user/cocolary/comments/' + str(page)
headers2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}
request = urllib.request.Request(url, headers=headers2)
answer = urllib.request.urlopen(request)
html_text = answer.read()
data = html_text.decode('utf-8')
soup = BeautifulSoup(html_text.decode('utf-8'), 'html.parser')
# 找出class属性值为content-list的div
news_list = soup.find('div', {'class': 'content-list'})
# 找出news_list下的所有div标签
news = news_list.find_all('div')
# 遍历news
for i in news:
try:
time = i.find('div', {'class': 'comment-time'}).get_text().strip() # 提取时间
com = i.find('span', {'class': 'text-comment-con'}).get_text().strip() # 提取评论
source = i.find('span', {'class': 'content-source'}).get_text().strip() # 提取来源
Section = i.find('span', {'class': 'content-kind'}).get_text().strip() # 提取来源区
ding_num = i.find('span', {'class': 'ding-num'}).get_text().strip() # 提取ding-num
cai_num = i.find('span', {'class': 'cai-num'}).get_text().strip() # 提取ding-num
title_content = i.find('div', {'class': 'comment-title'}).find('a').get_text().strip() # 提取评论新闻内容
title_href = i.find('div', {'class': 'comment-title'}).find('a').get('href') # 提取评论新闻href
state_href = i.find('div', {'class': 'comment-state'}).find('a').get('href') # 提取评论状态href
days, hours, minutes = transTime(time) # print(days,hours,minutes)
time = transDatetime(days, hours, minutes)
bianhao = re.findall(r"\d+\.?\d*", state_href)
k = 0
for i in bianhao:
if k == 0:
title_id = i # print("新闻编号为:", title_id)
k = 1
else:
comment_id = i # print("评论编号为:", comment_id)
k = 0
data = {}
data['CommentTime'] = time.strftime("%Y-%m-%d %H:%M:%S") #评论时间
data['com_content'] = com #评论内容
source = source[1:] #去掉来源(微博、微信……)前边的短线
data['Source'] = source
data['Section'] = Section
data['news_content'] = title_content #新闻内容
#去掉中括号
ding_num = ding_num.replace('[', '').replace(']', '')
cai_num = cai_num.replace('[', '').replace(']', '')
data['Ups'] = int(ding_num)
data['Downs'] = int(cai_num)
data['NID'] = int(title_id)
data['CID'] = int(comment_id)
collection.insert(data) # 插入记录
except AttributeError as e:
continue
answer.close()
def saveInfo(soup,num):
# 数据存入MongoDB 参考网址:https://www.jianshu.com/p/7d14c3ad810f
client = pymongo.MongoClient("localhost", 27017) # 创建连接,因为用的本机的mongodb数据库,所以直接写localhost即可,也可以写成127.0.0.1,27017为端口
db = client['mydb'] # 连接的数据库
collection = db['saveInfo_1'] # 连接的表
name = soup.find('div',{'class':"tu"}).get_text().strip() #username
eare = soup.find('div',{'class':"tu-m"}).find_all('span')
# servetime = soup.find('div',{'class':"medal"}).get_text().strip() 来自js
signNature = soup.find('div', {'class': "tu-b"}).get_text().strip()
score = soup.find('div',{'class':"profile-B_2"}).find('span').get_text().strip()
score = int(score)
k=0
for i in eare:
if k == 0:
eare_one = i.get_text().strip()
k = 1
else:
eare_two = i.get_text().strip()
break
# sex = soup.find('div',{'class':"tum_sex"}).get_text() 来自js
posts = int(soup.find(id='shu_fa').get_text().strip())
recommend = int(soup.find(id='shu_digg').get_text().strip())
all_comments_num = num
Info = {}
Info["Nick"] = name
Info["posts"] = posts
Info["eare_one"] = eare_one
Info["eare_two"] = eare_two
Info["jifen"] = score
Info["recommend"] = recommend
Info["all_comments_num"] = all_comments_num
Info["signNatur"] = signNature
collection.insert(Info) # 插入记录
url = 'https://dig.chouti.com/user/cocolary/comments/1'
headers2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/51.0.2704.63 Safari/537.36'}
request = urllib.request.Request(url, headers=headers2)
answer = urllib.request.urlopen(request)
html_text = answer.read()
soup = BeautifulSoup(html_text.decode('utf-8'), 'html.parser')
#获取评论页数
comments = soup.find(id = "shu_comment")
num = int(comments.text)
numm = num / 15+1
n = int(numm)
# 爬取的评论保存到文件中
#saveComments(2)
# 爬取的个人信息保存到文件中
saveInfo(soup,num)