静态网页 爬虫实战(二)

抽屉网为例,爬取该用户评论,并存入MongoDB数据库

"""
链接mongoDB后,导入数据
"""

import requests
import urllib
import re
from bs4 import BeautifulSoup
import pymongo
import time
from datetime import datetime,timedelta

def transTime(rtime):#时间戳转为时间
    now = time.strftime("%Y-%m-%d %H:%M:%S")
    # print(rtime)
    rtime = rtime.encode('utf-8')
    # print(rtime)
    if b"\xe5\xa4\xa9" in rtime:#带天数先正则
        res = re.search(b"(.*)\xe5\xa4\xa9(.*)\xe5\xb0\x8f\xe6\x97\xb6", rtime)
        days = str(res[1], encoding='utf-8')
        hours = str(res[2], encoding='utf-8')
        return days, hours, 0
    elif b"\xe5\xb0\x8f\xe6\x97\xb6" in rtime:#带小时再正则
        res = re.search(b"(.*)\xe5\xb0\x8f\xe6\x97\xb6(.*)\xe5\x88\x86\xe9\x92\x9f", rtime)
        # hours = res[1]
        # minutes = res[2]
        hours = str(res[1], encoding='utf-8')
        minutes = str(res[2], encoding='utf-8')
        return 0, hours, minutes
    elif b'\xe5\xb0\x8f\xe4\xba\x8e' in rtime:#小于1分钟
        return 0, 0, 0
    else:#几分钟
        minutes = re.search(b"^(.*)\xe5\x88\x86", rtime).group(1)
        minutes = str(minutes, encoding='utf-8')
        return 0, 0, minutes

def transDatetime(days,hours,minutes):#转换时间为日期
    now = datetime.now()
    d1 = now - timedelta(minutes=int(minutes))
    if hours!=0:
        d1 = d1 - timedelta(hours=int(hours))
        if days!=0:
             d1 = d1 - timedelta(days=int(days))
    return d1

def saveComments(n):
    # 数据存入MongoDB  参考网址:https://www.jianshu.com/p/7d14c3ad810f
    client = pymongo.MongoClient("localhost", 27017)  # 创建连接,因为用的本机的mongodb数据库,所以直接写localhost即可,也可以写成127.0.0.1,27017为端口
    db = client['mydb']  # 连接的数据库
    collection = db['saveComments_1']  # 连接的表

    # 通过循环实现对不同页码的网页的数据爬取
    # 参考网址:https://www.cnblogs.com/dudududu/p/8823871.html   https://blog.csdn.net/weixin_41032076/article/details/80171640
    for page in range(n):  # 以10页为例
        url = 'https://dig.chouti.com/user/cocolary/comments/' + str(page)
        headers2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                  'Chrome/51.0.2704.63 Safari/537.36'}

        request = urllib.request.Request(url, headers=headers2)
        answer = urllib.request.urlopen(request)
        html_text = answer.read()
        data = html_text.decode('utf-8')

        soup = BeautifulSoup(html_text.decode('utf-8'), 'html.parser')

        # 找出class属性值为content-list的div
        news_list = soup.find('div', {'class': 'content-list'})
        # 找出news_list下的所有div标签
        news = news_list.find_all('div')

        # 遍历news
        for i in news:
            try:
                time = i.find('div', {'class': 'comment-time'}).get_text().strip()  # 提取时间
                com = i.find('span', {'class': 'text-comment-con'}).get_text().strip()  # 提取评论
                source = i.find('span', {'class': 'content-source'}).get_text().strip()  # 提取来源
                Section = i.find('span', {'class': 'content-kind'}).get_text().strip()  # 提取来源区
                ding_num = i.find('span', {'class': 'ding-num'}).get_text().strip()  # 提取ding-num
                cai_num = i.find('span', {'class': 'cai-num'}).get_text().strip()  # 提取ding-num
                title_content = i.find('div', {'class': 'comment-title'}).find('a').get_text().strip()  # 提取评论新闻内容
                title_href = i.find('div', {'class': 'comment-title'}).find('a').get('href')  # 提取评论新闻href
                state_href = i.find('div', {'class': 'comment-state'}).find('a').get('href')  # 提取评论状态href

                days, hours, minutes = transTime(time)       # print(days,hours,minutes)
                time = transDatetime(days, hours, minutes)

                bianhao = re.findall(r"\d+\.?\d*", state_href)
                k = 0
                for i in bianhao:
                    if k == 0:
                        title_id = i   # print("新闻编号为:", title_id)
                        k = 1
                    else:
                        comment_id = i  # print("评论编号为:", comment_id)
                        k = 0

                data = {}
                data['CommentTime'] = time.strftime("%Y-%m-%d %H:%M:%S")   #评论时间
                data['com_content'] = com  #评论内容
                source = source[1:]   #去掉来源(微博、微信……)前边的短线
                data['Source'] = source
                data['Section'] = Section
                data['news_content'] = title_content   #新闻内容


                #去掉中括号
                ding_num = ding_num.replace('[', '').replace(']', '')
                cai_num = cai_num.replace('[', '').replace(']', '')

                data['Ups'] = int(ding_num)
                data['Downs'] = int(cai_num)
                data['NID'] = int(title_id)
                data['CID'] = int(comment_id)

                collection.insert(data)  # 插入记录

            except AttributeError as e:
                continue

        answer.close()

def saveInfo(soup,num):
    # 数据存入MongoDB  参考网址:https://www.jianshu.com/p/7d14c3ad810f
    client = pymongo.MongoClient("localhost", 27017)  # 创建连接,因为用的本机的mongodb数据库,所以直接写localhost即可,也可以写成127.0.0.1,27017为端口
    db = client['mydb']  # 连接的数据库
    collection = db['saveInfo_1']  # 连接的表

    name = soup.find('div',{'class':"tu"}).get_text().strip()  #username
    eare = soup.find('div',{'class':"tu-m"}).find_all('span')
    # servetime = soup.find('div',{'class':"medal"}).get_text().strip()   来自js
    signNature = soup.find('div', {'class': "tu-b"}).get_text().strip()
    score = soup.find('div',{'class':"profile-B_2"}).find('span').get_text().strip()
    score = int(score)
    k=0
    for i in eare:
        if k == 0:
            eare_one = i.get_text().strip()
            k = 1
        else:
            eare_two = i.get_text().strip()
            break
    # sex = soup.find('div',{'class':"tum_sex"}).get_text()   来自js
    posts = int(soup.find(id='shu_fa').get_text().strip())
    recommend = int(soup.find(id='shu_digg').get_text().strip())
    all_comments_num = num

    Info = {}
    Info["Nick"] = name
    Info["posts"] = posts
    Info["eare_one"] = eare_one
    Info["eare_two"] = eare_two
    Info["jifen"] = score
    Info["recommend"] = recommend
    Info["all_comments_num"] = all_comments_num
    Info["signNatur"] = signNature

    collection.insert(Info)  # 插入记录


url = 'https://dig.chouti.com/user/cocolary/comments/1'
headers2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/51.0.2704.63 Safari/537.36'}
request = urllib.request.Request(url, headers=headers2)
answer = urllib.request.urlopen(request)
html_text = answer.read()
soup = BeautifulSoup(html_text.decode('utf-8'), 'html.parser')

#获取评论页数
comments = soup.find(id = "shu_comment")
num = int(comments.text)
numm = num / 15+1
n = int(numm)

# 爬取的评论保存到文件中
#saveComments(2)

# 爬取的个人信息保存到文件中
saveInfo(soup,num)










  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值