爬取虎扑步行街论坛数据保存到MySQL、MongoDB

最新推荐文章于 2023-07-06 15:54:45 发布

拾清心

最新推荐文章于 2023-07-06 15:54:45 发布

阅读量1.5k

点赞数 3

分类专栏：网络爬虫文章标签： python 爬虫数据挖掘

本文链接：https://blog.csdn.net/My_daily_life/article/details/121378369

版权

网络爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

本文内容围绕获取虎扑步行街论坛上所有帖子的数据开展的，爬取的内容包括帖子名称、帖子链接、作者、作者链接、创建时间、回复数、浏览数、最后回复用户和最后回复时间，将爬取的数据分别保存在MongoDB和MySQL里
网页地址： https://bbs.hupu.com/bxj
在这里插入图片描述
在运行代码前记得启动MySQL、MongoDB，还需要在MySQL中创建所使用到的库与表
在MySQL中可能将要使用到的命令：
创建mypython这个库

create database mypython;

使用mypython

use mypython;

在这里插入图片描述

创建数据表hupu

create table hupu(
namevarchar(100) null,
urlvarchar(50) null,
authorvarchar(100) null,
author_hrefvarchar(50) null,
forum_timevarchar(30) null,
reply_countsvarchar(10) null,
browse_countsvarchar(10) null,
endreplynamevarchar(100) null,
endreplytimevarchar(30) null);

在这里插入图片描述

具体python代码如下：

import requests
import time
import pymysql
import pymongo
from lxml import etree

#创建连接 默认IP(本机)为localhost，端口号为27017
client = pymongo.MongoClient('localhost', 27017)
# 连接数据库，并创建pythonwork库
mydb = client['pythonwork']
 #连接表，并创建hupu集合
dataline = mydb['hupu']

#连接mysql数据库
conn=pymysql.connect(host='localhost',user='root',passwd='123456',db='mypython',port=3306,charset='utf8')
#光标对象
cursor=conn.cursor()
#请求头
headers={'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'}
#获取各页全部文章的链接
def get_href(url):
    html=requests.get(url,headers=headers)
    selector=etree.HTML(html.text)
    hrefs=selector.xpath('//ul/li/div/div[@class="post-title"]/a/@href')
    for href in hrefs:
        get_forum("https://bbs.hupu.com"+href)
      
def get_forum(url):
    html=requests.get(url,headers=headers)
    result=etree.HTML(html.text)
    try:
        name=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[2]/div/h1/text()')[0]    #获取帖子名
        author=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[4]/div/div[1]/div/div[1]/div/a/text()')[0]    #获取作者名
        author_href=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[4]/div/div[1]/div/div[1]/div/a/@href')[0]    #获取作者链接
        forum_time=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[4]/div/div[1]/div/div[1]/div/span[3]/text()')[0]  #获取创建时间
        reply_counts=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[2]/div/span[1]/text()')[0]       #获取回复数
        browse_counts=result.xpath('//*[@id="container"]/div/div[3]/div[2]/div[1]/div/div/div[2]/div/span[3]/text()')[0]      #获取浏览数
        #判断回复数是否大于20，若大于20则需要进入最后的回复页面进行爬取
        if int(reply_counts)>20:
            #获取回复一共有几页
            endpage=result.xpath('//ul[@class="hupu-rc-pagination"]/li/a/text()')[-1]
            #构造回复的最后一页链接
            endurl=url[:-5]+'-'+endpage+'.html'
            #爬取最后一页回复信息
            endhtml=requests.get(endurl,headers=headers)
            endresult=etree.HTML(endhtml.text)
            
            #获取最后回复用户名与时间
            endreplyname=endresult.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/a/text()')[0]
            endreplytime=endresult.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/span/text()')[0]
        #如果回复总数小于或等于20，那么就可直接在当前页面爬取最后回复者的信息
        else:
            endreplyname=result.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/a/text()')[0]
            endreplytime=result.xpath('//div[@class="bbs-post-wrapper-content"]/div/div[last()]/div/div[2]/div[1]/div/div[1]/span/text()')[0]
        #将爬取的数据存入字典中
        info={
            'name':name,
            'url':url,      #帖子链接
            'author':author,
            'author_href':author_href,
            'forum_time':forum_time,
            'reply_counts':reply_counts,
            'browse_counts':browse_counts,
            'endreplyname':endreplyname,
            'endreplytime':endreplytime
            }
        print('恭喜：该条帖子爬取成功！')
        #存入mongodb
        dataline.insert_one(info)
        #存入mysql
        cursor.execute("insert into hupu(name,url,author,author_href,forum_time,reply_counts,browse_counts,endreplyname,endreplytime) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                       (name,url,author,author_href,forum_time,reply_counts,browse_counts,endreplyname,endreplytime))
    except:
        pass

if __name__ == '__main__':
    urls = ['https://bbs.hupu.com/bxj-{}'.format(str(i)) for i in range(1,11)]
    for url in urls:
        get_href(url)
        time.sleep(1)
    #统一提交
    conn.commit()
    #关闭链接
    conn.close()