1.需求说明
由于毕设做了一个博客项目。但是到后期没有数据,所以就想着从csdn上爬取一些博客。
爬取任务:把分类的博客信息,用户信息爬取到,然后保存到mysql数据库中。再对数据进行过滤、去重。
2.核心实现
1. 实现思路
查看csdn首页发现,每次下拉会有一个xhr请求。这个请求的响应结果集比较好处理,但是需要携带一个时间戳参数,否则数据不准确。所以我在方法里做了一个迭代。
2.核心代码
import requests
from lxml import etree
import re
from pymysql import *
"""时间戳转日期格式"""
import time
from requests.exceptions import RequestException
# 操作数据库类
class DbUtil:
# 连接数据库 mysql
def connectDB(self):
host = "*********"
dbName = "******"
user = "*******"
password = "*******"
# 此处添加charset='utf8'是为了在数据库中显示中文,此编码必须与数据库的编码一致
db = connect(host, user, password, dbName, charset='utf8mb4')
return db
# 数据插入表中
# INSERT INTO datagov(government,title,content,pubtime)VALUES(111,"2112","2114","2019-11-05")
def inserttable(self, insertTable,w_uid, wcontent, wdate, wtitle,wremarks,wrootnickname):
try:
insertContentSql = "INSERT INTO " + insertTable + "(wcontent,wdate,wtitle,w_uid,wremarks,wrootnickname)VALUES(%s,%s,%s,%s,%s,%s)"
DB_insert = self.connectDB()
cursor_insert = DB_insert.cursor()
cursor_insert.execute(insertContentSql, (wcontent,wdate,wtitle,w_uid, wremarks, wrootnickname))
DB_insert.commit()
# DB_insert.close()
print('insert contents to ' + insertTable + ' successfully')
except Exception :
#出现错误跳过处理下一条
print("发生异常", Exception)
pass
def closeDB(self):
Database = self.connectDB()
Database.close()
class CSDN():
def __init__(self):
#todo 请求的地址 根据标签 例如 java/python/web ...
#https://www.csdn.net/nav/web
# https://www.csdn.net/nav/db
#https://www.csdn.net/nav/cloud
url = 'https://www.csdn.net/nav/cloud'
self.headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'cookie': ' 填自己的cookie ',
'user-agent': '填自己的版本',
'x-tingyun-id': ''
}
# TODO lst为了存储 list[ articles[article]]
self.lst = []
# 用户标记迭代
self.num = 0
content = requests.get(url, headers=self.headers).text
content = etree.HTML(content)
self.parse(content)
def parse(self, content):
# todo 获取页面第一个shown-offset
shown_offset = content.xpath('//ul[@id="feedlist_id"]/@shown-offset')[0]
#todo 传递参数
self.getData('cloud', shown_offset)
def getData(self, word, show_offset):
# todo xhr的url转换
url = 'https://blog.csdn.net/api/articles?type=more&category={}&shown_offset={}'.format(word, show_offset)
self.getData2(url, word)
def getData2(self, url, word):
# TODO 控制循环次数 一次循环保存一页数据
if self.num < 50:
content = requests.get(url, headers=self.headers).json()
print(content)
print("word",word)
print('==================={}====================='.format(self.num))
#保存数据
self.lst.append(content['articles'])
# 更新shown_offset时间戳为最后一条博客的时间戳
shown_offset = content['articles'][-1]['shown_offset']
url = 'https://blog.csdn.net/api/articles?type=more&category={}&shown_offset={}'.format(word,shown_offset)
self.num += 1
# 进行迭代
self.getData2(url, word)
else:
pass
# 时间戳转换
def stamp_to_time(stamp, strformat="%Y-%m-%d %H:%M:%S"):
stamp = int(stamp)
ltime = time.localtime(stamp)
timeStr = time.strftime(strformat, ltime)
return timeStr
def get_one_page(url):
response = requests.get(url=url)
try:
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
if __name__ == '__main__':
c = CSDN()
print(c.lst)
db=DbUtil()
db.connectDB()
for articles in c.lst:
for article in articles:
# 获取博客标题
wtitle = article['title']
# 获取博主 昵称
wrootnickname = article['nickname']
# 获取时间戳 转换为时间 设置为 wdate
wdate = stamp_to_time(stamp=article['shown_time'])
# 获取文章id传递到数据库中的w_uid
w_uid=article['id']
text = get_one_page(article['url'])
html = etree.HTML(text)
# 获取当前博客的内容wcontent
div_content = html.xpath('//*[@id="content_views"]')[0]
div_str = etree.tostring(div_content, encoding='utf-8').decode('utf-8')
content1 = str(div_str).strip()[:-6]
wcontent = re.sub('<div(.*?)>', '', content1, 1)
# 插入数据库
db.inserttable("weibo",w_uid, wcontent, wdate, wtitle, "Cloud",wrootnickname)
print("下载成功 :", wdate, wtitle, wrootnickname)
db.closeDB()
3.最后
发现爬取速度并不是很快,所以我在类CSDN的getData2方法中做了个改进,使用了生成器。
第一次写博客,欢迎学习交流哈 。。。。。。。