跟着擦哥、擦姐的系列文章,一步步向前走。 又好久没去读文章了…写的真是太好了~
这就是个简单的爬取工作,增加一下 原创文章数量+1
- 访问博主首页
- 爬取文章列表,保存在Excel里,且文章标题 直接关联了超链接
就是简单的把图里的内容,转换成了 excel,哈哈。
那,看图。
- 获取擦姐的博客首页文章地址url
https://dream.blog.csdn.net/article/list/1(第一页可有可无‘/article/list/1’)
https://dream.blog.csdn.net/article/list/2
https://dream.blog.csdn.net/article/list/3
https://dream.blog.csdn.net/article/list/4 - 很明显看出地址的一丢丢变化
- 然后就可以F12,进行节点抓取拉。
直接上源码,太简单了,没啥好看的
import sys, os
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../common')))
# print(__dir__)
from util import get_html
from excel import excel
# 爬取csdn博主文章,并保存?
import time
from bs4 import BeautifulSoup
# 文章列表
def get_ats(html, ats):
"""
:param :html 网页源码
:param :ats 即将要导出excel的内容
"""
soup = BeautifulSoup(html, 'html.parser')
box = soup.select('div .article-item-box')
if box:
for i in range(len(box)):
item = box[i]
at = []
# 标题
a = item.select('a')[0]
# 避免标题/文章头内容的,\t\n && 英文逗号,影响csv的分割,这里替换掉
title = a.text.strip().replace('\n','').replace(',',',')
titles = title.split(' ')
at.append(titles[0])
at.append(titles[1])
at.append(a['href'])
# print(title, title.split(' ')[1])
ps = item.select('p')
# 文章头,时间,阅读,评论
for i in range(len(ps)):
if i == 1:
for j in ps[i].select('span'):
at.append(j.text)
else:
at.append(ps[i].text.strip().replace(',',','))
# print(at)
ats.append(at)
return ats
# 保存csv
def wrt(ats):
name = str(round(time.time()))
with open(f'./files/{name}.csv', 'a+', encoding='utf-8') as f:
for i in range(len(ats)):
f.write(','.join(ats[i]))
f.write('\n')
#print(','.join(i))
if __name__ == '__main__':
# 表头
ats = [['创作','标题','链接','文章头','时间','阅读','评论']]
# 博主首页链接,擦姐的域名,很强。带有一个dream,普通人就是 https://blog.csdn.net/博主名称
blog_url = 'https://dream.blog.csdn.net'
# blog_url = 'https://blog.csdn.net/博主名称'
# 自行查看,博主首页文章的页数
pages = 2
# 循环获取每一页
for i in range(1, pages+1):
url = f'{blog_url}/article/list/{i}'
print(f'爬取。。。{url}')
html = get_html(url)
ats = get_ats(html, ats)
# 保存csv,保存后如若用excel打开,则需先用记事本打开,另存为 ANSI , 才不会乱码
# wrt(ats)
# 保存excel
ex = excel()
# with_url , 哪一列下标内容带url,ats数组改列后面必须紧接着url
ex.write_row(ats, with_url=1)
#ex.wbac.cell(row=1, column=1).value = '=HYPERLINK("{}", "{}")'.format('https://www.baidu.com', "Link Name")
ex.save()
Excel,有的封装,封装了个寂寞 ,尝试封装
from openpyxl import Workbook, load_workbook
import time
class excel():
def __init__(self) -> None:
self.wb = Workbook()
self.wbac = self.wb.active
def get_sheet_names(self):
return self.wb.get_sheet_names()
# 向sheet中写入一行数据
def write_row(self, list, sheet_name='Sheet', with_url=None):
# sheets = self.get_sheet_names()
sheet = self.wb.get_sheet_by_name(sheet_name)
#sheet = wb.get_sheet_by_name(sheets[0])
if sheet:
# row = [y for item in list for y in item]
# sheet.append(row)
for i in range(len(list)):
# ll = []
row = list[i]
for y in range(len(row)):
content = row[y]
if with_url == y and i > 0:
# print(i,y)
# 替换成链接
# wu = '=HYPERLINK("{}", "{}")'.format(row[y+1], content)
list[i][y] = '=HYPERLINK("{}", "{}")'.format(row[y+1], content)
# v = self.wbac.cell(row=i+1, column=y+1).value = '=HYPERLINK("{}", "{}")'.format(row[y+1], content)
# ll.append(wu)
# continue
# else:
# ll.append(content)
# print(ll)
# sheet.append(ll)
sheet.append(row)
def new_sheet(self, name):
self.wb.create_sheet(name)
def save(self, path=f"./files/Excel{str(round(time.time()*1000))}.xlsx"):
self.wb.save(path)
# def __call__(self, list):
# self
# pass
封装的工具类,无用的import删掉
import os, time, requests, random, telnetlib, json, pypinyin
from bs4 import BeautifulSoup
__dir__ = os.path.dirname(os.path.abspath(__file__))
# print(__dir__)
def get_headers(localhost=True, refer="https://www.baidu.com", host=None):
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
if not localhost:
uas = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
"Sosospider+(+http://help.soso.com/webspider.htm)",
"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
]
ua = random.choice(uas)
headers = {
"User-Agent": ua,
"Referer": refer,
"Host": host
}
return headers
def get_html(url, ret_type="text", timeout=50, encoding="utf-8"):
headers = get_headers()
res = requests.get(url, headers=headers, timeout=timeout)
res.encoding = encoding
# print(res.status_code)
# print(res.text)
if ret_type == "text":
return res.text
elif ret_type == "image":
return res.content
elif ret_type == "json":
return res.json()
好了,嗯嗯嗯,就是这样…尝试抓取一下自己的文章列表吧!!修改这里
# 博主首页链接
blog_url = 'https://blog.csdn.net/博主名称'
# 自行查看,博主首页文章的页数
pages = 2
其他,在保存为csv的时候,再通过office-excel打开会乱码
别急~~,先用记事本打开,另存为 编码改为ANSI 就可以了。