最近工作需要爬取oschina等论坛发布数据的列表,但对于cnblog这种网站来说,web scraper这种插件是作用不大的,貌似cnblog的前端页面是作了静态资源的放爬取处理的,反正爬下来是很难处理数据的,不是很方便,故自己写了几个python脚本爬了下数据。
输入:
包含文章详细内容的excel,其中包括标题、链接等,其实数据一列标题就可以了。
输出:
文章的标题和浏览数字,其实还可以输出其他文章的信息,我不太需要,就只爬了这两。
大致思路:
- 根据oschina前端页面,正则匹配到标题和浏览量的pattern,备用
- 读取文章URL到数组当中,遍历数组,使用Python的Request模块,构造请求头,获取前端页面,和标题和浏览的正则匹配,获取实际数据。
- 使用python的xlrd模块,新建excel,将爬到的标题和浏览以列的形式在excel中保存。
具体代码:
from urllib import request
import re
from urllib.request import Request, urlopen
import urllib3
import xlrd
import xlwt
class Spider():
"""
爬取oschina网站信息
"""
# url = f'https://blog.csdn.net/weixin_44708240/article/details/116270210'
title_root_pattern = '<h1 class="article-box__title">([\s\S]*?)</h1>'
title_pattern = '<a href="([\s\S]*?)" target="_blank">([\s\S]*?)</a>'
# num_root_pattern = '<div class="bar-content">([\s\S]*?)</div>'
readNum_pattern = '<div class="item lm">([\s\S]*?)</div>'
def __fetch_content(self, url):
"""
docstring
"""
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
"""
构造请求
"""
request = Request(url, headers=headers)
htmls = urlopen(request).read()
htmls = str(htmls, encoding='utf-8')
return htmls
except ValueError as e:
print(e.__doc__ + 'the error url of 403 is' + str(url))
def __getTitle(self, htmls):
titles = []
root_html = re.findall(Spider.title_root_pattern, htmls)
try:
for html in root_html:
title = re.findall(Spider.title_pattern, html)
data = {'title': title}
titles.append(data)
print(title)
except ValueError as e:
print(e.__doc__ + 'the error title is' + str(title))
pass
return titles
def __getReadNum(self, htmls):
nums = []
try:
num = re.findall(Spider.readNum_pattern, htmls)
data = {'num': num}
nums.append(data)
except ValueError as e:
print(e.__doc__ + 'the error num is' + str(num))
pass
return nums
def __getUrl(self):
worksheet = xlrd.open_workbook('D:\sheet.xls')
sheet_names = worksheet.sheet_names()
print(sheet_names)
sheet = worksheet.sheet_by_index(1)
"""
获取csdn sheet的url列,此处我的列为3
"""
rows = sheet.nrows
cols = sheet.ncols
all_content = []
for i in range(rows):
cell = sheet.cell_value(i, 3)
try:
cell = str(cell)
all_content.append(cell)
except ValueError as e:
pass
return all_content
def go(self):
"""
调用正则匹配方法获取想要抓取的数据
"""
csdn = xlwt.Workbook(encoding='utf-8') #创建Excel
sheet = csdn.add_sheet('csdn', cell_overwrite_ok = True) #创建sheet页面
url = self.__getUrl()
row = 0
for i in url:
try:
htmls = self.__fetch_content(i)
title = self.__getTitle(htmls)
num = self.__getReadNum(htmls)
print('博客:' + str(title).split("'")[5] + ' 阅读量:' + str(num).split("'")[5])
sheet.write(row, 0, str(title).split("'")[5])
sheet.write(row, 1, str(num).split("'")[5])
"""
保存路径需要自定义
"""
csdn.save('D:\\pycharmProject\\oschinaData.xls')
row += 1
except ValueError as e:
print(e.__doc__+'the error url is' + str(i))
pass
# 执行
spider = Spider()
spider.go()
注意:爬取oschina的时候是需要构造header并在request方法中传入的,其他论坛:比如cnblog是不需要的,直接urlopen("url")即可。
爬取到的结果如下: