创建工具存储类tools文件: 内含处理解析数据时出现的特殊字符的方法, sql储存数据的动态方法, excel表格存储数据的动态方法
功能文件创建好之后, 可以在不同的爬虫项目中进行引用,可以在以后码爬虫项目时省去大量的时间精力
# -*- coding: utf-8 -*-
__author__ = '木之易'
__date__ = '2018/8/16 9:40'
import re
import sqlite3
import xlwt
class ProcessStr(object):
# 静态函数
@staticmethod
def process_date(string):
"""处理日期和标签"""
pattern = re.compile(r'\r|\n| ', re.S)
string = re.sub(pattern, '', string)
# 分割字符串
res = string.split('·')
# 取出日期,拼接标签
date = res[0]
tags = ';'.join(res[1:])
# 返回处理之后的数据
return date, tags
class SaveModel(object):
# 类属性
dbname = ''
# 表名
tablename = ''
# 字段名
fileds = []
def __init__(self, *data):
self.conn = None
self.cursor = None
self.data = data
def connect_sql(self):
"""链接数据库"""
# 根据dbname创建数据库
self.conn = sqlite3.connect(self.dbname)
self.cursor = self.conn.cursor()
def close_sql(self):
"""关闭数据库"""
self.conn.commit()
self.cursor.close()
self.conn.close()
def create_table(self):
"""创建表"""
# 1.链接数据库
self.connect_sql()
s = ''
for f in self.fileds:
s1 = ', ' + f + ' CHAR'
s += s1
s += ')'
# 拼接完整的sql语句
sql = 'CREATE TABLE IF NOT EXISTS {}(id INTEGER PRIMARY KEY '.format(self.tablename)+s
# 2.执行sql语句
self.cursor.execute(sql)
# 3.关闭数据库
self.close_sql()
def save(self):
# 1.打开数据库
self.connect_sql()
s = ''
v = ''
for f in self.fileds:
s1 = f + ','
v1 = '"%s",'
s += s1
v += v1
s = s[:-1]
# 拼接完整的值字符串
v = v[:-1] % self.data
# 2.准备sql语句
sql = """INSERT INTO {}(""".format(self.tablename) + s + """)VALUES({})""".format(v)
# 3.执行
self.cursor.execute(sql)
# 4.关闭数据库
self.close_sql()
class SaveExcel(object):
filename = ''
sheetname = ''
fields = []
@classmethod
def init_work(cls):
cls.workbook = xlwt.Workbook(encoding='utf-8')
cls.sheet = cls.workbook.add_sheet(cls.sheetname)
for idx, f in enumerate(cls.fields):
cls.sheet.write(0, idx, f)
@classmethod
def save(cls, count, *data):
print('正在写入第{}条数据,请稍后....'.format(count))
"""保存数据"""
for idx, f in enumerate(data):
# 写入数据
cls.sheet.write(count, idx, f)
# 保存文件
cls.workbook.save(cls.filename)
if __name__ == '__main__':
# 测试代码
SaveModel.dbname = 'article.db'
SaveModel.tablename = 'article'
SaveModel.fileds = ['title', 'img_src', 'comment', 'commends', 'markbook', 'contents']
s = SaveModel('1','2','3','4','5','6')
s.create_table()
s.save()
SaveExcel.filename = '伯乐在线.xls'
SaveExcel.sheetname = 'article'
SaveExcel.fields = ['title', 'commends', 'bookmarks']
SaveExcel.init_work()
SaveExcel.save(1,'Mysql5.7', '2', '10')
创建功能类文件: 用来需要爬取的处理解析网页内部信息
# -*- coding: utf-8 -*-
__author__ = '木之易'
__date__ = '2018/8/16 9:03'
import requests
from bs4 import BeautifulSoup
import xlwt
import sqlite3
import re
from tools import ProcessStr, SaveModel, SaveExcel
class BoleSpider(object):
def __init__(self):
self.url = 'http://blog.jobbole.com/all-posts/'
self.html = ''
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
self.count = 0
self.retry = 0
self.bs = None
def get_html(self, url):
"""
根据url地址获取网页源代码,并赋值给self.html
:param url: 网页地址
:return: None
"""
try:
self.retry += 1
response = requests.get(url=url, headers=self.headers)
self.html = response.text
# 创建bs对象
self.bs = BeautifulSoup(self.html, 'lxml')
except Exception as e:
if self.retry > 3:
print('该页面请求失败,地址:{}'.format(url))
return
print('请求失败,正在重试第{}次请求....'.format(self.retry))
# 重新请求
self.get_html(url)
else:
# 重试次数归0
self.retry = 0
def parse_index(self):
"""解析首页,1.文章列表 2.下一页链接"""
# 找到a链接标签
links = self.bs.select('#archive .post-thumb > a')
# 遍历取出每一个a标签
for a in links:
# 详情地址
href = a.attrs['href']
# 找到封面图标签节点
img = a.find('img')
# 获取图片链接
img_src = img.attrs['src']
# 进入详情页,解析详情数据,存储数据
self.get_html(href)
# 解析详情数据
self.parse_detail(img_src)
self.get_html(self.url)
# 查找下一页
next_link = self.bs.find(class_='next')
if next_link:
href = next_link.attrs['href']
print(href)
# 更换地址
self.url = href
# 获取下一页源代码
self.get_html(href)
# 调用解析首页源码函数
self.parse_index()
else:
print('没有下一页数据了,爬虫结束!')
def parse_detail(self, img_src):
self.count += 1
print('正在爬取第{}篇文章,请稍后....'.format(self.count))
# find() 返回直接就是节点对象
h1 = self.bs.find(class_='entry-header').find('h1')
if h1:
title = h1.text
else:
title = '无标题'
p = self.bs.find(class_="entry-meta-hide-on-mobile")
date, tags = ProcessStr.process_date(p.text)
# 原文出处、译文出处
from_where = self.bs.find(class_='copyright-area')
source_path = '未知'
trans_path = '未知'
if not from_where:
from_where = self.bs.find(class_= "entry").find('p')
# 判断是否找到
if from_where:
links = from_where.select('a')
if len(links) == 2:
source_path = links[0].attrs['href']
trans_path = links[1].attrs['href']
elif len(links) == 1:
source_path = links[0].attrs['href']
# 文章内容
contents = self.bs.find(class_='entry').text
contents = re.sub(re.compile(r'"',re.S), "'", contents)
# 赞
commends = self.bs.find('h10').text
if commends == '': commends = 0
# 收藏
bookmarks = self.bs.find(class_='bookmark-btn').text.strip().split(' ')
if len(bookmarks) >= 2:
bookmarks = bookmarks[0]
else:
bookmarks = 0
# 评论数
comment = self.bs.select('a[href="#article-comment"] > span')
if comment:
coms = comment[0].text.strip().split(' ')
if len(coms) >= 2:
comment = coms[0]
else:
comment = 0
# 创建数据模型对象
m = SaveModel(title, img_src, date, tags, source_path, trans_path, contents, commends, bookmarks, comment)
# 执行保存数据函数
m.save()
# 写入excel表格
SaveExcel.save(self.count, title, img_src, date, tags, source_path, trans_path, contents, commends, bookmarks, comment)
def run(self):
"""爬虫项目的生命周期"""
# 1.获取首页源代码
self.get_html(self.url)
# 2.解析首页
self.parse_index()
if __name__ == '__main__':
# 指定数据库名称、表名、字段名
SaveModel.dbname = 'bole.db'
SaveModel.tablename = 'bole'
SaveModel.fileds = ['title', 'img_src', 'date', 'tags', 'source_path', 'trans_path', 'contents', 'commends', 'bookmarks', 'comment']
# 创建表
s = SaveModel()
s.create_table()
# 指定excel表格配置
SaveExcel.filename = '伯乐在线.xls'
SaveExcel.sheetname = 'article'
SaveExcel.fields = ['title', 'img_src', 'date', 'tags', 'source_path', 'trans_path', 'contents', 'commends', 'bookmarks', 'comment']
SaveExcel.init_work()
# 运行爬虫
bole = BoleSpider()
bole.run()