导入文件
import requests
from lxml import etree
import MySQLdb
import datetime
初始化爬取网页和代理头
def __init__(self):
self.url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T"
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
}
获取网页内容
def getData(self,url):#获取网页内容
response=requests.get(url,headers=self.headers)
response=response.content.decode()
return response
解析处理数据
def parseData(self,data,i):
html=etree.HTML(data)
el_list=html.xpath('//*[@id="subject_list"]/ul/li/div[2]')
for el in el_list:
data_dict={}
data_dict['title']=el.xpath('./h2/a/text()')[0].strip()
data_dict['author']=el.xpath('./div[1]/text()')[0].strip().split(" / ")[0]
data_dict['publisher']=el.xpath('./div[1]/text()')[0].strip().split(" / ")[-3]
#处理日期数据
date=el.xpath('./div[1]/text()')[0].strip().split(" / ")[-2].replace("年", "-").replace("日", "-").replace("月", "-").replace(".", "-").strip("-")
date=date.split("-")
if(len(date)<=2):
date=datetime.datetime(int(date[0]),int(date[1]),1)
else:
date=datetime.datetime(int(date[0]),int(date[1]),int(date[2]))
data_dict['publish_time']=date
#处理数字数据
data_dict['price']=el.xpath('./div[1]/text()')[0].strip().split(" / ")[-1].strip()
rating=el.xpath('./div[2]/span[2]/text()')[0].strip()
data_dict['rating']=float(rating)
data_dict['comments']=el.xpath('./div[2]/span[3]/text()')[0].strip()
#处理简介
if(el.xpath('./p/text()')):
data_dict['introduction']=el.xpath('./p/text()')[0].strip()
else:
data_dict['introduction']=""
data_list.append(data_dict)
#翻页
self.url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}&type=T".format(i*20)
数据爬取用xml定位数据
其中日期类型格式样式太乱,需要处理转成datetime类型
将爬出数据保存到mysql
def savedb(self):
db = MySQLdb.connect("localhost", "root", "root", "db1", charset='utf8mb4' )
cursor = db.cursor()
#创建表
try:
# 执行sql语句
cursor.execute('''CREATE TABLE novel(
id INT PRIMARY KEY AUTO_INCREMENT,
title VARCHAR(32) NOT NULL,
author VARCHAR(32) NOT NULL,
publisher VARCHAR(32) NOT NULL,
publish_time DATE,
price varchar(32),
rating DOUBLE,
comments VARCHAR(32),
introduction VARCHAR(128)
);''')
for i in range(len(data_list)):
sql = "INSERT INTO novel(title,author,publisher,publish_time,price,rating,comments,introduction) VALUES ( %s, %s, %s, %s, %s, %s, %s, %s)"
cursor.execute(sql,[data_list[i]['title'],data_list[i]['author'],data_list[i]['publisher'],data_list[i]['publish_time'],data_list[i]['price'],data_list[i]['rating'],data_list[i]['comments'],data_list[i]['introduction']])
db.commit()
except:
# Rollback in case there is any error
db.rollback()
# 关闭数据库连接
db.close()
保存数据时遇到emoj表情数据,mysql无法保存
解决办法:将数据库字符集改为utf8mb4 注:需mysql5.53以上版本
保存结果展示:
查询数据库内容
def query(self,sql):
db = MySQLdb.connect("localhost", "root", "root", "db1", charset='utf8mb4' )
# 使用cursor()方法获取操作游标
cursor = db.cursor()
try:
# 执行sql语句
cursor.execute(sql)
results = cursor.fetchall()
for row in results:
title=row[1]
author=row[2]
publisher=row[3]
publish_time=row[4]
price=row[5]
rating=row[6]
comments=row[7]
introduction=row[8]
print("书名:"+title+",作者:"+author+",出版社:"+publisher+",出版时间:"+str(publish_time)+",价格:"+price+",评分:"+str(rating)+",评价人数:"+comments)
print("简介\n"+introduction)
except:
# Rollback in case there is any error
db.rollback()
# 关闭数据库连接
db.close()
运行函数
def run(self):
for i in range(1,11):
data=self.getData(self.url)
self.parseData(data,i)
self.savedb()
主函数
if __name__ =='__main__':
douban=Douban()
douban.run()
douban.query("SELECT * FROM novel WHERE id=1;")
查询结果展示