步骤:
1.连接数据库,确定存储表结构,建表。
(init_db.py)
# -*- coding: utf-8 -*-
"""
@Time : 2021/1/31 15:00
@Auth : 牟晋卓
@File :init_db.py
@IDE :PyCharm
"""
import sqlite3
conn = sqlite3.connect("qidian.db")
print("连接成功!")
c = conn.cursor()
#设置存储的字段及其类型
sql = '''
create table novel
(id INTEGER not null PRIMARY KEY AUTOINCREMENT,
novel_title text,#小说题目
novel_href text,#小说链接
novel_img text,#小说主题图片
novel_date text,#小说数据
novel_writer text,#小说作者
novel_instru text
);
'''
c.execute(sql)
conn.commit()
conn.close()
print("建表结束!")
2.开始爬取网页数据
(main.py)
# -*- coding: utf-8 -*-
"""
@Time : 2021/1/31 10:51
@Auth : 牟晋卓
@File :main_py.py
@IDE :PyCharm
"""
import requests
import time,random
from lxml import etree
import sqlite3
def random_user_agent():
list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36']
seed = random.randint(0, len(list)-1)
return list[seed]
#设置请求头
headers = {
'User-Agent': random_user_agent(),
}
#爬取网页
def getData(url,savePath):
print("爬取开始...")
for i in range(1,31):
novel_list = []
print('第%d页'%(i))
time.sleep(random.randint(1,3))#爬取间隔1-3秒
relUrl = url + str(i)
try:
response = requests.get(url=relUrl, params=headers)
if response.status_code == 200:
html = response.text
else:
html = " "
except Exception as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
continue
# print(html)
selector = etree.HTML(html)#数据处理
items = selector.xpath('//ul[@class="cf"]//li')
# print(len(items))
for item in items:
novel_img = 'https:' + item.xpath('./div[@class="focus-img"]//a//img/@src')[0]
novel_href = item.xpath('./div[@class="info"]//p//a/@href')[0]
novel_date = item.xpath('./div[@class="info"]//span/text()')[0]
novel_title = item.xpath('./div[@class="info"]//p//a/text()')[0]
# print(novel_img, novel_title, novel_href, novel_date)
try:
time.sleep(1)
response = requests.get(url=novel_href, params=headers)
if response.status_code == 200:
info = response.text
else:
info = " "
except Exception as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason,"")
print("第%d页%s页面丢失!"%(i,novel_href))
continue
sel = etree.HTML(info)
try:
novel_writer = sel.xpath('//a[@class="writer"]/text()')[0]
except:
novel_writer = ' '
try:
novel_instru = sel.xpath('//p[@class="intro"]/text()')[0]
except:
novel_instru = ' '
# print(novel_writer,novel_instru)
novel_list.append([novel_title,novel_href,novel_img,novel_date,novel_writer,novel_instru])
saveSql(novel_list,savePath)
print("爬取结束...")
return "程序结束!"
def saveSql(novel_list,savePath):
conn = sqlite3.connect(savePath)
c = conn.cursor()
print("开始存储!")
for data in novel_list:
for i in range(len(data)):
data[i] = '"'+data[i]+'"'
sqls = '''
insert into novel (novel_title,novel_href,novel_img,novel_date,novel_writer,novel_instru)
VALUES (%s)
'''%",".join(data)
# print(sqls)
c.execute(sqls)
conn.commit()
conn.close()
print("存储结束!")
if __name__ == "__main__":
baseurl = r"https://www.qidian.com/book/coverrec?page=2"
savePath = r"qidian.db"
getData(baseurl, savePath)
3.总结
1.爬虫设置几个请求头,轮流切换(避免反爬)
2.有条件的可以设置IP池代理,就不用考虑反爬了(但是要钱,公司出钱没问题)
3.个人兴趣做爬虫的话,尽量不用并发多线程,减少目标网站的负荷(或者是注意反爬封IP)
4.没有了,草率结束!!!