# 正则爬取科幻小说
import urllib.request
from urllib.request import ProxyHandler,build_opener
import random,re
url = 'https://www.qidian.com/free/all/chanId10/'
# 设置一个随机请求头列表
# 请求头的作用:将用户的行为伪装成一个浏览器取抓去数据
user_agent_list = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
]
# 设置一个ip列表
ip_list = [
{'http':'http://115.218.3.11:9000'},
{'http':'http://220.248.188.75:808'},
{'http':'http://220.196.229.131:8118'}
]
# 设置随机请求头
headers = {
'User-Agent':random.choice(user_agent_list)
}
# 定义一个请求信息
request = urllib.request.Request(url,headers=headers,method='GET')
# 创建一个ip对象
proxy_handler = ProxyHandler(random.choice(ip_list))
# 根据ip对象创建用于发送请求的对象
opener = build_opener(proxy_handler)
# 使用opener对象向目标网站发起请求
response = opener.open(request)
resp = response.read().decode('utf-8')
#创建一个对应的对象
re_expression = r'<div class="book-mid-info">.*? title=".*?">(.*?)</a>.*? target="_blank">(.*?)</a>.*?data-eid="qd_B60">(.*?)</a>.*?data-eid="qd_B61">(.*?)</a>.*?<p class="intro"> (.*?)</p>'
pattern_obj = re.compile(re_expression)
'''
<div class="book-mid-info">.*? title=".*?">(.*?)</a>.*? target="_blank">(.*?)</a>.*?data-eid="qd_B60">(.*?)</a>.*?data-eid="qd_B61">(.*?)</a>.*?<p class="intro"> (.*?)</p>
'''
res = re.findall(pattern_obj,resp)
for u in res:
novel_name = u[0]
# print(novel_name)
novel_author = u[1]
# print(novel_author)
# 获取大分类
novel_big_class = u[2]
# print(novel_big_class)
# 获取小分类
novel_small_class = u[3]
# print(novel_small_class)
# 获取小说简介
novel_intor = u[4]
# print(novel_intor)
# 保存数据
file = open('起点中文之武悬疑.csv', 'a', encoding='gb18030')
file.write(novel_name + ',' + novel_intor + ',' + novel_big_class + ',' + novel_small_class + ',' + novel_intor)
file.write('\n')
file.close()
03-04
687
03-04
3767
11-26
142
02-27
526