novels

# 正则爬取科幻小说

import urllib.request
from urllib.request import ProxyHandler,build_opener
import random,re


url = 'https://www.qidian.com/free/all/chanId10/'
# 设置一个随机请求头列表
# 请求头的作用:将用户的行为伪装成一个浏览器取抓去数据
user_agent_list = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
]

# 设置一个ip列表
ip_list = [
    {'http':'http://115.218.3.11:9000'},
    {'http':'http://220.248.188.75:808'},
    {'http':'http://220.196.229.131:8118'}
]

# 设置随机请求头
headers = {
    'User-Agent':random.choice(user_agent_list)
}
# 定义一个请求信息
request = urllib.request.Request(url,headers=headers,method='GET')
# 创建一个ip对象
proxy_handler = ProxyHandler(random.choice(ip_list))
# 根据ip对象创建用于发送请求的对象
opener = build_opener(proxy_handler)
# 使用opener对象向目标网站发起请求
response = opener.open(request)

resp = response.read().decode('utf-8')

#创建一个对应的对象
re_expression = r'<div class="book-mid-info">.*? title=".*?">(.*?)</a>.*? target="_blank">(.*?)</a>.*?data-eid="qd_B60">(.*?)</a>.*?data-eid="qd_B61">(.*?)</a>.*?<p class="intro"> (.*?)</p>'
pattern_obj = re.compile(re_expression)
'''
<div class="book-mid-info">.*? title=".*?">(.*?)</a>.*? target="_blank">(.*?)</a>.*?data-eid="qd_B60">(.*?)</a>.*?data-eid="qd_B61">(.*?)</a>.*?<p class="intro"> (.*?)</p>
'''
res = re.findall(pattern_obj,resp)
for u in res:
    novel_name = u[0]
    # print(novel_name)

    novel_author = u[1]
    # print(novel_author)

    # 获取大分类
    novel_big_class = u[2]
    # print(novel_big_class)

    # 获取小分类
    novel_small_class = u[3]
    # print(novel_small_class)

    # 获取小说简介
    novel_intor = u[4]
    # print(novel_intor)

    # 保存数据
    file = open('起点中文之武悬疑.csv', 'a', encoding='gb18030')
    file.write(novel_name + ',' + novel_intor + ',' + novel_big_class + ',' + novel_small_class + ',' + novel_intor)
    file.write('\n')
    file.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值