新人学习,如果不好的地方,欢迎指定和批判(爱你,比心)
主文件,文件名称:qimao_oaihangbang.py(六个文件直接复制可运行,别忘了install模块)
from deal_request_data import deal_data, deal_data_json
from in_mysql import in_mysql as mq
from download_book import get_chapter
from to_excel import to_excel
import openpyxl
import os
if __name__ == '__main__':
# baseurl = 'https://www.qimao.com/paihang/boy/hot/date/'
# deal_data(baseurl)
json_url = 'https://www.qimao.com/api/rank/book-list?is_girl=0&rank_type=1&date_type=1&date=202408&page=1'
# mq(json_url,'hot_data_list')
print('-------------------------------------------------------------')
print('-------------------------------------------------------------')
print('-------------------------------------------------------------')
is_girl = int(input("请选择并输入序号(1、男生榜,2、女生榜):")) - 1
print('-------------------------------------------------------------')
rank_type_list = ['1、大热榜', '2、新书榜', '3、完结榜', '4、收藏榜', '5、更新榜']
table_name = ['hot_list', 'new_list', 'over_list', 'collect_list', 'update_list']
top_name_num = int(input(f"输入排行类别{rank_type_list}的序号:"))
#因为4和5的榜单类和前面的不一样,需要额外进行处理
if 1 <= top_name_num <= 5 and top_name_num % 1 == 0:
if top_name_num == 5:
rank_type = top_name_num+1
else:
rank_type = top_name_num
if top_name_num == 1 or top_name_num == 2 or top_name_num == 3:
date_type = (input("请选择并输入序号(1、日榜,2、月榜):"))
data = '202408'
else:
date_type = 1
data = ''
print('-------------------------------------------------------------')
print('-------------------------------------------------------------')
start_page = int(input("输入爬取的开始页的页码:"))
end_page = int(input("输入爬取的结束页的页码(只有五页的数据哦):"))
# openpyxl模块写入
head = ['小说ID', '小说详情地址', '小说简图url', '小说排名', '小说名称', '小说作者', '小说类型', '小说状态',
'小说字数', '小说简介', '小说最近更新章节', '小说最近更新时间', '小说热度']
#文件夹创建
path = './download/excels\\'
print('正在检查文件是否存在。。。。')
if not os.path.isdir(path):
os.mkdir(path)
else:
pass
openpyxl_data = openpyxl.Workbook()
table = openpyxl_data.active
for h in head:
table.cell(1, head.index(h) + 1, h)
openpyxl_data.save(f'{path}七猫小说数据排行榜_openpyxl_{table_name[top_name_num-1]}.xlsx')
#页面范围,就是第几页到第几页,比如1-3,range是包含第三页,还有就是榜单排行最多只有五页的数据
for i in range(start_page,end_page+1):
page = i
json_url = f'https://www.qimao.com/api/rank/book-list?is_girl={is_girl}&rank_type={rank_type}&date_type={date_type}&date={data}&page={page}'
#deal_data_json(url=json_url) #数据处理
#mq(json_url,table_name[top_name_num-1]) #数据入库
#get_chapter(json_url) #下载小说到本地
to_excel(json_url,table_name[top_name_num-1]) #保存至excel
一、发送请求,获取json数据。文件名称:tongyong_request.py
使用的是requests库,发送请求,请求头的信息可以通过链接: 地址获取,方便快捷
如下图,将复制的内容直接粘贴到上面的地址网站里面(注意请求方法):
import requests
def tx_request(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'If-None-Match': '"19731-kcGg+jGeeycY5mZfXMVsNx2x3pI"',
'Referer': 'https://www.qimao.com/',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"'
}
response = requests.get(url=url, headers=headers)
response.encoding = 'UTF-8'
#写的if判断是因为通过页面和json两种方式获取的,直接写就行,不需要通过这样的方式
'''if 'page' in url:
result = response.json()
else:
result = response.text'''
###############################可忽略上面注释的部分#################################
result = response.json()
return result
二、对返回的json数据进行处理。文件名称:deal_request_data.py
处理数据的部分没什么好讲的,就是对数据进行处理,获取到自己想要的数据就行了。
如果直接用json数据的方式,请使用下面的 deal_data_json 方法。
from bs4 import BeautifulSoup
#因为所有都是分开写的,所以要引入,也就是第一步
from tongyong_request import tx_request as t_request
####这是通过页面获取数据部分
def deal_data(url):
response_html = t_request(url)
soup = BeautifulSoup(response_html, 'html.parser')
txt_list_html = soup.find_all('li', 'rank-list-item')
for items in txt_list_html:
# 书籍详情页地址和书籍图片,书籍排名
book_detail = items.find('div', 'pic')('a')[0]['href']
book_pic = items.find('img')['src']
book_paiming = items.find('span').text
# 书籍信息,包括:名称、作者、类型、状态、字数、书籍简介、最近更新章节、最近更新时间
book_info = items.find('div', 'txt')
book_name = book_info('a')[0].text # 名称
book_auther = book_info('a')[1].text # 作者
book_type = book_info('a')[2].text + ':' + book_info('a')[3].text # 类型
book_line = book_info.find_all('em')
book_state = book_line[3].text # 状态
book_word_count = book_line[5].text # 字数
book_introduce = book_info.find('span', 's-book-intro').text # 书籍简介
book_update_chapter = book_info.find('span', 's-book-update')('a')[0].text # 最近更新章节
book_update_time = book_info.find('span', 's-book-update')('em')[0].text # 最近更新时间
# 书籍热度
book_rank = items.find('span', 'rank-change-num')('em')
book_rank_num = book_rank[2].text + ':' + book_rank[0].text + book_rank[1].text # 书籍热度
# 这是通过返回json数据处理的部分
def deal_data_json(url):
res_json = t_request(url)['data']
paiming = (int(res_json['page_data']['page'])-1) * int(res_json['page_data']['page_size']) # 获取书籍总数
book_list = []
j = 1
for book in res_json['table_data']:
data = []
book_id = book['book_id']
book_detail = book['book_url']
book_pic = book['image_link']
book_paihang = int(paiming) + j
j += 1
book_name = book['title']
book_author = book['author']
book_type = book['category1_name']+':'+book['category2_name']
if book['is_over'] == '0':
book_state = '连载中'
elif book['is_over'] == '1':
book_state = '已完结'
book_word_count = book['words_num']
book_introduce = book['intro'].replace('\n','')
book_update_chapter = '最近更新:'+book['latest_chapter_title']
book_update_time = book['update_time']
book_rank_num ='热度:' + book['number']+book['unit']
data.extend([book_id,book_detail,book_pic,book_paihang,
book_name,book_author,book_type,book_state,
book_word_count,book_introduce, book_update_chapter,book_update_time ,book_rank_num
])
book_list.append(data)
return book_list
三、将处理完成的数据写入数据库,文件名称:in_mysql.py
import pymysql
#处理完成的json数据,就是第二步
from deal_request_data import deal_data_json as json_data
#此处可忽略,这是页面处理的数据
from deal_request_data import deal_data as html_data
#table_name(表名),url都是从主py文件中传递
def in_mysql(url, table_name):
#连接数据库
conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='xxxxx'
)
#获取游标
cursor = conn.cursor()
#判断数据库是否存在
cursor.execute('show databases')
dbs = cursor.fetchall() #获取所有库名,库名数据中会有逗号的,所有才会有下一步处理
db_list = []
for db in dbs:
db_list.append(db[0]) #将库名添加到新的数组当中
if 'txt' not in db_list: #创建数据库
db_create_sql = """create database txt default character set utf8 collate utf8_general_ci"""
cursor.execute(db_create_sql)
else:
pass
#使用数据库
cursor.execute('use txt')
#表是否存在处理,和库处理保持一致
cursor.execute('show tables')
tables = cursor.fetchall()
tables_list = []
for table in tables:
tables_list.append(table[0])
if table_name not in tables_list:
#创建表
table_create = f"""create table {table_name}(
id int not null primary key auto_increment,
book_id varchar(255),
book_detail varchar(255),
book_pic varchar(255),
book_paihang varchar(255),
book_name varchar(255),
book_author varchar(255),
book_type varchar(255),
book_state varchar(255),
book_word_count varchar(255),
book_introduce varchar(1000),
book_update_chapter varchar(255),
book_update_time varchar(255),
book_rank_num varchar(255)
)"""
cursor.execute(table_create)
conn.commit()
else:
pass
#插入数据语句
sql_insert_info = f"""insert into {table_name}
(book_id,book_detail,book_pic,book_paihang,book_name,book_author,
book_type,book_state,book_word_count,book_introduce,
book_update_chapter,book_update_time ,book_rank_num)
values
(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
#更新数据语句
sql_update = f"""update {table_name} set
book_detail=%s,book_pic=%s,book_paihang=%s,book_name=%s,
book_author=%s,book_type=%s,book_state=%s,book_word_count=%s,
book_introduce=%s, book_update_chapter=%s,book_update_time =%s,book_rank_num=%s
where book_id=%s"""
#if这个语句可以完全忽略
if 'page' in url:
book_data = json_data(url)
for items in book_data:
#查询是否存在当前数据,存在的话就执行更新语句,不存在就执行条件语句
cursor.execute(f'select * from {table_name} where book_id={items[0]}')
line_data = cursor.fetchall()
if len(line_data) > 0:
values = (items[1], items[2], items[3], items[4], items[5], items[6], items[7], items[8], items[9],
items[10], items[11], items[12], items[0])
cursor.execute(sql_update,values)
conn.commit()
print(f'数据更新成功,小说序号、名称:{items[3]}、{items[4]}')
else:
cursor.execute(sql_insert_info, (
items[0], items[1], items[2], items[3], items[4], items[5], items[6], items[7], items[8], items[9],
items[10], items[11], items[12]))
conn.commit()
print(f'小说:{items[3]}、{items[4]} 保存成功,请至数据库{table_name}表中进行查看!!!!!')
else:
pass
四、将处理完的数据写入excel文件中,文件名称:to_excel.py
from deal_request_data import deal_data_json as json_data
import openpyxl
def to_excel(url,table_name):
txt_list = json_data(url)
path = './download/excels\\'
#opnepyxl读取文件并追加数据
open_data = openpyxl.load_workbook(f'{path}七猫小说数据排行榜_openpyxl_{table_name}.xlsx')
#默认是sheet页
open_table = open_data.active
nrows = open_table.max_row #获取行数
for i in range(len(txt_list)):
for j in range(len(txt_list[i])):
#追加数据
open_table.cell(nrows + 1, j + 1).value = txt_list[i][j]
nrows += 1
#保存数据
open_data.save(f'{path}七猫小说数据排行榜_openpyxl_{table_name}.xlsx')
五、下载非VIP章节的小说,文件名称:download_book.py
from deal_request_data import deal_data_json #导入处理过后的数据
import requests
from bs4 import BeautifulSoup
import os
from pprint import pprint #这是为了打印出格式好看一点,用来取代print
def get_chapter(url):
book_url = deal_data_json(url)
for item in book_url:
book_id_num = item[0] #这是book_id
book_name = item[4] #这是book_name
#判断文件目录是否存在
path = f'./download/{book_name}\\'
if not os.path.isdir(path):
os.mkdir(path)
#获取所有章节的url,然后发送请求,获取到页面数据,取小说内容
all_chapter_url = f'https://www.qimao.com/api/book/chapter-list?book_id={book_id_num}'
response = requests.get(url=all_chapter_url).json()
result = response['data']['chapters']
for chapter in result:
#章节ID
chapter_id = chapter['id']
#章节名称
chapter_title = chapter['title']
chapter_index = chapter['index']
#章节是不是vip
chapter_vip = chapter['is_vip']
if chapter_vip == '0':
chapter_is_vip = '不需要vip'
#获取单个章节url
chapter_url = f'https://www.qimao.com/shuku/{book_id_num}-{chapter_id}/' # 小说章节url
elif chapter_vip == '1':
print(f'{book_name}:剩下章节需要vip,暂不支持爬取')
#跳出当前循环
break
#下面的就是对html的内容进行处理
chapter_requrst_html = requests.get(url=chapter_url).text
soup = BeautifulSoup(chapter_requrst_html, 'html.parser')
chapter_title_content = soup.find('div', 'chapter-detail-wrap-info').text
chapter_content = soup.find('div', 'article').text
content = chapter_title_content + '\n' + chapter_content
#保存文件,一定是二进制内容才能保存的
with open(path + chapter_title + '.txt', 'wb') as f:
f.write(bytes(content, 'utf-8'))
print_info = f'{book_name}:{chapter_title}---------下载完成!!!'
print(print_info)