整体实现描述
python爬取千千小说网 - 获取排行榜小说列表 并存入mysql数据库
ps:爬取数据过程中请遵守法律法规和道德规范、避免频繁访问单个网站或大量访问同一网站、尊重网站的反爬机制喔~~~
存储小说列表的mysql数据表创建
drop table if exists chengt.spider_novel;
create table if not exists chengt.spider_novel(
id int,
type varchar(200),
book_id varchar(200),
name varchar(200),
new_part varchar(500),
new_url varchar(500),
author varchar(200),
uptime varchar(200),
rksj datetime
);
-- 注意编码问题, 编码要设置为utf8
-- truncate table chengt.spider_novel;
select * from chengt.spider_novel;
python代码
# -*- coding: utf-8 -*-
# @Time : 2023/8/9 10:16
# @Author : chengt
# @File : myspidernovel.py
# @Software: PyCharm
# @Desc : 自己写的spider爬取小说目录的demo,获取千千小说网 排行榜单-总点击榜 下的小说目录列表,然后保存到mysql库中
import json
import pymysql
from lxml import etree
import datetime
import requests
import urllib3
http = urllib3.PoolManager(cert_reqs='CERT_NONE')
def conn_db(sql):
try:
conn = pymysql.Connect(
host="192.168.xx.xx",
port=3306,
database="chengt",
user="root",
password="xxxxxxx",
charset="utf8"
)
mycursor = conn.cursor()
mycursor.execute(sql)
conn.commit() # 在使用 pymysql 执行插入操作时,默认情况下是不会自动提交数据到数据库的
execute_satus = mycursor.rowcount
print("sql execute_satus:", execute_satus)
except pymysql.Error as e:
print(">>> An error occurred:", e)
finally:
mycursor.close()
conn.close()
return execute_satus
def main():
url = "https://www.qqxsw.tv/top/allvisit/"
response = requests.get(url)
print("response:", response)
print("response.content:", response.content)
# 获取当前时间
current_time = datetime.datetime.now()
# 将当前时间格式化为字符串,以便进行解析
formatted_time = current_time.strftime("%Y-%m-%d %H:%M:%S")
# 将字符串解析为 datetime 对象
# formatted_time = datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
print("formatted_time:", formatted_time)
# 解析 HTML 数据
html = etree.HTML(response.content)
if response.status_code == 200:
types = html.xpath('//div[@class="novelslistss"]/li/span[@class="s1"]')
book_ids = html.xpath('//div[@class="novelslistss"]/li/span[@class="s2"]/a/@href')
names = html.xpath('//div[@class="novelslistss"]/li/span[@class="s2"]/a/@title')
new_parts = html.xpath('//div[@class="novelslistss"]/li/span[@class="s3"]/a/@title')
new_urls = html.xpath('//div[@class="novelslistss"]/li/span[@class="s3"]/a/@href')
authors = html.xpath('//div[@class="novelslistss"]/li/span[@class="s4"]')
uptimes = html.xpath('//div[@class="novelslistss"]/li/span[@class="s5"]')
for index in range(len(types)):
id = int(index+1)
type = str(types[index].text)
# 使用字符串的切片来提取所需的部分(取出作为book id)
start_index = book_ids[index].find("book_") + len("book_")
end_index = book_ids[index].find("/", start_index)
book_id = book_ids[index][start_index:end_index]
name = names[index]
new_part = new_parts[index]
new_url = new_urls[index]
author = authors[index].text
uptime = uptimes[index].text
# concat_data = f">>>concat_data:id:{id} | type:{type} | book_id:{book_id} | name:{name} | new_part:{new_part} | new_url:{new_url} | author:{author} | uptime:{uptime}".format()
# print(concat_data)
sql = f"\ninsert into zhisheng.spider_novel values ({id},'{type}','{book_id}','{name}','{new_part}','{new_url}','{author}','{uptime}','{formatted_time}')"
print(sql)
conn_db(sql)
index += 1
else:
print("请求失败:", response.status_code)
if __name__ == '__main__':
main()
最终效果: