import re import requests import pymysql # 连接到数据库 conn = pymysql.connect(host='localhost', port=3306, user='root', passwd=None, db='books') cursor = conn.cursor() # 创建表(如果不存在),这里应该根据实际需求创建一个更合适的表 # 假设我们创建一个名为books的表 sql_create_table = ''' CREATE TABLE IF NOT EXISTS books ( id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255), publisher VARCHAR(255), rating_nums FLOAT ) ''' cursor.execute(sql_create_table) # 抓取数据的循环 for page in range(10): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0' } url = 'https://book.douban.com/tag/%E5%8E%86%E5%8F%B2?start=' + str(page * 20) + '&type=T' resq = requests.get(url, headers=headers) resq.encoding = 'utf-8' s = resq.text # 使用正则表达式匹配数据 obj = re.compile(r'<h2 class="">.*?<a.*?title="(?P<name>.*?)".*?οnclick=".*?">.*?</a>.*?</h2>' r'.*?<div class="pub">(?P<pub>.*?)</div>' r'.*?<span class="rating_nums">(?P<rating_nums>\d+\.\d*)</span>', re.S) result = obj.finditer(s) # 遍历匹配结果并插入数据库 for it in result: dic = it.groupdict() dic['pub'] = dic['pub'].strip() # 尝试将评分转换为浮点数,如果匹配不到数字则默认为0.0 try: dic['rating_nums'] = float(re.search(r'\d+\.\d*', dic['rating_nums']).group()) except: dic['rating_nums'] = 0.0 # 插入数据库 sql_insert = 'INSERT INTO books (name, publisher, rating_nums) VALUES (%s, %s, %s)' cursor.execute(sql_insert, (dic['name'], dic['pub'], dic['rating_nums'])) # 关闭当前响应(应该在处理完每个页面的数据后关闭) resq.close() # 提交事务 conn.commit() # 关闭游标和连接 cursor.close() conn.close() print("完成!")
爬取豆瓣读书
最新推荐文章于 2025-05-04 19:41:26 发布