import re
import requests
import pymysql
img_li = []
title_li = []
author_li = []
for p in range(1, 10):
url = f'https://book.zongheng.com/store/c0/c0/b0/u4/p{p}/v0/s9/t0/u0/i1/ALL.html'
response = requests.get(url=url)
content = response.content.decode()
#利用正则获取过滤数据
imgs = re.findall('<img src="https://static.zongheng.com/upload/cover/(.*?)"', content)
titles = re.findall('<a href="https://book.zongheng.com/book/(.*?)</a>', content)
authors = re.findall('<a href="https://home.zongheng.com/show/userInfo(.*?)</a>', content)
# print(imgs)
#利用循环把数据存入列表
for i in imgs:
img = 'https://static.zongheng.com/upload/cover/' + i
img_li.append(img)
print(img_li)
for title in titles:
title = title + '1'
data_t = re.findall('target="_blank">(.*?)1', title)
title_li.append(data_t[0])
print(title_li)
for author in authors:
author = author + '1'
print(author)
data_a = re.findall('target="_blank">(.*?)1', author)
# print(data_a[0])
author_li.append(data_a)
print(author_li)
#创建字典
result = {'title': title_li, 'author': author_li, 'img': img_li}
con = pymysql.connect(
host="127.0.0.1",
user="root",
password="******",
port=3306,
database="db_******"
)
con.autocommit(True)
cursor = con.cursor()
# sql语言插入多条数据
insert_query = ("INSERT INTO book (title, author, img) VALUES (%s, %s, %s)")
for title, author, img in zip(title_li, author_li, img_li):
cursor.execute(insert_query, (title, author, img))
cursor.close()
con.close()
总结缺点
titles
和authors
正则表达式可能不会正确地匹配到所需的内容,缺少了结束标签的闭合部分,而且也没有提取出实际的标题和作者名字。execute
方法需要单个元组或列表,而不是字典。result["title"]
、result["author"]
和result["img"]
是列表,直接传递给execute
方法会导致错误,因为execute
期望单个值或者元组/列表。