import requests
import urllib.request
from bs4 import BeautifulSoup
import csv
'''
目标数据:
书名
作者
日期
价格
评分
评价人数
'''
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
key = "小说"
key_ASCII = urllib.request.quote(key)
book_name_list = []
user_name = []
book_time = []
book_pice = []
pm_number_list = []
people_number_list = []
all_book = []
# for page in range(0,60,20): # 3页数据 由于不可抗拒因数 ,不需要获取太数(注:(0,1000,20) 为全部数据)
# link = "https://book.douban.com/tag/"+str(key_ASCII)+"?start={}&type=T".format(page)
# resp = requests.get(url=link,headers=headers)
# print(resp.status_code)
# html = resp.text
# 优化
# for page in range(0,51):
# link = "https://book.douban.com/tag/" + str(key_ASCII) + "?start="+str(page * 20)+"&type=T"
# resp = requests.get(url=link,headers=headers)
# print("第"+str(page)+"页状态响应码:",resp.status_code)
# 由于不可抗拒因数 ,不需要获取太数
for page in range(0,5): # 这里获取5页数据
link = "https://book.douban.com/tag/" + str(key_ASCII) + "?start="+str(page * 20)+"&type=T"
resp = requests.get(url=link,headers=headers)
print("\n第"+str(page)+"页状态响应码:",resp.status_code)
# 确定 url 及响应状态码
html = resp.text
soup = BeautifulSoup(html,'html.parser')
# 进行数据转换
# 确定元素 变化规律 使用 for 来完成这个规律
for i in range(1,21):
# 书名 获取
book_name = soup.select("#subject_list > ul > li:nth-child("+str(i)+") > div.info > h2 > a")
#subject_list > ul > li:nth-child(7) > div.info > h2 > a
#subject_list > ul > li:nth-child(2) > div.info > h2
for book_name in book_name:
# print(book_name.get_text().replace("\n", "").replace(" ", ""))
book_name_list.append(book_name.get_text().replace("\n", "").replace(" ", ""))
# 正常 1 页 20 条数据,但有异常
# 此方法解决了 某页数据 不足 20
# 书籍信息(作者,出版时间,价格)
book_user = soup.select("#subject_list > ul > li:nth-child("+str(i)+") > div.info > div.pub")
for book_user in book_user:
# print(book_user.get_text().replace("\n","").replace(" ",""))
book_user = book_user.get_text().replace("\n","").replace(" ","").replace("元","")
book_user_list = book_user.split("/")
# 提取全部数据(作者,出版时间,价格) 清洗数据
# 异常解决
if len(book_user_list) >= 4: # 解决 某些书籍 没有这些东西
user_name.append(book_user_list[0]) # 作者
book_time.append(book_user_list[-2]) # 出版日期
book_pice.append(book_user_list[-1]) # 价格
else:
user_name.append(book_user_list[0]) # 作者
book_time.append("空") # 出版日期
book_pice.append("空") # 价格
# book_name = book_name[0].get_text()
# print(book_name.replace("\n","").replace(" ",""))
# 方法 2
# 评分数 评价人数 数据的获取
pm_number = soup.select("#subject_list > ul > li:nth-child("+str(i)+") > div.info > div.star.clearfix > span.rating_nums")
people_number = soup.select("#subject_list > ul > li:nth-child("+str(i)+") > div.info > div.star.clearfix > span.pl")
for pm_number in pm_number:
# print(pm_number.get_text())
pm_number_list.append(pm_number.get_text())
for people_number in people_number:
# print(people_number.get_text().replace(" ","").replace("\n","").replace("(","").replace(")",""))
people_number_list.append(people_number.get_text().replace(" ","").replace("\n","").replace("(","").replace("人评价)",""))
# print(book_name_list)
# print(user_name)
# print(book_time)
# print(book_pice)
# print(pm_number_list)
# print(people_number_list)
# 存入定义好的空列表 ,以字典的方式存入
# for i in range(0,len(user_name)):
# all_book.append(
# {
# "书名":book_name_list[i],
# "原作者": user_name[i],
# "出版日期":book_time[i],
# "价格(元)":book_pice[i],
# "评分":pm_number_list[i],
# "评价人数":people_number_list[i],
#
# }
# )
# # 此时是 列表包含 len (book_time) 个 字典
# # 每一个字典 包含一本书的 爬取的 所有信息
#
# print(all_book)
# 因为存入 csv 格式的文件也需要用到了字典 这上面这一步就 pass
file_path = "C:/Users/DELL/Desktop/python_wd/文本信息/豆瓣小说.csv"
with open(file_path,"w",newline="",encoding="utf-8")as f:
file_value_names = ["书名","作者","出版日期","价格(元)","评分","评价人数"] # 设置表头
f_csv = csv.DictWriter(f,fieldnames=file_value_names) # 采用字典的形式 存入
f_csv.writeheader() # 写入表头
# 写入行
for i in range(0,len(user_name)):
f_csv.writerow( # 注意这里
{
"书名":book_name_list[i],
"作者": user_name[i],
"出版日期":book_time[i],
"价格(元)":book_pice[i],
"评分":pm_number_list[i],
"评价人数":people_number_list[i],
}
)
print("\n数据保存完毕。。。。。。")
豆瓣电子书
最新推荐文章于 2024-04-20 09:51:26 发布