# https://search.dangdang.com/?key=%B1%E0%B3%CC&act=input&page_index=1
# 导入requests
import requests
import time
from bs4 import BeautifulSoup
import mysql.connector
import csv
# 定义容器 用来存储所有数据
allContainer = []
for i in range(1, 36):
# 判断当前是否为第一次循环
if i == 1:
url = "https://search.dangdang.com/?key=%B1%E0%B3%CC&act=input&page_index=1"
else:
url = f"https://search.dangdang.com/?key=%B1%E0%B3%CC&act=input&page_index={i}"
print(f"当前已完成第{i}次")
# 循环休眠 防止检测
time.sleep(1)
# 发起请求
# 请求头
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "ddscreen=2; __permanent_id=20250609184530979156760224679480468; __visit_id=20250609184530993130404124438448889; __out_refer=1749465931%7C!%7Cwww.baidu.com%7C!%7C; dest_area=country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0; __rpm=s_112100.155956512835%2C155956512836..1749466159510%7Cs_112100.155956512835%2C155956512836..1749466166450; search_passback=1e0bf85a587c99ab37bc4668fc0100003945670025bc4668; __trace_id=20250609184927332100480187110221325",
"Host": "search.dangdang.com",
"Referer": "https://search.dangdang.com/?key=%B1%E0%B3%CC&act=input&page_index=2",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0",
}
response = requests.get(url, headers=header)
# 设置响应的编码格式
# response.encoding = 'utf-8'
# 自动识别编码方式(关键!)
response.encoding = response.apparent_encoding
# 将响应先保存至本地,然后先测试对本地页面进行数据解析 然后再进行多次爬取
# with open('../data/当当网.html', 'w', encoding='utf-8') as f:
# f.write(response.text)
htmlTree = BeautifulSoup(response.text, 'html.parser')
allulEle = htmlTree.find_all('ul', class_="bigimg")
for ul in allulEle:
# 根据每一个ul标签中的li 进行指定数据的获取
allw1 = ul.find_all('li', recursive=False)
# 获取w1下的p标签
for li_tag in allw1:
rowContainer = []
# 提取书名
title_tag = li_tag.find_all('p', class_='name')
if title_tag:
a_tag = li_tag.find_all('a')
if a_tag:
title = a_tag[0].get('title')
href = a_tag[0].get('href')
link = f"https:{href}"
rowContainer.append(title)
rowContainer.append(link)
else:
title = ""
href = ""
else:
title = ""
href = ""
pre_price = li_tag.find_all('span', class_='search_pre_price')
for p in pre_price:
PrePrice = p.get_text(strip=True)
rowContainer.append(PrePrice)
# 提取评论数
comment_count = li_tag.find('a', class_='search_comment_num')
if comment_count:
CommentCount = comment_count.get_text(strip=True)
else:
CommentCount = '0条评论'
rowContainer.append(CommentCount)
# 提取作者、出版时间、出版社
author_info = li_tag.find('p', class_='search_book_author')
for p in author_info:
AuthorInfo = p.get_text(strip=True).replace('\\\\', '').replace('/', '')
if not AuthorInfo:
AuthorInfo = ''
rowContainer.append(AuthorInfo)
allContainer.append(rowContainer)
for i in allContainer:
print(i)
# 导入数据库模块
import mysql.connector
# 使用当前库中的内置对象来创建数据库连接
mydb = mysql.connector.connect(
host='localhost', # 当前mysql运行服务的地址
port=3306, # mysql服务的端口号
user='root', # mysql用户名
password='root', # 密码
database='dangdang'
)
# 创建游标对象
mycursor = mydb.cursor()
# discount VARCHAR ( 20 ), -- 折扣
# 创建图书信息表
create_table_sql = """
CREATE TABLE IF NOT EXISTS books
(
id
INT
AUTO_INCREMENT
PRIMARY
KEY,
title
VARCHAR
(
255
) NOT NULL, -- 书名
link VARCHAR
(
512
), -- 链接
now_price VARCHAR
(
20
), -- 现价
comment_count VARCHAR
(
50
), -- 评论数
author VARCHAR
(
100
), -- 作者
publish_date VARCHAR
(
20
), -- 出版时间
publisher VARCHAR
(
100
), -- 出版社
action VARCHAR
(
100
),
unidentified VARCHAR
(
20
)
)
"""
# 执行建表语句
mycursor.execute(create_table_sql)
# 插入语句
insert_sql = """
INSERT INTO books
(title, link, now_price, comment_count, author, publish_date, publisher, action, unidentified)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
for book in allContainer:
if len(book) == 8:
book.insert(4, '')
mycursor.execute(insert_sql, list(book))
# 提交事务
mydb.commit()
print("✅ 数据插入完成,共插入", len(allContainer), "条记录")
# 关闭连接
mycursor.close()
mydb.close()
import pandas as pd
# 转成 DataFrame
df = pd.DataFrame(allContainer,
columns=["书名", "链接", "现价", "评论数", "作者", "出版时间", "出版社", "可选状态", "未知"])
# 插入序号列(从 1 开始)
df.insert(0, '序号', range(1, len(df) + 1))
# 保存为 Excel 文件
df.to_excel("../data/当当网.xlsx", index=False)
print("✅ 数据已成功保存为 Excel 文件!")
import jieba
import wordcloud
# 读取excel 文件到当前代码中
import openpyxl
from wordcloud import WordCloud
# 获取当前excel 表格对象
wb = openpyxl.load_workbook('../data/当当网.xlsx')
# 获取当前表格中的sheet
sheet = wb.worksheets[0]
# 遍历当前的execl 对象
# min row = 2 代表的是从当前表格中的第二行开始获取
# min col = 3 代表获取第三列
# max col = 3 最大的列是三,之确保我们获取当前第三列
# 定义一个列表用来存储当前获取的所有的数据
data = []
for row in sheet.iter_rows(min_row=2, min_col=2, max_col=5):
data.append(row[0].value) # 获取每个单元格中的value值
# print(data)
# 对当前的数组中的元素进行分割词组
seg_list = jieba.cut(''.join(data), cut_all=False)
# print(type(seg_list))
#
# print('/'.join(seg_list))
# 引入当前的字体 作为词云图的渲染字体
fonts = '../data/AlibabaPuHuiTi-2-65-Medium.ttf'
wc = WordCloud(
# 通过属性对当前的词云图进行赋值
width=1200, # 宽600px
height=600,
background_color='white',
max_font_size=50,
min_font_size=10,
font_path=fonts
)
# 将分隔完成的数据 加载到当前的wc对象中
wc.generate_from_text(''.join(seg_list))
# 保存当前的结果到指定文件夹中
wc.to_file("../data/词云图.png")
import numpy as np
import pandas as pd
# 这些设置有助于调试时查看完整的 DataFrame 数据,适合开发阶段使用
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
newXml = pd.read_excel("../data/当当网.xlsx")
print(newXml.shape) # 查看行数列数
print(newXml.info()) # 查看各列的数据类型及缺失值
#
# 检查重复行
duplicates = newXml.duplicated(keep='first')
print("重复行数量:", duplicates.sum())
#
# 删除重复行
cleaned_data = newXml.drop_duplicates()
print("删除重复后数据形状:", cleaned_data.shape)
#
# 删除含有空值的行
dropna_data = newXml.dropna()
print("删除空值后数据形状:", dropna_data.shape)
# 或者填充空值
# filled_data = newXml.fillna({"未知": "默认值"})
df = newXml.drop(columns=['未知'])
print(df)
# filled_data = newXml.fillna({"CPU信息": "未知", "等级": 0})
df.to_excel("../data/new当当.xlsx", index=False)
import pandas as pd
import numpy as np
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar, Line, Scatter, Pie, Radar
from pyecharts.globals import ThemeType
# 读取文件
excel_file = pd.ExcelFile('../data/new当当.xlsx')
# 获取指定工作表中的数据
df = excel_file.parse('Sheet1')
# 将出版社列转换为字符串类型
df['出版社'] = df['出版社'].astype(str)
# 获取出版社和书名列的数据
publishers = df['出版社'].to_numpy()
book_names = df['书名'].to_numpy()
# 获取唯一的出版社
unique_publishers = np.unique(publishers)
# 统计每个出版社的书籍数量
book_counts = np.array([np.sum(publishers == publisher) for publisher in unique_publishers])
# 构建结果 DataFrame
result_df = pd.DataFrame({
'出版社': unique_publishers,
'书籍数量': book_counts
})
print(result_df)
# 读取数据
df = pd.read_excel('../data/new当当.xlsx')
# 数据预处理
# 转换现价列,提取数字
df['现价'] = df['现价'].str.extract('(\d+\.?\d*)').astype(float)
# 转换评论数列,提取数字
df['评论数'] = df['评论数'].str.extract('(\d+)').astype(int)
# 转换出版时间列,提取年份
df['出版年份'] = pd.to_datetime(df['出版时间']).dt.year
# 图表1:价格分布直方图
hist, bins = pd.cut(df['现价'], bins=20, retbins=True)
hist_value = hist.value_counts().sort_index()
# 使用 Bar 来模拟直方图
histogram = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT, width="800px", height="400px"))
.add_xaxis([f"{bins[i]:.2f}-{bins[i + 1]:.2f}" for i in range(len(bins) - 1)])
.add_yaxis("书籍数量", hist_value.tolist(), category_gap=0)
.set_global_opts(
title_opts=opts.TitleOpts(title="价格分布柱状图"),
xaxis_opts=opts.AxisOpts(name="价格区间"),
yaxis_opts=opts.AxisOpts(name="数量"),
)
)
# 图表2:不同出版社出版书籍数量柱状图
publisher_counts = df['出版社'].value_counts()
bar_publisher = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT, width="800px", height="400px"))
.add_xaxis(publisher_counts.index.tolist())
.add_yaxis("出版书籍数量", publisher_counts.tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title="不同出版社出版书籍数量柱状图"),
xaxis_opts=opts.AxisOpts(name="出版社", axislabel_opts={"rotate": 90}),
yaxis_opts=opts.AxisOpts(name="出版书籍数量"),
)
)
# 图表3:每年出版书籍数量折线图
yearly_counts = df['出版年份'].value_counts().sort_index()
line_yearly = (
Line(init_opts=opts.InitOpts(theme=ThemeType.LIGHT, width="800px", height="400px"))
.add_xaxis(yearly_counts.index.astype(str).tolist())
.add_yaxis("出版书籍数量", yearly_counts.tolist(), is_smooth=True, symbol="circle")
.set_global_opts(
title_opts=opts.TitleOpts(title="每年出版书籍数量折线图"),
xaxis_opts=opts.AxisOpts(name="出版年份"),
yaxis_opts=opts.AxisOpts(name="出版书籍数量"),
)
)
# 图表4:评论数前五书籍的书名与评论数柱状图
top_5_commented = df.nlargest(5, '评论数')
bar_comment = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT, width="800px", height="400px"))
.add_xaxis(top_5_commented['书名'].tolist())
.add_yaxis("评论数", top_5_commented['评论数'].tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title="评论数前五书籍的书名与评论数柱状图"),
xaxis_opts=opts.AxisOpts(name="书名", axislabel_opts={"rotate": 90}),
yaxis_opts=opts.AxisOpts(name="评论数"),
)
)
# 图表5:价格与评论数的散点图
# 将现价列转换为字符串类型
df['现价'] = df['现价'].astype(str)
# 提取价格数值
df['价格'] = df['现价'].str.extract(r'(\d+\.?\d*)').astype(float)
# 检查价格列是否存在缺失值
print(f"价格列缺失值数量: {df['价格'].isna().sum()}")
# 删除价格列为缺失值的行
df = df.dropna(subset=['价格'])
# 定义价格区间
bins = [0, 50, 100, 150, 200, float('inf')]
labels = ['0 - 50', '51 - 100', '101 - 150', '151 - 200', '200以上']
# 划分价格区间并统计数量
df['价格区间'] = pd.cut(df['价格'], bins=bins, labels=labels)
price_range_counts = df['价格区间'].value_counts().reset_index(name='数量')
# 使用 pyecharts 绘制饼状图
pie = (
Pie()
.add(
series_name="数量",
data_pair=[list(z) for z in zip(price_range_counts['价格区间'], price_range_counts['数量'])],
radius=["40%", "75%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="价格区间与数量的饼状图"),
legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%"),
)
.set_series_opts(
label_opts=opts.LabelOpts(formatter="{b}: {d}%")
)
)
# 将评论数列转换为字符串类型
df['评论数'] = df['评论数'].astype(str)
# 提取评论数数值
df['评论数数值'] = df['评论数'].str.extract(r'(\d+\.?\d*)').astype(float)
# 找出评论数前五的书籍
top_5_books = df.nlargest(5, '评论数数值', keep='all')[['书名', '评论数数值']]
# 定义雷达图的指标
c_schema = [{"name": book_name, "max": top_5_books['评论数数值'].max()} for book_name in top_5_books['书名']]
# 准备雷达图的数据
data = [[count for count in top_5_books['评论数数值'].values]]
# 创建雷达图对象
(
Radar()
.add_schema(schema=c_schema)
.add(
series_name="评论数",
data=data,
areastyle_opts=opts.AreaStyleOpts(opacity=0.2)
)
.set_global_opts(
title_opts=opts.TitleOpts(title="评论数前五的书籍的书名与评论数雷达图"),
)
.render("../data/radar_chart_top5_books.html")
)
# 统计不同出版社的书籍数量
publisher_book_count = df['出版社'].value_counts().reset_index()
publisher_book_count.columns = ['出版社', '书籍数量']
# 选取书籍数量前 10 的出版社
top_10_publisher = publisher_book_count.nlargest(10, '书籍数量')
# 创建散点图对象
scatter = (
Scatter()
.add_xaxis(top_10_publisher['出版社'].tolist())
.add_yaxis(
series_name="书籍数量",
y_axis=top_10_publisher['书籍数量'].tolist(),
symbol_size=10,
label_opts=opts.LabelOpts(is_show=False)
)
.set_global_opts(
title_opts=opts.TitleOpts(title="不同出版社书籍数量前10的散点图"),
xaxis_opts=opts.AxisOpts(
name="出版社",
type_="category",
axislabel_opts=opts.LabelOpts(rotate=45, interval="auto")
),
yaxis_opts=opts.AxisOpts(name="书籍数量"),
)
)
# 保存图表
histogram.render("../data/price_distribution_histogram.html")
bar_publisher.render("../data/publisher_book_count_bar.html")
line_yearly.render("../data/yearly_book_count_line.html")
bar_comment.render("../data/top_commented_books_bar.html")
pie.render("../data/price_range_pie_chart.html")
scatter.render("../data/scatter_top10_publisher_book_count.html")
from flask import Flask, request, render_template_string, jsonify
import requests
# import requests
#
# # 定义一个message的变量,作为会话的容器
# messages = [{"role":"system","content":""}]
#
# # API KEY
# API_KEY = "sk-ec2e933afb424766ba6bce9765960a3a"
# # 设置请求头
# header = {
# "Content-Type": "application/json", # 告知服务器我们传递的内容的数据类型
# "Authorization": f"Bearer {API_KEY}" # api_key
# }
#
# # 请求地址
# url = "https://api.deepseek.com/chat/completions"
#
# # 因为要完成多轮对话 所以要有循环
# # 同时因为要完成用户的多次对话请求
#
# def DeepSeekChat(userMessage):
# # 1. 将用户输入的信息与角色进行拼接 从而变成一个完成的对话
# messages.append( {"role": "user", "content": userMessage})
#
# # 2. 请求deepseek 请求过程中将我们携带的多个参数进行传递
# data = {
# "model":"deepseek-chat",
# "messages":messages,
# "stream":False
# }
#
# # 3. 发起请求
# response = requests.post(url, json=data, headers=header)
#
# # 4. 对response进行处理
# if response.status_code == 200:
# # 获取响应内容
# result_json = response.json()
# # 处理当前json中的内容
# airesult = result_json['choices'][0]['message']['content']
# # AI返回结果
# print(f"图书商城AI客服:{airesult}")
# # 如何实现多轮对话
# # 将回复的内容继续追加到列表中,形成会话的闭合,结合上下文内容
# messages.append({"role": "assistant", "content": airesult})
# else:
# print(response.text)
# print("请求失败")
#
# print("图书商城欢迎您")
# print("使用exit退出程序")
#
# while True:
# userinput = input("你:")
# if userinput == "exit":
# print("退出当前程序")
# break
# else:
# # 调用函数完成多次请求的发送
# # 将用户输入的内容作为参数调用函数 完成API的调用
# DeepSeekChat(userinput) 这是我写的代码,需要把爬虫得到的数据与deepseek相结合,使得可以在一个新页面上根据数据与ai对话,请你进行修改
最新发布