使用beautifulSoup,并导出为json,做个记录。
import requests
from bs4 import BeautifulSoup
import json
# 定义目标URL
url_pre = "https://www.baidu.com"
url_suf = "/xx1/x.phtml"
years = ["1","2","3","4","5"]
# 初始化数据存储字典
data_dict = {}
for year in years:
# 发起GET请求获取页面内容
response = requests.get(url_pre+year+url_suf)
html_content = response.text
# 使用BeautifulSoup解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找到数据表格所在的标签
table = soup.find('table', id='id1')
# 遍历表格中的每一行
all_tr = table.find_all('tr')
#
shouRu = all_tr[3]
column_sr = shouRu.find_all('td')
if len(column_sr) >= 2:
key_sr = 'sr'
value1 = column_sr[1].text.strip().replace(',', '')
value2 = column_sr[2].text.strip().replace(',', '')
value3 = column_sr[3].text.strip().replace(',', '')
value4 = column_sr[4].text.strip().replace(',', '')
data_dict[year+key_sr+'1'] = value1
data_dict[year+key_sr+'2'] = value2
data_dict[year+key_sr+'3'] = value3
data_dict[year+key_sr+'4'] = value4
#
chengBen = all_tr[5]
column_cb = chengBen.find_all('td')
if len(column_cb) >= 2:
key_cb = 'cb'
value1 = column_cb[1].text.strip().replace(',', '')
value2 = column_cb[2].text.strip().replace(',', '')
value3 = column_cb[3].text.strip().replace(',', '')
value4 = column_cb[4].text.strip().replace(',', '')
data_dict[year+key_cb+'1'] = value1
data_dict[year+key_cb+'2'] = value2
data_dict[year+key_cb+'3'] = value3
data_dict[year+key_cb+'4'] = value4
#
liRun = all_tr[18]
column_lr = liRun.find_all('td')
if len(column_lr) >= 2:
key_lr = 'lr'
value1 = column_lr[1].text.strip().replace(',', '')
value2 = column_lr[2].text.strip().replace(',', '')
value3 = column_lr[3].text.strip().replace(',', '')
value4 = column_lr[4].text.strip().replace(',', '')
data_dict[year+key_lr+'1'] = value1
data_dict[year+key_lr+'2'] = value2
data_dict[year+key_lr+'3'] = value3
data_dict[year+key_lr+'4'] = value4
# 将数据存储字典保存为JSON文件
output_filename = "data.json"
with open(output_filename, 'w') as json_file:
json.dump(data_dict, json_file, ensure_ascii=False, indent=4)
print(f"数据已保存到{output_filename}")
对数据进行处理。
import json
import pandas as pd
import matplotlib.pyplot as plt
# 将JSON数据解析为字典
with open('data.json', 'r', encoding='utf-8') as json_file:
data_dict = json.load(json_file)
# 初始化数据存储字典
data = {
"year": [],
"zongshouru": [],
"zongchengben": []
}
# 解析JSON数据并填充数据存储字典
for year in ["1", "2", "3", "4", "5"]:
sr_sum = sum(float(data_dict[f"{year}sr{i}"]) for i in range(1, 5))
cb_sum = sum(float(data_dict[f"{year}cb{i}"]) for i in range(1, 5))
data["year"].append(year)
data["zongshouru"].append(sr_sum)
data["zongchengben"].append(cb_sum)
# 将数据存储字典转换为DataFrame
df = pd.DataFrame(data)
# 绘制折柱混合图
plt.figure(figsize=(10, 6))
# 绘制折线图
plt.plot(df["year"], df["zsr"], marker='o', label="zsr", color="blue")
plt.plot(df["year"], df["zcb"], marker='o', label="zcb", color="orange")
# 绘制柱状图
plt.bar(df["year"], df["zsr"], width=0.4, align='center', alpha=0.5, color="blue")
plt.bar(df["year"], df["zcb"], width=0.4, align='edge', alpha=0.5, color="orange")
plt.xlabel('year')
plt.ylabel('amount')
plt.title('this is title(unit: 100K)')
plt.legend()
plt.grid(True)
plt.show()
对数据进行处理的部分2,进行折线图绘制使用echart
# 根据季报数据,计算出公司近五年(2018到2022年)的营业利润,
# 分年度展示每一年四个季度的营业利润。
# 要求:年度可以切换,使用折线图展示
import json
import pandas as pd
import matplotlib.pyplot as plt
# 将JSON数据解析为字典
with open('data.json', 'r', encoding='utf-8') as json_file:
data_dict = json.load(json_file)
# 初始化数据存储字典
data = {
"年份": [],
"Q1": [],
"Q2": [],
"Q3": [],
"Q4": []
}
# 解析JSON数据并填充数据存储字典
for year in ["1", "2", "3", "4", "5"]:
data["年份"].append(year)
for quarter in ["Q1", "Q2", "Q3", "Q4"]:
lr_key = f"{year}lr{quarter[1]}"
if lr_key in data_dict:
data[quarter].append(float(data_dict[lr_key]))
else:
data[quarter].append(0.0)
# print(data)
# 将数据存储字典转换为DataFrame
df = pd.DataFrame(data)
# 绘制折线图
plt.figure(figsize=(10, 6))
for quarter in ["Q1", "Q2", "Q3", "Q4"]:
plt.plot(df["年份"], df[quarter], marker='o', label=quarter)
plt.xlabel('year')
plt.ylabel('lirun')
plt.title('Company 5 years Every Quarters LiRun')
plt.legend()
plt.grid(True)
plt.show()
记录一下python的学习过程。