使用python获取网页数据并导出为json

R_Wyuan

已于 2023-08-14 19:58:17 修改

阅读量1k

点赞数

文章标签： python json 前端

于 2023-08-14 19:54:11 首次发布

本文链接：https://blog.csdn.net/R_Wyuan/article/details/132283035

版权

使用beautifulSoup，并导出为json，做个记录。

import requests
from bs4 import BeautifulSoup
import json

# 定义目标URL
url_pre = "https://www.baidu.com"
url_suf = "/xx1/x.phtml"
years = ["1","2","3","4","5"]

# 初始化数据存储字典
data_dict = {}

for year in years:
    # 发起GET请求获取页面内容
    response = requests.get(url_pre+year+url_suf)
    html_content = response.text

    # 使用BeautifulSoup解析HTML内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 找到数据表格所在的标签
    table = soup.find('table', id='id1')

    # 遍历表格中的每一行
    all_tr = table.find_all('tr')
    # 
    shouRu = all_tr[3]
    column_sr = shouRu.find_all('td')
    if len(column_sr) >= 2:
        key_sr = 'sr'
        value1 = column_sr[1].text.strip().replace(',', '')
        value2 = column_sr[2].text.strip().replace(',', '')
        value3 = column_sr[3].text.strip().replace(',', '')
        value4 = column_sr[4].text.strip().replace(',', '')
        data_dict[year+key_sr+'1'] = value1
        data_dict[year+key_sr+'2'] = value2
        data_dict[year+key_sr+'3'] = value3
        data_dict[year+key_sr+'4'] = value4

    # 
    chengBen = all_tr[5]
    column_cb = chengBen.find_all('td')
    if len(column_cb) >= 2:
        key_cb = 'cb'
        value1 = column_cb[1].text.strip().replace(',', '')
        value2 = column_cb[2].text.strip().replace(',', '')
        value3 = column_cb[3].text.strip().replace(',', '')
        value4 = column_cb[4].text.strip().replace(',', '')
        data_dict[year+key_cb+'1'] = value1
        data_dict[year+key_cb+'2'] = value2
        data_dict[year+key_cb+'3'] = value3
        data_dict[year+key_cb+'4'] = value4

    # 
    liRun = all_tr[18]
    column_lr = liRun.find_all('td')
    if len(column_lr) >= 2:
        key_lr = 'lr'
        value1 = column_lr[1].text.strip().replace(',', '')
        value2 = column_lr[2].text.strip().replace(',', '')
        value3 = column_lr[3].text.strip().replace(',', '')
        value4 = column_lr[4].text.strip().replace(',', '')
        data_dict[year+key_lr+'1'] = value1
        data_dict[year+key_lr+'2'] = value2
        data_dict[year+key_lr+'3'] = value3
        data_dict[year+key_lr+'4'] = value4

# 将数据存储字典保存为JSON文件
output_filename = "data.json"
with open(output_filename, 'w') as json_file:
    json.dump(data_dict, json_file, ensure_ascii=False, indent=4)

print(f"数据已保存到{output_filename}")

对数据进行处理。



import json
import pandas as pd
import matplotlib.pyplot as plt

# 将JSON数据解析为字典
with open('data.json', 'r', encoding='utf-8') as json_file:
    data_dict = json.load(json_file)

# 初始化数据存储字典
data = {
    "year": [],
    "zongshouru": [],
    "zongchengben": []
}

# 解析JSON数据并填充数据存储字典
for year in ["1", "2", "3", "4", "5"]:
    sr_sum = sum(float(data_dict[f"{year}sr{i}"]) for i in range(1, 5))
    cb_sum = sum(float(data_dict[f"{year}cb{i}"]) for i in range(1, 5))
    data["year"].append(year)
    data["zongshouru"].append(sr_sum)
    data["zongchengben"].append(cb_sum)

# 将数据存储字典转换为DataFrame
df = pd.DataFrame(data)

# 绘制折柱混合图
plt.figure(figsize=(10, 6))

# 绘制折线图
plt.plot(df["year"], df["zsr"], marker='o', label="zsr", color="blue")
plt.plot(df["year"], df["zcb"], marker='o', label="zcb", color="orange")

# 绘制柱状图
plt.bar(df["year"], df["zsr"], width=0.4, align='center', alpha=0.5, color="blue")
plt.bar(df["year"], df["zcb"], width=0.4, align='edge', alpha=0.5, color="orange")

plt.xlabel('year')
plt.ylabel('amount')
plt.title('this is title(unit: 100K)')
plt.legend()
plt.grid(True)
plt.show()

对数据进行处理的部分2，进行折线图绘制使用echart

# 根据季报数据，计算出公司近五年(2018到2022年)的营业利润，
# 分年度展示每一年四个季度的营业利润。
# 要求:年度可以切换，使用折线图展示

import json
import pandas as pd
import matplotlib.pyplot as plt

# 将JSON数据解析为字典
with open('data.json', 'r', encoding='utf-8') as json_file:
    data_dict = json.load(json_file)

# 初始化数据存储字典
data = {
    "年份": [],
    "Q1": [],
    "Q2": [],
    "Q3": [],
    "Q4": []
}

# 解析JSON数据并填充数据存储字典
for year in ["1", "2", "3", "4", "5"]:
    data["年份"].append(year)
    for quarter in ["Q1", "Q2", "Q3", "Q4"]:
        lr_key = f"{year}lr{quarter[1]}"
        if lr_key in data_dict:
            data[quarter].append(float(data_dict[lr_key]))
        else:
            data[quarter].append(0.0)

# print(data)

# 将数据存储字典转换为DataFrame
df = pd.DataFrame(data)

# 绘制折线图
plt.figure(figsize=(10, 6))
for quarter in ["Q1", "Q2", "Q3", "Q4"]:
    plt.plot(df["年份"], df[quarter], marker='o', label=quarter)

plt.xlabel('year')
plt.ylabel('lirun')
plt.title('Company 5 years Every Quarters LiRun')
plt.legend()
plt.grid(True)
plt.show()

记录一下python的学习过程。

R_Wyuan

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
使用python获取网页数据并导出为json

使用beautifulSoup，并导出为json，做个记录。对数据进行处理的方式需要进行选择，此处使用echart.对数据进行处理的部分2，进行折线图绘制使用echart。记录一下python的学习过程。
复制链接

扫一扫