python selenium爬取东方财富沪深京A股并对其做数据分析

luky！

已于 2024-11-13 09:08:54 修改

阅读量875

点赞数 13

文章标签： python selenium 开发语言

于 2024-11-04 08:43:39 首次发布

本文链接：https://blog.csdn.net/qq_68809241/article/details/143466311

版权

import csv
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 设置 ChromeDriver 的路径
driver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe'
service = webdriver.chrome.service.Service(executable_path=driver_path)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

url = "https://quote.eastmoney.com/center/gridlist.html#hs_a_board"

file_header = ['序号', '代码', '名称', '相关链接', '最新价', '涨跌幅', '涨跌额', '成交量(手)', '成交额', '振幅', '最高',
               '最低', '今开', '昨收', '量比', '换手率', '市盈率(动态)', '市净率', '加自选']

try:
    page = 1
    while True:
        driver.get(url + f"?p={page}")
        random_wait_time = random.randint(1, 10)
        time.sleep(random_wait_time)

        # 等待表格加载
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.ID, 'main-table_paginate')))

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        target_div = soup.find('div', id='table_wrapper', class_='row')
        if target_div:
            print(f"第{page}页内容:")
            data_for_this_page = []
            table = target_div.find('div', class_='listview full').find('table')
            if table:
                table_body = table.find('tbody')
                for row in table_body.find_all('tr'):
                    row_data = {}
                    cells = row.find_all('td')
                    if len(cells) > 0:
                        row_data['序号'] = cells[0].text.strip()
                        a_tag = cells[1].find('a')
                        row_data['代码'] = a_tag['href'] if a_tag else ''
                        row_data['名称'] = cells[2].find('a').text.strip()
                        links = cells[3].find_all('a')
                        link_texts = [link.get_text() for link in links]
                        row_data['相关链接'] = '&nbsp;'.join(link_texts)
                        row_data['最新价'] = cells[4].find('span').text.strip() if cells[4].find('span') else ''
                        row_data['涨跌幅'] = cells[5].find('span').text.strip() if cells[5].find('span') else ''
                        row_data['涨跌额'] = cells[6].find('span').text.strip() if cells[6].find('span') else ''
                        row_data['成交量(手)'] = cells[7].text.strip()
                        row_data['成交额'] = cells[8].text.strip()
                        row_data['振幅'] = cells[9].text.strip()
                        row_data['最高'] = cells[10].find('span').text.strip() if cells[10].find('span') else ''
                        row_data['最低'] = cells[11].find('span').text.strip() if cells[11].find('span') else ''
                        row_data['今开'] = cells[12].find('span').text.strip() if cells[12].find('span') else ''
                        row_data['昨收'] = cells[13].text.strip()
                        row_data['量比'] = cells[14].text.strip()
                        row_data['换手率'] = cells[15].text.strip()
                        row_data['市盈率(动态)'] = cells[16].text.strip()
                        row_data['市净率'] = cells[17].text.strip()
                    data_for_this_page.append(row_data)

                # 将当前页数据写入 CSV 文件
                with open('东方财经沪深京A股.csv', 'a', newline='', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile)
                    if page == 1:
                        writer.writerow(file_header)
                    for item in data_for_this_page:
                        writer.writerow([item.get(key, '') for key in file_header])

        # 尝试找到下一页按钮并点击
        pagination_div = soup.find('div', class_='dataTables_paginate paging_input')
        if pagination_div:
            next_page_button = pagination_div.find('a', class_='next paginate_button')
            if next_page_button and 'disabled' not in next_page_button.get('class',
                                                                           []) and next_page_button is not None:
                # 使用Selenium的方法来点击下一页按钮
                next_page_element = driver.find_element(By.XPATH,
                                                        "//a[@class='next paginate_button' and not(contains(@class, 'disabled'))]")
                next_page_element.click()
                page += 1
            else:
                print("已到达最后一页或未找到下一页按钮，停止翻页。")
                break
        else:
            print("未找到翻页区域，停止翻页。")
            break

except Exception as e:
    print("出现错误:", e)
finally:
    driver.quit()

数据分析

import pandas as pd

# 读取CSV文件
df = pd.read_csv('东方财经.csv')

# 打印列名，检查是否有任何不一致
print("列名：")
print(df.columns)

# 定义一个清洗数据的函数，将字符串转换为数值
def clean_data(value):
    if '亿' in str(value):
        return float(value.replace('亿', '')) * 1e8
    elif '万' in str(value):
        return float(value.replace('万', '')) * 1e4
    else:
        return value

# 尝试清洗数据并转换类型，确保列名与实际列名一致
try:
    # 清洗成交额数据
    if '成交额' in df.columns:
        df['成交额'] = df['成交额'].apply(clean_data)
        df['成交额'] = df['成交额'].astype(float)
    else:
        print("列名 '成交额' 不存在，请检查CSV文件的列名。")

    # 清洗成交量数据
    if '成交量(手)' in df.columns:
        df['成交量(手)'] = df['成交量(手)'].apply(clean_data)
        df['成交量(手)'] = df['成交量(手)'].astype(float)
    else:
        print("列名 '成交量(手)' 不存在，请检查CSV文件的列名。")

    # 清洗涨跌幅数据
    if '涨跌幅' in df.columns:
        df['涨跌幅'] = df['涨跌幅'].str.rstrip('%').astype(float)
    else:
        print("列名 '涨跌幅' 不存在，请检查CSV文件的列名。")

    # 计算股票的总成交额
    total_turnover = df['成交额'].sum()
    print(f"\n总成交额为：{total_turnover}亿")

    # 计算股票的平均涨跌幅
    average_change_percent = df['涨跌幅'].mean()
    print(f"平均涨跌幅为：{average_change_percent}%")

    # 找出涨跌幅最大的股票
    max_change_percent_stock = df.loc[df['涨跌幅'].idxmax()]
    print(f"涨跌幅最大的股票是：{max_change_percent_stock['名称']}，涨跌幅为：{max_change_percent_stock['涨跌幅']}%")

    # 找出成交额最高的股票
    max_turnover_stock = df.loc[df['成交额'].idxmax()]
    print(f"成交额最高的股票是：{max_turnover_stock['名称']}，成交额为：{max_turnover_stock['成交额']}亿")

except Exception as e:
    print(f"处理数据时出现错误：{e}")