import csv import random from selenium import webdriver from selenium.webdriver.chrome.service import Service from bs4 import BeautifulSoup import time from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # 设置 ChromeDriver 的路径 driver_path = r'C:\Program Files\Google\Chrome\Application\chromedriver.exe' service = webdriver.chrome.service.Service(executable_path=driver_path) options = webdriver.ChromeOptions() driver = webdriver.Chrome(service=service, options=options) url = "https://quote.eastmoney.com/center/gridlist.html#hs_a_board" file_header = ['序号', '代码', '名称', '相关链接', '最新价', '涨跌幅', '涨跌额', '成交量(手)', '成交额', '振幅', '最高', '最低', '今开', '昨收', '量比', '换手率', '市盈率(动态)', '市净率', '加自选'] try: page = 1 while True: driver.get(url + f"?p={page}") random_wait_time = random.randint(1, 10) time.sleep(random_wait_time) # 等待表格加载 wait = WebDriverWait(driver, 10) wait.until(EC.presence_of_element_located((By.ID, 'main-table_paginate'))) page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') target_div = soup.find('div', id='table_wrapper', class_='row') if target_div: print(f"第{page}页内容:") data_for_this_page = [] table = target_div.find('div', class_='listview full').find('table') if table: table_body = table.find('tbody') for row in table_body.find_all('tr'): row_data = {} cells = row.find_all('td') if len(cells) > 0: row_data['序号'] = cells[0].text.strip() a_tag = cells[1].find('a') row_data['代码'] = a_tag['href'] if a_tag else '' row_data['名称'] = cells[2].find('a').text.strip() links = cells[3].find_all('a') link_texts = [link.get_text() for link in links] row_data['相关链接'] = ' '.join(link_texts) row_data['最新价'] = cells[4].find('span').text.strip() if cells[4].find('span') else '' row_data['涨跌幅'] = cells[5].find('span').text.strip() if cells[5].find('span') else '' row_data['涨跌额'] = cells[6].find('span').text.strip() if cells[6].find('span') else '' row_data['成交量(手)'] = cells[7].text.strip() row_data['成交额'] = cells[8].text.strip() row_data['振幅'] = cells[9].text.strip() row_data['最高'] = cells[10].find('span').text.strip() if cells[10].find('span') else '' row_data['最低'] = cells[11].find('span').text.strip() if cells[11].find('span') else '' row_data['今开'] = cells[12].find('span').text.strip() if cells[12].find('span') else '' row_data['昨收'] = cells[13].text.strip() row_data['量比'] = cells[14].text.strip() row_data['换手率'] = cells[15].text.strip() row_data['市盈率(动态)'] = cells[16].text.strip() row_data['市净率'] = cells[17].text.strip() data_for_this_page.append(row_data) # 将当前页数据写入 CSV 文件 with open('东方财经沪深京A股.csv', 'a', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) if page == 1: writer.writerow(file_header) for item in data_for_this_page: writer.writerow([item.get(key, '') for key in file_header]) # 尝试找到下一页按钮并点击 pagination_div = soup.find('div', class_='dataTables_paginate paging_input') if pagination_div: next_page_button = pagination_div.find('a', class_='next paginate_button') if next_page_button and 'disabled' not in next_page_button.get('class', []) and next_page_button is not None: # 使用Selenium的方法来点击下一页按钮 next_page_element = driver.find_element(By.XPATH, "//a[@class='next paginate_button' and not(contains(@class, 'disabled'))]") next_page_element.click() page += 1 else: print("已到达最后一页或未找到下一页按钮,停止翻页。") break else: print("未找到翻页区域,停止翻页。") break except Exception as e: print("出现错误:", e) finally: driver.quit()
数据分析
import pandas as pd # 读取CSV文件 df = pd.read_csv('东方财经.csv') # 打印列名,检查是否有任何不一致 print("列名:") print(df.columns) # 定义一个清洗数据的函数,将字符串转换为数值 def clean_data(value): if '亿' in str(value): return float(value.replace('亿', '')) * 1e8 elif '万' in str(value): return float(value.replace('万', '')) * 1e4 else: return value # 尝试清洗数据并转换类型,确保列名与实际列名一致 try: # 清洗成交额数据 if '成交额' in df.columns: df['成交额'] = df['成交额'].apply(clean_data) df['成交额'] = df['成交额'].astype(float) else: print("列名 '成交额' 不存在,请检查CSV文件的列名。") # 清洗成交量数据 if '成交量(手)' in df.columns: df['成交量(手)'] = df['成交量(手)'].apply(clean_data) df['成交量(手)'] = df['成交量(手)'].astype(float) else: print("列名 '成交量(手)' 不存在,请检查CSV文件的列名。") # 清洗涨跌幅数据 if '涨跌幅' in df.columns: df['涨跌幅'] = df['涨跌幅'].str.rstrip('%').astype(float) else: print("列名 '涨跌幅' 不存在,请检查CSV文件的列名。") # 计算股票的总成交额 total_turnover = df['成交额'].sum() print(f"\n总成交额为:{total_turnover}亿") # 计算股票的平均涨跌幅 average_change_percent = df['涨跌幅'].mean() print(f"平均涨跌幅为:{average_change_percent}%") # 找出涨跌幅最大的股票 max_change_percent_stock = df.loc[df['涨跌幅'].idxmax()] print(f"涨跌幅最大的股票是:{max_change_percent_stock['名称']},涨跌幅为:{max_change_percent_stock['涨跌幅']}%") # 找出成交额最高的股票 max_turnover_stock = df.loc[df['成交额'].idxmax()] print(f"成交额最高的股票是:{max_turnover_stock['名称']},成交额为:{max_turnover_stock['成交额']}亿") except Exception as e: print(f"处理数据时出现错误:{e}")