网络爬虫获取东方财富个股研报发布信息

东方财富网的研究报告数据,是进行数据分析以及做量化策略的重要数据来源,本文将用Python实现一个批量获取研报发布数据的网络爬虫,实现步骤如下:

首先打开个股研报发布网址:个股研报

此时网站会弹出一个图片

该图片为遮罩层,会阻止爬虫抓取数据,所以需要先关闭该图片,分析网页源码结构,找到关闭该图片的网页源码

用Python代码实现关闭图片过程:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# 配置
driver_path = 'chromedriver.exe' #注意此处要把chromedriver.exe所在的文件夹路径补全
service = Service(driver_path)
options = webdriver.ChromeOptions()
# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.198 Safari/537.36")
# options.add_argument('--disable-blink-features=AutomationControlled')  # 绕过浏览器的自动化检测
driver = webdriver.Chrome(service=service, options=options)

# 目标 URL
driver.get("https://data.eastmoney.com/report/stock.jshtml")

try:
    # 等待关闭按钮出现并可点击
    close_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "img[src='https://emcharts.dfcfw.com/fullscreengg/ic_close.png']"))
    )
    # 点击关闭按钮
    close_button.click()
    print("成功关闭遮罩层")
except Exception as e:
    print(f"关闭遮罩层时出错: {e}")

接下来展示研报发布内容,以及对应的网页源码

以下用Python代码实现获取研报发布内容

# 存储数据的字典
data = {
    '序号': [],
    '股票代码': [],
    '股票名称': [],
    '详细链接': [],
    '报告标题': [],
    '评级': [],
    '评级变动': [],
    '机构名称': [],
    '近一月个股研报数': [],
    '2024盈利预测收益': [],
    '2024盈利预测市盈率': [],
    '2025盈利预测收益': [],
    '2025盈利预测市盈率': [],
    '行业': [],
    '日期': []
}

# 设置要爬取的页数
pages_to_crawl = 100

# 爬取指定页数的数据
for page in range(pages_to_crawl):
    try:
        # 等待表格加载完成(至少有一行数据)
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, "//div[@id='stock_table']"))
        )

        # 提取数据
        rows = driver.find_elements(By.XPATH, "//div[@id='stock_table']//tbody/tr")
        for row in rows:
            try:
                # 序号
                serial = row.find_element(By.XPATH, "./td[1]").text.strip()
                # 股票代码
                stock_code = row.find_element(By.XPATH, "./td[2]/a").text.strip()
                # 股票名称
                stock_name = row.find_element(By.XPATH, "./td[3]/a/span").get_attribute("title")
                # 详细链接
                detail_url = row.find_element(By.XPATH, "./td[4]/a[1]").get_attribute("href")
                # 报告标题
                report_title = row.find_element(By.XPATH, "./td[5]/a").text.strip()
                # 评级
                rating = row.find_element(By.XPATH, "./td[6]").text.strip()
                # 评级变动
                rating_change = row.find_element(By.XPATH, "./td[7]").text.strip()
                # 机构名称
                institution = row.find_element(By.XPATH, "./td[8]/a").text.strip()
                # 近一月个股研报数
                research_count = row.find_element(By.XPATH, "./td[9]").text.strip()
                # 2024盈利预测收益
                predict_2024_eps = row.find_element(By.XPATH, "./td[10]").text.strip()
                # 2024盈利预测市盈率
                predict_2024_pe = row.find_element(By.XPATH, "./td[11]").text.strip()
                # 2025盈利预测收益
                predict_2025_eps = row.find_element(By.XPATH, "./td[12]").text.strip()
                # 2025盈利预测市盈率
                predict_2025_pe = row.find_element(By.XPATH, "./td[13]").text.strip()
                # 行业
                industry = row.find_element(By.XPATH, "./td[14]/a").text.strip()
                # 日期
                date = row.find_element(By.XPATH, "./td[15]").text.strip()

                # 存入字典
                data['序号'].append(serial)
                data['股票代码'].append(stock_code)
                data['股票名称'].append(stock_name)
                data['详细链接'].append(detail_url)
                data['报告标题'].append(report_title)
                data['评级'].append(rating)
                data['评级变动'].append(rating_change)
                data['机构名称'].append(institution)
                data['近一月个股研报数'].append(research_count)
                data['2024盈利预测收益'].append(predict_2024_eps)
                data['2024盈利预测市盈率'].append(predict_2024_pe)
                data['2025盈利预测收益'].append(predict_2025_eps)
                data['2025盈利预测市盈率'].append(predict_2025_pe)
                data['行业'].append(industry)
                data['日期'].append(date)

            except Exception as e:
                print(f"解析行时出错: {e}")
                continue

为了抓取更多数据,必须实现翻页功能,如下:

对应的网页源码如下:

用Python实现点击“下一页”实现翻页功能如下:

# 点击下一页
        if page < pages_to_crawl - 1:  # 最后一页不点击下一页
            next_page_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='pagerbox']/a[contains(text(), '下一页')]"))
            )
            next_page_button.click()
            time.sleep(2)  # 等待页面加载

   

最后把网页获取到的数据保存到report_data.xlsx文件

下面将完整代码奉上:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# 配置
driver_path = 'chromedriver.exe' #注意此处要把chromedriver.exe所在的文件夹路径补全
service = Service(driver_path)
options = webdriver.ChromeOptions()

# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.198 Safari/537.36")
# options.add_argument('--disable-blink-features=AutomationControlled')  # 绕过浏览器的自动化检测
driver = webdriver.Chrome(service=service, options=options)

# 目标 URL
driver.get("https://data.eastmoney.com/report/stock.jshtml")

try:
    # 等待关闭按钮出现并可点击
    close_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "img[src='https://emcharts.dfcfw.com/fullscreengg/ic_close.png']"))
    )
    # 点击关闭按钮
    close_button.click()
    print("成功关闭遮罩层")
except Exception as e:
    print(f"关闭遮罩层时出错: {e}")

# 存储数据的字典
data = {
    '序号': [],
    '股票代码': [],
    '股票名称': [],
    '详细链接': [],
    '报告标题': [],
    '评级': [],
    '评级变动': [],
    '机构名称': [],
    '近一月个股研报数': [],
    '2024盈利预测收益': [],
    '2024盈利预测市盈率': [],
    '2025盈利预测收益': [],
    '2025盈利预测市盈率': [],
    '行业': [],
    '日期': []
}

# 设置要爬取的页数
pages_to_crawl = 100

# 爬取指定页数的数据
for page in range(pages_to_crawl):
    try:
        # 等待表格加载完成(至少有一行数据)
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.XPATH, "//div[@id='stock_table']"))
        )

        # 提取数据
        rows = driver.find_elements(By.XPATH, "//div[@id='stock_table']//tbody/tr")
        for row in rows:
            try:
                # 序号
                serial = row.find_element(By.XPATH, "./td[1]").text.strip()
                # 股票代码
                stock_code = row.find_element(By.XPATH, "./td[2]/a").text.strip()
                # 股票名称
                stock_name = row.find_element(By.XPATH, "./td[3]/a/span").get_attribute("title")
                # 详细链接
                detail_url = row.find_element(By.XPATH, "./td[4]/a[1]").get_attribute("href")
                # 报告标题
                report_title = row.find_element(By.XPATH, "./td[5]/a").text.strip()
                # 评级
                rating = row.find_element(By.XPATH, "./td[6]").text.strip()
                # 评级变动
                rating_change = row.find_element(By.XPATH, "./td[7]").text.strip()
                # 机构名称
                institution = row.find_element(By.XPATH, "./td[8]/a").text.strip()
                # 近一月个股研报数
                research_count = row.find_element(By.XPATH, "./td[9]").text.strip()
                # 2024盈利预测收益
                predict_2024_eps = row.find_element(By.XPATH, "./td[10]").text.strip()
                # 2024盈利预测市盈率
                predict_2024_pe = row.find_element(By.XPATH, "./td[11]").text.strip()
                # 2025盈利预测收益
                predict_2025_eps = row.find_element(By.XPATH, "./td[12]").text.strip()
                # 2025盈利预测市盈率
                predict_2025_pe = row.find_element(By.XPATH, "./td[13]").text.strip()
                # 行业
                industry = row.find_element(By.XPATH, "./td[14]/a").text.strip()
                # 日期
                date = row.find_element(By.XPATH, "./td[15]").text.strip()

                # 存入字典
                data['序号'].append(serial)
                data['股票代码'].append(stock_code)
                data['股票名称'].append(stock_name)
                data['详细链接'].append(detail_url)
                data['报告标题'].append(report_title)
                data['评级'].append(rating)
                data['评级变动'].append(rating_change)
                data['机构名称'].append(institution)
                data['近一月个股研报数'].append(research_count)
                data['2024盈利预测收益'].append(predict_2024_eps)
                data['2024盈利预测市盈率'].append(predict_2024_pe)
                data['2025盈利预测收益'].append(predict_2025_eps)
                data['2025盈利预测市盈率'].append(predict_2025_pe)
                data['行业'].append(industry)
                data['日期'].append(date)

            except Exception as e:
                print(f"解析行时出错: {e}")
                continue

        # 点击下一页
        if page < pages_to_crawl - 1:  # 最后一页不点击下一页
            next_page_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='pagerbox']/a[contains(text(), '下一页')]"))
            )
            next_page_button.click()
            time.sleep(2)  # 等待页面加载

    except Exception as e:
        print(f"爬取第 {page + 1} 页时出错: {e}")
        break

# 转换为 DataFrame
df = pd.DataFrame(data)
df.to_excel(r".\report_data.xlsx", index=False)

# 关闭浏览器
driver.quit()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值