可转债代码交流第三期：利用Python获取集思录可转债单独页面数据

最新推荐文章于 2024-12-05 18:55:18 发布

Sevemt

最新推荐文章于 2024-12-05 18:55:18 发布

阅读量2k

点赞数 7

文章标签： python chrome 爬虫

本文链接：https://blog.csdn.net/Sevemt/article/details/128714926

版权

本文介绍了如何使用Python的selenium库自动化登录集思录网站，然后获取可转债详细页面的数据，包括可转债名称、公司名称、行业信息和到期赎回价等。代码示例展示了模拟登录、遍历并打开每个可转债页面、提取所需信息的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

上期内容讲解了集思录基础页面的数据获取方法（包含了三类可转债的筛选标记），本期内容分享集思录可转债单独详细页面的数据获取方法，获取页面如下：

图一

本期内容我们需要获取上图中用红框圈起来的信息。其它信息大家可根据自己需要添加。该文章的内容只是抛砖引玉，大家有好的想法也可以互相交流。

前两期内容传送门：

第一期：可转债代码交流第一期：利用Python获取宁稳网数据

（包含基本的环境搭建与Python编辑器安装方法）

第二期：可转债代码交流第二期：利用Python获取集思录数据

（包含基本的模拟登录方法）

本人并非计算机专业出身，所有python知识均为自学，所写代码如有不规范的地方，还望指正。

废话不多说，直接上代码！

第一步：导入库（导入各个模块，为了让代码成功运行）

ps:所有的库安装好之后先导入下试试，测试下是否安装成功（导入模块跟上期内容一样）

import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd

第二步：编写数据获取函数Thelogin4()

1.准备工作，配置运行环境（集思录需要登录（非会员版）才能获取完整的数据）

ps:集思录网站第一次登录不需要进行图片验证（省了很多事情），输入账号密码即可


# 设置函数，为了设置运行参数
chrome_options = Options()
# 输入你的账号
account = '135*****907'
# 输入你的密码
password1 = '1********q'
# 设置目标网址，开头的f可不加.此网站仍然是集思录基础页的链接，每一个转债单独的链接后面会获取
url =f'https://www.jisilu.cn/web/data/cb/list'
# 增加无头（不打开浏览器）
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
# 防止被网站识别（伪装）
chrome_options.add_experimental_option('excludeSwitches',['enable-automation'])
#打开网页不加载图片
prefs = {
'profile.default_content_setting_values': {
    'images': 2
}
        }
chrome_options.add_experimental_option('prefs', prefs)
# 将设置的参数传入，运行浏览器驱动，并且运行Chrome浏览器
browser = webdriver.Chrome(chrome_options = chrome_options)
browser.get(url)#打开网址
#browser.maximize_window()#窗口最大化
browser.implicitly_wait(5)
time.sleep(1)#等待

2.模拟登录，利用selenium自动输入账号密码，勾选同意选项并点击登录按钮

#点击登录按钮输入账号密码
buttons1 = browser.find_elements_by_xpath('//button[@type="button"][1]')
button1 = buttons1[0]
browser.execute_script("arguments[0].click();", button1)
time.sleep(0.5)
#输入账号密码
button2 = browser.find_element_by_xpath('//input[@name="user_name"]')
browser.execute_script("arguments[0].click();", button2)
button2.send_keys(account)#输入账号信息
button3 = browser.find_element_by_xpath('//input[@name="password"]')
browser.execute_script("arguments[0].click();", button3)
button3.send_keys(password1)#输入密码信息
time.sleep(0.5)
#勾选同意
button4 = browser.find_element_by_xpath('//span[@class="agree_text"]')
browser.execute_script("arguments[0].click();", button4)
#点击登录
buttons2 = browser.find_elements_by_xpath('//a[@class="btn btn-jisilu"]')
button5 = buttons2[0]
browser.execute_script("arguments[0].click();", button5)
browser.implicitly_wait(5)
print("登录成功！")
time.sleep(1)#等待

3.获取页面下每一个可转债单独页面的链接

每一个可转债单独页面中链接都存储在代码数据这一栏中，所以我们只需要将这里的信息获取并保存下来即可。

#获取所有可转债的链接
buttonss2 = browser.find_elements_by_xpath('//*[contains(@href,"convert")]')

4.单独打开每一个可转债的链接，获取其中需要的信息（数据较多，整体获取的时间较长，后续可以考虑加入多线程，节约等待时间）


list_event = []#事件1
list_event1 = []#事件2
list_name = []#可转债名称
list_compay = []#公司名称
list_industry1 = []#行业1
list_industry2 = []#行业2
list_price = []#到期赎回价

#利用for循环遍历每一个链接
for i in buttonss2:
    for handle in browser.window_handles:#定位到新打开的页面，不然会报错
        browser.switch_to.window(handle)
        
    #打开获取到的链接
    browser.execute_script("arguments[0].click();", i)
    browser.implicitly_wait(5)
    time.sleep(0.5)#等待
    #定位到最新跳转出来的页面
    for handle in browser.window_handles:
        browser.switch_to.window(handle)
        
    #获取事件（图一中有标注），事件并不是所有可转债都有，所以需要通过try进行尝试性获取，没有的话就跳过
    event =""
    try:
        button10 = browser.find_element_by_xpath('//div[@class="tips-bg"]')
    except:
        pass
    else:#此处代码目前存在一定问题，但不影响运行
    #本意是获取关键信息，而不是把事件里所有的文字都获取下来，后续网页更新了之后变成了一体
        event = button10.text
        if len(event) > 25:
            event = event.split('，')[-1]        
    list_event.append(event)
    
    #获取强赎满足事件（图一中的“强赎天计数”），同样也不是所有可转债都有，进行尝试性获取
    event =""
    try:
        button11 = browser.find_element_by_xpath('//span[@class="font_13"]')
    except:
        pass
    else:
        event = button11.text
    #获取下来的数据可能是图一中的天数满足信息，但不是我们需要的
    #我们只要"已满足强赎条件 !"的信息
    if event == "已满足强赎条件 !":
        list_event1.append(event)
    else:
        list_event1.append("")
        
    #获取可转债名称和对应公司名称
    buttonss5 = browser.find_elements_by_xpath('//span[@class="font_18"]')
    button9 = buttonss5[0]
    name1 = button9.text
    list_name.append(name1)
    button6 = buttonss5[1]
    compay = button6.text
    #print(compay)
    #如果公司名称中带有“R”，那就去除
    if compay[-1] == "R":
        compay = compay.strip("R")
    #print(compay)
    list_compay.append(compay)
    
    #获取可转债所需行业信息，这里只获取了图一中的前两个行业信息
    button7 = browser.find_element_by_xpath('//*[contains(@href,"industry")]')
    industry = button7.text
    industry1 = industry.split('-')[0]
    #print(industry)
    list_industry1.append(industry1)
    #去除行业中带有“Ⅱ”的符号
    industry2 = industry.split('-')[1]
    if industry2[-1] == "Ⅱ" :
        industry2 = industry2.strip("Ⅱ")  
    list_industry2.append(industry2)

    #获取到期赎回价
    button8 = browser.find_element_by_xpath('//td[@id="redeem_price"]')
    list_price.append(button8.text)
    
    browser.close()#关闭当前网页，每一次获取完单独页面都需要关闭网页

5.将获取到的数据转换成DataFrame格式并进行整理


dict_all = {}
dict_all["事件1"]=list_event
dict_all["事件2"]=list_event1
dict_all["转债名称"]=list_name
dict_all["公司名称"]=list_compay
dict_all["行业1"]=list_industry1
dict_all["行业2"]=list_industry2
dict_all["到期赎回价"]=list_price

#列名按列表中的顺序进行排序
list_columns = ["转债名称","公司名称","行业1","行业2","到期赎回价","事件1","事件2"]
#生成DataFrame数据
df_data = pd.DataFrame(dict_all,columns = list_columns)

df_data3 = df_data.copy()

#去除可交换可转债
criteria11 = df_data3['转债名称'].map(lambda x:'EB'not in x)
df_data3 = df_data3[criteria11]
#将事件信息进行整合
df_data3["事件"] = df_data3["事件1"]+df_data3["事件2"]

6.退出设置


time.sleep(1)#等待
#重新定位到详细页面，不然无法关闭网页
for handle in browser.window_handles:
    browser.switch_to.window(handle)

print("集思录补充页信息已爬取完毕！")    

browser.close()#关闭当前网页
browser.quit()#完全退出浏览器
return df_data3

第三步：调用数据获取函数

df_data3 = Thelogin4()

完整代码如下：整体数据较多，获取信息的时间大概在40分钟左右


import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd

def Thelogin4():#获取集思录补充信息
    wait_time = 180
    chrome_options = Options()
    #这里替换成自己的账号密码！
    account = '13********7'
    password1 = 'z*******q'

    url =f'https://www.jisilu.cn/web/data/cb/list'

    # 增加无头（不打开浏览器）
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')

    # 防止被网站识别（伪装）
    chrome_options.add_experimental_option('excludeSwitches',['enable-automation'])

    #打开网页不加载图片
    prefs = {
    'profile.default_content_setting_values': {
        'images': 2
    }
            }
    chrome_options.add_experimental_option('prefs', prefs)

    browser = webdriver.Chrome(chrome_options = chrome_options)
    browser.get(url)#打开网址
    browser.maximize_window()#窗口最大化
    browser.implicitly_wait(5)
    #browser.delete_all_cookies()#删除原来的cookies
    time.sleep(1)#等待
    #点击登录按钮输入账号密码
    buttons1 = browser.find_elements_by_xpath('//button[@type="button"][1]')
    button1 = buttons1[0]
    browser.execute_script("arguments[0].click();", button1)
    time.sleep(0.5)
    #输入账号密码
    button2 = browser.find_element_by_xpath('//input[@name="user_name"]')
    browser.execute_script("arguments[0].click();", button2)
    button2.send_keys(account)#输入信息
    button3 = browser.find_element_by_xpath('//input[@name="password"]')
    browser.execute_script("arguments[0].click();", button3)
    button3.send_keys(password1)#输入信息
    time.sleep(0.5)
    #勾选同意
    button4 = browser.find_element_by_xpath('//span[@class="agree_text"]')
    browser.execute_script("arguments[0].click();", button4)
    #登录
    buttons2 = browser.find_elements_by_xpath('//a[@class="btn btn-jisilu"]')
    button5 = buttons2[0]
    browser.execute_script("arguments[0].click();", button5)
    browser.implicitly_wait(5)

    print("登录成功！")
    time.sleep(5)#等待
    
    list_event = []#事件1
    list_event1 = []#事件2
    list_name = []#可转债名称
    list_compay = []#公司名称
    list_industry1 = []#行业1
    list_industry2 = []#行业2
    list_price = []#到期赎回价
    
    #获取所有可转债的链接
    buttonss2 = browser.find_elements_by_xpath('//*[contains(@href,"convert")]')
    
    for i in buttonss2:
        for handle in browser.window_handles:
            browser.switch_to.window(handle)
        browser.execute_script("arguments[0].click();", i)
        browser.implicitly_wait(5)
        time.sleep(0.5)#等待
        #定位到最新跳转出来的页面
        for handle in browser.window_handles:
            browser.switch_to.window(handle)
        #获取事件
        event =""
        try:
            button10 = browser.find_element_by_xpath('//div[@class="tips-bg"]')
        except:
            pass
        else:
            event = button10.text
            if len(event) > 25:
                event = event.split('，')[-1]        
        list_event.append(event)
        
        #获取强赎满足事件
        event =""
        try:
            button11 = browser.find_element_by_xpath('//span[@class="font_13"]')
        except:
            pass
        else:
            event = button11.text
        if event == "已满足强赎条件 !":
            list_event1.append(event)
        else:
            list_event1.append("")


        buttonss5 = browser.find_elements_by_xpath('//span[@class="font_18"]')
        button9 = buttonss5[0]
        name1 = button9.text
        list_name.append(name1)

        button6 = buttonss5[1]
        compay = button6.text
        #print(compay)
        if compay[-1] == "R":
            compay = compay.strip("R")
        #print(compay)
        list_compay.append(compay)

        button7 = browser.find_element_by_xpath('//*[contains(@href,"industry")]')
        industry = button7.text
        industry1 = industry.split('-')[0]
        #print(industry)
        list_industry1.append(industry1)

        industry2 = industry.split('-')[1]
        if industry2[-1] == "Ⅱ" :
            industry2 = industry2.strip("Ⅱ")  
        list_industry2.append(industry2)
        

        button8 = browser.find_element_by_xpath('//td[@id="redeem_price"]')
        list_price.append(button8.text)
        #print(button8.text)
        
        browser.close()#关闭当前网页
        
    dict_all = {}
    dict_all["事件1"]=list_event
    dict_all["事件2"]=list_event1
    dict_all["转债名称"]=list_name
    dict_all["公司名称"]=list_compay
    dict_all["行业1"]=list_industry1
    dict_all["行业2"]=list_industry2
    dict_all["到期赎回价"]=list_price
    
    list_columns = ["转债名称","公司名称","行业1","行业2","到期赎回价","事件1","事件2"]
    df_data = pd.DataFrame(dict_all,columns = list_columns)
    
    df_data3 = df_data.copy()

    #去除可交换可转债
    criteria11 = df_data3['转债名称'].map(lambda x:'EB'not in x)
    df_data3 = df_data3[criteria11]
    df_data3["事件"] = df_data3["事件1"]+df_data3["事件2"]
    
    time.sleep(1)#等待
    for handle in browser.window_handles:
        browser.switch_to.window(handle)
        
    print("集思录补充页信息已爬取完毕！")    
    
    browser.close()#关闭当前网页
    browser.quit()#完全退出浏览器
    return df_data3
    
df_data3 = Thelogin4()

显示结果例图：