selenium python采集数据样例

不说了,直接上代码,但是登录验证还没找到办法绕过,找的各种资料也看不懂。
# 公司股权穿透内容采集
#%%
from time import sleep
from time import time
import random
from selenium import webdriver


brower = webdriver.Chrome()

src_url = r"https://aiqicha.baidu.com/?from=pz"
brower.maximize_window()
brower.implicitly_wait(random.randint(3,8))    # 默认等待时间

brower.get(src_url)     # 网址导航

sleep(random.randint(5,10))     # 等待人工验证图形
brower.find_element_by_class_name('login').click()

sleep(random.randint(2,4))
usename = brower.find_element_by_xpath('//*[@id="TANGRAM__PSP_4__userName"]')
password = brower.find_element_by_xpath('//*[@id="TANGRAM__PSP_4__password"]')

usename.clear()
password.clear()

usename.send_keys('1851xxxxxx659')
sleep(random.randint(4,10))
password.send_keys('xxxxx')
sleep(random.randint(1,2))
#brower.find_element_by_xpath('//*[@id="TANGRAM__PSP_4__submit"]').click()
brower.find_element_by_id('TANGRAM__PSP_4__submit').click()
# 完成账密登录

# sleep(random.randint(4,10))
# cookiesAfter = brower.get_cookies()
# cookie1 = cookiesAfter[0]
# cookie2 = cookiesAfter[3]
# cookie3 = cookiesAfter[-2]
# cookie4 = cookiesAfter[-1]
# sleep(random.randint(4,10))

# brower.add_cookie(cookie1)
# brower.add_cookie(cookie2)
# brower.add_cookie(cookie3)
# brower.add_cookie(cookie4)
# sleep(random.randint(4,10))
# brower.refresh()
# sleep(random.randint(4,10))

# 初始化导航页
url_s0 = brower.current_url

# %%
def get_info(brower, comp_list, urls0):

    info_dicts = {}
    # 开始点击查询
    for comp in comp_list:

        brower.get(urls0)
        reslist = []
        sleep(random.randint(4,9))
        intput_ls = brower.find_element_by_xpath('//*[@id="aqc-search-input"]')
        intput_ls.clear()
        intput_ls.send_keys(comp)
        brower.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div[2]/button').click()

        # 获取第一行出现的公司的url链接
        sleep(random.randint(0,4))
        url_s1 = brower.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[1]/div[2]/div[2]/div/div/div[2]/div/h3/a').get_attribute('href')
        brower.get(url_s1)
        credit_code = brower.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div[1]/div[1]/div[2]/div[4]/div[3]/span[1]').text
        reslist.append(credit_code)

        # 获取股权穿透的url链接
        sleep(random.randint(0,3))
        url_s2 = brower.find_element_by_xpath('/html/body/div[1]/div[1]/div/div[2]/div[1]/div[1]/div[2]/div[6]/a[2]').get_attribute('href')
        brower.get(url_s2)

        # 获取html中穿透的公司数据
        uplist = brower.find_elements_by_class_name('upwardNode')
        templist=[]
        for lst in uplist:
            for subls in lst.find_elements_by_class_name('branch-name'):
                res = subls.text
                if res:
                    templist.append(res)

        reslist.append(templist)

        info_dicts[comp] = reslist
        sleep(random.randint(5,11))
    return info_dicts

#%%
# 开始采集

complst = ['苏银凯基消费金融有限公司', '江苏苏宁银行股份有限公司', '江苏银行股份有限公司'] 
reslut = get_info(brower, complst, url_s0)
print(reslut)

# %%
# 退出网页
brower.quit()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值