爱企查爬虫selenium

写这个是为了寻求一个可以实现模拟登录功能的大佬,我已经实现了,爬取第一页,但是到了第二页需要登录,我就不会了。或者有想法一起探讨。会写先登录然后再爬取的也行。

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import openpyxl as  op
import csv
from urllib.parse import urljoin
ws=op.Workbook()
wb=ws.create_sheet(index=0)
wb.cell(row=1,column=1,value='企业名称')
wb.cell(row=1,column=2,value='统一社会信用代码')
wb.cell(row=1,column=3,value='法定代表人')
wb.cell(row=1,column=4,value='经营状态')
wb.cell(row=1,column=5,value='成立日期')
wb.cell(row=1,column=6,value='行政区划')
wb.cell(row=1,column=7,value='注册资本')
wb.cell(row=1,column=8,value='实缴资本')
wb.cell(row=1,column=9,value='企业类型')
wb.cell(row=1,column=10,value='所属行业')
wb.cell(row=1,column=11,value='工商注册号')
wb.cell(row=1,column=12,value='组织机构代码')
wb.cell(row=1,column=13,value='纳税人识别号')
wb.cell(row=1,column=14,value='纳税人资质')
wb.cell(row=1,column=15,value='营业期限')
wb.cell(row=1,column=16,value='核准日期')
wb.cell(row=1,column=17,value='登记机关')
wb.cell(row=1,column=18,value='参保人数')
wb.cell(row=1,column=19,value='曾用名')
wb.cell(row=1,column=20,value='注册地址')
wb.cell(row=1,column=21,value='经营范围')
wb.cell(row=1,column=22,value='主管部门名称')
wb.cell(row=1,column=23,value='持股比例')
wb.cell(row=1,column=24,value='认缴出资额')
wb.cell(row=1,column=25,value='认缴出资日期')

#get直接返回,不再等待界面加载完成
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"

# 设置谷歌驱动器的环境
options = webdriver.ChromeOptions()
# 设置chrome不加载图片,提高速度
#options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 设置不显示窗口
#options.add_argument('--headless')
# 创建一个谷歌驱动器
driver = webdriver.Chrome(options=options)
# 设置搜索主题
theme = "施工图审查"
# 设置所需篇数
papers_need = 20


# 打开页面
#driver.get("https://aiqicha.baidu.com/s?q=%E6%96%BD%E5%B7%A5%E5%9B%BE%E5%AE%A1%E6%9F%A5&t=0")
driver.get("https://aiqicha.baidu.com/s?q=%E6%96%BD%E5%B7%A5%E5%9B%BE%E5%AE%A1%E6%9F%A5")
# 传入关键字
WebDriverWait( driver, 100 ).until( EC.presence_of_element_located( (By.XPATH ,'''//*[@id="aqc-header-search-input"]''') ) ).send_keys(theme)
# 点击搜索
WebDriverWait( driver, 100 ).until( EC.presence_of_element_located( (By.XPATH ,"/html/body/div[1]/div[1]/header/div/div[2]/button") ) ).click()
time.sleep(3)

# 点击切换中文文献
# 获取总文献数和页数


# 赋值序号, 控制爬取的文章数量
count = 1
# 当,爬取数量小于需求时,循环网页页码
while count <= papers_need:
    # 等待加载完全,休眠3S
    time.sleep(3)

    title_list = WebDriverWait( driver,1 ).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR  ,"body > div.base.page-search.has-search-tab > div.aqc-content-wrapper.has-footer > div > div.main > div.list-wrap > div.company-list > div > div:nth-child(-n+10) > div.info > div > h3 > a") ) )

    # 循环网页一页中的条目
    for i in range(len(title_list)):
        try:

            term = count%10   # 本页的第几个条目
            title_xpath = "/html/body/div[1]/div[2]/div/div[1]/div[3]/div[2]/div/div[@class='card']/div[2]/div/h3/text()"
            #author_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='author']/td[3]/text()"
            #source_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='source']/td[4]/text()"
            #date_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='data']/td[5]/text()"
            #database_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='data']/td[6]/text()"
            #title = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,title_xpath) ) ).click()
            #authors = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,author_xpath) ) ).text
            #source = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,source_xpath) ) ).text
            #date = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,date_xpath) ) ).text
            #database = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,database_xpath) ) ).text
            #date = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h1") ) ).text

            # 点击条目
            title_list[i].click()
            # 获取driver的句柄
            n = driver.window_handles
            # driver切换至最新生产的页面
            driver.switch_to.window(n[-1])
            # 开始获取页面信息
            title = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[1]/td[2]/span") ) ).text
            code = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[1]/td[4]") ) ).text
            fadindaibiao = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[2]/td[2]/div[2]/a[1]") ) ).text
            jingyingzhuangtai = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[2]/td[4]') ) ).text
            chenglidata = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[3]/td[2]') ) ).text
            area=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[3]/td[4]') ) ).text
            zhuceziben=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[2]') ) ).text
            shijiaoziben=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[4]') ) ).text
            qiyeleixing=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[5]/td[2]') ) ).text
            suosuhangye=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[5]/td[4]') ) ).text
            gongshangzhucehao= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[6]/td[2]/span') ) ).text
            zuzhijigodaima= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[6]/td[4]/span') ) ).text
            nasuorenshibiehao= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[2]/span') ) ).text
            nasuorenzizhi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[4]') ) ).text
            yingyeqingxian= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[8]/td[2]') ) ).text
            hezhunriqi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[8]/td[4]') ) ).text
            dengjijiguan= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[9]/td[2]') ) ).text
            canbaorenshu= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[9]/td[4]') ) ).text
            cengyongming= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[10]/td[2]/p') ) ).text
            address= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[11]/td[2]/span[1]') ) ).text
            jingyingfanwei= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[12]/td[2]/div') ) ).text
            gudongmingcheng= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[2]/div/div/div[2]/div[1]/div/div[1]/a') ) ).text
            chigubili= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[3]/div/div/span') ) ).text
            money=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[4]/div/span') ) ).text
            data=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[5]/div/span') ) ).text
            try:

                 zhuceziben=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[2]') ) ).text
                 shijiaoziben=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[4]') ) ).text



                 nasuorenshibiehao= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[2]/span') ) ).text
                 nasuorenzizhi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[4]') ) ).text

                 hezhunriqi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[8]/td[4]') ) ).text
                 dengjijiguan= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[9]/td[2]') ) ).text

                 cengyongming= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[10]/td[2]/p') ) ).text

                 gudongmingcheng= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[2]/div/div/div[2]/div[1]/div/div[1]/a') ) ).text
                 chigubili= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[3]/div/div/span') ) ).text
                 money=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[4]/div/span') ) ).text
                 data=  WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH  ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[5]/div/span') ) ).text
            except:

                zhuceziben= '无'
                shijiaoziben= '无'

                nasuorenshibiehao= '无'
                nasuorenzizhi= '无'

                hezhunriqi= '无'
                dengjijiguan= '无'

                cengyongming= '无'

                gudongmingcheng= '无'
                chigubili= '无'
                money= '无'
                data= '无'



            url = driver.current_url

            # 获取下载链接
            # link = WebDriverWait( driver, 10 ).until( EC.presence_of_all_elements_located((By.CLASS_NAME  ,"btn-dlcaj") ) )[0].get_attribute('href')
            # link = urljoin(driver.current_url, link)

            # 写入文件
            #res = "+str(title)+str(authors)+str(institute)+str(abstract)+str(keywords)".replace("\n","")+"\n"
            #print(res)
            print("企业名称:"+str(title)+"统一社会信用代码:"+str(code)+"法定代表人:"+str(fadindaibiao)+"经营状态:"+str(jingyingzhuangtai)+"成立日期:"+str(chenglidata)+"行政区划:"+str(area)+
                  "注册资本:"+str(zhuceziben)+"实缴资本:"+str(shijiaoziben)+
                  "企业类型:"+str(qiyeleixing)+"所属行业:"+str(suosuhangye)+"工商注册号:"+str(gongshangzhucehao)+"组织机构代码:"+str(zuzhijigodaima)+"纳税人识别号:"+str(nasuorenshibiehao)+
                  "纳税人资质:"+str(nasuorenzizhi)+"营业期限:"+str(yingyeqingxian)+"核准日期:"+str(hezhunriqi)+"登记机关:"+str(dengjijiguan)+"参保人数:"+str(canbaorenshu)+
                  "曾用名:"+str(cengyongming)+"注册地址:"+str(address)+"经营范围:"+str(jingyingfanwei)+
                  "股东:"+str(gudongmingcheng)+"持股比列:"+str(chigubili)+"认缴出资额:"+str(money)+"认缴出资日期:"+str(data))

            wb.cell(row=count,column=1,value=title)
            wb.cell(row=count,column=2,value=code)
            wb.cell(row=count,column=3,value=fadindaibiao)
            wb.cell(row=count,column=4,value=jingyingzhuangtai)
            wb.cell(row=count,column=5,value=chenglidata)
            wb.cell(row=count,column=6,value=area)
            wb.cell(row=count,column=7,value=zhuceziben)
            wb.cell(row=count,column=8,value=shijiaoziben)
            wb.cell(row=count,column=9,value=qiyeleixing)
            wb.cell(row=count,column=10,value=suosuhangye)
            wb.cell(row=count,column=11,value=gongshangzhucehao)
            wb.cell(row=count,column=12,value=zuzhijigodaima)
            wb.cell(row=count,column=13,value=nasuorenshibiehao)
            wb.cell(row=count,column=14,value=nasuorenzizhi)
            wb.cell(row=count,column=15,value=yingyeqingxian)
            wb.cell(row=count,column=16,value=hezhunriqi)
            wb.cell(row=count,column=17,value=dengjijiguan)
            wb.cell(row=count,column=18,value=canbaorenshu)
            wb.cell(row=count,column=19,value=cengyongming)
            wb.cell(row=count,column=20,value=address)
            wb.cell(row=count,column=21,value=jingyingfanwei)
            wb.cell(row=count,column=22,value=gudongmingcheng)
            wb.cell(row=count,column=23,value=chigubili)
            wb.cell(row=count,column=24,value=money)
            wb.cell(row=count,column=25,value=data)





            #with open('CNKI_res.txt', 'a', encoding='gbk') as f:
                #f.write(keywords)




        #except:
        except:
            print(count)
            print("失败")
            # 跳过本条,接着下一个

            continue
        finally:
            n2 = driver.window_handles
            if len(n2) > 1:
                driver.close()
                driver.switch_to.window(n2[0])
            # 计数,判断需求是否足够
        count += 1
        if count == papers_need:break


        #except:
            #print(" 第{count} 条爬取失败\n")
            # 跳过本条,接着下一个
            #continue
    ws.save('施工图企业.xls')

    WebDriverWait( driver, 10 ).until( EC.presence_of_element_located( (By.CSS_SELECTOR ,"body > div.base.page-search.has-search-tab > div.aqc-content-wrapper.has-footer > div > div.main > div.list-wrap > div.company-list > ul > li.ivu-page-next > a > i") ) ).click()
    #WebDriverWait( driver, 10 ).until( EC.presence_of_element_located( (By.CSS_SELECTOR ,"body > div.base.page-search.has-search-tab > div.aqc-content-wrapper.has-footer > div > div.main > div.list-wrap > div.company-list > div > div > div > button > span") ) ).click()
    #time.sleep(50)#等待20s,完成手动登录操作

# 关闭浏览器
driver.close()

第一页爬取成功后出现这个

 急需大佬解决。在我的基础上添加一部分。

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 8
    评论
您好!对于亚马逊爬虫,使用Selenium是一个常见的选择。Selenium是一个自动化测试工具,可以模拟用户在网页上的操作。通过Selenium,您可以实现打开亚马逊网页、搜索商品、获取商品信息等操作。 如果您想使用Selenium爬取亚马逊网页,首先需要安装Selenium库并配置相应的浏览器驱动。您可以使用以下步骤来进行操作: 1. 安装Selenium库:在命令行中运行`pip install selenium`来安装Selenium库。 2. 下载浏览器驱动:Selenium需要与特定的浏览器驱动配合使用,您需要根据您所使用的浏览器下载对应的驱动。例如,如果您使用的是Chrome浏览器,可以从Chrome Driver官方网站(https://sites.google.com/a/chromium.org/chromedriver/)下载适用于您的Chrome浏览器版本的驱动。 3. 配置浏览器驱动:将下载的驱动解压缩后,将驱动所在的路径添加到系统环境变量中,或者在代码中指定驱动的路径。 4. 编写爬虫代码:使用Selenium的API来编写爬虫代码。例如,您可以使用`webdriver.Chrome()`创建一个Chrome浏览器对象,然后使用该对象模拟用户在亚马逊网页上的操作,如搜索商品、点击链接、获取页面内容等。 需要注意的是,使用Selenium进行爬虫操作时,应该遵守网站的使用条款和服务协议,确保合法合规地进行爬取。此外,为了减少对服务器的负载并保护您的爬虫不被检测到,您可以设置适当的请求间隔、使用随机User-Agent等策略。 希望以上信息对您有帮助!如果还有其他问题,请随时提问。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值