python 只使用 selenium 查找企业信息

对python 不熟悉,想用爬虫框架,看到一堆的安装东西,没那多的时间,看到selenium 好像可以操作浏览器,不用写这个麻烦。想不到这个还是很麻烦。电脑上用qq登陆,代码自动进行网页登陆,查找信息。其中有405问题

# -*- coding: utf-8 -*-

from os import system, times

from w3lib.html import remove_tags

from selenium import webdriver

from selenium.webdriver.common.by import By

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains

class Chacha:

    links=[]

    driver=object

    url=''

    def __init__(self,url):

        self.url=url

        optionsdata = webdriver.FirefoxOptions()

        #optionsdata.add_argument('-headless')

        self.driver = webdriver.Firefox(options=optionsdata)

        # 获取当前标签页1的句柄

        handle = self.driver.current_window_handle

        print('tab1: ' + handle)

    def get_text(self,url,n):

        # 新开一个标签页2

        self.driver.execute_script('window.open("","_blank");')

        # 切换到标签页1

        self.driver.switch_to.window(self.driver.window_handles[n])

        #self.driver.switch_to.window('tab')

        self.driver.get(url)

        try:

            # title=WebDriverWait(self.driver,10).until(

            #     EC.presence_of_element_located(By.CSS_SELECTOR,".co")

            # )

            element = WebDriverWait(self.driver, 10).until(

                EC.presence_of_element_located((By.CSS_SELECTOR, ".f.ca  > .val > span > span > a"))

            )

            #未登陆时使用

            # element = WebDriverWait(self.driver, 10).until(

            #     EC.presence_of_element_located((By.CSS_SELECTOR, ".fc > .text-primary > .text-primary > a"))

            # )

            

            navHead=self.driver.find_element(By.CSS_SELECTOR, ".nav-head")

            navHead.find_element_by_partial_link_text('基本信息').click()

            name=element.text

                # 电话

            phone=self.driver.find_element(By.CSS_SELECTOR, ".val > span:nth-child(2)").text

            self.driver.find_element(By.CSS_SELECTOR, ".active > h2").click()

            #tel=self.driver.find_element(By.CSS_SELECTOR, "span.f.ca > span.val > sapn:nth-child(2)").text

            #未登陆时

            #tel=self.driver.find_element(By.CSS_SELECTOR, "span.fc > span.cvlu span > a").text

            # 企业名称

            company_name=self.driver.find_element(By.CSS_SELECTOR, ".ntable > tr:nth-child(1) > td:nth-child(4)").text

            # 最新年报地址

            

            

            zhuche =remove_tags(self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(9) > td:nth-child(2)").text)

            if(zhuche==''):

                zhuche=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(9) > td:nth-child(2)> a.text-dk").text

            add=""

            Business_Scope=""

            temp=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(10) > .tb").text

            if(temp=="最新年报地址"):

                add=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(10) > td:nth-child(2)").text

                

                # 经营范围

                Business_Scope=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(11) > td:nth-child(2)").text

            if(temp=="经营范围"):

                Business_Scope=self.driver.find_element(By.CSS_SELECTOR, "tr:nth-child(10) > td:nth-child(2)").text

            print(name,phone,company_name,zhuche,add,Business_Scope)

        except TimeoutError:

            self.driver.close()

        finally:

            print(self.driver.title)

      

    def start_login(self):

        self.driver.get(self.url)

        for handle in self.driver.window_handles:

            print(handle)

        try:

            element = WebDriverWait(self.driver, 10).until(

                EC.element_to_be_clickable(

                    (By.CSS_SELECTOR, "li:nth-child(10) > .navi-btn"))

            )

            element.click()

        except(ValueError, ArithmeticError):

            print("程序发生了数字格式异常、算术异常之一")

        finally:

            print(self.driver.title)

        #self.driver.find_element(By.CSS_SELECTOR, "li:nth-child(10) > .navi-btn > span").click()

        self.driver.find_element(By.CSS_SELECTOR, ".text-center > .btn-qq-d").click()

        # 5 | selectFrame | index=0 |

        self.driver.switch_to.frame(0)

        self.driver.find_element(By.XPATH, "//span[4]").click()

        print(self.driver.title)

        self.driver.switch_to.default_content()

        #self.get_links(self)

    def get_links(self):

        # 关键字搜索,把链接存到links列表

        try:

            elementw = WebDriverWait(self.driver,10).until(

                EC.presence_of_element_located((By.ID, "searchkey"))

                

            )

            print("搜索准备完成!")

        finally:

            print("----------")

        elementw.send_keys("软件")

        self.driver.find_element(By.CSS_SELECTOR, ".index-searchbtn").click()

            

            # 2 | setWindowSize | 1550x838 | 

        self.driver.set_window_size(1550, 838)

        try:

            element2 = WebDriverWait(self.driver,10).until(

                EC.presence_of_element_located((By.CSS_SELECTOR, ".ntable.ntable-list"))

            )

        finally:

            print(222)

            # 4 | click | css=tr:nth-child(11) > td:nth-child(2) | 

        list=self.driver.find_elements(By.CSS_SELECTOR, "tr td .maininfo > a.title")

        for li in list:

            print(li.get_attribute("href"))

            self.links.append(li.get_attribute("href"))

cc=Chacha("https://www.qcc.com")

cc.start_login()

print(cc)

cc.get_links()

n=1

while len(cc.links) > 0:

    link=cc.links.pop(0)

    

    cc.get_text(link,n)

    n=n+1

    

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值