selenium 爬虫

1.事例:启信宝
2.浏览器:火狐,谷歌,phantomjs均可以使用
3.该事例中对selenium的方法进行了封装,读者可以

pip install SpiderTool==19.1.1

该模块对selenium的方法镜像了更细的封装,方便快速开发

4.代码样例:

#!/usr/bin/env python
# _*_ coding:utf-8 _*_

"""
File:   .py
Author: Lijiacai ()
Date: 2018-12-29
Description:
"""

import os
import re
import sys
import random
from SpiderTool import Request
from SpiderTool import Browser
from loggingtool import loggingtool
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from lxml import etree

# 这是一个日志记录器,相关loggingtool,通过pip install loggingtool可以查看源码
logging = loggingtool.init_log("qxb", "console", level="NOTSET")

cur_dir = os.path.split(os.path.realpath(__file__))[0]
sys.path.append("%s/" % cur_dir)


def proxy_deal(proxies):
    """
    get proxy
    If there are other agents, change the function here.
    :return: return a ip:12.23.88.23:2345
    """
    if not proxies:
        one_proxy = None
    elif type(proxies) == list:
        one_proxy = random.choice(proxies)
    else:
        one_proxy = None
    if one_proxy == None:
        logging.info("self ip")
    return one_proxy


class MyRequest(Request.Request):
    """change proxy function to use api proxy"""

    def proxy(self):
        """
        get proxy
        If there are other agents, change the function here.
        :return: return a ip:12.23.88.23:2345
        """
        one_proxy = proxy_deal(self.proxies)
        return one_proxy


class MyBrowser(Browser.Browser):
    """change proxy function to use api proxy"""

    def proxy(self):
        """
        get proxy
        If there are other agents, change the function here.
        :return: return a ip:12.23.88.23:2345
        """
        one_proxy = proxy_deal(self.proxies)
        return one_proxy


class Qxb(object):
    """qixinbao code"""

    def __init__(self, proxies=None):
        self.proxies = proxies

    def search_page(self, keyword):
        """
        搜索页
        :param keyword: 随意给,e.g 百度
        :return:
        """
        result = []
        try:
            url = "https://m.qixin.com/"
            browser = MyBrowser(proxies=self.proxies, headless=False, timeout=20,
                                executable_path=None,
                                browser_type="Firefox")

            browser.get(url=url)
            input_k = browser.find_element(value=u"//input[@placeholder='请输入企业名,人名,品牌名等']",
                                           by=By.XPATH)
            browser.send_keys(input_k, keyword)
            browser.keys(input_k, keyboard=Keys.ENTER)
            try:
                browser.wait_for_element_loaded("btn-primary", elem_type=By.CLASS_NAME,
                                                wait_time=3)
                button_k = browser.find_elements(value=u"btn-primary", by=By.CLASS_NAME)
                for i in button_k:
                    browser.click_elem(i)

            except Exception as e:
                logging.exception(u"No validate")
            browser.implicitly_wait(3)
            html = browser.page_source()
            page = etree.HTML(html)
            href_s = page.xpath("//a/@href")
            for href in href_s:
                if re.findall(r"^/company/", href):
                    company = {
                        "url": "https://m.qixin.com%s" % href,
                        "company_id": href,
                        "company_name": "",
                        "company_status": "",
                        "history_names": []
                    }
                    result.append(company)
        except Exception as e:
            logging.exception(str(e))
        return result

    def result_page(self, url, company_id=None, **kwargs):
        """获取对应搜索页的详细工商信息"""
        url = url + "/info/"
        browser = MyBrowser(proxies=self.proxies, headless=False, timeout=20,
                            executable_path=None,
                            browser_type="Firefox")

        browser.get(url="view-source:" + url)
        try:
            browser.wait_for_element_loaded("btn-primary", elem_type=By.CLASS_NAME,
                                            wait_time=3)
            button_k = browser.find_elements(value=u"btn-primary", by=By.CLASS_NAME)
            for i in button_k:
                browser.click_elem(i)
        except Exception as e:
            logging.exception(u"No validate")
        # browser.get(url="view-source:" + url)
        print browser.browser.current_url
        browser.implicitly_wait(3)
        html = browser.page_source()
        return html


if __name__ == '__main__':
    qxb = Qxb()
    # qxb.search_page("baidu")
    print qxb.result_page(url="https://m.qixin.com/company/5e5641da-211e-40ed-9629-b421f4cf1416")
    # pass

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值