Python操作selenium浏览器

一. 环境的配置

1. 安装selenium
pip install selenium
pip install browsermob-proxy
2. 下载浏览器却动文件chromedriver,要和自己的Google浏览器版本一致,下载地址:http://chromedriver.storage.googleapis.com/index.html
3. 可忽略>下载代理:browsermob-proxy-2.1.4,下载地址:https://github.com/lightbody/browsermob-proxy/releases/

二. 2、初始化代理服务器和浏览器,注意这里的路径问题

import os
from browsermobproxy import Server
from selenium import webdriver
import time

# 初始化代理
def init_proxy():
    server = Server(os.path.join(os.getcwd(), r'browsermob-proxy-2.1.4\bin\browsermob-proxy.bat'))
    server.start()
    return server.create_proxy()


# 初始化浏览器,传递代理对象
def init_browser(proxy):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')  # 无头浏览器
    chrome_options.add_argument('ignore-certificate-errors')  # 无视证书验证
    chrome_options.add_argument('--start-maximized')  # 开始时直接最大屏幕
    # 设置用户数据目录,避免每次都要重新登陆
    chrome_options.add_argument(r'--user-data-dir=D:\ChromeUserData')
    chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy))  # 设置请求的代理
    return webdriver.Chrome(options=chrome_options)


if __name__ == '__main__':
    server_proxy = init_proxy()
    browser = init_browser(server_proxy)
    # 监听结果
    server_proxy.new_har("test", options={
        'captureContent': True,
        'captureHeaders': True
    })
    browser.get("https://www.baidu.com")
    time.sleep(2)
    catch_result = server_proxy.har
    for entry in catch_result['log']['entries']:
        print(entry['response']['content'])
        
    # 用完记得关
    server_proxy.close()
    browser.close()
下面是封装使用
# _*_ coding:utf-8 _*_
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from browsermobproxy import Server
import time
import json


class BaseFramework(object):

    def __init__(self):
        # 修改下载的文件路径
        self.server = Server(r'F:\BrowserMobProxy\browsermob-proxy-2.1.4\bin\browser-mob-proxy.bat')
        self.server.start()
        self.proxy = self.server.create_proxy()
        chrome_options = Options()
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument('--proxy-server={0}'.format(self.proxy.proxy))
        chrome_options.add_argument('--headless')  # 无头模式
        self.browser = webdriver.Chrome(options=chrome_options)

    def process_request(self, request, response):
        pass

    def process_response(self, response, request):
        pass

    def run(self, func, *args):
        self.proxy.new_har(options={
            'captureContent': True,
            'captureHeaders': True
        })
        func(*args)
        result = self.proxy.har
        for entry in result['log']['entries']:
            request = entry['request']
            response = entry['response']
            self.process_request(request, response)
            self.process_response(response, request)

    def __del__(self):
        self.proxy.close()
        self.browser.close()


class Framework(BaseFramework):

    def load(self, url):
        self.browser.get(url)
        time.sleep(3)

    def process_request(self, request, response):
        pass

    def process_response(self, response, request):
        # print(request['url'])
        # 找到你所需数据的url即可快乐的解析数据了
        if '/item/timemap/cn/' in request['url']:
            try:
                text = response['content']['text']
                text_dict = json.loads(text)
                data_result = text_dict['data']
            except KeyError:
                print('----KeyError: text----')
                return
            name = data_result['name']  # 姓名
            id_name = name_id + '_' + name
            print(id_name)
            time_map_list = data_result['timeMap']
            if time_map_list:
                time_map_dict = {}
                for i in range(len(time_map_list)):
                    time_map = time_map_list[i]
                    time_map_dict[str(i)] = time_map
            else:
                return
            path = f'./****/{id_name}.json'
            if os.path.exists(path):
                print(f'------{id_name}--已存在------')
                return
            with open(path, 'w', encoding='utf-8') as f:
                f.write(json.dumps(time_map_dict, ensure_ascii=False, indent=4))


if __name__ == '__main__':
    Framework = Framework()
    id_list = ['********']
    for name_id in id_list:
        url = "************************"
        Framework.run(Framework.load, url)
解释解释:
代码一共分了四步:

•第一步便是启动 BrowserMob Proxy,它会在本地启动一个代理服务,这里注意 Server 的第一个参数需要指定 BrowserMob Proxy 的可执行文件路径,这里我就指定了下载下来的 BrowserMob Proxy 的 bin 目录的 browsermob-proxy 的路径。
•第二步便是启动 Selenium 了,它可以设置 Proxy Server 为 BrowserMob Proxy 的地址。
•第三步便是访问页面同时监听结果,这里我们需要调用 new_har 方法,同时指定捕获 Resopnse Body 和 Headers 信息,紧接着调用 Selenium 的 get 方法访问一个页面,这时候浏览器便会加载这个页面,同时所有的请求和响应信息都会被记录到 HAR 中。
•第四步便是读取 HAR 到内容了,我们调用 log 到 entries 字段,里面便包含了请求和响应的具体结果,这样所有的请求和响应信息我们便能获取到了,Ajax 的内容也不在话下。
举个栗子1
from selenium import webdriver
from time import sleep
from selenium.webdriver.common.keys import Keys
import datetime
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
def qinjia(browser,url):
    browser.get(url)
    sleep(1)
    browser.implicitly_wait(3)
    WebDriverWait(browser,5).until(EC.presence_of_all_elements_located((By.ID,"user_main")))
    user_main_div=browser.find_element_by_id("user_main")
    username_input=user_main_div.find_element_by_id("txtId")   #用户名
    password_input=user_main_div.find_element_by_id("txtMM")    #密码
    login_btn=user_main_div.find_element_by_id("IbtnEnter")   #登录按钮

    # 修改1:此处的账号和密码
    username_input.send_keys("==================账号===================")
    password_input.send_keys("==================密码===================")
    login_btn.click()

    sleep(1)
    browser.implicitly_wait(3)
    WebDriverWait(browser, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "tabThinM")))
    table_tag=browser.find_element_by_class_name("tabThinM")
    href_body=table_tag.find_elements_by_tag_name("tbody")[2]
    href=href_body.find_elements_by_tag_name("tr")[1].find_element_by_tag_name("a").get_attribute("href")
    browser.get(href)

    table_wjTA=browser.find_element_by_id("wjTA")

    div_gerenjiankang=table_wjTA.find_elements_by_class_name("dvO")[0]  #个人健康
    div_shenqing=table_wjTA.find_elements_by_class_name("dvO")[1]       #申请进入

    # 个人健康
    selects_tag=div_gerenjiankang.find_elements_by_tag_name("select")
    work_station_select=selects_tag[2]
    health_station_select=selects_tag[3]
    live_station_select=selects_tag[4]
    family_station_select=selects_tag[5]

    Select(work_station_select).select_by_value("1")
    Select(health_station_select).select_by_value("1")
    Select(live_station_select).select_by_value("1")
    Select(family_station_select).select_by_value("1")


    #申请进入
    select_shenqin_time_tags=div_shenqing.find_elements_by_tag_name("select")

    input_shenqin_reaseons_tags=div_shenqing.find_elements_by_tag_name("input")
    target_place_input=input_shenqin_reaseons_tags[0]
    reason_input=input_shenqin_reaseons_tags[1]
    # 修改2:成此处的申请目的地和事由
    target_place_input.send_keys("=====================申请目的========================")
    reason_input.send_keys("==========================事由===============================")

    Select(select_shenqin_time_tags[0]).select_by_value("1")
    Select(select_shenqin_time_tags[1]).select_by_value("06")
    Select(select_shenqin_time_tags[2]).select_by_value("3")
    Select(select_shenqin_time_tags[3]).select_by_value("23")

    submit_input=browser.find_element_by_tag_name("input")
    submit_input.click()


def log(message):
    curent_time = datetime.datetime.now()
    print(curent_time)

    f = open("log.txt", "a+", encoding="utf-8")

    f.write(str(curent_time) + ":  "+message+"\n")
    f.close()
def headLessChrome():
    chrome_driver = r"chromedriver.exe"
    chrome_options=Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    browser=webdriver.Chrome(options=chrome_options, executable_path=chrome_driver)
    return browser
if __name__ == '__main__':
    url="http://login.cuit.edu.cn/Login/xLogin/Login.asp"
    browser=headLessChrome()
    try:
        qinjia(browser,url)
        log("成功")
    except:
        log("失败")
    browser.quit()
举个栗子2
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from selenium import webdriver
import csv
import re
import numpy as np
import os

afterLogin_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'}
# driver = webdriver.PhantomJS(executable_path=r'D:\code\patent_info\phantomjs-2.1.1-windows\bin\phantomjs.exe', service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
chrome_driver = r'D:\code\patent_info\chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(r'--user-data-dir=D:\code\patent_info\ChromeUserqichacha0623')
driver = webdriver.Chrome(executable_path = chrome_driver, options=chrome_options)
# driver.maximize_window() 
# # driver.add_argument('--headless')  # 开启无界面模式 

def get_company_message(company):
    '''
    Input: company is a chinese word
    Todo: get company code and  patent url 
    Return: patent url 
    '''
    driver.get('https://www.qcc.com/search?key={}'.format(company))
    time.sleep(5)
    print('https://www.qcc.com/search?key={}'.format(company))
    html_page = driver.page_source
    soup = BeautifulSoup(html_page,features="lxml")
    href = soup.find_all('a',{'class': 'title'})[0].get('href')
    print('href is {}'.format(href))
    href2 = href.replace('firm', 'cassets')
    print('href2 is {}'.format(href2))
    return href2

def iselementExist(element):
    '''
    Input: xpath of patent_table element
    Todo: Whether the patent exists
    Return: bools
    '''
    flag = True
    try:
        driver.find_element_by_xpath(element)
        return flag
    except:
        flag=False
        return flag
    # //*[@id="zhuanlilist"]/div[1]/h3
def get_patent_infomation(number_page):# get information single web
    '''
    Input: href2 get from 'get_company_message' 
    Todo: get patents information from table
    Return: soup.select
    '''
    number_page = BeautifulSoup(number_page,features="lxml")
    data_infos = number_page.select('#zhuanlilist .app-ntable td')
    return data_infos

def save_patient(data_infos, key_company):
    '''
    Input: 1.data_infos: get one page information from 'get_patent_infomation'
           2.key_company: company name read from excel
    Todo: explain and write input to csv
    return: None
    '''
    company_patient = []
    for info in data_infos:
        company_patient.append(info.text)
    company_patient_classified =  [company_patient[i:i+10] for i in range(0,len(company_patient),10)]
    company_patient_classified = np.array(company_patient_classified) # 使用numpy中的array,将列表转化为标准的数组s
    dataframe = pd.DataFrame({  'company': key_company,
                                'Number': company_patient_classified[:,0],
                                'Patent_name': company_patient_classified[:,1],
                                'Patent_type': company_patient_classified[:,2],
                                'Patent_statu': company_patient_classified[:,3],
                                'Application number': company_patient_classified[:,4],
                                'Apply_data': company_patient_classified[:,5],
                                'Public_announcement_No': company_patient_classified[:,6],
                                'Public_announcement_Data': company_patient_classified[:,7],
                                'Inventor': company_patient_classified[:,8],
                                'More': company_patient_classified[:,9]
                                })
    if not os.path.exists('D:\code\patent_info\data\output_data\company_patient1.csv'):
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030')
    else:
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030',header=False)

def whether_turn_page(element1):
    '''
    Input: xpath of patent_table element
    Todo: Whether the pages_number table exists
    Return: bools
    '''
    flag1 = True
    try:
        driver.find_element_by_xpath(element1)
        return flag1
    except:
        flag1=False
        return flag1
  # //*[@id="zhuanlilist"]/div[4]/nav/ul

def turn_next_page(key_company):  
    '''
    Input: key_company: company name read from excel
    Todo: design how to turn next page in defferent condition
    return: key_company: company name read from excel
    '''
    print(driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text)
    list_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text
    # num_max = int(re.sub("\D", "", list_max)) # just keep int number
    if list_max == '>': # patent number between (10,70]
        list_second_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').text
        for turn_index in range(int(list_second_max) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    elif int(re.sub("\D", "", list_max)):
    # elif isinstance(num_max, int):# patent number above 70
        for turn_index in range(int(re.sub("\D", "", list_max)) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    else:
        print('error company name is {}'.format(key_company))

    #next_page_button //*[@id="zhuanlilist"]/div[4]/nav/ul/li[8]/a 
if __name__ == '__main__':
    csv_file = r"D:\code\patent_info\patient1.csv"
    with open(csv_file, encoding='utf-8') as csvfile:
        reader=csv.reader(csvfile)
        for i,key_company in enumerate(reader):
            print('i is {}'.format(i))
            print('rows is {}'.format(key_company))
            # print(type(key_company))
            key_company = ' '.join(key_company)
            patent_url = get_company_message(key_company) # patent url
            driver.get(patent_url)
            time.sleep(1)
            if iselementExist('//*[@id="zhuanlilist"]/div[1]/h3'):
                number_page = driver.page_source
                data_infos = get_patent_infomation(number_page)
                save_patient(data_infos, key_company)
                if whether_turn_page('//*[@id="zhuanlilist"]/div[4]/nav/ul'):
                    turn_next_page(key_company)


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值