selenium与PhantomJSDriver整合 加速 网站爬取

在使用 PhantomJSDriver 的时候 ,因为每次start client 是每次爬取 数据非常 在 30 s 左右 ,对于 源码的研究 自己改造 了 ,其中关键代码 已贴出


源码: http://git.oschina.net/wds/contact


package org.openqa.selenium.phantomjs;

import java.io.IOException;
import java.lang.reflect.Field;
import java.util.Map;

import org.openqa.selenium.Capabilities;
import org.openqa.selenium.Platform;
import org.openqa.selenium.remote.CapabilityType;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.DriverCommand;
import org.openqa.selenium.remote.MyHttpCommandExecutor;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.openqa.selenium.remote.Response;

import com.google.common.collect.ImmutableMap;

public class MyPhantomJSDriver extends PhantomJSDriver {
	private String mySessionId;

	private Capabilities desiredCapabilities;
	private Capabilities requiredCapabilities;

	private int port;

	public MyPhantomJSDriver(String mySessionId, int port) {
		super(port);
		this.mySessionId = mySessionId;
		this.port = port;
		try {
			startSession();
		} catch (RuntimeException e) {
			try {
				quit();
			} catch (Exception localException1) {
			}
			throw e;
		}
	}

	
	protected void startSession() {
		if (this.mySessionId != null && !this.mySessionId.isEmpty()) {

//			// URL driverserver = new URL(localServer);
//			 MyHttpCommandExecutor delegate = new PhantomJSCommandExecutor(
//					PhantomJSDriverService.createDefaultServiceWithPort(desiredCapabilities, this.port));
//			// HttpCommandExecutor(driverserver);
//
//			try {
//				// TODO: use a more intelligent way of testing if the server is
//				// ready.
//				delegate.getAddressOfRemoteServer().openConnection().connect();
//				super.setCommandExecutor(delegate);
//
//			} catch (IOException e) {
//				e.printStackTrace();
//			}

			super.setSessionId(this.mySessionId);
			// Command command = new Command(super.getSessionId(),
			// DriverCommand.GET_CAPABILITIES);

			ImmutableMap.Builder<String, Capabilities> paramBuilder = new ImmutableMap.Builder();
			paramBuilder.put("desiredCapabilities", desiredCapabilities);
			if (requiredCapabilities != null) {
				paramBuilder.put("requiredCapabilities", requiredCapabilities);
			}
			Map<String, ?> parameters = paramBuilder.build();

			Response response = execute(DriverCommand.GET_CAPABILITIES, parameters);

			Map<String, Object> rawCapabilities = (Map<String, Object>) response.getValue();
			DesiredCapabilities returnedCapabilities = (DesiredCapabilities) super.getCapabilities();
			if (returnedCapabilities == null) {
				returnedCapabilities = new DesiredCapabilities();
			}
			for (Map.Entry<String, Object> entry : rawCapabilities.entrySet()) {
				// Handle the platform later
				if (CapabilityType.PLATFORM.equals(entry.getKey())) {
					continue;
				}
				returnedCapabilities.setCapability(entry.getKey(), entry.getValue());
			}
			String platformString = (String) rawCapabilities.get(CapabilityType.PLATFORM);
			Platform platform;
			try {
				if (platformString == null || "".equals(platformString)) {
					platform = Platform.ANY;
				} else {
					platform = Platform.valueOf(platformString);
				}
			} catch (IllegalArgumentException e) {
				// The server probably responded with a name matching the
				// os.name
				// system property. Try to recover and parse this.
				platform = Platform.extractFromSysProperty(platformString);
			}
			returnedCapabilities.setPlatform(platform);

			// this.myCapabilities = returnedCapabilities;
			try {
				Field f = RemoteWebDriver.class.getDeclaredField("capabilities");
				f.setAccessible(true);
				f.set(this, returnedCapabilities);
			} catch (Exception e) {
				e.printStackTrace();
			}
		} else {
			super.startSession(desiredCapabilities, requiredCapabilities);
		}
	}

	@Override
	protected void startSession(Capabilities desiredCapabilities, Capabilities requiredCapabilities) {
		this.desiredCapabilities = desiredCapabilities;
		this.requiredCapabilities = requiredCapabilities;
	}
}
/*
This file is part of the GhostDriver by Ivan De Marino <http://ivandemarino.me>.

Copyright (c) 2012-2014, Ivan De Marino <http://ivandemarino.me>
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice,
      this list of conditions and the following disclaimer in the documentation
      and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

package org.openqa.selenium.phantomjs;

import java.io.IOException;
import java.net.ConnectException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.SocketAddress;

import org.openqa.selenium.WebDriverException;
import org.openqa.selenium.remote.Command;
import org.openqa.selenium.remote.DriverCommand;
import org.openqa.selenium.remote.MyHttpCommandExecutor;
import org.openqa.selenium.remote.Response;

import com.google.common.base.Throwables;

/**
 * A specialized {@link org.openqa.selenium.remote.MyHttpCommandExecutor} that
 * will use a {@link PhantomJSDriverService} that lives and dies with a single
 * WebDriver session.
 * <p/>
 * The service will be restarted upon each new session request and shutdown
 * after each quit command.
 * <p/>
 * NOTE: Yes, the design of this class is heavily inspired by
 * {@link org.openqa.selenium.chrome.ChromeCommandExecutor}.
 *
 * @author Ivan De Marino <http://ivandemarino.me>
 */
class PhantomJSCommandExecutor extends MyHttpCommandExecutor {

	private final PhantomJSDriverService service;

	/**
	 * Creates a new PhantomJSCommandExecutor. The PhantomJSCommandExecutor will
	 * communicate with the PhantomJS/GhostDriver through the given
	 * {@code service}.
	 *
	 * @param service
	 *            The PhantomJSDriverService to send commands to.
	 */
	PhantomJSCommandExecutor(PhantomJSDriverService service) {
		super(PhantomJSDriver.getCustomCommands(), service.getUrl());
		this.service = service;
	}

	/**
	 * Sends the {@code command} to the PhantomJS/GhostDriver server for
	 * execution. The server will be started if requesting a new session.
	 * Likewise, if terminating a session, the server will be shutdown once a
	 * response is received.
	 *
	 * @param command
	 *            The command to execute.
	 * @return The command response.
	 * @throws java.io.IOException
	 *             If an I/O error occurs while sending the command.
	 */
	@Override
	public Response execute(Command command) {

		Socket socket = new Socket();
		try {

			SocketAddress remoteAddr = new InetSocketAddress("127.0.0.1", service.getPort());
			socket.connect(remoteAddr, 3000);
		} catch (IOException e) {
			if (DriverCommand.NEW_SESSION.equals(command.getName())) {
				try {
					service.start();
				} catch (IOException e1) {
					e1.printStackTrace();
				}
			}
			e.printStackTrace();
		} finally {
			try {
				socket.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		try {
			return super.execute(command);
		} catch (Throwable t) {
			Throwable rootCause = Throwables.getRootCause(t);
			if (rootCause instanceof ConnectException && "Connection refused".equals(rootCause.getMessage())
					&& !service.isRunning()) {
				throw new WebDriverException("The PhantomJS/GhostDriver server has unexpectedly died!", t);
			}
			Throwables.propagateIfPossible(t);
			throw new WebDriverException(t);
		} finally {
			if (DriverCommand.QUIT.equals(command.getName())) {
				service.stop();
			}
		}
	}
}



好的,以下是使用selenium模拟浏览器爬取京东商品评价的代码: ```python from selenium import webdriver from selenium.webdriver.common.keys import Keys from time import sleep # 创建一个浏览器对象 browser = webdriver.Chrome() # 访问京东首页 browser.get('https://www.jd.com/') # 查找搜索框并输入要搜索的商品名称 search_box = browser.find_element_by_id('key') search_box.send_keys('手机') search_box.send_keys(Keys.ENTER) # 等待搜索结果页面加载完成 sleep(3) # 找到商品列表中第一个商品的链接并点击进入商品详情页 product_link = browser.find_element_by_css_selector('.gl-item:nth-child(1) .p-name a') product_link.click() # 切换到新打开的标签页 browser.switch_to.window(browser.window_handles[-1]) # 进入商品评价页面 browser.find_element_by_css_selector('#detail .comment-count').click() # 模拟滚动加载评价数据 while True: browser.execute_script('window.scrollTo(0, document.body.scrollHeight)') sleep(3) try: # 找到“查看更多”的按钮并点击 more_button = browser.find_element_by_css_selector('.comment-item .comment-operate .btn-append') more_button.click() sleep(3) except: # 没有“查看更多”按钮,说明评价数据已全部加载完成 break # 找到所有评价的元素 comments = browser.find_elements_by_css_selector('.comment-item') # 遍历所有评价并输出评价内容 for comment in comments: content = comment.find_element_by_css_selector('.comment-con').text.strip() print(content) # 关闭浏览器 browser.quit() ``` 上面的代码中,我们首先使用`selenium`创建一个`Chrome`浏览器对象,并访问京东首页。然后,我们在搜索框中输入要搜索的商品名称,按下`Enter`键进行搜索。搜索结果页面加载完成后,我们找到商品列表中第一个商品的链接,并点击进入商品详情页。切换到新打开的标签页后,我们进入商品评价页面,然后模拟滚动加载评价数据,并在评价数据全部加载完成后,找到所有评价的元素,遍历所有评价并输出评价内容。最后,关闭浏览器。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值