本文根据工作中爬取数据需要所做工作整理而来。最初我使用了HttpClient+Jsoup,然后这种最简单的方式只能得到普通的静态页面数据以及暴露在浏览器F12调试窗口中的可见URL的数据采集,对于一些需要模仿浏览器行为比如点击事件,比如页面采用了JS框架进行重新布局的就无能为力了。因此,对于此类情况,最后经过摸索,得到了这个比较好一点的实践方式。下面废话不多说,来一个具体实践:抓取点击打开链接https://www.sosobtc.com/ 网页上的数据。
第一步:创建Maven工程:mycrawler
第二步:导入Maven依赖:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.szzc.crawler</groupId>
<artifactId>mycrawler</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.53.0</version>
</dependency>
<dependency>
<groupId>com.opera</groupId>
<artifactId>operadriver</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-exec</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>com.github.detro</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.0</version>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>com.opera</groupId>
<artifactId>operadriver</artifactId>
<version>0.16</version>
<exclusions>
<exclusion>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-remote-driver</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
</plugin>
</plugins>
</build>
</project>
第三步:封装的实体类CoinData:
package com.szzc;
public class CoinData {
private Integer rowId;
private String marketName;//交易市场
private String CurrentPrice;//最新价格
private String platformPrice;//平台价格
private String highestPrice;//最高价
private String lowestPrice;//最低价
private String upsAndDowns;//涨跌
private String increment;//涨幅
private String trading;//成交量
public Integer getRowId() {
return rowId;
}
public void setRowId(Integer rowId) {
this.rowId = rowId;
}
public String getMarketName() {
return marketName;
}
public void setMarketName(String marketName) {
this.marketName = marketName;
}
public String getCurrentPrice() {
return CurrentPrice;
}
public void setCurrentPrice(String currentPrice) {
CurrentPrice = currentPrice;
}
public String getPlatformPrice() {
return platformPrice;
}
public void setPlatformPrice(String platformPrice) {
this.platformPrice = platformPrice;
}
public String getHighestPrice() {
return highestPrice;
}
public void setHighestPrice(String highestPrice) {
this.highestPrice = highestPrice;
}
public String getLowestPrice() {
return lowestPrice;
}
public void setLowestPrice(String lowestPrice) {
this.lowestPrice = lowestPrice;
}
public String getUpsAndDowns() {
return upsAndDowns;
}
public void setUpsAndDowns(String upsAndDowns) {
this.upsAndDowns = upsAndDowns;
}
public String getIncrement() {
return increment;
}
public void setIncrement(String increment) {
this.increment = increment;
}
public String getTrading() {
return trading;
}
public void setTrading(String trading) {
this.trading = trading;
}
@Override
public String toString() {
return "CoinData [rowId=" + rowId + ", marketName=" + marketName + ", CurrentPrice=" + CurrentPrice
+ ", platformPrice=" + platformPrice + ", highestPrice=" + highestPrice + ", lowestPrice=" + lowestPrice
+ ", upsAndDowns=" + upsAndDowns + ", increment=" + increment + ", trading=" + trading + "]";
}
}
第四步:抓取数据的Main方法所在的类:
package com.szzc;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
public class FirstTest {
public static final String TR = "tr";
public static final String TD = "td";
public static Integer ROWID = 1;
private static String[] tableDiv = null;
private static String[] liIds = null;
static {
tableDiv = new String[4];
tableDiv[0] = "default_market_tabs-pane-btc";
tableDiv[1] = "default_market_tabs-pane-ltc";
tableDiv[2] = "default_market_tabs-pane-eth";
tableDiv[3] = "default_market_tabs-pane-etc";
liIds = new String[4];
liIds[0] = "default_market_tabs-tab-btc";
liIds[1] = "default_market_tabs-tab-ltc";
liIds[2] = "default_market_tabs-tab-eth";
liIds[3] = "default_market_tabs-tab-etc";
}
public static void main(String[] args) throws Exception {
//加载Chrome的驱动并打开浏览器
System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");
ChromeOptions options = new ChromeOptions();
options.addArguments("--start-maximized", "allow-running-insecure-content", "--test-type");
WebDriver driver = new ChromeDriver(options);
//打开sosobtc.com页面
driver.get("https://www.sosobtc.com/");
//给浏览器初始化页面响应时间
Thread.sleep(5000);
//定义一个Map来存储获取到的四个币种的数据
Map<String,List<CoinData>> data = new HashMap<>();
String[] coinName = {"btc","ltc","eth","etc"};
//依次点击页面的li标签,并获取数据
for (int i = 0; i < liIds.length; i++) {
List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);
data.put(coinName[i], coidDataList);
}
for (String coinname : data.keySet()) {
List<CoinData> list = data.get(coinname);
for (CoinData coinData : list) {
System.out.println(coinData);
}
}
//关闭浏览器
driver.quit();
}
/**
*
* @Description:
* @param driver
* @param liId 切换数据表格的li标签的id
* @param id 存储数据的div的id
* @throws Exception
* @version 1.0
* @return
* @time 2017年7月9日下午9:28:20
*/
public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {
//点击切换li标签来显式不同币种的数据
driver.findElement(By.id(liId)).click();
//给数据响应的时间
Thread.sleep(500L);
//获取存储数据的table所在的div
WebElement div = driver.findElement(By.id(id));
//获得所有的行对象
List<WebElement> trs = div.findElements(By.tagName(TR));
//定义一个list来存储数据,每个元素代表一行
List<CoinData> coinDataList = new ArrayList<>();
for (WebElement tr : trs) {
//获取一个列对象列表
List<WebElement> tds = tr.findElements(By.tagName(TD));
//获取的列对象集合不为空时,开始封装对象
if (tds != null && tds.size() > 0) {
CoinData coinData = new CoinData();
coinData.setRowId(ROWID++);
coinData.setMarketName(tds.get(0).getText());
coinData.setCurrentPrice(tds.get(1).getText());
coinData.setPlatformPrice(tds.get(2).getText());
coinData.setHighestPrice(tds.get(3).getText());
coinData.setLowestPrice(tds.get(4).getText());
coinData.setUpsAndDowns(tds.get(5).getText());
coinData.setIncrement(tds.get(6).getText());
coinData.setTrading(tds.get(7).getText());
coinDataList.add(coinData);
}
}
//切换币种时,重新从第一个市场名字开始计数
ROWID = 1;
return coinDataList;
}
}
实现抓取的Main函数所在的类:
package com.szzc;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.phantomjs.PhantomJSDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
public class SecondTest {
public static final String TR = "tr";
public static final String TD = "td";
public static Integer ROWID = 1;
private static String[] tableDiv = null;
private static String[] liIds = null;
static {
tableDiv = new String[4];
tableDiv[0] = "default_market_tabs-pane-btc";
tableDiv[1] = "default_market_tabs-pane-ltc";
tableDiv[2] = "default_market_tabs-pane-eth";
tableDiv[3] = "default_market_tabs-pane-etc";
liIds = new String[4];
liIds[0] = "default_market_tabs-tab-btc";
liIds[1] = "default_market_tabs-tab-ltc";
liIds[2] = "default_market_tabs-tab-eth";
liIds[3] = "default_market_tabs-tab-etc";
}
public static void main(String[] args) throws Exception {
//加载Chrome的驱动并打开浏览器
//System.setProperty("webdriver.chrome.driver","D:/Google/chromedriver.exe");
System.setProperty("phantomjs.binary.path", "/usr/bin/phantomjs");
System.setProperty("phantomjs.binary.path", "./phantomjs/win/phantomjs.exe");
DesiredCapabilities desiredCapabilities = DesiredCapabilities.phantomjs();
//此处可以设置一些desiredCapabilities的属性(浏览器的头信息)
WebDriver driver = new PhantomJSDriver(desiredCapabilities);
//打开sosobtc.com页面
driver.get("https://www.sosobtc.com/");
//给浏览器初始化页面响应时间
Thread.sleep(5000);
//定义一个Map来存储获取到的四个币种的数据
Map<String,List<CoinData>> data = new HashMap<>();
String[] coinName = {"btc","ltc","eth","etc"};
//依次点击页面的li标签,并获取数据
for (int i = 0; i < liIds.length; i++) {
List<CoinData> coidDataList = getCoidData(driver, liIds[i], tableDiv[i]);
data.put(coinName[i], coidDataList);
}
for (String coinname : data.keySet()) {
List<CoinData> list = data.get(coinname);
for (CoinData coinData : list) {
System.out.println(coinData);
}
}
//关闭浏览器
driver.quit();
}
/**
*
* @Description:
* @param driver
* @param liId 切换数据表格的li标签的id
* @param id 存储数据的div的id
* @throws Exception
* @version 1.0
* @return
* @time 2017年7月9日下午9:28:20
*/
public static List<CoinData> getCoidData(WebDriver driver,String liId,String id) throws Exception {
//点击切换li标签来显式不同币种的数据
driver.findElement(By.id(liId)).click();
//给数据响应的时间
Thread.sleep(500L);
//获取存储数据的table所在的div
WebElement div = driver.findElement(By.id(id));
//获得所有的行对象
List<WebElement> trs = div.findElements(By.tagName(TR));
//定义一个list来存储数据,每个元素代表一行
List<CoinData> coinDataList = new ArrayList<>();
for (WebElement tr : trs) {
//获取一个列对象列表
List<WebElement> tds = tr.findElements(By.tagName(TD));
//获取的列对象集合不为空时,开始封装对象
if (tds != null && tds.size() > 0) {
CoinData coinData = new CoinData();
coinData.setRowId(ROWID++);
coinData.setMarketName(tds.get(0).getText());
coinData.setCurrentPrice(tds.get(1).getText());
coinData.setPlatformPrice(tds.get(2).getText());
coinData.setHighestPrice(tds.get(3).getText());
coinData.setLowestPrice(tds.get(4).getText());
coinData.setUpsAndDowns(tds.get(5).getText());
coinData.setIncrement(tds.get(6).getText());
coinData.setTrading(tds.get(7).getText());
coinDataList.add(coinData);
}
}
//切换币种时,重新从第一个市场名字开始计数
ROWID = 1;
return coinDataList;
}
}
至此我们已经可以完美的模仿一个浏览器的行为,来简单抓取一些网页的数据了。