第一准备工作需要导入pom文件,其中有自动控制浏览器,采集页面,表格操作,json格式化
<dependencies>
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.13.0</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20160810</version>
</dependency>
<!--使用代理-->
<dependency>
<groupId>net.lightbody.bmp</groupId>
<artifactId>browsermob-core</artifactId>
<version>2.1.5</version>
</dependency>
<dependency>
<groupId>net.lightbody.bmp</groupId>
<artifactId>browsermob-legacy</artifactId>
<version>2.1.5</version>
</dependency>
<!--java发送请求-->
<!-- commons-httpclient http请求工具依赖包 -->
<!-- apache http请求工具依赖包 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.0.2</version>
</dependency>
<!--自动生成 getset-->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.18</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.0.0</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>4.4</version>
</dependency>
</dependencys>
第二部分 根据自己需要准备好导入chromedriver(这种方式对应的是自动打开页面,并封装页面操作总)
第三部分 上代码
package main;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import page.ApplePage;
import page.HonorPage;
import us.codecraft.webmagic.Page;
import util.Util;
import java.io.IOException;
import java.util.List;
public class HonorMain {
public static void main(String[] args) throws IOException, InterruptedException {
HonorPage a=new HonorPage();
// 创建chorme的配置信息
System.setProperty("webdriver.chrome.driver","C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe");
// 创建chorme对象
ChromeOptions chromeOptions = new ChromeOptions();
// 设置为无头模式
//chromeOptions.addArguments("--headless");
// 设置浏览器打开窗口大小
chromeOptions.addArguments("--window-size=1920,1080");
// 1. 基于配置信息,创建RemoteWebDriver对象
RemoteWebDriver driver = new ChromeDriver(chromeOptions);
// 解析荣耀案例
driver.get("https://www.hihonor.com/cn/retail/");
driver.findElementByClassName("bnt-yes").click();
List<String> list = Util.getShi();
for (int i = 0; i <list.size() ; i++) {
Thread.sleep(1000);
driver.findElementById("search-text").sendKeys(list.get(i));
driver.findElementById("searchMapBtn").click();
Thread.sleep(2000);
driver.findElementById("search-text").clear();
Page page = Util.createPage(driver.getPageSource(), driver.getCurrentUrl());
a.process(page);
}
}
}
第四部分 用封装好的页面 采集数据
package page;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
public class HonorPage implements PageProcessor {
@Override
public void process(Page page) {
Document d = page.getHtml().getDocument();
Element element = d.getElementsByClass("map-list").get(0);
Elements li = element.getElementsByTag("li");
for (int i = 0; i < li.size(); i++) {
String attr = li.get(i).attr("data-reg");
String title = li.get(i).getElementsByClass("li-font-title").text();
String address = li.get(i).getElementsByClass("map-list-address").text();
System.out.println(attr+title+address);
}
}
@Override
public Site getSite() {
return Site.me();
}
}
--------------------------------------------------------封装页面,采集数据就完事了---------------------------------
使用接口采集数据并解析json
上代码
package main.get;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import entity.JW;
import entity.apple.Apple;
import entity.apple.Badgesarray;
import entity.apple.Productmatrixlist;
import entity.apple.Stores;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import out.execl.OutXlsx;
import reade.excel.ReadExcel;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.cert.X509Certificate;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
;
public class GetApple {
public static void main(String[] args) throws IOException {
Set<Stores> list=new HashSet<>();
List<JW> jw = ReadExcel.readJW();
// Apple apple = getHuaWeiJson("https://locate.apple.com/api/v1/grlui/cn/zh/sales?pt=all&lat=39.904989&lon=116.405285&carrier=&maxrad=300&maxResult=99&repairType=");
List<Badgesarray> b=null;
List<Productmatrixlist> p=null;
for (int i = 0; i < jw.size(); i++) {
System.out.println("zong:"+list.size());
String jd=jw.get(i).getJ();
String wd=jw.get(i).getW();
Apple apple = getHuaWeiJson("https://locate.apple.com/api/v1/grlui/cn/zh/sales?pt=all&lat=" + wd + "&lon=" + jd + "&carrier=&maxrad=300&maxResult=99&repairType=");
System.out.println();
if(apple!=null){
Set<Stores> stores = apple.getResults().getStores();
System.out.println("本次抓取:"+stores.size());
if (stores!=null){
list.addAll(stores);
b=apple.getResults().getBadgesarray();
p=apple.getResults().getProductmatrixlist();
}
}
}
OutXlsx.OutApple(list, b, p);
}
public static Apple getHuaWeiJson(String url) throws IOException {
HttpClient client = new DefaultHttpClient();// 获取HttpClient对象
client = wrapClient(client);
HttpGet httpGet=new HttpGet(url);
//httpGet.addHeader("Accept","*/*");
//注销掉不要压缩格式
// httpGet.addHeader("Accept-Encoding","gzip, deflate, br");
// httpGet.addHeader("Accept-Language","zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7");
// httpGet.addHeader("Connection","keep-alive");
// httpGet.addHeader("Host","sgw-cn.c.huawei.com");
// httpGet.addHeader("Origin","https://consumer.huawei.com");
// httpGet.addHeader("Referer","https://consumer.huawei.com/");
// httpGet.addHeader("sec-ch-ua","\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\", \"Google Chrome\";v=\"99\"");
// httpGet.addHeader("sec-ch-ua-mobile","?0");
// httpGet.addHeader("sec-ch-ua-platform","\"Windows\"");
// httpGet.addHeader("Sec-Fetch-Dest","empty");
// httpGet.addHeader("Sec-Fetch-Mode","cors");
// httpGet.addHeader("Sec-Fetch-Site","same-site");
// httpGet.addHeader("SGW-APP-ID","DE1FDF33D6278164A62EC486793F7CCF");
// httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36");
HttpResponse httpResponse = null;
int statusCode=0;
try {
httpResponse= client.execute(httpGet);
statusCode= httpResponse.getStatusLine().getStatusCode();
}catch (Exception e){
System.out.println(e);
}
//200正确,404页面不存在,406参数异常,500后端服务器异常,504超时,502访问网址不存在
if (statusCode == 200){
String s = EntityUtils.toString(httpResponse.getEntity());
//System.out.println(s);
JSONObject jsonObject = JSON.parseObject(s);
Apple apple = jsonObject.toJavaObject(Apple.class);
//ystem.out.println(apple);
return apple;
}
return null;
}
//https 认证
public static HttpClient wrapClient(HttpClient base) {
try {
SSLContext ctx = SSLContext.getInstance("TLS");
X509TrustManager tm = new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] xcs,
String string) {
}
public void checkServerTrusted(X509Certificate[] xcs,
String string) {
}
public X509Certificate[] getAcceptedIssuers() {
return null;
}
};
ctx.init(null, new TrustManager[] { tm }, null);
SSLSocketFactory ssf = new SSLSocketFactory(ctx);
ssf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
ClientConnectionManager ccm = base.getConnectionManager();
SchemeRegistry sr = ccm.getSchemeRegistry();
sr.register(new Scheme("https", ssf, 443));
return new DefaultHttpClient(ccm, base.getParams());
} catch (Exception ex) {
ex.printStackTrace();
return null;
}
}
}