java采用selenium跟PageProcessor 抓取数据,其中包含两种抓取方式:第一种封装页面抓取,第二种调用接口抓取。

第一准备工作需要导入pom文件,其中有自动控制浏览器,采集页面,表格操作,json格式化

 <dependencies>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.13.0</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>org.json</groupId>
            <artifactId>json</artifactId>
            <version>20160810</version>
        </dependency>
        <!--使用代理-->
        <dependency>
            <groupId>net.lightbody.bmp</groupId>
            <artifactId>browsermob-core</artifactId>
            <version>2.1.5</version>
        </dependency>

        <dependency>
            <groupId>net.lightbody.bmp</groupId>
            <artifactId>browsermob-legacy</artifactId>
            <version>2.1.5</version>
        </dependency>
        <!--java发送请求-->
        <!-- commons-httpclient http请求工具依赖包 -->
        <!-- apache http请求工具依赖包 -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.0.2</version>
        </dependency>
        <!--自动生成 getset-->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.16.18</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>5.0.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>5.0.0</version>
        </dependency>
        <dependency>
            <groupId>com.opencsv</groupId>
            <artifactId>opencsv</artifactId>
            <version>4.4</version>
        </dependency>
</dependencys>

第二部分  根据自己需要准备好导入chromedriver(这种方式对应的是自动打开页面,并封装页面操作总)

第三部分 上代码

package main;

import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebDriver;
import page.ApplePage;
import page.HonorPage;
import us.codecraft.webmagic.Page;
import util.Util;

import java.io.IOException;
import java.util.List;

public class HonorMain {
    public static void main(String[] args) throws IOException, InterruptedException {
        HonorPage a=new HonorPage();
        // 创建chorme的配置信息
        System.setProperty("webdriver.chrome.driver","C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe");
        // 创建chorme对象
        ChromeOptions chromeOptions = new ChromeOptions();
        // 设置为无头模式
        //chromeOptions.addArguments("--headless");
        // 设置浏览器打开窗口大小
        chromeOptions.addArguments("--window-size=1920,1080");

        // 1. 基于配置信息,创建RemoteWebDriver对象
        RemoteWebDriver driver = new ChromeDriver(chromeOptions);
        // 解析荣耀案例
        driver.get("https://www.hihonor.com/cn/retail/");
        driver.findElementByClassName("bnt-yes").click();
        List<String> list = Util.getShi();
        for (int i = 0; i <list.size() ; i++) {
            Thread.sleep(1000);
            driver.findElementById("search-text").sendKeys(list.get(i));
            driver.findElementById("searchMapBtn").click();
            Thread.sleep(2000);
            driver.findElementById("search-text").clear();
            Page page = Util.createPage(driver.getPageSource(), driver.getCurrentUrl());
            a.process(page);
        }
    }
}

第四部分 用封装好的页面 采集数据

package page;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

public class HonorPage implements PageProcessor {
    @Override
    public void process(Page page) {
        Document d = page.getHtml().getDocument();
        Element element = d.getElementsByClass("map-list").get(0);
        Elements li = element.getElementsByTag("li");
        for (int i = 0; i < li.size(); i++) {
            String attr = li.get(i).attr("data-reg");
            String title = li.get(i).getElementsByClass("li-font-title").text();
            String address = li.get(i).getElementsByClass("map-list-address").text();
            System.out.println(attr+title+address);
        }


    }

    @Override
    public Site getSite() {
        return Site.me();
    }
}

--------------------------------------------------------封装页面,采集数据就完事了---------------------------------

使用接口采集数据并解析json

上代码

package main.get;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import entity.JW;
import entity.apple.Apple;
import entity.apple.Badgesarray;
import entity.apple.Productmatrixlist;
import entity.apple.Stores;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import out.execl.OutXlsx;
import reade.excel.ReadExcel;

import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.security.cert.X509Certificate;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

;


public class GetApple {
    public static void main(String[] args) throws IOException {
        Set<Stores> list=new HashSet<>();
        List<JW> jw = ReadExcel.readJW();
       // Apple apple = getHuaWeiJson("https://locate.apple.com/api/v1/grlui/cn/zh/sales?pt=all&lat=39.904989&lon=116.405285&carrier=&maxrad=300&maxResult=99&repairType=");

        List<Badgesarray> b=null;
        List<Productmatrixlist> p=null;
        for (int i = 0; i < jw.size(); i++) {
            System.out.println("zong:"+list.size());
            String jd=jw.get(i).getJ();
            String wd=jw.get(i).getW();
            Apple apple = getHuaWeiJson("https://locate.apple.com/api/v1/grlui/cn/zh/sales?pt=all&lat=" + wd + "&lon=" + jd + "&carrier=&maxrad=300&maxResult=99&repairType=");
            System.out.println();
            if(apple!=null){
                Set<Stores> stores = apple.getResults().getStores();
                System.out.println("本次抓取:"+stores.size());
                if (stores!=null){
                    list.addAll(stores);
                    b=apple.getResults().getBadgesarray();
                    p=apple.getResults().getProductmatrixlist();
                }
            }
        }
        OutXlsx.OutApple(list, b, p);
    }
    public static Apple getHuaWeiJson(String url) throws IOException {
        HttpClient client = new DefaultHttpClient();// 获取HttpClient对象
        client = wrapClient(client);
        HttpGet httpGet=new HttpGet(url);
        //httpGet.addHeader("Accept","*/*");
        //注销掉不要压缩格式
       // httpGet.addHeader("Accept-Encoding","gzip, deflate, br");
//        httpGet.addHeader("Accept-Language","zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7");
//        httpGet.addHeader("Connection","keep-alive");
//        httpGet.addHeader("Host","sgw-cn.c.huawei.com");
//        httpGet.addHeader("Origin","https://consumer.huawei.com");
//        httpGet.addHeader("Referer","https://consumer.huawei.com/");
//        httpGet.addHeader("sec-ch-ua","\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\", \"Google Chrome\";v=\"99\"");
//        httpGet.addHeader("sec-ch-ua-mobile","?0");
//        httpGet.addHeader("sec-ch-ua-platform","\"Windows\"");
//        httpGet.addHeader("Sec-Fetch-Dest","empty");
//        httpGet.addHeader("Sec-Fetch-Mode","cors");
//        httpGet.addHeader("Sec-Fetch-Site","same-site");
//        httpGet.addHeader("SGW-APP-ID","DE1FDF33D6278164A62EC486793F7CCF");
//        httpGet.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36");
        HttpResponse httpResponse = null;
        int statusCode=0;
        try {
            httpResponse= client.execute(httpGet);
            statusCode= httpResponse.getStatusLine().getStatusCode();
        }catch (Exception e){
            System.out.println(e);
        }

        //200正确,404页面不存在,406参数异常,500后端服务器异常,504超时,502访问网址不存在
        if (statusCode == 200){
            String s = EntityUtils.toString(httpResponse.getEntity());
            //System.out.println(s);
            JSONObject jsonObject = JSON.parseObject(s);
            Apple apple = jsonObject.toJavaObject(Apple.class);
            //ystem.out.println(apple);
            return apple;
        }
        return null;
    }
    //https 认证
    public static HttpClient wrapClient(HttpClient base) {
        try {
            SSLContext ctx = SSLContext.getInstance("TLS");
            X509TrustManager tm = new X509TrustManager() {
                public void checkClientTrusted(X509Certificate[] xcs,
                                               String string) {
                }

                public void checkServerTrusted(X509Certificate[] xcs,
                                               String string) {
                }

                public X509Certificate[] getAcceptedIssuers() {
                    return null;
                }
            };
            ctx.init(null, new TrustManager[] { tm }, null);
            SSLSocketFactory ssf = new SSLSocketFactory(ctx);
            ssf.setHostnameVerifier(SSLSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
            ClientConnectionManager ccm = base.getConnectionManager();
            SchemeRegistry sr = ccm.getSchemeRegistry();
            sr.register(new Scheme("https", ssf, 443));
            return new DefaultHttpClient(ccm, base.getParams());
        } catch (Exception ex) {
            ex.printStackTrace();
            return null;
        }
    }

    }



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值