爬数据 phantomjs +selenium

一.引入java包   
<dependency>
   <groupId>org.apache.httpcomponents</groupId>
   <artifactId>httpmime</artifactId>
   <version>4.5.2</version>
</dependency>
<dependency>
   <groupId>org.seleniumhq.selenium</groupId>
   <artifactId>selenium-java</artifactId>
   <version>3.0.1</version>
</dependency>
<dependency>
   <groupId>org.seleniumhq.selenium</groupId>
   <artifactId>selenium-chrome-driver</artifactId>
   <version>3.0.1</version>
</dependency>

<dependency>
   <groupId>org.seleniumhq.selenium</groupId>
   <artifactId>selenium-remote-driver</artifactId>
   <version>3.0.1</version>
</dependency>

<dependency>
   <groupId>com.codeborne</groupId>
   <artifactId>phantomjsdriver</artifactId>
   <version>1.2.1</version>
</dependency>

<dependency>
   <groupId>org.apache.commons</groupId>
   <artifactId>commons-exec</artifactId>
   <version>1.3</version>
</dependency>

2.封装验证码识别api(由于带有验证码,使用菲菲打码识别验证码)

import java.util.Date;
import java.net.URL;

public class GetCodeApi {
    protected String app_id;
    protected String app_key;
    protected String pd_id;
    protected String pd_key;
    protected String pred_url;
    public void Init(String app_id, String app_key, String pd_id, String pd_key){
        this.app_id     = app_id;
        this.app_key    = app_key;
        this.pd_id     = pd_id;
        this.pd_key    = pd_key;
        this.pred_url   = "http://pred.fateadm.com";
    }

    /**
     * 查询余额
     * 参数:无
     * 返回值:
     *      resp.ret_code:正常返回0
     *      resp.err_msg:异常时返回异常详情
     */
    public Util.HttpResp QueryBalc() throws Exception{
        long cur_tm     = new Date().getTime()/1000;    // 时间戳精确到秒。所以除以1000
        String stm      = String.valueOf(cur_tm);
        String sign     = Util.CalcSign( pd_id, pd_key, stm);
        String url      = this.pred_url + "/api/custval";
        String params   = "user_id="+this.pd_id + "&timestamp=" + stm + "&sign=" + sign;
        String pres     = Util.HttpPost(url, params);
        Util.HttpResp resp = Util.ParseHttpResp( pres);
        return resp;
    }

    /***
     * 查询余额:直接返回余额结果
     * 参数:无
     * 返回值: 用户余额:double
     */
    public double QueryBalcExtend() throws Exception {
        Util.HttpResp resp = QueryBalc();
        return resp.cust_val;
    }

    /**
     * 充值接口
     * 参数:cardid:充值卡号, cardkey:充值卡签名串
     * 返回值:
     *      resp.ret_code:正常返回0
     *      resp.err_msg:异常时返回异常详情
     */
    public Util.HttpResp Charge(String cardid, String cardkey) throws Exception{
        long cur_tm     = new Date().getTime()/1000;    // 时间戳精确到秒。所以除以1000
        String stm      = String.valueOf(cur_tm);
        String sign     = Util.CalcSign( pd_id, pd_key, stm);
        String csign    = Util.CalcMd5(pd_key + stm + cardid + cardkey);
        String url      = this.pred_url + "/api/charge";
        String params   = "user_id=" + pd_id + "&timestamp=" + stm + "&sign=" + sign + "&cardid=" + cardid + "&csign=" + csign;
        String pres     = Util.HttpPost(url, params);
        Util.HttpResp resp  = Util.ParseHttpResp(pres);
        return resp;
    }

    /***
     * 充值接口:直接返回是否成功
     * 参数:cardid:充值卡号, cardkey:充值卡签名串
     * 返回值: 充值成功返回 0
     */
    private int ChargeExtend(String cardid, String cardkey)  throws Exception {
        Util.HttpResp resp = Charge(cardid, cardkey);
        return resp.ret_code;
    }


    /**
     * 文件形式进行验证码识别
     * 参数: pred_type:识别类型  file_name:文件名
     * 返回值:
     *      resp.ret_code:正常返回0
     *      resp.err_msg:异常时返回异常详情
     *      resp.req_Id:唯一订单号
     *      resp.pred_resl:识别的结果
     */
    public Util.HttpResp PredictFromFile(String pred_type, String file_name)throws Exception{
        byte[] file_data    = Util.ReadBinaryFile(file_name);
        if( file_data == null){
            Util.HttpResp resp  = new Util.HttpResp();
            resp.ret_code       = -1;
            resp.err_msg        = "ERROR: read file failed! file_name: " + file_name;
            return resp;
        }
        Util.HttpResp resp = Predict(pred_type, file_data);
        return resp;
    }

    /***
     * 文件形式进行验证码识别:直接返回识别结果
     * 参数: pred_type:识别类型  file_name:文件名
     * 返回值: 识别的结果:String
     */
    public String PredictFromFileExtend(String pred_type, String file_name) throws Exception {
        Util.HttpResp resp = PredictFromFile(pred_type, file_name);
        return resp.pred_resl;
    }

    /**
     * 验证码识别
     * 参数: pred_type:识别类型  img_data:图片数据
     * 返回值:
     *      resp.ret_code:正常返回0
     *      resp.err_msg:异常时返回异常详情
     *      resp.req_Id:唯一订单号
     *      resp.pred_resl:识别的结果
     */
    public Util.HttpResp Predict(String pred_type, byte[] img_data) throws Exception{
        long cur_tm     = new Date().getTime()/1000;    // 时间戳精确到秒。所以除以1000
        String stm      = String.valueOf(cur_tm);
        String sign     = Util.CalcSign(pd_id,pd_key,stm);
        String asign   = "";
        URL url = new URL(pred_url + "/api/capreg");
        if(!app_id.isEmpty()){
            asign  = Util.CalcSign(app_id,app_key,stm);
        }
        String pres = Util.MFPost(url,img_data,stm,pd_id,sign,app_id,asign,pred_type);
        // System.out.println(pres);
        Util.HttpResp resp = Util.ParseHttpResp(pres);
        return resp;
    }

    /***
     * 验证码识别
     * 参数: pred_type:识别类型  img_data:图片数据
     * 返回值: 识别的结果:String
     */
    public String PredictExtend(String pred_type, byte[] img_data) throws Exception{
        Util.HttpResp resp = Predict(pred_type, img_data);
        return resp.pred_resl;
    }

    /**
     * 识别失败,进行退款请求
     * 参数: req_id:需要退款的订单号
     * 返回值:
     *      resp.ret_code:正常返回0
     *      resp.err_msg:异常时返回异常详情
     *
     * 注意:
     *      Predict识别接口,仅在RetCode == 0时才会进行扣款,才需要进行退款请求,否则无需进行退款操作
     * 注意2:
     *      退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
     */
    public Util.HttpResp Justice(String req_id) throws Exception{
        long cur_tm     = new Date().getTime()/1000;     // 时间戳精确到秒。所以除以1000
        String stm      = String.valueOf(cur_tm);
        String sign     = Util.CalcSign( pd_id, pd_key, stm);
        String url      = pred_url + "/api/capjust";
        String params   = "user_id=" + pd_id + "&timestamp="+stm + "&sign=" + sign + "&request_id=" + req_id;
        String pres     = Util.HttpPost(url, params);
        Util.HttpResp resp  = Util.ParseHttpResp( pres);
        return resp;
    }

    /***
     * 退款请求: 直接返回是否成功
     * 参数: req_id:需要退款的订单号
     * 返回值: 返回 0 代表成功
     */
    public int JusticeExtend(String req_id) throws Exception{
        Util.HttpResp resp = Justice(req_id);
        return resp.ret_code;
    }
}

3.工具类

  

public  class Utils {
    //识别获取验证码
    public static String getValidateCode(String filePath){
        Api api = new GetCodeApi ();
        String app_id = "xxxx";
        String app_key = "xxxx";
        String pd_id = "xxxx";
        String pd_key = "xxxx";
        // 对象生成之后,在任何操作之前,需要先调用初始化接口
        api.Init(app_id, app_key, pd_id, pd_key);
        String result=null;
        try {
            String pred_type = "30400";
             result = api.PredictFromFile(pred_type, filePath).pred_resl;  // 返回识别结果的详细信息
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }
   //获取图片路径
    public static String getImagePath(WebDriver driver, WebElement ele){
        BufferedImage fullImg = null;
        Point point = ele.getLocation();
        int eleWidth = ele.getSize().getWidth();
        int eleHeight = ele.getSize().getHeight();
        File screenshot = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
        try {
            fullImg = ImageIO.read(screenshot);
            BufferedImage eleScreenshot = fullImg.getSubimage(point.getX(), point.getY(), eleWidth, eleHeight);
            ImageIO.write(eleScreenshot, "jpg", screenshot);
        }catch (Exception e){
            e.printStackTrace();
        }
        return screenshot.getAbsolutePath();
    }
    //获取状态
    public static String Status(HttpMethodBase Method) {
        String result=null;
        try {
            HttpClient httpClient = new HttpClient();
            httpClient.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
            System.out.println(Method);
            int resultCode = httpClient.executeMethod(Method);

            if (resultCode == 200) {
                result = Method.getResponseBodyAsString();
            }
            else{
                //todo 短信扣费取消
                System.out.println("验证码失败");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }
获取PhantomJSDriver
  public WebDriver defaultDriver() {
        DesiredCapabilities cap = DesiredCapabilities.phantomjs();
        // 优化命令行参数
        List<String> cmdList = new ArrayList<>();
        // 禁用图片
//        cmdList.add("--load-images=false");
        // 本地缓存
        cmdList.add("--disk-cache=true");
        cap.setCapability("phantomjs.cli.args", cmdList);
        //todo
        //禁用截图,使用图片接口访问。加快访问速度,解决报错
        cap.setCapability("takesScreenshot", true);
        return new PhantomJSDriver(cap);
    }
}

4.模拟登录获取cookie  

public String login(AccountEntity accountEntity ) {
    WebDriver driver = WebDriverPool.getInstance().defaultDriver();
    driver.get(accountEntity.getLoginUrl());
    driver.manage().window().maximize();

    WebElement userId = driver.findElement(By.name("userId"));
    WebElement password = driver.findElement(By.name("password"));
    WebElement j_captcha = driver.findElement(By.name("j_captcha"));
    WebElement ele = driver.findElement(By.id("captchaImg"));
    WebElement btn = driver.findElement(By.id("loginBtn"));

    userId.sendKeys(accountEntity.getAccount());
    password.sendKeys(accountEntity.getPassword());

    String img_file = Utils.getImagePath(driver, ele);
    String validaCode = Utils.getValidateCode(img_file);

    j_captcha.sendKeys(validaCode);
    btn.click();
    try {
        //等待网页加载,获得cookie信息
        Thread.sleep(4000);
    } catch (InterruptedException e) {
        e.printStackTrace();
    }
    Set<Cookie> cookies = driver.manage().getCookies();
    StringBuffer strBuff = new StringBuffer();
    for (Cookie str : cookies) {
        String temp = str.getName();
        if ("SESSIONcasLogin-Tokenlangname".contains(temp))
            strBuff.append(temp+"="+str.getValue()+";");
    }
    driver.quit();
    return strBuff.toString();
}

    5..利用获取cookie拿取数据

@Override
public String getData(int index, int row, AccountEntity accountEntity) {
    String json = null;
    try {
        PostMethod postMethod = getMethod(accountEntity);
        postMethod.setPath(accountEntity.getCountUrl());
        String requestJson = Utils.getPage(index, row);
        RequestEntity se = new StringRequestEntity(requestJson, "application/json", "UTF-8");
        postMethod.setRequestEntity(se);
        json = Utils.Status(postMethod);
    } catch (Exception e) {
        e.printStackTrace();
    }
    return json;
}


public PostMethod getMethod( AccountEntity  accountEntity) {
    PostMethod postMethod = new PostMethod();
    postMethod.setRequestHeader("Cookie", accountEntity.getCookies());
    postMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
    return postMethod;
}

    

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值