HttpClient 登录爬取信息

 

 

爬取图片

[1].[文件] SemeiziCrawler.java ~ 5KB    下载(576) 跳至 [1] [2] [3]

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

package kidbei.learn.crawler;

 

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import java.io.StringWriter;

import java.util.ArrayList;

import java.util.Iterator;

import java.util.List;

 

import org.apache.commons.io.IOUtils;

import org.apache.http.HttpEntity;

import org.apache.http.HttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.util.EntityUtils;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

/**

 * http://sejie.wanxun.org/post/2012-09-25/40039413449

 * @author Administrator

 *

 */

public class SemeiziCrawler {

    private static final String BASEHOST = "http://sejie.wanxun.org/";

    private static DefaultHttpClient client = ConnectionManager.getHttpClient();

    static String url = "http://sejie.wanxun.org/post/2012-09-25/40039413449";

    private static String IMGPATH = "D:\\sexpicture\\色戒美眉图"+File.separator+StringUtil.getDate();

    static int STARTPAGE = 1;

    static int PAGECOUNT = 100;

 

    public static void main(String[] args) {

        File f = new File(IMGPATH);

        if(!f.exists()){

            f.mkdirs();

        }

        String host = BASEHOST ;

        for(int i=STARTPAGE;i<PAGECOUNT;i++){

            if(i != 1){

                host = BASEHOST+"page/"+i;

            }

            System.out.println("进入第"+i+"页");

            String pageContext = getResultByUrl(host);

//          System.out.println(pageContext);

            List<String>articleURLS = getArticleURL(pageContext);

            for(String articleURL:articleURLS){

                String articleContext = getResultByUrl(articleURL);

                List<String> ImgURLS = getImgURLS(articleContext);

                for(String ImgURL:ImgURLS){

                    savepic(ImgURL);

                }

            }

        }

//      String articleContext = getResultByUrl(url);

//      List<String> strs = getImgURLS(articleContext);

//      for(String str:strs){

//          System.out.println(str);

//      }

    }

    /**

     * 根据url获取页面

     * @param url

     * @return

     */

    public static String getResultByUrl(String url){

        System.out.println("打开网页"+url);

        HttpGet get = new HttpGet(url);

        HttpEntity entity = null;

        HttpResponse response = null;

        try {

            response = client.execute(get);

            entity = response.getEntity();

            if(entity != null){

                InputStream is = entity.getContent();

                StringWriter sw = new StringWriter();

                IOUtils.copy(is, sw, "UTF-8");

                is.close();

                sw.close();

                return sw.toString();

            }

        } catch (Exception e) {

            System.out.println("网页打开出错");

            return null;

        }finally{

            get.abort();

            try {

                EntityUtils.consume(entity);

            } catch (IOException e) {

                e.printStackTrace();

            }

        }

        return null;

    }

    /**

     * 找出当前页面中所有帖子的地址

     * @param pageStr  网页字符串

     * @return

     */

    public static List<String> getArticleURL(String pageContext){

        if(pageContext == null){

            return null;

        }

        List<String> articleURLS = new ArrayList<String>();

        System.out.println("寻找帖子...........");

        try {

            Document doc = Jsoup.parseBodyFragment(pageContext);

            Elements es = doc.select("div.post");

            es = es.select("div[class=post-item type-photo]");

            es = es.select("div.meta a:containsOwn(全文)");

            for(Element e:es){

                articleURLS.add(e.attr("href"));

            }

        } catch (Exception e) {

            e.printStackTrace();

            return null;

        }

        return articleURLS;

    }

    /**

     * 获取帖子的图片地址

     * @param articleURLS

     * @return

     */

    public static List<String> getImgURLS(String articleContext){

        List<String>ImgURLS = new ArrayList<String>();

        if(articleContext == null){

            return null;

        }

        System.out.println("获取图片地址-----------");

        Document doc = Jsoup.parse(articleContext);

        Elements es = doc.select("a[target=_blank] img[src]");

         for(Iterator<Element> i=es.iterator();i.hasNext();){

                Element e = i.next();

                ImgURLS.add(e.attr("src"));

             }

        return ImgURLS;

    }

    /**

     * 保存图片

     * @param ImgURL

     */

    public static void savepic(String ImgURL){

        if(ImgURL == null){

            return ;

        }

        HttpGet get = new HttpGet(ImgURL);

        String[] strs = ImgURL.split("/");

        String fileName = strs[strs.length-1];

        String savePath = IMGPATH+File.separator+fileName;

        HttpEntity entity = null;

        try {

            HttpResponse response = client.execute(get);

            entity = response.getEntity();

            System.out.println("保存图片>>>>.>>>>>>"+fileName);

            InputStream is = entity.getContent();

            OutputStream os = new FileOutputStream(savePath);

            IOUtils.copy(is, os);

            IOUtils.closeQuietly(os);

            IOUtils.closeQuietly(is);

        } catch (Exception e) {

            e.printStackTrace();

            System.out.println("图片保存失败");

            return ;

        }

    }

}

[2].[文件] StringUtil.java ~ 1KB    下载(406) 跳至 [1] [2] [3]

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

package kidbei.learn.crawler;

 

import java.io.File;

import java.text.SimpleDateFormat;

import java.util.Date;

import java.util.Random;

 

public class StringUtil {

    public static String getRandomString(){

        StringBuffer generateRandStr = new StringBuffer();

        Random rand = new Random();

        int length = 6;

        char ch;

        for(int i=0;i<length;i++)

        {

         int randNum = Math.abs(rand.nextInt())%26+97; // 产生97到122的随机数(a-z的键位值)

            ch = ( char ) randNum;

            generateRandStr.append( ch );

        }

        return generateRandStr.toString();

    }

     

    public static String getSavePath(String IMGPATH,String fileName){

        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");

        String date = sdf.format(new Date()).toString();

        if(!(fileName.endsWith(".jpg"))){

            fileName = fileName + ".jpg";

        }

        String randStr = StringUtil.getRandomString();

        return IMGPATH+File.separator+date+File.separator+randStr+fileName;

    }

     

    public static String getDate(){

        SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");

        return sdf.format(new Date()).toString();

    }

}

[3].[文件] ConnectionManager.java ~ 2KB    下载(404) 跳至 [1] [2] [3]

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

package kidbei.learn.crawler;

 

import org.apache.http.conn.scheme.PlainSocketFactory;

import org.apache.http.conn.scheme.Scheme;

import org.apache.http.conn.scheme.SchemeRegistry;

import org.apache.http.conn.ssl.SSLSocketFactory;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.impl.conn.PoolingClientConnectionManager;

import org.apache.http.params.BasicHttpParams;

import org.apache.http.params.CoreConnectionPNames;

import org.apache.http.params.CoreProtocolPNames;

import org.apache.http.params.HttpParams;

 

public class ConnectionManager {

    static final int TIMEOUT = 20000;//连接超时时间

    static final int SO_TIMEOUT = 20000;//数据传输超时

    static String UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1" +

            " (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1";

     

    public static DefaultHttpClient getHttpClient(){

        SchemeRegistry schemeRegistry = new SchemeRegistry();

        schemeRegistry.register(

                new Scheme("http",80,PlainSocketFactory.getSocketFactory()));

        schemeRegistry.register(

                new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));

         

        PoolingClientConnectionManager  cm = new PoolingClientConnectionManager(schemeRegistry);

        cm.setMaxTotal(500);

        cm.setDefaultMaxPerRoute(200);

         

        HttpParams params = new BasicHttpParams();

        params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,TIMEOUT);

        params.setParameter(CoreConnectionPNames.SO_TIMEOUT, SO_TIMEOUT);

        params.setParameter(CoreProtocolPNames.USER_AGENT, UA);

         

        DefaultHttpClient client = new DefaultHttpClient(cm,params);

        return client;

    }

}

 

 

 

 

 

 

 

 

 

使用HttpClient 4.3.4 自动登录并抓取中国联通用户基本信息和账单数据,GET/POST/Cookie - Hi_Amos

时间 2014-06-23 00:43:00  博客园-原创精华区

原文  http://www.cnblogs.com/amosli/p/3803314.html

主题 HttpComponents

.什么是HttpClient?

HTTP 协议可能是现在 Internet 上使用得最多、最重要的协议了,越来越多的Java 应用程序需要直接通过 HTTP 协议来访问网络资源。虽然在 JDK java net包中已经提供了访问 HTTP 协议的基本功能,但是对于大部分应用程序来说,JDK 库本身提供的功能还不够丰富和灵活。 HttpClient  Apache Jakarta Common 下的子项目,用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包 ,并且它支持 HTTP 协议最新的版本和建议。HttpClient已经应用在很多的项目中,比如 Apache Jakarta 上很著名的另外两个开源项目 Cactus  HTMLUnit 都使用了 HttpClient 现在HttpClient最新版本为 HttpClient 4.3.4(2014-06-22).

-----引自百度百科

简单的说,HttpClient就是一个Apache的一个对于Http封装的一个jar.

下面将介绍使用GET/POST请求,登录中国联通网站并抓取用户的基本信息和账单数据.

.新建一个maven项目httpclient

我这里的环境是 jdk1.7+Intelij idea 13.0+ubuntu12.04+maven+HttpClient 4.3.4 .下面首先建一个maven项目:

如图所示,选择quickstart

然后next下去即可.

建好项目后,如下图所示:

双击pom.xml文件并添加所需要的jar:

<dependency>
          <groupId>org.apache.httpcomponents</groupId>
          <artifactId>httpclient</artifactId>
          <version>4.3.4</version>
      </dependency>

maven会自动将需要的其它jar包下载好,实际上所需要的jar包如下图所示:

.登录中国联通并抓取数据

1.使用Get模拟登录,抓取每月账单数据

中国联通有两种登录方式:

上面两图的区别一个是带验证码,一个是不带验证码下面将先解决不带验证码的登录.

package com.amos;
 
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
 
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
 
/**
 * @author amosli
 * 登录并抓取中国联通数据
 */
 
public class LoginChinaUnicom {
  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
 
    String name = "中国联通手机号码";
    String pwd = "手机服务密码";
 
      String url = "https://uac.10010.com/portal/Service/MallLogin?callback=jQuery17202691898950318097_1403425938090&redirectURL=http%3A%2F%2Fwww.10010.com&userName=" + name + "&password=" + pwd + "&pwdType=01&productType=01&redirectType=01&rememberMe=1";
 
    HttpClient httpClient = new DefaultHttpClient();
    HttpGet httpGet = new HttpGet(url);
    HttpResponse loginResponse = httpClient.execute(httpGet);
 
    if (loginResponse.getStatusLine().getStatusCode() == 200) {
      for (Header head : loginResponse.getAllHeaders()) {
        System.out.println(head);
      }
      HttpEntity loginEntity = loginResponse.getEntity();
      String loginEntityContent = EntityUtils.toString(loginEntity);
      System.out.println("登录状态:" + loginEntityContent);
      //如果登录成功
      if (loginEntityContent.contains("resultCode:\"0000\"")) {
 
        //月份
        String months[] = new String[]{"201401", "201402", "201403", "201404", "201405"};
 
        for (String month : months) {
          String billurl = "http://iservice.10010.com/ehallService/static/historyBiil/execute/YH102010002/QUERY_YH102010002.processData/QueryYH102010002_Data/" + month + "/undefined";
 
          HttpPost httpPost = new HttpPost(billurl);
          HttpResponse billresponse = httpClient.execute(httpPost);
 
          if (billresponse.getStatusLine().getStatusCode() == 200) {
            saveToLocal(billresponse.getEntity(), "chinaunicom.bill." + month + ".2.html");
          }
        }
      }
    }
 
  }

找到要登录的url以及要传的参数,这里手机号码服务密码这里就不提供了.

new一个DefaultHttpClient,然后使用Get方式发出请求,如果登录成功,其返回代码是0000.

再用HttpPost方式将返回值写到本地.

/**
   * 写文件到本地
   *
   * @param httpEntity
   * @param filename
   */
  public static void saveToLocal(HttpEntity httpEntity, String filename) {
 
    try {
 
      File dir = new File("/home/amosli/workspace/chinaunicom/");
      if (!dir.isDirectory()) {
        dir.mkdir();
      }
 
      File file = new File(dir.getAbsolutePath() + "/" + filename);
      FileOutputStream fileOutputStream = new FileOutputStream(file);
      InputStream inputStream = httpEntity.getContent();
 
      if (!file.exists()) {
        file.createNewFile();
      }
      byte[] bytes = new byte[1024];
      int length = 0;
      while ((length = inputStream.read(bytes)) > 0) {
        fileOutputStream.write(bytes, 0, length);
      }
      inputStream.close();
      fileOutputStream.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
 
  }

这里如果只是想输出一下可以使用 EntityUtils.toString(HttpEntity entity)方法,其源码如下:

public static String toString(
      final HttpEntity entity, final Charset defaultCharset) throws IOException, ParseException {
    Args.notNull(entity, "Entity");
    final InputStream instream = entity.getContent();
    if (instream == null) {
      return null;
    }
    try {
      Args.check(entity.getContentLength() <= Integer.MAX_VALUE,
          "HTTP entity too large to be buffered in memory");
      int i = (int)entity.getContentLength();
      if (i < 0) {
        i = 4096;
      }
      Charset charset = null;
      try {
        final ContentType contentType = ContentType.get(entity);
        if (contentType != null) {
          charset = contentType.getCharset();
        }
      } catch (final UnsupportedCharsetException ex) {
        throw new UnsupportedEncodingException(ex.getMessage());
      }
      if (charset == null) {
        charset = defaultCharset;
      }
      if (charset == null) {
        charset = HTTP.DEF_CONTENT_CHARSET;
      }
      final Reader reader = new InputStreamReader(instream, charset);
      final CharArrayBuffer buffer = new CharArrayBuffer(i);
      final char[] tmp = new char[1024];
      int l;
      while((l = reader.read(tmp)) != -1) {
        buffer.append(tmp, 0, l);
      }
      return buffer.toString();
    } finally {
      instream.close();
    }
  }

这里可以发现其实现方式还是比较容易看懂的,可以指定编码,也可以不指定.

2.带验证码的登录,抓取基本信息

package com.amos;
 
 
import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.*;
import org.apache.http.util.EntityUtils;
 
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
 
/**
 * Created by amosli on 14-6-22.
 */
public class LoginWithCaptcha {
 
    public static void main(String args[]) throws Exception {
 
  //生成验证码的链接
  String createCaptchaUrl = "http://uac.10010.com/portal/Service/CreateImage";
  HttpClient httpClient = new DefaultHttpClient();
 
  String name = "中国联通手机号码";
  String pwd = "手机服务密码";
 
  //这里可自定义所需要的cookie
  CookieStore cookieStore = new BasicCookieStore();
 
  CloseableHttpClient httpclient = HttpClients.custom()
    .setDefaultCookieStore(cookieStore)
    .build();
 
  //get captcha,获取验证码
  HttpGet captchaHttpGet = new HttpGet(createCaptchaUrl);
  HttpResponse capthcaResponse = httpClient.execute(captchaHttpGet);
 
  if (capthcaResponse.getStatusLine().getStatusCode() == 200) {
      //将验证码写入本地
      LoginChinaUnicom.saveToLocal(capthcaResponse.getEntity(), "chinaunicom.capthca." + System.currentTimeMillis());
  }
 
 
  //手工输入验证码并验证
  HttpResponse verifyResponse = null;
  String capthca = null;
  String uvc = null;
 
  do {
      //输入验证码,读入键盘输入
      //1)
      InputStream inputStream = System.in;
      BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
      System.out.println("请输入验证码:");
      capthca = bufferedReader.readLine();
 
      //2)
      //Scanner scanner = new Scanner(System.in);
      //capthca = scanner.next();
 
      String verifyCaptchaUrl = "http://uac.10010.com/portal/Service/CtaIdyChk?verifyCode=" + capthca + "&verifyType=1";
      HttpGet verifyCapthcaGet = new HttpGet(verifyCaptchaUrl);
      verifyResponse = httpClient.execute(verifyCapthcaGet);
      AbstractHttpClient abstractHttpClient = (AbstractHttpClient) httpClient;
      for (Cookie cookie : abstractHttpClient.getCookieStore().getCookies()) {
    System.out.println(cookie.getName() + ":" + cookie.getValue());
    if (cookie.getName().equals("uacverifykey")) {
        uvc = cookie.getValue();
    }
      }
  } while (!EntityUtils.toString(verifyResponse.getEntity()).contains("true"));
 
  //登录
  String loginurl = "https://uac.10010.com/portal/Service/MallLogin?userName=" + name + "&password=" + pwd + "&pwdType=01&productType=01&verifyCode=" + capthca + "&redirectType=03&uvc=" + uvc;
  HttpGet loginGet = new HttpGet(loginurl);
  CloseableHttpResponse loginResponse = httpclient.execute(loginGet);
  System.out.print("loginResponse:" + EntityUtils.toString(loginResponse.getEntity()));
 
  //抓取基本信息数据
        HttpPost basicHttpGet = new HttpPost("http://iservice.10010.com/ehallService/static/acctBalance/execute/YH102010005/QUERY_AcctBalance.processData/Result");
  LoginChinaUnicom.saveToLocal(httpclient.execute(basicHttpGet).getEntity(), "chinaunicom.basic.html");
 
    }
 
 
}

这里有两个难点,一是验证码,uvc;

验证码,这里将其写到本地,然后人工输入,这个还比较好解决.

uvc,很重要,这个是在cookie里的,httpclient操作cookie的方法网上找了很久都没有找到,后来看其源码才看到.

3.效果图

账单数据(这里是json格式的数据,可能不太方便查看):

 4.本文源码

https://github.com/amosli/crawl/tree/httpclient

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值