爬取图片
[1].[文件] SemeiziCrawler.java ~ 5KB 下载(576) 跳至 [1] [2] [3]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | package kidbei.learn.crawler;
import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.StringWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /** * http://sejie.wanxun.org/post/2012-09-25/40039413449 * @author Administrator * */ public class SemeiziCrawler { private static final String BASEHOST = "http://sejie.wanxun.org/"; private static DefaultHttpClient client = ConnectionManager.getHttpClient(); static String url = "http://sejie.wanxun.org/post/2012-09-25/40039413449"; private static String IMGPATH = "D:\\sexpicture\\色戒美眉图"+File.separator+StringUtil.getDate(); static int STARTPAGE = 1; static int PAGECOUNT = 100;
public static void main(String[] args) { File f = new File(IMGPATH); if(!f.exists()){ f.mkdirs(); } String host = BASEHOST ; for(int i=STARTPAGE;i<PAGECOUNT;i++){ if(i != 1){ host = BASEHOST+"page/"+i; } System.out.println("进入第"+i+"页"); String pageContext = getResultByUrl(host); // System.out.println(pageContext); List<String>articleURLS = getArticleURL(pageContext); for(String articleURL:articleURLS){ String articleContext = getResultByUrl(articleURL); List<String> ImgURLS = getImgURLS(articleContext); for(String ImgURL:ImgURLS){ savepic(ImgURL); } } } // String articleContext = getResultByUrl(url); // List<String> strs = getImgURLS(articleContext); // for(String str:strs){ // System.out.println(str); // } } /** * 根据url获取页面 * @param url * @return */ public static String getResultByUrl(String url){ System.out.println("打开网页"+url); HttpGet get = new HttpGet(url); HttpEntity entity = null; HttpResponse response = null; try { response = client.execute(get); entity = response.getEntity(); if(entity != null){ InputStream is = entity.getContent(); StringWriter sw = new StringWriter(); IOUtils.copy(is, sw, "UTF-8"); is.close(); sw.close(); return sw.toString(); } } catch (Exception e) { System.out.println("网页打开出错"); return null; }finally{ get.abort(); try { EntityUtils.consume(entity); } catch (IOException e) { e.printStackTrace(); } } return null; } /** * 找出当前页面中所有帖子的地址 * @param pageStr 网页字符串 * @return */ public static List<String> getArticleURL(String pageContext){ if(pageContext == null){ return null; } List<String> articleURLS = new ArrayList<String>(); System.out.println("寻找帖子..........."); try { Document doc = Jsoup.parseBodyFragment(pageContext); Elements es = doc.select("div.post"); es = es.select("div[class=post-item type-photo]"); es = es.select("div.meta a:containsOwn(全文)"); for(Element e:es){ articleURLS.add(e.attr("href")); } } catch (Exception e) { e.printStackTrace(); return null; } return articleURLS; } /** * 获取帖子的图片地址 * @param articleURLS * @return */ public static List<String> getImgURLS(String articleContext){ List<String>ImgURLS = new ArrayList<String>(); if(articleContext == null){ return null; } System.out.println("获取图片地址-----------"); Document doc = Jsoup.parse(articleContext); Elements es = doc.select("a[target=_blank] img[src]"); for(Iterator<Element> i=es.iterator();i.hasNext();){ Element e = i.next(); ImgURLS.add(e.attr("src")); } return ImgURLS; } /** * 保存图片 * @param ImgURL */ public static void savepic(String ImgURL){ if(ImgURL == null){ return ; } HttpGet get = new HttpGet(ImgURL); String[] strs = ImgURL.split("/"); String fileName = strs[strs.length-1]; String savePath = IMGPATH+File.separator+fileName; HttpEntity entity = null; try { HttpResponse response = client.execute(get); entity = response.getEntity(); System.out.println("保存图片>>>>.>>>>>>"+fileName); InputStream is = entity.getContent(); OutputStream os = new FileOutputStream(savePath); IOUtils.copy(is, os); IOUtils.closeQuietly(os); IOUtils.closeQuietly(is); } catch (Exception e) { e.printStackTrace(); System.out.println("图片保存失败"); return ; } } } |
[2].[文件] StringUtil.java ~ 1KB 下载(406) 跳至 [1] [2] [3]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | package kidbei.learn.crawler;
import java.io.File; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Random;
public class StringUtil { public static String getRandomString(){ StringBuffer generateRandStr = new StringBuffer(); Random rand = new Random(); int length = 6; char ch; for(int i=0;i<length;i++) { int randNum = Math.abs(rand.nextInt())%26+97; // 产生97到122的随机数(a-z的键位值) ch = ( char ) randNum; generateRandStr.append( ch ); } return generateRandStr.toString(); }
public static String getSavePath(String IMGPATH,String fileName){ SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); String date = sdf.format(new Date()).toString(); if(!(fileName.endsWith(".jpg"))){ fileName = fileName + ".jpg"; } String randStr = StringUtil.getRandomString(); return IMGPATH+File.separator+date+File.separator+randStr+fileName; }
public static String getDate(){ SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); return sdf.format(new Date()).toString(); } } |
[3].[文件] ConnectionManager.java ~ 2KB 下载(404) 跳至 [1] [2] [3]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | package kidbei.learn.crawler;
import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpParams;
public class ConnectionManager { static final int TIMEOUT = 20000;//连接超时时间 static final int SO_TIMEOUT = 20000;//数据传输超时 static String UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1" + " (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1";
public static DefaultHttpClient getHttpClient(){ SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register( new Scheme("http",80,PlainSocketFactory.getSocketFactory())); schemeRegistry.register( new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry); cm.setMaxTotal(500); cm.setDefaultMaxPerRoute(200);
HttpParams params = new BasicHttpParams(); params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,TIMEOUT); params.setParameter(CoreConnectionPNames.SO_TIMEOUT, SO_TIMEOUT); params.setParameter(CoreProtocolPNames.USER_AGENT, UA);
DefaultHttpClient client = new DefaultHttpClient(cm,params); return client; } } |
使用HttpClient 4.3.4 自动登录并抓取中国联通用户基本信息和账单数据,GET/POST/Cookie - Hi_Amos
时间 2014-06-23 00:43:00 博客园-原创精华区
原文 http://www.cnblogs.com/amosli/p/3803314.html
一.什么是HttpClient?
HTTP 协议可能是现在 Internet 上使用得最多、最重要的协议了,越来越多的Java 应用程序需要直接通过 HTTP 协议来访问网络资源。虽然在 JDK 的 java net包中已经提供了访问 HTTP 协议的基本功能,但是对于大部分应用程序来说,JDK 库本身提供的功能还不够丰富和灵活。 HttpClient 是 Apache Jakarta Common 下的子项目,用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包 ,并且它支持 HTTP 协议最新的版本和建议。HttpClient已经应用在很多的项目中,比如 Apache Jakarta 上很著名的另外两个开源项目 Cactus 和 HTMLUnit 都使用了 HttpClient。 现在HttpClient最新版本为 HttpClient 4.3.4(2014-06-22).
-----引自百度百科
简单的说,HttpClient就是一个Apache的一个对于Http封装的一个jar包.
下面将介绍使用GET/POST请求,登录中国联通网站并抓取用户的基本信息和账单数据.
二.新建一个maven项目httpclient
我这里的环境是 jdk1.7+Intelij idea 13.0+ubuntu12.04+maven+HttpClient 4.3.4 .下面首先建一个maven项目:
如图所示,选择quickstart
然后next下去即可.
建好项目后,如下图所示:
双击pom.xml文件并添加所需要的jar包:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.4</version>
</dependency>
maven会自动将需要的其它jar包下载好,实际上所需要的jar包如下图所示:
三.登录中国联通并抓取数据
1.使用Get模拟登录,抓取每月账单数据
中国联通有两种登录方式:
上面两图的区别一个是带验证码,一个是不带验证码, 下面将先解决不带验证码的登录.
package com.amos;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
/**
* @author amosli
* 登录并抓取中国联通数据
*/
public class LoginChinaUnicom {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String name = "中国联通手机号码";
String pwd = "手机服务密码";
String url = "https://uac.10010.com/portal/Service/MallLogin?callback=jQuery17202691898950318097_1403425938090&redirectURL=http%3A%2F%2Fwww.10010.com&userName=" + name + "&password=" + pwd + "&pwdType=01&productType=01&redirectType=01&rememberMe=1";
HttpClient httpClient = new DefaultHttpClient();
HttpGet httpGet = new HttpGet(url);
HttpResponse loginResponse = httpClient.execute(httpGet);
if (loginResponse.getStatusLine().getStatusCode() == 200) {
for (Header head : loginResponse.getAllHeaders()) {
System.out.println(head);
}
HttpEntity loginEntity = loginResponse.getEntity();
String loginEntityContent = EntityUtils.toString(loginEntity);
System.out.println("登录状态:" + loginEntityContent);
//如果登录成功
if (loginEntityContent.contains("resultCode:\"0000\"")) {
//月份
String months[] = new String[]{"201401", "201402", "201403", "201404", "201405"};
for (String month : months) {
String billurl = "http://iservice.10010.com/ehallService/static/historyBiil/execute/YH102010002/QUERY_YH102010002.processData/QueryYH102010002_Data/" + month + "/undefined";
HttpPost httpPost = new HttpPost(billurl);
HttpResponse billresponse = httpClient.execute(httpPost);
if (billresponse.getStatusLine().getStatusCode() == 200) {
saveToLocal(billresponse.getEntity(), "chinaunicom.bill." + month + ".2.html");
}
}
}
}
}
找到要登录的url以及要传的参数,这里手机号码服务密码这里就不提供了.
new一个DefaultHttpClient,然后使用Get方式发出请求,如果登录成功,其返回代码是0000.
再用HttpPost方式将返回值写到本地.
/**
* 写文件到本地
*
* @param httpEntity
* @param filename
*/
public static void saveToLocal(HttpEntity httpEntity, String filename) {
try {
File dir = new File("/home/amosli/workspace/chinaunicom/");
if (!dir.isDirectory()) {
dir.mkdir();
}
File file = new File(dir.getAbsolutePath() + "/" + filename);
FileOutputStream fileOutputStream = new FileOutputStream(file);
InputStream inputStream = httpEntity.getContent();
if (!file.exists()) {
file.createNewFile();
}
byte[] bytes = new byte[1024];
int length = 0;
while ((length = inputStream.read(bytes)) > 0) {
fileOutputStream.write(bytes, 0, length);
}
inputStream.close();
fileOutputStream.close();
} catch (Exception e) {
e.printStackTrace();
}
}
这里如果只是想输出一下可以使用 EntityUtils.toString(HttpEntity entity)方法,其源码如下:
public static String toString(
final HttpEntity entity, final Charset defaultCharset) throws IOException, ParseException {
Args.notNull(entity, "Entity");
final InputStream instream = entity.getContent();
if (instream == null) {
return null;
}
try {
Args.check(entity.getContentLength() <= Integer.MAX_VALUE,
"HTTP entity too large to be buffered in memory");
int i = (int)entity.getContentLength();
if (i < 0) {
i = 4096;
}
Charset charset = null;
try {
final ContentType contentType = ContentType.get(entity);
if (contentType != null) {
charset = contentType.getCharset();
}
} catch (final UnsupportedCharsetException ex) {
throw new UnsupportedEncodingException(ex.getMessage());
}
if (charset == null) {
charset = defaultCharset;
}
if (charset == null) {
charset = HTTP.DEF_CONTENT_CHARSET;
}
final Reader reader = new InputStreamReader(instream, charset);
final CharArrayBuffer buffer = new CharArrayBuffer(i);
final char[] tmp = new char[1024];
int l;
while((l = reader.read(tmp)) != -1) {
buffer.append(tmp, 0, l);
}
return buffer.toString();
} finally {
instream.close();
}
}
这里可以发现其实现方式还是比较容易看懂的,可以指定编码,也可以不指定.
2.带验证码的登录,抓取基本信息
package com.amos;
import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.*;
import org.apache.http.util.EntityUtils;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* Created by amosli on 14-6-22.
*/
public class LoginWithCaptcha {
public static void main(String args[]) throws Exception {
//生成验证码的链接
String createCaptchaUrl = "http://uac.10010.com/portal/Service/CreateImage";
HttpClient httpClient = new DefaultHttpClient();
String name = "中国联通手机号码";
String pwd = "手机服务密码";
//这里可自定义所需要的cookie
CookieStore cookieStore = new BasicCookieStore();
CloseableHttpClient httpclient = HttpClients.custom()
.setDefaultCookieStore(cookieStore)
.build();
//get captcha,获取验证码
HttpGet captchaHttpGet = new HttpGet(createCaptchaUrl);
HttpResponse capthcaResponse = httpClient.execute(captchaHttpGet);
if (capthcaResponse.getStatusLine().getStatusCode() == 200) {
//将验证码写入本地
LoginChinaUnicom.saveToLocal(capthcaResponse.getEntity(), "chinaunicom.capthca." + System.currentTimeMillis());
}
//手工输入验证码并验证
HttpResponse verifyResponse = null;
String capthca = null;
String uvc = null;
do {
//输入验证码,读入键盘输入
//1)
InputStream inputStream = System.in;
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
System.out.println("请输入验证码:");
capthca = bufferedReader.readLine();
//2)
//Scanner scanner = new Scanner(System.in);
//capthca = scanner.next();
String verifyCaptchaUrl = "http://uac.10010.com/portal/Service/CtaIdyChk?verifyCode=" + capthca + "&verifyType=1";
HttpGet verifyCapthcaGet = new HttpGet(verifyCaptchaUrl);
verifyResponse = httpClient.execute(verifyCapthcaGet);
AbstractHttpClient abstractHttpClient = (AbstractHttpClient) httpClient;
for (Cookie cookie : abstractHttpClient.getCookieStore().getCookies()) {
System.out.println(cookie.getName() + ":" + cookie.getValue());
if (cookie.getName().equals("uacverifykey")) {
uvc = cookie.getValue();
}
}
} while (!EntityUtils.toString(verifyResponse.getEntity()).contains("true"));
//登录
String loginurl = "https://uac.10010.com/portal/Service/MallLogin?userName=" + name + "&password=" + pwd + "&pwdType=01&productType=01&verifyCode=" + capthca + "&redirectType=03&uvc=" + uvc;
HttpGet loginGet = new HttpGet(loginurl);
CloseableHttpResponse loginResponse = httpclient.execute(loginGet);
System.out.print("loginResponse:" + EntityUtils.toString(loginResponse.getEntity()));
//抓取基本信息数据
HttpPost basicHttpGet = new HttpPost("http://iservice.10010.com/ehallService/static/acctBalance/execute/YH102010005/QUERY_AcctBalance.processData/Result");
LoginChinaUnicom.saveToLocal(httpclient.execute(basicHttpGet).getEntity(), "chinaunicom.basic.html");
}
}
这里有两个难点,一是验证码,二uvc码;
验证码,这里将其写到本地,然后人工输入,这个还比较好解决.
uvc码,很重要,这个是在cookie里的,httpclient操作cookie的方法网上找了很久都没有找到,后来看其源码才看到.
3.效果图
账单数据(这里是json格式的数据,可能不太方便查看):
4.本文源码
https://github.com/amosli/crawl/tree/httpclient