以下是我常用的抓取类,直接调用其中方法可实现本机ip抓取,goagent代理ip抓取,代理ip抓取。以及对文件的下载,页面内容保存到本地等。
package crawlMethodManager;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.DeflateDecompressingEntity;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.params.ConnRouteParams;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.CharArrayBuffer;
@SuppressWarnings("deprecation")
public class CrawlMethodManager {
static String ip = "";
static int port = 0;
static String ipUrl = "http://localhost:8080/ipFilter/getIp/getIp";
static HttpClient httpPostClient = new DefaultHttpClient(
new ThreadSafeClientConnManager());
/**
* httpClient的get方法
*
* @param url
* String 要抓取的链接
* @param encode
* String 抓取时使用的编码
* @param goagentFlag
* boolean 是否启用goagent
* @param goagentNum
* int goagent尝试的次数
* @param companyFlag
* boolean 是否启用代理
* @param companyNum
* int 代理尝试的次数
* @param localFlag
* boolean 是否启用本机
* @param localNum
* int 本机尝试的次数
*/
public String crawlPageContentByGet(String url, String encode,
boolean goagentFlag, int goagentNum, boolean companyFlag,
int companyNum, boolean localFlag, int localNum)
throws ClientProtocolException, IOException {
String content = "";
if (goagentFlag && content.equals("")) {
int goagentCount = 0;
while (content.equals("") && goagentCount < goagentNum) {
try {
System.out.println("goagent正在请求");
content = doGetByGoagent(url, encode);
} catch (Exception e) {
// System.out.println("goagent请求失败");
}
goagentCount++;
}
}
if (companyFlag && content.equals("")) {
int companyCount = 0;
while (content.equals("") && companyCount < companyNum) {
try {
System.out.println("公司代理ip正在请求");
content = getByCompanyProxy(url, encode);
} catch (Exception e) {
// System.out.println("公司代理ip请求失败");
}
companyCount++;
}
}
if (localFlag && content.equals("")) {
int localCount = 0;
while (content.equals("") && localCount < localNum) {
try {
System.out.println("本机正在请求");
content = doGet(url, encode);
} catch (Exception e) {
// System.out.println("本机请求失败");
}
localCount++;
}
}
return content;
}
/**
*
* @Description: get web content
* @param @param url
* @param @param encode
* @param @return
* @param @throws ClientProtocolException
* @param @throws IOException
* @return String
* @throws
* @author joe
* @date 2014-12-11
*/
public String crawlPageContentByGet(String url, String encode)
throws ClientProtocolException, IOException {
String content = "";
try {
content = doGetByGoagent(url, encode);
if (content == null || content.equals("")) {
System.out.println("启用公司代理");
content = getByCompanyProxy(url, encode);
// if (content == null || content.equals("")) {
// System.out.println("启用本机");
// content = doGet(url, encode);
// }
}
} catch (Exception e) {
try {
System.out.println("goagent连接失败,启用公司代理");
content = getByCompanyProxy(url, encode);
// if (content == null || content.equals("")) {
// System.out.println("公司代理连接失败,启用本机");
// content = doGet(url, encode);
// }
} catch (Exception e2) {
try {
content = getByCompanyProxy(url, encode);
// e2.printStackTrace();
// System.out.println("公司代理连接失败,5秒后启用本机");
// Thread.sleep(5000);
// content = doGet(url, encode);
} catch (Exception e3) {
e3.printStackTrace();
}
}
}
return content;
}
private String getByCompanyProxy(String url, String encode) {
int count = 10;
String result = "";
String urlString = url;
String proxy = "";
HttpHost proxyHost = null;
boolean newProxy = false;
int oldProxyUsecount = 0;
for (int i = 0; i <= count; i++) {
if (!ip.equals("")) {
proxyHost = new HttpHost(ip, port, null);
}
try {
if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {
oldProxyUsecount = 0;
String[] proxys = null;
try {
while (proxy.equals("") || !proxy.contains(":")) {
System.out.println("ip为空,正在提取");
proxy = doGet(ipUrl, "gbk");
}
proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
":");
} catch (Exception e) {
while (proxy.equals("") || !proxy.contains(":")) {
System.out.println("ip为空,正在提取");
proxy = doGet(ipUrl, "gbk");
}
proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
":");
// proxy = doGet(
// ,
// "gbk");
// proxys = proxy.split(":");
}
ip = proxys[0];
port = Integer.parseInt(proxys[1]);
proxyHost = new HttpHost(ip, port, null);
}
System.out.println("正在使用代理" + ip + ":" + port + ":" + port);
HttpGet httpRequst = new HttpGet(urlString);
httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch");
httpRequst.getParams().setParameter(
CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
DefaultHttpClient httpClient = new DefaultHttpClient();
httpClient.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 9000);// 连接时间20s
httpClient.getParams().setParameter(
CoreConnectionPNames.SO_TIMEOUT, 9000);// 数据传输时间60s
httpClient.getParams().setParameter(
ConnRouteParams.DEFAULT_PROXY, proxyHost);
HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(
httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
if (resultTest(result)) {
System.out.println(ip + "公司代理成功抓取" + url);
return result;
} else if (result.contains("function JumpSelf")
&& result.contains("WebShieldSessionVerify")) {
int indexs = result.indexOf("&WebShieldSessionVerify");
int indexe = result.indexOf("\";}</script>");
String verify = result.substring(indexs, indexe);
urlString = urlString + verify;
newProxy = false;
} else if (result.contains("function JumpSelf")
&& !result.contains("WebShieldSessionVerify")) {
urlString = url;
newProxy = false;
} else {
System.out.println("网页含有错误特殊字符" + urlString);
oldProxyUsecount++;
System.out.println(result);
}
} else
System.out.println(httpResponse.getStatusLine()
.getStatusCode() + " " + urlString + " 状态不为200");
oldProxyUsecount++;
httpRequst.abort();
} catch (ClientProtocolException e) {
newProxy = true;
System.out.println(ip + "代理ip拒绝了");
} catch (IOException e) {
oldProxyUsecount++;
System.out.println(ip + "代理读取超时");
}
}
return "";
}
private String doGet(String url, String encode)
throws ClientProtocolException, IOException {
String result = "";
try {
HttpGet httpRequst = new HttpGet(url);
// httpRequst.addHeader("Content-Type", "text/html;charset=" +
// encode);
// httpRequst.getParams().setParameter(
// CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
DefaultHttpClient httpClient = new DefaultHttpClient();
// httpClient.getParams().setParameter(
// CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
httpClient.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
httpClient.getParams().setParameter(
CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s
HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
.getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
} else
httpRequst.abort();
} catch (ClientProtocolException e) {
System.out.println("doget代理读取超时");
} catch (IOException e) {
System.out.println("doget代理读取超时");
}
return result;
}
private String doGetByGoagent(String url, String encode)
throws ClientProtocolException, IOException {
String result = "";
HttpGet httpRequst = new HttpGet(url);
httpRequst.addHeader("Accept-Encoding", "gzip,deflate,sdch");
httpRequst.getParams().setParameter(
CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
DefaultHttpClient httpClient = new DefaultHttpClient();
HttpHost proxyHost = new HttpHost("127.0.0.1", 8087, null);
httpClient.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
6000);// 数据传输时间60s
httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,
proxyHost);
HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
.getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
} else
httpRequst.abort();
return result;
}
public String crawlPageContentByPost(String url, String pram, String encode)
throws ClientProtocolException, IOException {
String content = "";
try {
content = doPostByGoagent(url, pram, encode);
if (content == null || content.equals("")) {
content = doPostByGoagent(url, pram, encode);
// System.out.println("启用公司代理");
// content = postByCompanyProxy(url, pram, encode);
// if (content == null || content.equals("")) {
// System.out.println("5秒后启用本机");
// Thread.sleep(5000);
// content = doPost(url, pram, encode);
// }
}
} catch (Exception e) {
try {
content = doPostByGoagent(url, pram, encode);
// System.out.println("goagent连接失败,启用公司代理");
// content = postByCompanyProxy(url, pram, encode);
// if (content == null || content.equals("")) {
// System.out.println("公司代理连接失败,启用本机");
// content = doPost(url, pram, encode);
// }
} catch (Exception e2) {
try {
content = doPostByGoagent(url, pram, encode);
// e2.printStackTrace();
// content = postByCompanyProxy(url, pram, encode);
// System.out.println("公司代理连接失败,启用本机");
// content = doPost(url, pram, encode);
} catch (Exception e3) {
e3.printStackTrace();
}
}
}
return content;
}
private String doPostByGoagent(String url, String parm, String encode)
throws ClientProtocolException, IOException {
String result = "";
HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
HttpHost proxy = new HttpHost("127.0.0.1", 8087, null);
StringEntity entity = new StringEntity(parm);
entity.setContentType("application/x-www-form-urlencoded");
entity.setContentEncoding(encode);
httpRequst.setEntity(entity);
DefaultHttpClient httpClient = new DefaultHttpClient();
httpClient.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
8000);// 数据传输时间60s
httpClient.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY,
proxy);
HttpResponse httpResponse = httpClient.execute(httpRequst);
// System.out.println(httpResponse.getStatusLine().getStatusCode());
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
.getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
}
return result;
}
public String doPost(String url, String parm, String encode)
throws ClientProtocolException, IOException {
String result = "";
HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
StringEntity entity = new StringEntity(parm);
entity.setContentType("application/x-www-form-urlencoded");
entity.setContentEncoding(encode);
httpRequst.setEntity(entity);
DefaultHttpClient httpClient = new DefaultHttpClient();
httpClient.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 8000);// 连接时间20s
httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT,
8000);// 数据传输时间60s
HttpResponse httpResponse = httpClient.execute(httpRequst);
// System.out.println(httpResponse.getStatusLine().getStatusCode());
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity.getContentEncoding()
.getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
return result;
}
return result;
}
@SuppressWarnings("unused")
private String postByCompanyProxy(String url, String parm, String encode)
throws ClientProtocolException, IOException {
int count = 5;
String result = "";
String urlString = url;
boolean okProxy = false;
boolean newProxy = false;
int oldProxyUsecount = 0;
for (int i = 0; i <= count; i++) {
try {
if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {
okProxy = postByCompanyProxyBoolean(url, parm, encode);
}
if (okProxy) {
System.out.println("正在使用代理" + ip + ":" + port);
HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
StringEntity entity = new StringEntity(parm);
entity.setContentType("application/x-www-form-urlencoded");
httpRequst.setEntity(entity);
httpRequst.getParams().setParameter(
CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
HttpResponse httpResponse = httpPostClient
.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new GzipDecompressingEntity(
httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(
httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
// System.out.println(result);
if (resultTest(result)) {
return result;
} else if (result.contains("function JumpSelf")
&& result.contains("WebShieldSessionVerify")) {
int indexs = result
.indexOf("&WebShieldSessionVerify");
int indexe = result.indexOf("\";}</script>");
String verify = result.substring(indexs, indexe);
urlString = urlString + verify;
newProxy = false;
} else if (result.contains("function JumpSelf")
&& !result.contains("WebShieldSessionVerify")) {
urlString = url;
newProxy = false;
}
} else if (httpResponse.getStatusLine().getStatusCode() == 302) {
System.out.println("重定向了");
Header header = httpResponse.getFirstHeader("location");
if (header != null) {
urlString = header.getValue();
System.out.println(urlString);
if (urlString.contains("tabid=26")) {
urlString = "http://www.landchina.com"
+ urlString;
result = getByHttpClient(urlString, encode,
httpPostClient);
if (resultTest(result)) {
System.out.println(i + "公司代理成功抓取" + url);
return result;
}
newProxy = false;
}
newProxy = false;
}
} else {
httpRequst.abort();
}
} else {
oldProxyUsecount++;
}
} catch (ClientProtocolException e) {
newProxy = true;
System.out.println(ip + "代理ip拒绝了");
} catch (IOException e) {
oldProxyUsecount++;
System.out.println(ip + "代理读取超时");
}
}
return "";
}
private String getByHttpClient(String url, String encode,
HttpClient httpClient) {
int count = 2;
String result = "";
String urlString = url;
for (int i = 0; i <= count; i++) {
try {
HttpGet httpRequst = new HttpGet(urlString);
httpRequst.setHeader("Content-Type",
"application/x-www-form-urlencoded");
HttpResponse httpResponse = httpClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(
httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
if (resultTest(result)) {
System.out.println(ip + "公司代理成功抓取" + url);
return result;
} else if (result.contains("function JumpSelf")
&& result.contains("WebShieldSessionVerify")) {
int indexs = result.indexOf("&WebShieldSessionVerify");
int indexe = result.indexOf("\";}</script>");
String verify = result.substring(indexs, indexe);
urlString = urlString + verify;
} else if (result.contains("function JumpSelf")
&& !result.contains("WebShieldSessionVerify")) {
urlString = url;
}
} else
httpRequst.abort();
} catch (ClientProtocolException e) {
System.out.println(ip + "代理ip拒绝了");
} catch (IOException e) {
System.out.println(ip + "代理读取超时");
}
}
return "";
}
/**
* 新ip第一次访问时要先通过安全验证,这时只能得到首页的内容,所以在post前线验证一次 <功能详细描述> [参数说明]
*
* @return void [返回类型说明]
* @exception throws [违例类型] [违例说明]
* @see [类、类#方法、类#成员]
*/
private Boolean postByCompanyProxyBoolean(String url, String parm,
String encode) throws ClientProtocolException, IOException {
int count = 10;
String result = "";
String urlString = url;
String proxy = "";
HttpHost proxyHost = null;
boolean newProxy = false;
int oldProxyUsecount = 0;
for (int i = 0; i <= count; i++) {
try {
if (newProxy || oldProxyUsecount > 2 || ip.equals("")) {
oldProxyUsecount = 0;
String[] proxys = null;
try {
while (proxy.equals("") || !proxy.contains(":")) {
System.out.println("ip为空,正在提取");
proxy = doGet(ipUrl, "gbk");
}
proxys = proxy.replaceAll("\"|//|/|\r\n| | ", "")
.split(":");
} catch (Exception e) {
while (proxy.equals("") || !proxy.contains(":")) {
System.out.println("ip为空,正在提取");
proxy = doGet(ipUrl, "gbk");
}
proxys = proxy.replaceAll("\"|//|/|\r\n| ", "").split(
":");
}
ip = proxys[0];
port = Integer.parseInt(proxys[1]);
proxyHost = new HttpHost(ip, port, null);
}
System.out.println("正在使用代理" + ip + ":" + port);
HttpPost httpRequst = new HttpPost(url);// 创建HttpPost对象
StringEntity entity = new StringEntity(parm);
entity.setContentType("application/x-www-form-urlencoded");
httpRequst.setEntity(entity);
httpRequst.getParams().setParameter(
CoreProtocolPNames.HTTP_CONTENT_CHARSET, encode);
httpPostClient.getParams().setParameter(
CoreConnectionPNames.CONNECTION_TIMEOUT, 10000);// 连接时间20s
httpPostClient.getParams().setParameter(
CoreConnectionPNames.SO_TIMEOUT, 8000);// 数据传输时间60s
httpPostClient.getParams().setParameter(
ConnRouteParams.DEFAULT_PROXY, proxyHost);
HttpResponse httpResponse = httpPostClient.execute(httpRequst);// 其中HttpGet是HttpUriRequst的子类
if (httpResponse.getStatusLine().getStatusCode() == 200) {
HttpEntity httpEntity = httpResponse.getEntity();
if (httpEntity.getContentEncoding() != null) {
if ("gzip".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new GzipDecompressingEntity(httpEntity);
} else if ("deflate".equalsIgnoreCase(httpEntity
.getContentEncoding().getValue())) {
httpEntity = new DeflateDecompressingEntity(
httpEntity);
}
}
result = enCodetoString(httpEntity, encode);// 取出应答字符串
// System.out.println(result);
if (resultTest(result)) {
return true;
} else if (result.contains("function JumpSelf")
&& result.contains("WebShieldSessionVerify")) {
int indexs = result.indexOf("&WebShieldSessionVerify");
int indexe = result.indexOf("\";}</script>");
String verify = result.substring(indexs, indexe);
urlString = urlString + verify;
if (urlString.contains("tabid=26")
&& !urlString.contains("landchina")) {
urlString = "http://www.landchina.com" + urlString;
result = getByHttpClient(urlString, encode,
httpPostClient);
if (resultTest(result)) {
System.out.println(ip + "公司代理成功抓取" + url);
return true;
}
newProxy = false;
} else if (urlString.contains("tabid=26")
&& urlString.contains("landchina")) {
result = getByHttpClient(urlString, encode,
httpPostClient);
if (resultTest(result)) {
System.out.println(ip + "公司代理成功抓取" + url);
return true;
}
newProxy = false;
}
newProxy = false;
} else if (result.contains("function JumpSelf")
&& !result.contains("WebShieldSessionVerify")) {
urlString = url;
newProxy = false;
}
} else if (httpResponse.getStatusLine().getStatusCode() == 302) {
System.out.println("重定向了");
Header header = httpResponse.getFirstHeader("location");
if (header != null) {
urlString = header.getValue();
System.out.println(urlString);
if (urlString.contains("tabid=26")
&& !urlString.contains("landchina")) {
urlString = "http://www.landchina.com" + urlString;
result = getByHttpClient(urlString, encode,
httpPostClient);
if (resultTest(result)) {
System.out.println(ip + "公司代理成功抓取" + url);
return true;
}
newProxy = false;
} else if (urlString.contains("tabid=26")
&& urlString.contains("landchina")) {
result = getByHttpClient(urlString, encode,
httpPostClient);
if (resultTest(result)) {
System.out.println(ip + "公司代理成功抓取" + url);
return true;
}
newProxy = false;
}
newProxy = false;
}
} else {
httpRequst.abort();
}
} catch (ClientProtocolException e) {
newProxy = true;
System.out.println(ip + "代理ip拒绝了");
} catch (IOException e) {
oldProxyUsecount++;
System.out.println(ip + "代理读取超时");
}
}
return false;
}
private Boolean resultTest(String result) {
if (!result.equals("") && !result.equals("100")
&& !result.contains("<title>blank")
&& !result.contains("Error Page Messages")
&& !result.contains("<title>404")
&& !result.contains("您的访问出错了") && !result.contains("302 Found")
&& !result.contains("出错页面") && !result.contains("没有找到这篇文章!")
&& !result.contains("特定于实例的错误") && !result.contains("错误 404")
&& !result.contains("Error report")
&& !result.contains("function JumpSelf")
&& !result.contains("refused") && !result.contains("网站防火墙")
&& !result.contains("无法解析服务器") && !result.contains("STATUS OK")
&& !result.contains("refresh")
&& !result.contains("DownloadError")
&& !result.contains("Not Found")
&& !result.contains("Runtime Error")
&& !result.contains("Service Unavailable")) {
return true;
}
return false;
}
public static String enCodetoString(final HttpEntity entity,
final String defaultCharset) throws IOException, ParseException {
return enCodetoStringDo(entity,
defaultCharset != null ? Charset.forName(defaultCharset) : null);
}
public static String enCodetoStringDo(final HttpEntity entity,
Charset defaultCharset) throws IOException, ParseException {
if (entity == null) {
throw new IllegalArgumentException("HTTP entity may not be null");
}
InputStream instream = entity.getContent();
if (instream == null) {
return null;
}
try {
if (entity.getContentLength() > Integer.MAX_VALUE) {
throw new IllegalArgumentException(
"HTTP entity too large to be buffered in memory");
}
int i = (int) entity.getContentLength();
if (i < 0) {
i = 4096;
}
Charset charset = null;
try {
// ContentType contentType = ContentType.get(entity);
// if (contentType != null) {
// charset = contentType.getCharset();
// }
} catch (final UnsupportedCharsetException ex) {
throw new UnsupportedEncodingException(ex.getMessage());
}
if (charset == null) {
charset = defaultCharset;
}
if (charset == null) {
charset = HTTP.DEF_CONTENT_CHARSET;
}
Reader reader = new InputStreamReader(instream, charset);
CharArrayBuffer buffer = new CharArrayBuffer(i);
char[] tmp = new char[1024];
int l;
while ((l = reader.read(tmp)) != -1) {
buffer.append(tmp, 0, l);
}
return buffer.toString();
} finally {
instream.close();
}
}
/**
*
* @Description: TODO
* @param @param 硬盘名
* @param @param 文件名
* @param @param 文件夹名
* @param @param 保存后缀名
* @param @param 保存的内容
* @return void
* @throws
* @author joe
* @date 2015-3-6
*/
public static void writeToFile(String topName, String fileName,
String tagName, String type, String content) {
File dirFile = null;
try {
dirFile = new File(topName + ":\\" + tagName);
if (!(dirFile.exists()) && !(dirFile.isDirectory())) {
boolean creadok = dirFile.mkdirs();
if (creadok) {
System.out.println(" ok:创建文件夹成功! ");
} else {
System.out.println(" err:创建文件夹失败! ");
}
}
} catch (Exception e) {
e.printStackTrace();
}
String fullPath = dirFile + "/" + fileName + "." + type;
write(fullPath, content);
}
/**
* 写文件
*
* @param path
* @param content
*/
public static boolean write(String path, String content) {
String s = new String();
String s1 = new String();
BufferedWriter output = null;
try {
File f = new File(path);
if (f.exists()) {
} else {
System.out.println("文件不存在,正在创建...");
if (f.createNewFile()) {
System.out.println("文件创建成功!");
} else {
System.out.println("文件创建失败!");
}
}
BufferedReader input = new BufferedReader(new FileReader(f));
while ((s = input.readLine()) != null) {
s1 += s + "\n";
}
System.out.println("原文件内容:" + s1);
input.close();
s1 += content;
output = new BufferedWriter(new FileWriter(f));
output.write(s1);
output.flush();
return true;
} catch (Exception e) {
e.printStackTrace();
return false;
} finally {
if (output != null) {
try {
output.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
*
* @Description: TODO
* @param @param fileUrl文件链接
* @param @param topName硬盘名
* @param @param fileName文件名
* @param @param tagName文件夹名
* @param @param type 后缀名
* @return void
* @throws
* @author joe
* @date 2015-3-6
*/
public void downLoadFile(String fileUrl, String topName, String fileName,
String tagName, String type) {
// 下载网络文件
int bytesum = 0;
int byteread = 0;
try {
URL url = new URL(fileUrl);
URLConnection conn = url.openConnection();
InputStream inStream = conn.getInputStream();
File fileD = new File(topName + ":/" + tagName);
// 如果文件夹不存在则创建
if (!fileD.exists() && !fileD.isDirectory()) {
System.out.println("正在新建目录");
fileD.mkdirs();
;
} else {
System.out.println("目录存在");
}
File file = new File(topName + ":/" + tagName + "/" + fileName
+ "." + type);
if (!file.exists()) {
try {
file.createNewFile();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
FileOutputStream fs = new FileOutputStream(topName + ":/" + tagName
+ "/" + fileName + "." + type);
byte[] buffer = new byte[1204];
while ((byteread = inStream.read(buffer)) != -1) {
bytesum += byteread;
System.out.println(bytesum);
fs.write(buffer, 0, byteread);
}
System.out.println("downloaded ok");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) throws ClientProtocolException,
IOException {
CrawlMethodManager manager = new CrawlMethodManager();
}
}