我们如果在公司或家里使用网络爬虫去抓取自己索要的一些数据的时候,常常对方的网站有defence机制,会给你的http请求返回500错误,只要是相同IP就请求不到数据,这时候我们只能去重启路由器,这样IP地址会改变,网络爬虫就能正常工作了
下面是通过发送Socket请求来模拟路由器的重启指令:
protected void rebotadsl() {
try {
BufferedOutputStream sender = null;
String url = baseURL;
URL target = new URL(url);
InetAddress address = InetAddress.getByName(target.getHost());
Socket client = new Socket(address, 8080);
sender = new BufferedOutputStream(client.getOutputStream());
String str = "";
String cmd = "GET "
+ "/userRpm/StatusRpm.htm?Disconnect=%B6%CF%20%CF%DF&wan=1"
+ " HTTP/1.0\r\n" + "User-Agent: myselfHttp/1.0\r\n"
+ "Accept: www/source; text/html; image/gif; */*\r\n"
+ "Authorization: Basic" + " " + luyou + "\r\n"
+ "\r\n"; //luyou填写路由器的密码,如YWRtaW46d2FuZzIwMDU=
sender.write(cmd.getBytes(), 0, cmd.length());
sender.flush();
System.out.println("由于重定向路由器断线了");
} catch (Exception ex) {
ex.printStackTrace();
}
}
当然了,我们得写一个算法来使用这个函数,如两次重启路由器时间不能太短
java下载图片:
/**
*发送图片信息到服务器下载图片,应用ISO8859-1
*/
public void sendPic(String url,String story,String name){
setURL(url);
HttpClient http = new HttpClient();
http.getHttpConnectionManager().getParams().setConnectionTimeout(100000);
GetMethod get=null;
try{
get = new GetMethod(url);
}catch(IllegalArgumentException ex){
Log.logException("url带有不规则字符", ex);
setStatus(baseURL, ERROR);
_body.setLength(0);
return;
}
get.getParams().setParameter(HttpMethodParams.SO_TIMEOUT,100000);
get.setFollowRedirects(false);
int er = 0;
try{
get.addRequestHeader("user-agent",useragent);
er = http.executeMethod(get);
System.out.println("server return code"+er);
}catch(Exception ex){
System.out.println("发送图片url到服务器访问失败");
try{
Thread.sleep(120000);
}catch(InterruptedException e){
}
try {
er = http.executeMethod(get);
} catch (Exception e) {
System.out.println("连不上服务器,系统将推出");
System.exit(0);
}
}
if (er == 200) {
InputStream is = null;
//读取从服务器传过来的页面数据
try {
is = get.getResponseBodyAsStream();
} catch (Exception e) {
System.out.println("读取服务器内容响应时发生错误");
}
byte buffer[] = new byte[20480];
byte tbuf[] = new byte[204800];
StringBuffer bf = new StringBuffer();
try {
int tl=0;
while (true) {
int l = is.read(buffer);
if (l < 0 || l+tl>204800)
break;
for(int jj=0;jj<l;jj++)
tbuf[tl+jj]=buffer[jj];
tl+=l;
}
bf.append(new String(tbuf, 0, tl, "ISO8859-1"));
_body.setLength(0);
_body.append(bf.toString());
}catch(IOException ex){
System.out.println("将服务器的数据转换成String时发生错误");
}
}//end if(er == 200)
//下载图片到硬盘上
File outputfile = new File(story,name);
try{
FileOutputStream fos = new FileOutputStream(outputfile);
fos.write(_body.toString().getBytes("ISO8859-1"));
fos.close();
}catch(IOException ex){
System.out.println("IO存本地发生错误");
}
}
// 返回true表示该url在数据库中已存在
public boolean URLisExist(String url) {
ResultSet rs = null;
boolean b = true;
int count = 0;
try {
_prepGetCount.setString(1, url);
rs = _prepGetCount.executeQuery();
rs.next();
count = rs.getInt("qty");
} catch (Exception ex) {
System.out.println("URLisExist发生错误");
try {
if (rs != null) {
rs.close();
}
} catch (Exception e1) {
System.out.println("rs关闭时发生错误");
}
}
if (count < 1)
b = false;
return b;
}