正则表达式解析出页面所有链接,并得到链接的内容

Main类的main方法得到所有链接,此方法是带链接状态的

package com.logistics;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;

public class Main1 {

/**
* @param args
* @throws IOException
* @throws ClientProtocolException
*/
public static void main(String[] args) throws Exception {
// Create and initialize HTTP parameters
HttpParams params = new BasicHttpParams();
ConnManagerParams.setMaxTotalConnections(params, 10);
HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);

// Create and initialize scheme registry
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(
new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));

ClientConnectionManager cm = new ThreadSafeClientConnManager(params, schemeRegistry);
HttpClient client = new DefaultHttpClient(cm, params);
HttpGet get = new HttpGet("http://localhost:8080/docs/");
HttpResponse response = client.execute(get);
HttpEntity entity = response.getEntity();
byte[] b = new byte[1024];
ByteArrayOutputStream stream = new ByteArrayOutputStream();
if (entity != null) {
InputStream is = entity.getContent();
while (is.read(b) != -1) {
stream.write(b);
}
}
Pattern pattern = Pattern.compile("\\w+\\.html");
Matcher matcher = pattern.matcher(stream.toString("utf-8"));
ArrayList<String> list=new ArrayList<String>();
while (matcher.find()) {
list.add("http://localhost:8080/docs/"+matcher.group());
}
for (int i = 0; i < list.size(); i++) {
new SpiderThread(client, new HttpGet(list.get(i)), i + 1).run();
}
}
}


然后使用线程得到链接内容
package com.logistics;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;

public class SpiderThread extends Thread {
private final HttpClient httpClient;
private final HttpContext context;
private final HttpGet httpGet;
private final int id;

public SpiderThread(HttpClient httpClient, HttpGet httpGet, int id) {
this.httpClient = httpClient;
this.context = new BasicHttpContext();
this.httpGet = httpGet;
this.id = id;
}

/**
* Executes the GetMethod and prints some status information.
*/
@Override
public void run() {
Long start = System.currentTimeMillis();
try {
HttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
if (entity != null) {
byte[] bytes = EntityUtils.toByteArray(entity);
// System.out.println(new String(bytes,"utf-8"));
System.out.println(httpGet.getURI().getPath());
}
} catch (Exception e) {
httpGet.abort();
System.out.println(id + " - error: " + e);
}
Long end = System.currentTimeMillis();
System.out.println(id +" -- 用时:"+(end-start));
}
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值