javaSpider
所需jar包
1. 我的第一个测试
public class SpiderTest {
/**
* 爬取网站的源码
*/
@Test
public void test1()
{
BufferedReader bf = null;
try {
//获得URL对象
URL url = new URL("http://1483104508.55555.io/From");
//获得对应的inputStream流
InputStream input = url.openStream();
//存入BufferReader 并设置字符集
bf = new BufferedReader(new InputStreamReader(input, "utf-8"));
String str =null;
//读取数据
while((str = bf.readLine())!=null)
{
//打印
System.out.println(str);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
//关闭BufferReader
if(bf!=null)
try {
bf.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 通过代理访问
*/
@Test
public void test2()
{
BufferedReader bf = null;
try {
URL url = new URL("http://1483104508.55555.io/From");
//设置代理,XX为代理服务器IP,host为端口号
Proxy proxy = new Proxy(Type.HTTP,new InetSocketAddress("XXX.XXX.XXX.XXX", host));
//通过代理获得URLConnection连接
URLConnection u=url.openConnection(proxy);
//参考test1
InputStream input = u.getInputStream();
bf = new BufferedReader(new InputStreamReader(input, "utf-8"));
String str =null;
while((str = bf.readLine())!=null)
{
System.out.println(str);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
if(bf!=null)
try {
bf.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
2.HttpClient
public class SpiderTest1 {
@Test
public void test()
{
HttpClient send = new DefaultHttpClient();
send.getParams().setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("XXX.XXX.XXX.XXX", host));
//代理方式XX为ip,host为端口
HttpContext context = new BasicHttpContext();
HttpGet get = new HttpGet("http://1483104508.55555.io/From");
// HttpGet get = new HttpGet("http://www.baidu.com");
BufferedReader bf = null;
try {
// HttpResponse response= send.execute(get);//只获取消息体
HttpResponse response= send.execute(get,context);//可以获得消息头
HttpEntity entity = response.getEntity();//获得的是请求体
Object HTTP_CONNECTION = context.getAttribute(ExecutionContext.HTTP_CONNECTION);
System.out.println(HTTP_CONNECTION);//获得http连接
Object HTTP_PROXY_HOST = context.getAttribute(ExecutionContext.HTTP_PROXY_HOST);
System.out.println(HTTP_PROXY_HOST);//代理主机host
Object HTTP_REQ_SENT = context.getAttribute(ExecutionContext.HTTP_REQ_SENT);
System.out.println(HTTP_REQ_SENT);//
Object HTTP_REQUEST = context.getAttribute(ExecutionContext.HTTP_REQUEST);
System.out.println(HTTP_REQUEST);//获得request对象
Object HTTP_RESPONSE = context.getAttribute(ExecutionContext.HTTP_RESPONSE);
System.out.println(HTTP_RESPONSE);//获得response对象
Object HTTP_TARGET_HOST = context.getAttribute(ExecutionContext.HTTP_TARGET_HOST);
System.out.println(HTTP_TARGET_HOST);//目标主机host
HttpHost hh = (HttpHost)HTTP_TARGET_HOST; // 转换为HttpHost
String host = hh.getHostName();//获得主机名
System.out.println(host);
InputStream content = entity.getContent();
String contentCharSet = EntityUtils.getContentCharSet(entity);
bf = new BufferedReader(new InputStreamReader(content, contentCharSet));
String str = null;
while ((str = bf.readLine())!=null)
{
System.out.println(str);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(bf!=null)
{
try {
bf.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
3.爬取网页存储
public class SpiderTest2 {
/**
* 爬到网页并存储
*/
@Test
public void test3() {
BufferedReader r = null;
HttpClient sender = new DefaultHttpClient();
HttpGet get = new HttpGet("http://www.btime.com/?from=ssk2");
try {
HttpResponse response = sender.execute(get); //获得response对象
HttpEntity entity = response.getEntity(); //得到请求体
InputStream in = entity.getContent();// 得到请求内容
// 直接将所拿到的流存放到文件里,转成html
IOUtils.copy(in, new FileOutputStream("F:/c.html"));//将流写入html文件
} catch (Exception e) {
e.printStackTrace();
}finally {
if (r != null) {
try {
r.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
4.模拟登陆
public class SpiderTest3 {
/**
* 模拟登陆
*/
@Test
public void test()
{
HttpClient send = new DefaultHttpClient();//先获取CLient对象
HttpContext context = new BasicHttpContext();//获取Context对象
HttpPost post = new HttpPost("http://1483104508.55555.io/blog/AdminLogin");
//通过post连接
List<NameValuePair> parms = new ArrayList<NameValuePair>();//参数集合
parms.add(new BasicNameValuePair("user", "root"));//添加参数
parms.add(new BasicNameValuePair("pwd", "root"));
BufferedReader bf = null;
try {
post.setEntity(new UrlEncodedFormEntity(parms,"utf-8"));//请求
HttpResponse response= send.execute(post,context);//可以获得消息头
HttpEntity entity = response.getEntity();//获得请求体
InputStream content = entity.getContent();//获得内容
String contentCharSet = EntityUtils.getContentCharSet(entity);//获得字符集
bf = new BufferedReader(new InputStreamReader(content, contentCharSet));
//获取BufferReader
String str = null;
while ((str = bf.readLine())!=null)
{
System.out.println(str);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if(bf!=null)
{
try {
bf.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
模拟登陆后获得的为登陆后页面的信息,注意目标Post路径应为登陆所指的Servelet的路径