获取站点内容有多种方式我这里介绍两种,一种是通过HTTPSocket对象获取,另一种是通过HttpURLConnection对象获取
1、采用HTTPSocket对象要应用一个叫做heaton的抓取页面的包可以去网上找到下载
参考代码如下:
- import com.heaton.bot.*;
- class HTTPGet {
- public static void main(String args[]) {
- String url;
- try {
- if (args.length != 1) {
- }
- // 要获取的页面
- url = "http://club.china.alibaba.com/forum/thread/view/96_24986738_.html";
- HTTPSocket http = new HTTPSocket();
- http.send(url, null);// 向url发送请求
- System.out.println("* * Headers from: " + url + "* *");
- for (int i = 0; i < http.getClientHeaders().length(); i++) {
- Attribute a = http.getServerHeaders().get(i);// 获取头信息
- System.out.println(a.getName() + "=" + a.getValue());
- }
- System.out.println("* * * Data from: " + url + "* * *");
- System.out.println(http.getBody());// 获取源代码
- System.exit(0);
- } catch (Exception e) {
- System.out.println("Exception thrown: " + e.getMessage());
- }
- }
- }
2、通过HttpURLConnection对象获取页面,通过HttpURLConnection对象获取页面的头信息和编码信息,通过URL对象获取页面的源代码。参考代码:
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- public class WebSpider {
- /**
- * @param args
- */
- public static void main(String[] args) {
- try{
- URL aurl=new URL("http://www.csdn.net");
- HttpURLConnection urlcon=(HttpURLConnection) aurl.openConnection();
- //初始化HttpURLConnection对象
- System.out.println(urlcon.getHeaderField(0));//获取页面头信息
- System.out.println(urlcon.getContentType());
- //采用UTF-8读取页面源文件,这里采用的字符集必须与页面相匹配,不匹配可能会出现乱码
- BufferedReader bin=new BufferedReader(new InputStreamReader(aurl.openStream(),"utf-8"));
- String line = "";
- while((line=bin.readLine())!=null){
- System.out.print(line);
- System.out.print('/n');
- }
- }catch(MalformedURLException e){
- System.out.println("URL error/n");
- }catch(IOException e2){
- System.out.println("IO error/n");
- }
- }
- }