- package ttwork.net;
- import java.net.*;
- import java.util.regex.*;
- import java.io.*;
- public class WebPage {
- public static void main(String[] args) {
- //http://zhidao.baidu.com/question/192012139.html
- WebPage wp = new WebPage("http://tieba.baidu.com/f?kz=664340519");
- //wp.getSubHTML(1, 20, "src/f2.html");
- wp.getInnerHTML("/^([//w-//.]+)@((//[[0-9]{1,3}//.[0-9]{1,3}//.[0-9]{1,3}//.)|(([//w-]+//.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(//]?)$/g", "src/f2.html");
- }
- private URL url; //URL地址
- private InetAddress[] ip; //IP地址
- private File desFile; //目标文件
- private InputStreamReader isr; //Reader
- private OutputStreamWriter osw; //Writer
- private BufferedReader br; //Reader
- private BufferedWriter bw; //Writer
- /**
- * @param protocol 协议的名称
- * @param host 主机的名称或者域名
- */
- public WebPage(String _url) {
- try {
- url = new URL(_url);
- try {
- ip = InetAddress.getAllByName(_url.substring(7));
- } catch (UnknownHostException e) {
- ip = null;
- }
- } catch (MalformedURLException e) {
- e.printStackTrace();
- }
- }
- /**
- * 获取指定网站的ip地址,有些网站的ip地址不止一个
- * @return 字符串连接的ip地址
- */
- public String getAllIp() {
- String ipAll = null;
- for(int i=0; i<ip.length; i++) {
- ipAll += ip[i].toString()+"/n";
- }
- return ipAll;
- }
- /**
- * 得到该网页中的所有内容
- */
- public void getHTML(String desFilePath) {
- try {
- desFile = new File(desFilePath);
- isr = new InputStreamReader(url.openStream());
- osw = new OutputStreamWriter(new FileOutputStream(desFile));
- char[] line = new char[1024];
- while(isr.read(line) > 0) {
- osw.write(line);
- osw.flush();
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- osw.close();
- isr.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 按照正则表达式进行匹配,获取指定格式的内容,比如获取网页中所有的邮箱地址
- * 验证邮箱的正则表达式
- * /^([/w-/.]+)@((/[[0-9]{1,3}/.[0-9]{1,3}/.[0-9]{1,3}/.)|(([/w-]+/.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(/]?)$/g
- */
- public void getInnerHTML(String regex,String desFilePath) {
- try {
- desFile = new File(desFilePath);
- isr = new InputStreamReader(url.openStream());
- br = new BufferedReader(isr);
- osw = new OutputStreamWriter(new FileOutputStream(desFile));
- bw = new BufferedWriter(osw);
- Pattern p = Pattern.compile(regex);
- Matcher m;
- String line;
- while((line=br.readLine()) != null) {
- m = p.matcher(line);
- if(m.find()) {
- int s = m.start();
- int e = m.end();
- bw.write(line.substring(s,e));
- bw.flush();
- bw.newLine();
- }
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- bw.close();
- osw.close();
- br.close();
- isr.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- /**
- * 获取网页中指定行到指定行的内容,根据java中的原则包括startRow这一行,不包括endRow这一行
- *
- */
- public void getSubHTML(int startRow, int endRow, String desFilePath) {
- try {
- int rowNum = 0;
- desFile = new File(desFilePath);
- isr = new InputStreamReader(url.openStream());
- br = new BufferedReader(isr);
- osw = new OutputStreamWriter(new FileOutputStream(desFile));
- bw = new BufferedWriter(osw);
- String line;
- while((line=br.readLine()) != null) {
- rowNum ++;
- if(rowNum < startRow) {
- continue;
- } else if(rowNum>=startRow && rowNum<endRow) {
- bw.write(line);
- bw.flush();
- bw.newLine();
- } else {
- break;
- }
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- bw.close();
- osw.close();
- br.close();
- isr.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
- }