Java 抓取网页内容，获取指定服务器IP

最新推荐文章于 2020-08-06 16:07:48 发布

涛涛_2009

最新推荐文章于 2020-08-06 16:07:48 发布

阅读量3.4k

点赞数

文章标签： java 服务器正则表达式 null string url

本文链接：https://blog.csdn.net/tsyj810883979/article/details/6510459

版权

Code:

package ttwork.net;
import java.net.*;
import java.util.regex.*;
import java.io.*;
public class WebPage {
public static void main(String[] args) {
//http://zhidao.baidu.com/question/192012139.html
WebPage wp = new WebPage("http://tieba.baidu.com/f?kz=664340519");
//wp.getSubHTML(1, 20, "src/f2.html");
wp.getInnerHTML("/^([//w-//.]+)@((//[[0-9]{1,3}//.[0-9]{1,3}//.[0-9]{1,3}//.)|(([//w-]+//.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(//]?)$/g", "src/f2.html");
}
private URL url; //URL地址
private InetAddress[] ip; //IP地址
private File desFile; //目标文件
private InputStreamReader isr; //Reader
private OutputStreamWriter osw; //Writer
private BufferedReader br; //Reader
private BufferedWriter bw; //Writer
/**
* @param protocol 协议的名称
* @param host 主机的名称或者域名
*/
public WebPage(String _url) {
try {
url = new URL(_url);
try {
ip = InetAddress.getAllByName(_url.substring(7));
} catch (UnknownHostException e) {
ip = null;
}
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
/**
* 获取指定网站的ip地址，有些网站的ip地址不止一个
* @return 字符串连接的ip地址
*/
public String getAllIp() {
String ipAll = null;
for(int i=0; i<ip.length; i++) {
ipAll += ip[i].toString()+"/n";
}
return ipAll;
}
/**
* 得到该网页中的所有内容
*/
public void getHTML(String desFilePath) {
try {
desFile = new File(desFilePath);
isr = new InputStreamReader(url.openStream());
osw = new OutputStreamWriter(new FileOutputStream(desFile));
char[] line = new char[1024];
while(isr.read(line) > 0) {
osw.write(line);
osw.flush();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
osw.close();
isr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 按照正则表达式进行匹配，获取指定格式的内容,比如获取网页中所有的邮箱地址
* 验证邮箱的正则表达式
* /^([/w-/.]+)@((/[[0-9]{1,3}/.[0-9]{1,3}/.[0-9]{1,3}/.)|(([/w-]+/.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(/]?)$/g
*/
public void getInnerHTML(String regex,String desFilePath) {
try {
desFile = new File(desFilePath);
isr = new InputStreamReader(url.openStream());
br = new BufferedReader(isr);
osw = new OutputStreamWriter(new FileOutputStream(desFile));
bw = new BufferedWriter(osw);
Pattern p = Pattern.compile(regex);
Matcher m;
String line;
while((line=br.readLine()) != null) {
m = p.matcher(line);
if(m.find()) {
int s = m.start();
int e = m.end();
bw.write(line.substring(s,e));
bw.flush();
bw.newLine();
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
bw.close();
osw.close();
br.close();
isr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 获取网页中指定行到指定行的内容,根据java中的原则包括startRow这一行，不包括endRow这一行
*
*/
public void getSubHTML(int startRow, int endRow, String desFilePath) {
try {
int rowNum = 0;
desFile = new File(desFilePath);
isr = new InputStreamReader(url.openStream());
br = new BufferedReader(isr);
osw = new OutputStreamWriter(new FileOutputStream(desFile));
bw = new BufferedWriter(osw);
String line;
while((line=br.readLine()) != null) {
rowNum ++;
if(rowNum < startRow) {
continue;
} else if(rowNum>=startRow && rowNum<endRow) {
bw.write(line);
bw.flush();
bw.newLine();
} else {
break;
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
bw.close();
osw.close();
br.close();
isr.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}