1. 从网页上抓取我们感兴趣的内容.
2. 得到网页的源代码, 通过正则表达式找的我们需要的信息.保存起来.
3.代码实现
URL url = new URL(网页地址);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
//设置代理,有些网页不允许Java访问.
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.setDoOutput(true);
//得到网页编码
String charset = getCharset(connection.getContentType());
//初始化输出流
BufferedReader br = new BufferedReader(new InputStreamReader(
connection.getInputStream(), charset));
while ((str = br.readLine()) != null) {
//邮箱正则表达式
Pattern pattern = Pattern
.compile("[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+\\.[a-zA-Z]{2,4}");
Matcher matcher = pattern.matcher(str);
//如果找到则输出
while (matcher.find()) {
String reString = matcher.group();
System.out.println(reString);
}
}