这是一个利用正则表达式获取本地储存的文件(txt doc html)或网页中得Email地址。在原理上接近网络爬虫。
但是我在做测试得时候,不能获取openOffice的文件中的地址。应该是编码问题。可是openOffice号称自己
完全支持MS Office 得文件,那编码应该式兼容的啊。而MS的doc文件在试验中式成功的。这个我问题我以后再看看吧。
其中的英语是在太面,但是我得坚持用英语写。
- /*
- * This program was write for catching email addresses
- * from a document that contains lots of them.
- */
- import java.io.BufferedReader;
- import java.io.FileNotFoundException;
- import java.io.FileReader;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * @author Sancho_lai
- *
- */
- public class ReadFileFromLocalAndWeb {
- public void test() {
- /**
- * this part get email address from local doc
- */
- System.out.println("********************Get Email Address From Local doc********************");
- try {
- /*
- * here you can input whatever address of doc that contains the email addresses you want,
- */
- BufferedReader br = new BufferedReader(new FileReader("E://workspace//Project_For_ToyTest//doc//EmailAddress.txt"));
- // BufferedReader br = new BufferedReader(new FileReader("E://workspace//Project_For_ToyTest//doc//New OpenDocument Text.odt"));
- String line = "";
- while( (line = br.readLine()) != null) {
- parser(line);
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- }
- /**
- * this part get email address from web
- */
- System.out.println("********************Get Email Address From the Web********************");
- String line = "";
- InputStream urlStream;
- try {
- URL url = new URL("http://www.8885.net/bbs/thread-189746-1-3.html");
- HttpURLConnection connection = (HttpURLConnection)url.openConnection();
- urlStream = connection.getInputStream();
- BufferedReader br = new BufferedReader(new InputStreamReader(urlStream,"gbk"));
- while( (line = br.readLine()) != null) {
- parser(line);
- }
- } catch (MalformedURLException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- }
- }
- private void parser(String line) {
- //this is the email address pattern.
- Pattern p = Pattern.compile("[//w[.-]]+@[//w[.-]]+//.[//w]+");
- Matcher m = p.matcher(line);
- while(m.find()) {
- System.out.println(m.group());
- }
- }
- public static vide main(String[] args) {
- ReadFileFromLocalAndWeb t = new ReadFileFromLocalAndWeb();
- t.test();
- }
- }