这是一个利用正则表达式获取本地储存的文件(txt doc html)或网页中得Email地址。在原理上接近网络爬虫。
但是我在做测试得时候,不能获取openOffice的文件中的地址。应该是编码问题。可是openOffice号称自己
完全支持MS Office 得文件,那编码应该式兼容的啊。而MS的doc文件在试验中式成功的。这个我问题我以后再看看吧。
其中的英语是在太面,但是我得坚持用英语写。
- /*
- *Thisprogramwaswriteforcatchingemailaddresses
- *fromadocumentthatcontainslotsofthem.
- */
- importjava.io.BufferedReader;
- importjava.io.FileNotFoundException;
- importjava.io.FileReader;
- importjava.io.IOException;
- importjava.io.InputStream;
- importjava.io.InputStreamReader;
- importjava.net.HttpURLConnection;
- importjava.net.MalformedURLException;
- importjava.net.URL;
- importjava.util.regex.Matcher;
- importjava.util.regex.Pattern;
- /**
- *@authorSancho_lai
- *
- */
- publicclassReadFileFromLocalAndWeb{
- publicvoidtest(){
- /**
- *thispartgetemailaddressfromlocaldoc
- */
- System.out.println("********************GetEmailAddressFromLocaldoc********************");
- try{
- /*
- *hereyoucaninputwhateveraddressofdocthatcontainstheemailaddressesyouwant,
- */
- BufferedReaderbr=newBufferedReader(newFileReader("E://workspace//Project_For_ToyTest//doc//EmailAddress.txt"));
- //BufferedReaderbr=newBufferedReader(newFileReader("E://workspace//Project_For_ToyTest//doc//NewOpenDocumentText.odt"));
- Stringline="";
- while((line=br.readLine())!=null){
- parser(line);
- }
- }catch(FileNotFoundExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- /**
- *thispartgetemailaddressfromweb
- */
- System.out.println("********************GetEmailAddressFromtheWeb********************");
- Stringline="";
- InputStreamurlStream;
- try{
- URLurl=newURL("http://www.8885.net/bbs/thread-189746-1-3.html");
- HttpURLConnectionconnection=(HttpURLConnection)url.openConnection();
- urlStream=connection.getInputStream();
- BufferedReaderbr=newBufferedReader(newInputStreamReader(urlStream,"gbk"));
- while((line=br.readLine())!=null){
- parser(line);
- }
- }catch(MalformedURLExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- }
- }
- privatevoidparser(Stringline){
- //thisistheemailaddresspattern.
- Patternp=Pattern.compile("[//w[.-]]+@[//w[.-]]+//.[//w]+");
- Matcherm=p.matcher(line);
- while(m.find()){
- System.out.println(m.group());
- }
- }
- publicstaticvidemain(String[]args){
- ReadFileFromLocalAndWebt=newReadFileFromLocalAndWeb();
- t.test();
- }
- }