读取html页面文件解析邮箱地址

最新推荐文章于 2023-01-29 23:20:46 发布

原创最新推荐文章于 2023-01-29 23:20:46 发布 · 3.5k 阅读

1 ·

CC 4.0 BY-SA版权

JAVA 专栏收录该内容

115 篇文章

订阅专栏

本文介绍了一个简单的Java程序，用于从HTML文件中读取并解析电子邮件地址。程序通过正则表达式匹配来查找包含@符号的字符串，并进行进一步的格式清理。

本文来自：http://blog.csdn.net/javaalpha/article/details/8332587  转载是请标明，谢谢。

 

读取html页面文件解析邮箱地址

 

package com.alpha.test;import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;/**
 * 读取html页面文件解析邮箱地址
 * 
 * @author JavaAlpha 2012-12-19 13:45:11
 */
public class ReadHtmlToTxt { // 读取文件
 public static String readHtml(String path) {  StringBuffer emailCont = new StringBuffer();  File htmlFile = new File(path);
  if (htmlFile.exists() && htmlFile.isFile() && htmlFile.canRead()) {
   Reader in;
   try {
    in = new FileReader(htmlFile);
    char[] buff = new char[4096];
    int nch;
    while ((nch = in.read(buff, 0, buff.length)) != -1) {
     emailCont.append(checkEmail(new String(buff, 0, nch)));
    }
   } catch (FileNotFoundException e) {
    e.printStackTrace();
   } catch (IOException e) {
    e.printStackTrace();
   }  }  return emailCont.toString();
 } // 判断字符串里面是否包括@符号
 public static String checkEmail(String str) {  String postCont = "";
  // 判断是否回复的内容
  if (str.indexOf("@") > -1) {   postCont = str.substring(str.indexOf("@") - 10,
     str.indexOf("@") + 10);   if (postCont.indexOf(">") > -1 || postCont.indexOf("<") > -1) {
    postCont = postCont.replaceAll(">", "");
    postCont = postCont.replaceAll("<", "");
    postCont = postCont.replaceAll("/", "");
   }   if (postCont.indexOf(",") > -1 || postCont.indexOf("，") > -1
     || postCont.indexOf("。") > -1 || postCont.indexOf(";") > -1) {
    postCont = postCont.replaceAll(",", "");
    postCont = postCont.replaceAll("，", "");
    postCont = postCont.replaceAll("。", "");
   }   postCont = postCont.substring(0, postCont.indexOf(".com") + 4);   System.out.println(postCont);
  }  return postCont;
 }
 
 //过滤汉字
 public static boolean checkChinese(String str) {
  
  String regEx = "[\\u4e00-\\u9fa5]";
  Pattern p = Pattern.compile(regEx);
  Matcher m = p.matcher(str);
  if (m != null && m.find()){
   return true;//是汉字
  }
  return false;
 } // 将整理是邮箱地址写入文件
 public static void writerFile(String cont, String path) {  File emailFile = new File(path);  try {
   //如果文件不存在，创建文件
   if (!emailFile.exists()) {
    emailFile.createNewFile();
   }
   
   Writer out = new FileWriter(emailFile);   out.write(cont);
   out.flush();
   out.close();
  } catch (Exception e) {
   e.printStackTrace();
  } }
 
 /**
  * 读取网络内容 
  */
 public static void readUrlCont(String strUrl) {
  
  StringBuffer cont = new StringBuffer();//内容
  
  try {
   URL url = new URL(strUrl);
   URLConnection conn = url.openConnection();
   BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
   String lineCont = "";
   while ((lineCont = reader.readLine())!= null) {
    cont.append(lineCont+"</br>");
   }
   
   reader.close();
   
  } catch (MalformedURLException e) {
   e.printStackTrace();
  } catch (IOException e) {
   e.printStackTrace();
  }
  
  System.out.println(cont.toString());
 } public static void main(String[] args) {
  
  //String cont = readHtml("e://test.htm");//读取文件
  
  //writerFile(cont, "e://test.txt");//写文件
  
  //checkChinese("qwe123");
  
  readUrlCont("http://www.163.com");
  
 }}