很简陋的一个抓取邮箱的,抓取效率很低,纯当熟悉键盘。
1. 函数入口
public class Test01 {
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) {
for (int i=1; i<=20; i++) {
HtmlPage h1 = new HtmlPage(
"http://www.cjol.com/search/l2008/"+i+"/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43",1);
h1.pageCode();
//new Thread(new mRunable(h1), ""+i).start();
}
//http://www.cjol.com/search/l2008/4/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43
// h1.email();
//h1.pageCode();
// h1.email();
}
}
2. 正则表达式
public class Regx {
/**
* 搜索业务员找公司页面 pat = "href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>"
*
* @param buf
* @throws IOException
*/
public synchronized static void findCompany(String buf) {
List<String> companyList = new ArrayList<String>();
// System.out.println("resource:"+buf);
// System.out.println("findCompany()");
Pattern pattern = Pattern // \\s*target=\"_blank\"
.compile("href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>");
Matcher matcher = pattern.matcher(buf);
Pattern innerPattern = Pattern.compile("http:\\S+\"");
while (matcher.find()) {
String string = matcher.group();
// System.out.println(string);
Matcher innerMatcher = innerPattern.matcher(string);
if (innerMatcher.find()) {
String tmp = innerMatcher.group().replaceAll("\"", "");
new HtmlPage(tmp, 2);
String ttString = "公司招聘页面地址:" + tmp;
System.out.println(ttString);
HtmlPage.writLog(ttString);
}
companyList.add(string);
}
}
/**
* 找到公司官网主页地址
*
* @param buf
* @throws IOException
*/
public synchronized static void findWebSite(String buf) {
List<String> webSiteList = new ArrayList<String>();
Pattern pattern = Pattern.compile("网址:<a href=\"http://[\\w-\\./]+\"");
Matcher matcher = pattern.matcher(buf);
Pattern innerPattern = Pattern.compile("http:\\S+\"");
while (matcher.find()) {
String string = matcher.group();
// System.out.println("找到啦:"+string);
Matcher innerMatcher = innerPattern.matcher(string);
if (innerMatcher.find()) {
String tmp = innerMatcher.group().replaceAll("\"", "");
String ttsString = "公司主页地址:" + tmp;
System.out.println(ttsString);
HtmlPage.writLog(ttsString);
new HtmlPage(tmp, 3);
}
webSiteList.add(string);
}
}
/**
* 在官网主页找 联系我们/contact us <a href="contactus.asp">联系方式</a>
* href="contact.php">CONTACT US</a>
*
* @param str
* @throws IOException
*/
public synchronized static void findContanct(String url, String str) {
List<String> webSiteList = new ArrayList<String>();
Pattern pattern = Pattern.compile("href=\"[\\w-\\./\\?=]+\">contact",
Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(str);
Pattern innerPattern = Pattern.compile("\".+\"");
while (matcher.find()) {
String string = matcher.group();
String ttsString = "联系方式地址:" + string;
System.out.println(ttsString);
HtmlPage.writLog(ttsString);
Matcher innerMatcher = innerPattern.matcher(string);
if (innerMatcher.find()) {
String tmp = innerMatcher.group().replaceAll("\"", "");
new HtmlPage(url + "//" + tmp, 4);
}
webSiteList.add(string);
}
}
/**
* 邮箱地址验证
*
* @param str
* @return
*/
public synchronized static List<String> email(String str) {
File file = new File("1.txt");
RandomAccessFile rd = null;
try {
rd = new RandomAccessFile(file, "rw");
rd.seek(file.length());
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Pattern pattern = Pattern
.compile("[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+\\.[a-zA-Z]{2,4}");
Matcher matcher = pattern.matcher(str);
List<String> list = new ArrayList<String>();
while (matcher.find()) {
String reString = matcher.group();
if (HtmlPage.putEmail(reString)) {
System.out
.println("邮箱:------------------------------------------------------------------------- "
+ reString + "---------------");
HtmlPage.writLog("邮箱: "+reString);
try {
rd.write(reString.getBytes());
rd.write("\r\n".getBytes());
} catch (IOException e) {
HtmlPage.writLog(reString+" 写邮箱失败:"+e.getMessage());
System.out.println("邮箱写入失败:"+e.getMessage());
e.printStackTrace();
} finally {
try {
if (rd!= null)
rd.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
list.add(reString);
return list;
}
}
return null;
}
}
3. 页面内容抓取
public class HtmlPage {
private String spec;
private int depth;
// private String pageCode;
private static List<String> emailList = new ArrayList<String>();
public HtmlPage(String urlString, int depth) {
this.spec = urlString;
this.depth = depth;
System.out.println("---------"+urlString +"----"+ depth);
HtmlPage.writLog("---------"+urlString +"----"+ depth);
if (depth !=1)
pageCode();
}
public void pageCode() {
URL url = null;
try {
url = new URL(spec);
} catch (MalformedURLException e) {
HtmlPage.writLog(spec+" 初始化失败:"+e.getMessage());
System.out.println("url初始化失败");
e.printStackTrace();
return;
}
StringBuffer sBuffer = new StringBuffer();
HttpURLConnection connection;
try {
connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
} catch (IOException e) {
HtmlPage.writLog(spec+" 打开网址失败:"+e.getMessage());
System.out.println("打开网址失败");
e.printStackTrace();
return;
}
connection.setDoOutput(true);
// 网页编码
//
String charset = getCharset(connection.getContentType());
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(
connection.getInputStream(), charset));
} catch (UnsupportedEncodingException e) {
HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage());
e.printStackTrace();
return;
} catch (IOException e) {
HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage());
e.printStackTrace();
return;
}
String str = null;
try {
while ((str = br.readLine()) != null) {
switch (depth) {
case 1:
Regx.findCompany(str);
break;
case 2:
Regx.findWebSite(str);
break;
case 3:
Regx.findContanct(spec, str);
Regx.email(str);
break;
case 4:
Regx.email(str);
break;
default:
break;
}
}
} catch (IOException e) {
HtmlPage.writLog(spec+" 读取输入流:"+e.getMessage());
System.out.println(e.getMessage());
//e.printStackTrace();
return;
}
}
/**
* 网页编码
*
* @param contentType
* @return
*/
private String getCharset(String contentType) {
if (contentType == null)
return "gbk";
Pattern pattern = Pattern.compile("charset=.*");
Matcher matcher = pattern.matcher(contentType);
if (matcher.find())
return matcher.group(0).split("charset=")[1];
return "gbk";
}
public synchronized static boolean putEmail(String str) {
if (!emailList.contains(str)) {
emailList.add(str);
return true;
}
return false;
}
public synchronized static void writLog(String str) {
File file = new File("log.txt");
RandomAccessFile rd = null;
try {
rd = new RandomAccessFile(file, "rw");
int len = (int) file.length();
rd.seek(len);
rd.write(str.getBytes());
rd.write("\r\n".getBytes());
} catch (FileNotFoundException e) {
System.out.println("日志写入失败!");
e.printStackTrace();
} catch (IOException e) {
System.out.println("日志写入失败!");
e.printStackTrace();
} finally {
try {
if (rd!= null)
rd.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public void email() {
int len = emailList.size();
for (int i=0; i<len; i++) {
System.out.println(emailList.get(i));
}
}
}
4. 多线程
public class mRunable implements Runnable {
private HtmlPage htmlPage;
public mRunable() {
}
public mRunable(HtmlPage htmlPage) {
this.htmlPage = htmlPage;
}
@Override
public void run() {
System.out.println("\n\n线程---------------------------------------------------- ----"+Thread.currentThread().getName() +"滴滴开始了啦----------\n\n\n");
HtmlPage.writLog("线程"+Thread.currentThread().getName()+"开始运行");
htmlPage.pageCode();
System.out.println("\n\n线程---------------------------------------------------- ----"+Thread.currentThread().getName() +"完成工作啦----------\n\n\n");
HtmlPage.writLog("线程"+Thread.currentThread().getName()+"运行结束");
}
}