在吃夜宵的时候利用短暂的时间做个爬虫小程序,代码没时间优化,但是功能全实现了。eclipse的项目结构如下图:
代码如下:
package com.jiaxun.test;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//小询
public class EmailSpider {
//获取网页内容(如这个网站:"https://www.douban.com/group/topic/23934101/?start=600")
public static String getPageContent(String myurl){//在此导入网址链接
StringBuffer sb = new StringBuffer();
URL url =null;
Scanner scanner = null;
try {
url = new URL(myurl);
URLConnection conn = url.openConnection();
scanner = new Scanner(conn.getInputStream());
while (scanner.hasNextLine()) {
String content = scanner.nextLine();
sb.append(content).append("\r\n");
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
scanner.close();
}
return sb.toString();
}
//将网页内容输出到本地文本文件
public static void saveToLocalFile(){
String pageContent = getPageContent("https://www.douban.com/group/topic/23934101/?start=600");
PrintWriter pw = null;
try {
pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream("pagecontent.txt")));
pw.println(pageContent);
pw.flush();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
pw.close();
}
}
//将邮箱找出来并保存到本地文件中
public static void emailSpider(String fileName){//相对路径为pagecontent.txt
StringBuffer sb = new StringBuffer();
Scanner scanner = null;
PrintWriter pw = null;
try {
scanner = new Scanner(new FileInputStream(fileName));
while (scanner.hasNextLine()) {
String content = scanner.nextLine();
sb.append(content);
}
String parseMails = parseMails(sb.toString());
pw = new PrintWriter("mails.txt");
pw.println(parseMails);
pw.flush();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
pw.close();
scanner.close();
}
}
//邮箱匹配
public static String parseMails(String line){
StringBuffer sb = new StringBuffer();
Pattern pattern = Pattern.compile("[\\w[.-]]+@[\\w[.-]]+\\.[\\w]+");
Matcher matcher = pattern.matcher(line);
while(matcher.find()){
sb.append(matcher.group()).append("\r\n");
}
return sb.toString();
}
public static void main(String[] args) {
saveToLocalFile();
emailSpider("pagecontent.txt");
}
}