java 学习:网络爬虫--中国人才热线邮箱抓取

很简陋的一个抓取邮箱的,抓取效率很低,纯当熟悉键盘。

1. 函数入口

public class Test01 {
	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args)  {	
		for (int i=1; i<=20; i++) {
			HtmlPage h1 = new HtmlPage(	    
					"http://www.cjol.com/search/l2008/"+i+"/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43",1);
			h1.pageCode();
			//new Thread(new mRunable(h1), ""+i).start();
		}
		//http://www.cjol.com/search/l2008/4/?Keyword=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&KeywordType=3&RecentSelected=43
	//	h1.email();
		//h1.pageCode();
		// h1.email();
	}	
}

2. 正则表达式

public class Regx {
	/**
	 * 搜索业务员找公司页面 pat = "href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>"
	 * 
	 * @param buf
	 * @throws IOException
	 */
	public synchronized static void findCompany(String buf) {
		List<String> companyList = new ArrayList<String>();
		// System.out.println("resource:"+buf);
		// System.out.println("findCompany()");
		Pattern pattern = Pattern // \\s*target=\"_blank\"
				.compile("href=\"http://[\\w-\\./]+\">[\u4e00-\u9fa5]*有限公司</a>");
		Matcher matcher = pattern.matcher(buf);
		Pattern innerPattern = Pattern.compile("http:\\S+\"");

		while (matcher.find()) {
			String string = matcher.group();
			// System.out.println(string);
			Matcher innerMatcher = innerPattern.matcher(string);
			if (innerMatcher.find()) {
				String tmp = innerMatcher.group().replaceAll("\"", "");
				new HtmlPage(tmp, 2);
				String ttString = "公司招聘页面地址:" + tmp;
				System.out.println(ttString);
				HtmlPage.writLog(ttString);
			}

			companyList.add(string);
		}
	}

	/**
	 * 找到公司官网主页地址
	 * 
	 * @param buf
	 * @throws IOException
	 */
	public synchronized static void findWebSite(String buf) {

		List<String> webSiteList = new ArrayList<String>();
		Pattern pattern = Pattern.compile("网址:<a href=\"http://[\\w-\\./]+\"");
		Matcher matcher = pattern.matcher(buf);

		Pattern innerPattern = Pattern.compile("http:\\S+\"");
		while (matcher.find()) {
			String string = matcher.group();
			// System.out.println("找到啦:"+string);
			Matcher innerMatcher = innerPattern.matcher(string);
			if (innerMatcher.find()) {
				String tmp = innerMatcher.group().replaceAll("\"", "");
				String ttsString = "公司主页地址:" + tmp;
				System.out.println(ttsString);
				HtmlPage.writLog(ttsString);
				new HtmlPage(tmp, 3);
			}
			webSiteList.add(string);
		}
	}

	/**
	 * 在官网主页找 联系我们/contact us <a href="contactus.asp">联系方式</a>
	 * href="contact.php">CONTACT US</a>
	 * 
	 * @param str
	 * @throws IOException
	 */
	public synchronized static void findContanct(String url, String str) {

		List<String> webSiteList = new ArrayList<String>();
		Pattern pattern = Pattern.compile("href=\"[\\w-\\./\\?=]+\">contact",
				Pattern.CASE_INSENSITIVE);
		Matcher matcher = pattern.matcher(str);
		Pattern innerPattern = Pattern.compile("\".+\"");

		while (matcher.find()) {
			String string = matcher.group();
			String ttsString = "联系方式地址:" + string;
			System.out.println(ttsString);
			HtmlPage.writLog(ttsString);
			Matcher innerMatcher = innerPattern.matcher(string);
			if (innerMatcher.find()) {
				String tmp = innerMatcher.group().replaceAll("\"", "");
				new HtmlPage(url + "//" + tmp, 4);
			}
			webSiteList.add(string);
		}
	}

	/**
	 * 邮箱地址验证
	 * 
	 * @param str
	 * @return
	 */
	public synchronized static List<String> email(String str) {
		File file = new File("1.txt");
		RandomAccessFile rd = null;
		try {
			rd = new RandomAccessFile(file, "rw");
			rd.seek(file.length());
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}

		Pattern pattern = Pattern
				.compile("[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+\\.[a-zA-Z]{2,4}");
		Matcher matcher = pattern.matcher(str);
		List<String> list = new ArrayList<String>();
		while (matcher.find()) {
			String reString = matcher.group();
			
			
			if (HtmlPage.putEmail(reString)) {
				System.out
				.println("邮箱:------------------------------------------------------------------------- "
						+ reString + "---------------");
				HtmlPage.writLog("邮箱: "+reString);
				try {

					rd.write(reString.getBytes());
					rd.write("\r\n".getBytes());
				} catch (IOException e) {
					HtmlPage.writLog(reString+" 写邮箱失败:"+e.getMessage());
					System.out.println("邮箱写入失败:"+e.getMessage());
					e.printStackTrace();
				} finally {
					try {
						if (rd!= null) 
							rd.close();
					} catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
					
				}
				list.add(reString);
				return list;
			}
		}
		return null;
	}
}

3. 页面内容抓取

public class HtmlPage {
	private String spec;
	private int depth;
	
	// private String pageCode;
	private static List<String> emailList = new ArrayList<String>();

	public HtmlPage(String urlString, int depth)  {
		this.spec = urlString;
		this.depth = depth;
		System.out.println("---------"+urlString +"----"+ depth);
		HtmlPage.writLog("---------"+urlString +"----"+ depth);
		if (depth !=1)
			pageCode();
	}

	public void pageCode()  {
		URL url = null;
		try {
			url = new URL(spec);
		} catch (MalformedURLException e) {
			HtmlPage.writLog(spec+" 初始化失败:"+e.getMessage());
			System.out.println("url初始化失败");
			e.printStackTrace();
			return;
		}
		StringBuffer sBuffer = new StringBuffer();
		HttpURLConnection connection;
		try {
			connection = (HttpURLConnection) url.openConnection();
			connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
		} catch (IOException e) {
			HtmlPage.writLog(spec+" 打开网址失败:"+e.getMessage());
			System.out.println("打开网址失败");
			e.printStackTrace();
			return;
		}
		connection.setDoOutput(true);
		// 网页编码
		//
		String charset = getCharset(connection.getContentType());
		BufferedReader br = null;
		try {
			br = new BufferedReader(new InputStreamReader(
					connection.getInputStream(), charset));
		} catch (UnsupportedEncodingException e) {
			HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage());
			e.printStackTrace();
			return;
		} catch (IOException e) {
			HtmlPage.writLog(spec+" 初始化输入流:"+e.getMessage());
			e.printStackTrace();
			return;
		}
		String str = null;
		try {
			while ((str = br.readLine()) != null) {
				switch (depth) {
				case 1:
					Regx.findCompany(str);
					break;
				case 2:
					Regx.findWebSite(str);
					break;
				case 3:
					Regx.findContanct(spec, str);
					Regx.email(str);
					break;
				case 4:
					Regx.email(str);
					break;
				default:
					break;
				}

			}
		} catch (IOException e) {
			HtmlPage.writLog(spec+" 读取输入流:"+e.getMessage());
			System.out.println(e.getMessage());
			//e.printStackTrace();
			return;
		}
	}

	/**
	 * 网页编码
	 * 
	 * @param contentType
	 * @return
	 */
	private String getCharset(String contentType) {
		if (contentType == null)
			return "gbk";
		Pattern pattern = Pattern.compile("charset=.*");
		Matcher matcher = pattern.matcher(contentType);
		if (matcher.find())
			return matcher.group(0).split("charset=")[1];
		return "gbk";
	}

	public synchronized static boolean putEmail(String str) {
		if (!emailList.contains(str)) {
			emailList.add(str);
			return true;
		}
		return false;
	}
	public synchronized static void writLog(String str) {
		File file = new File("log.txt");
		RandomAccessFile rd = null;
		try {
			 rd = new RandomAccessFile(file, "rw");
			int len = (int) file.length();
			rd.seek(len);
			rd.write(str.getBytes());
			rd.write("\r\n".getBytes());
		} catch (FileNotFoundException e) {
			System.out.println("日志写入失败!");
			e.printStackTrace();
			
		} catch (IOException e) {
			System.out.println("日志写入失败!");
			e.printStackTrace();
		} finally {
			try {
				if (rd!= null) 
					rd.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
	}
	public void email() {
		int len = emailList.size();
		for (int i=0; i<len; i++) {
			System.out.println(emailList.get(i));
		}
	}
}


4. 多线程

public class mRunable implements Runnable {

	private HtmlPage htmlPage;
	public mRunable() {
	}
	
	public mRunable(HtmlPage htmlPage) {
		this.htmlPage = htmlPage;
	}

	@Override
	public void run() {
		System.out.println("\n\n线程---------------------------------------------------- ----"+Thread.currentThread().getName() +"滴滴开始了啦----------\n\n\n");
		HtmlPage.writLog("线程"+Thread.currentThread().getName()+"开始运行");
		htmlPage.pageCode();
		System.out.println("\n\n线程----------------------------------------------------  ----"+Thread.currentThread().getName() +"完成工作啦----------\n\n\n");
		HtmlPage.writLog("线程"+Thread.currentThread().getName()+"运行结束");
	}
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值