java 网页抓取数据

声明  本文章 属转载


import java.io.BufferedReader;  
import java.io.IOException;   
import java.io.InputStreamReader;   
import java.net.MalformedURLException;   
import java.net.URL;   
import java.util.ArrayList;   
import java.util.HashMap;   
import java.util.Iterator;   
import java.util.List;   
import java.util.regex.Matcher;   
import java.util.regex.Pattern;          
public class WebContent {    
	/** * 
	 * 读取一个网页全部内容       *       
	 * @param htmlurl       
	 * @return String 网页内容       
	 * @throws IOException      
	*/      
   public String getOneHtml(final String htmlurl) throws IOException {
		URL url;         
		String temp;     
		final StringBuffer htmlContent = new StringBuffer();      
		try {              
				url = new URL(htmlurl);     
				System.out.println(url.getProtocol()); //协议  
				final BufferedReader in = 
						new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));// 读取网页全部内容         
				while ((temp = in.readLine()) != null) { 
					htmlContent.append(temp); 
					System.out.println(htmlContent);
				}               
				in.close();       
		} catch (final MalformedURLException me) {   
				System.out.println("你输入的URL格式有问题!请仔细输入");        
				me.getMessage();      throw me;   
		} catch (final IOException e) {    
	            e.printStackTrace();  throw e;          
        }           
		return htmlContent.toString(); 
}              
	/** 
	* @param s      
    * @return 获得网页标题   
    */ 
	public String getTitle(final String s) {  
		String regex = "<title>.*?</title>";      
		        
		String title = "";     
		final List<String> list = new ArrayList<String>();     
		final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);    
		final Matcher ma = pa.matcher(s);         
		while (ma.find()) {              
			list.add(ma.group());       
		}          
		for (int i = 0; i < list.size(); i++) {  
			title = title + list.get(i);   
		}           
		return outTag(title);  
}              
	/**       
	  * @param s       
	  * @return 获得链接       
	  */ 
	public List<String> getLink(final String s) {  
		String regex;          
		final List<String> list = new ArrayList<String>();  
		regex = "<a[^>]*href=(\"([^\"]*)\"|\'([^\']*)\'|([^\\s>]*))[^>]*>(.*?)</a>";   
		final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);     
		final Matcher ma = pa.matcher(s);          
		while (ma.find()) {              
			list.add(ma.group());   
		}          
		return list;    
}             
	/**       
	 * @param s      
	 * @return 获得脚本代码    
   */    
	public List<String> getScript(final String s) {   
		String regex;          
		final List<String> list = new ArrayList<String>();  
		regex = "<script.*?</script>";       
		final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);    
		final Matcher ma = pa.matcher(s);         
		while (ma.find()) {            
			list.add(ma.group());      
		}          
		return list;    
}            
	/**      
	 * @param s      
	 * @return 获得CSS    
	*/  
	public List<String> getCSS(final String s) {  
		String regex;         
		final List<String> list = new ArrayList<String>();   
		regex = "<style.*?</style>";      
		final Pattern pa = Pattern.compile(regex, Pattern.DOTALL);  
		final Matcher ma = pa.matcher(s);       
		while (ma.find()) {             
			list.add(ma.group());      
		}           
		return list;     
}          
	/**      
	 * @param s     
	 * @return 去掉标记      
	*/   
    public String outTag(final String s) {   
    		return s.replaceAll("<.*?>", "");    
 }              
	/**       
	 * @param s    
	 * @return   
     */  
	public HashMap<String, List<String>> getFromUrls(final String url) { 
		final HashMap<String, List<String>> result = new HashMap<String, List<String>>();
		String content = "";        
		System.out.println("\n------------------开始读取网页(" + url   + ")--------------------");   
		try {             
			content = getOneHtml(url);     
		} catch (final Exception e) {      
			e.getMessage();             
			return null;         
		}         
		System.out.println("------------------读取网页(" + url  + ")结束--------------------\n");          
		System.out.println("------------------分析网页(" + url   + ")结果如下--------------------\n");           List<String> title = new ArrayList<String>(); 
		title.add(getTitle(content));   
		result.put("title", title);   
		result.put("css", getCSS(content));  
		result.put("script", getScript(content)); 
		result.put("link", getLink(content));   
		return result;     
}            
	/**      
	 * @param args    
	 */ 
	public static void main(final String args[]) { 
		String url = "";         
		final List<String> list = new ArrayList<String>(); 
		System.out.print("输入URL,一行一个,输入结束后输入 go 程序开始运行:   \n");  
		final BufferedReader br = new BufferedReader(new InputStreamReader(System.in));   
		try {              
			while (!(url = br.readLine()).equals("go")){ // 如果输入不是go那么一直读取        
				list.add(url);      
			}          
		} catch (final Exception e) {    
				e.getMessage();       
		}          
		final WebContent wc = new WebContent(); 
		HashMap<String, List<String>> hashMap = new HashMap<String, List<String>>();   
		for (int i = 0; i < list.size(); i++) {             
			if (wc.getFromUrls(list.get(i)) != null) { 
				hashMap = wc.getFromUrls(list.get(i));  
			}              
			for (Iterator<String> iter = hashMap.keySet().iterator(); iter.hasNext();) {
				String key = iter.next();         
				List<String> list2 = hashMap.get(key);        
				System.out.println("--" + key + "内容如下ssssssssssss:");      
				for (int j = 0; j < list2.size(); j++) {       
					System.out.println(list2.get(j));     
				}             
			}         
		}      
}  
	
}


  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值