自娱自乐--爬虫java实践

最新推荐文章于 2024-09-20 09:39:51 发布

nianqian

最新推荐文章于 2024-09-20 09:39:51 发布

阅读量795

点赞数

分类专栏：自娱自乐文章标签：爬虫源码网络爬虫

本文链接：https://blog.csdn.net/nianqian/article/details/46640437

版权

自娱自乐专栏收录该内容

3 篇文章 0 订阅

订阅专栏

编码没几年，以后估计也不会干码农的活了，但个人对编程实用性还是蛮感兴趣的，最近在网上搜罗资料时，发现一网站资源很丰富，萌生了想把每个游泳的链接资源搜罗起来的想法，索性现在毕业没啥事，我就动手琢磨了一两小时搞了个小爬虫把这些超链接资源扒下来了，当然我的功能还是蛮简单了，在这也不想写出专业性的东西了，就把自己写的原始源码贴出来跟大伙分享分享吧。

爬虫程序就是自动搜索获取内容的程序，我的需求就是弄个搜罗网页内容的小爬虫出来，也叫网络爬虫，把有用的超链接信息汇总起来。

网络爬虫原理简单点（我理解的）就是由给定的URL抓取对应网页中的内容，这个给定的URL可以是人为赋值的，也可以由爬虫本身自动解析网页内容获取的URL，你所需要抓取的网页内容则是通过正则表达式来判断，而如何抓取就是网页数据流的实现过程了。所以一个小爬虫涉及的技术很基础，我就分类贴出鄙人的原始代码。

一、通过URL获取网页内容

	public StringBuffer getContext(String URLPath)
	{
		StringBuffer buffer= new StringBuffer();
		try {
			URL url = new URL(URLPath);
			URLConnection conn = url.openConnection(); 
			BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream())); 
			String line = null; 
			while((line = reader.readLine()) != null) 
				buffer.append(line + "\n");
			
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return buffer;
	}

二、解析网页内容

	/**
	 * 解析网页内容，返回所需的信息，可以是String型，此处是因实际需要返回map
	 * @param htmlDoc 网页内容
	 */
	public LinkedHashMap<String, String> urlDetector(String htmlDoc)
	{
		//key存放超链接显示名，value存放相对地址路径名
		LinkedHashMap<String, String> map = new LinkedHashMap<String, String>();
		final String patternString = "<[a|A].*</[a|A]>";    //获取所有超链接的源码片段
		
		Pattern pattern = Pattern.compile(patternString,Pattern.CASE_INSENSITIVE); 
		Matcher matcher = pattern.matcher(htmlDoc);
		while(matcher.find())
		{
			String tempURL=matcher.group();
			String temp = tempURL.replaceAll("href=\"(.*)\"\\s", "@$1@")  //将超链接的源码中的href=“XXX” 代码替换成@XXX@
							.replaceAll(">([^<>\\s]+)<","&$1&");   //将超链接的源码中的超链接文本显示的 代码替换成&XXX&
			if(temp.indexOf("@")!=-1)
				map.put(temp.split("&")[1], temp.split("@")[1]); 
		}
		return map;
	}

<span style="white-space:pre">		</span>/**
		 * 以下部分是本需求的特殊处理，贴出来方便以后自我翻看 
		 */
		//解析首页的内容,并分别访问这些超链接的网页，存入相应的文件
		HashMap<String, String> urlMap = t.urlDetector(stBuf.toString());
		StringBuffer buf1 = new StringBuffer();
		for(String keyName : urlMap.keySet() )
		{
			String urlName = urlMap.get(keyName);
			String uPath = UrlPath + urlName;
			if( !keyName.trim().isEmpty() && uPath.trim().matches("^.*htm((\\s*)|l)$") ) //  .matches(".*(htm)|.*(html)$")判断以htm或html结尾
			{
				System.out.println(keyName.trim() + "<<<<<<"+ uPath.trim() +">>>>>>>>>>");
				buf1.append(keyName + "\n");
				String BtStr = t.collectBTorOther(uPath, filePath + keyName + ".tmp");
				buf1.append(BtStr);
				buf1.append("-----------------------------------------------\n");
			}
		}

<span style="white-space:pre">	</span>/**
	 * 接上面，特殊处理
	 * @param uPath
	 * @param fPath
	 */
	public String collectBTorOther(String uPath, String fPath)
	{
		StringBuffer buf = this.getContext(uPath);
		this.writeToFile(fPath, buf);
		// 解析网页内容,返回需要收集的信息  
		final String patternBT = "http:[^>\"'<]*.torrent";
		StringBuffer rtBuf = new StringBuffer();
		
		Matcher mat = Pattern.compile(patternBT, Pattern.CASE_INSENSITIVE).matcher(buf);
		while(mat.find())
		{
			String tmStr =mat.group();
			rtBuf.append(tmStr.trim() + "\n");
		}
		return rtBuf.toString();
	}

三、将需要的部分存入本地

	public void writeToFile(String filePath, StringBuffer buffer)
	{
		File ff = new File(filePath);
		FileWriter document = null;
		try {
			document = new FileWriter(ff);
			document.append(buffer);
			document.flush();
			document.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}