java访问站点并实现简易网络爬虫

最新推荐文章于 2023-07-18 00:48:59 发布

忘记怎样忘记

最新推荐文章于 2023-07-18 00:48:59 发布

阅读量4.4k

点赞数

文章标签：网络爬虫 java exception string import url

本文链接：https://blog.csdn.net/wangcj625/article/details/6454157

版权

本文介绍了一个基于Java的简易网络爬虫实现方法。该爬虫利用HttpURLConnection获取网页内容，并通过正则表达式抓取页面上的链接。示例代码展示了如何连接到指定网址、读取响应头信息及内容，并递归地抓取页面链接。

摘要由CSDN通过智能技术生成

import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.util.*; public class HttpConnTest { public static void main (String[] args) throws Exception{ // URL url = new URL("http://javaeye.com"); URL url = new URL("http://blog.sina.com.cn/buptaa"); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.connect(); //打印请求相应的头部文件 Map<String,List<String>> header = conn.getHeaderFields(); for(String key : header.keySet()){ System.out.println(key + ":" + header.get(key)); } //打印相应内容 BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"UTF-8")); String str = null; while((str = br.readLine()) != null){ System.out.println(str); } conn.disconnect(); } }

基于广度优先算法在上面代码基础上实现简易爬虫如下

import java.io.*; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.*; import java.util.regex.*; public class HttpConnTest { private List<URL> urlList = new ArrayList<URL>(); private int count = 0; private void doHttpConn() throws Exception{ count ++; URL url = new URL("http://blog.sina.com.cn/buptaa"); if(! urlList.isEmpty()){ url = urlList.get(0); } String urlRegx = "(http|www|ftp)(://)?(//w+(-//w+)*)" + "(//.(//w+(-//w+)*))*((://d+)?)(/(//w+(-//w+)*))" + "*(//.?(//w)*)(//?)?(((//w*%)*(//w*//?)*(//w*:)" + "*(//w*//+)*(//w*//.)*(//w*&)*(//w*-)*(//w*=)*" + "(//w*%)*(//w*//?)*(//w*:)*(//w*//+)*(//w*//.)*" + "(//w*&)*(//w*-)*(//w*=)*)*(//w*)*)"; Pattern p = Pattern.compile(urlRegx, Pattern.CASE_INSENSITIVE); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.connect(); // 打印请求相应的头部文件 Map<String, List<String>> header = conn.getHeaderFields(); for (String key : header.keySet()) { System.out.println(key + ":" + header.get(key)); } // 打印相应内容 BufferedReader br = new BufferedReader(new InputStreamReader(conn .getInputStream(), "UTF-8")); String str = null; while ((str = br.readLine()) != null) { System.out.println(str); Matcher m = p.matcher(str); while (m.find()) { urlList.add(new URL(m.group(0))); } } conn.disconnect(); System.out.println("-----------------------"); System.out.println(urlList.size()); for (URL aurl : urlList) { System.out.println(aurl.toString()); } } public static void main(String[] args) throws Exception { HttpConnTest hct = new HttpConnTest(); while(hct.count <= 3){ hct.doHttpConn(); } System.out.println("---DONE---"+hct.count); } }