JAVA爬取URL,并利用布隆算法去重

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection; import java.util.Arrays; import java.util.BitSet; import java.util.regex.Matcher; import java.util.regex.Pattern; public class UrlSpider { /** * 定义相关全局变量 * url:访问的地址 * sb:缓存爬取出来的url */ private static String url = "https://www.cncn.com/"; private static StringBuffer sb = new StringBuffer(""); public static void main(String[] args) throws Exception { String[] urls = urlSpider(url); MyBloomFilter filter = new MyBloomFilter(); int num = 0; for (String s : urls ) { if (!filter.contain(s)) { System.out.println(s); num++; } } System.out.println(); System.out.println("用布隆算法去重后该页面url的数量为: "+num); } /** * 爬取网页数据 */ public static String[] urlSpider(String urlAddress) throws Exception { //定义一个 String[],接收url String[] urls = null; URL url = new URL(urlAddress); //创建连接 URLConnection connection = url.openConnection(); //添加User-Agent connection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"); BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream(), "GBK")); String line = null; //这里不能使用(line = br.readLine()).length(),因为最后没有数据的时候,会报空指针异常 while ((line = br.readLine()) != null) { sb.append(line); } //利用正则或者Xpath判定 urls = getUrl(sb); return urls; } /** * Regex Function,取得url */ public static String[] getUrl(StringBuffer sb) { String[] urls = new String[0]; String regex = "<a href=\"([(https)|(http)]+://www.cncn.com/[^\\\"]+)\"[^>]+>[\\u4e00-\\u9fa5]+</a>"; Pattern pattern = Pattern.compile(regex); Matcher urlMatcher = pattern.matcher(sb); while (urlMatcher.find()) { String group = urlMatcher.group(1); urls = Arrays.copyOf(urls, urls.length + 1); urls[urls.length - 1] = group; } return urls; } } /** * 布隆去重 */ class MyBloomFilter { //定义BitSet默认大小,2 << 25没有超过int范围 private final static int DEFAULT_SIZE = 2 << 25; //创建BitSet private static BitSet bitSet = new BitSet(DEFAULT_SIZE); //定义哈希种子,类型为质数、个数决定哈希函数个数 private static int[] seeds = new int[]{5, 7, 11, 13, 31, 37, 61}; //定义哈希函数数组 private static MyHash[] func = new MyHash[seeds.length]; public MyBloomFilter() { //....构造所需的哈希函数 for (int i = 0; i < seeds.length; i++) { func[i] = new MyHash(DEFAULT_SIZE, seeds[i]); } } /** * 将字符串标记到bits中 */ public void add(String url) { if (url != null) { for (MyHash f : func ) { //true代表这个位置被标记了 bitSet.set(f.hash(url), true); } } } /** * 判断url是否已经被BitSet标记过 */ public boolean contain(String url) { if (url == null) { return false; } boolean flag = true; //循环判断bitset中是否包含该url for (MyHash f : func ) { flag = flag && bitSet.get(f.hash(url)); } return flag; } /** * 定义一个静态内部类MyHash,实现哈希函数功能 */ public static class MyHash { /** * cap = DEFAULT_SIZE */ private int cap; private int seed; public MyHash(int cap, int seed) { this.cap = cap; this.seed = seed; } public int hash(String url) { int no = 0; for (int i = 0; i < url.length(); i++) { no = no * seed + url.charAt(i); } return (cap - 1) & no; } } }

转载于:https://www.cnblogs.com/lfq-blog/p/10450662.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值