网页消重算法（java）

最新推荐文章于 2021-09-16 14:25:01 发布

haha0515

最新推荐文章于 2021-09-16 14:25:01 发布

阅读量289

点赞数

文章标签： Java QQ 算法 J# .net

在爬虫的过程中，我们常常会遇到主题内容相同的网页，例如转载网页等等。由于标题不一样，内容有细微的偏差，也许我们的爬虫会误认为两个网页是不同的。这个时候，我们就必须对网页内容过滤消重。几乎所有的消重技术都基于这样一个基本思想：为每个文档计算出一组指纹（fingerprint），若两个文档拥有一定数量的相同指纹，则认为这两个文档的内容重叠性较高，也即二者是内容转载的。（具体详细内容在搜索引擎 — 原理、技术与系统一书中有详细介绍）。

根据书中的算法描述，简单写了一个，网页消重的java代码，做一下代码笔记。

以下是算法中的主要部分：具体算法，在搜索引擎 — 原理、技术与系统一书中有详细介绍。

public class FileSimCal {

	private static final int N = 10;//取得特征词项的个数
	
	public FileSimCal() {
		
	}
	
	/**
	 * 获取由n个关键词组成的特征项集合
	 * 
	 * @param filepath
	 * @return Map<String, Integer>
	 */
	public Map<String, Integer> getTerms(String filepath) {
		FileReader fr = null;

		Map<String, Integer> map = new HashMap<String, Integer>();

		try {
			fr = new FileReader(filepath);
			IKSegmentation iks = new IKSegmentation(fr, true);
			Lexeme lexeme = null;
			while ((lexeme = iks.next()) != null) {
				String term = lexeme.getLexemeText();
				if (map.containsKey(term)) {
					int count = map.get(term) + 1;
					map.put(term, count);
				} else {
					map.put(term, 1);
				}
			}

			return map;

		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				fr.close();
			} catch (IOException e) {

				e.printStackTrace();
			}
		}

		return null;
	}

	/**
	 * 排序特征项集，取前10项
	 * 
	 * @param words_T
	 * @return
	 */
	public String[] topTen(Map<String, Integer> words_T) {
		PriorityQueue<Entry<String, Integer>> queue = new PriorityQueue<Entry<String, Integer>>(1000, new Comparator<Entry<String, Integer>>() {
			public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {
				if (o1.getValue() > o2.getValue()) {
					return -1;
				}

				if (o1.getValue() < o2.getValue()) {
					return 1;
				}

				return 0;
			};
		});

		String[] words = new String[N];
		Iterator<Entry<String, Integer>> it = words_T.entrySet().iterator();
		while (it.hasNext()) {
			Entry<String, Integer> entry = it.next();
			queue.add(entry);
			
		}
		
		int i = 0;
		while(!queue.isEmpty()) {
			Entry<String, Integer> entry = queue.poll();
			
			if(entry.getKey().length() < 2) {//取出的特征项，应为一个词，所以string的长度必须大于等于2
				continue;
			}
			
			words[i] = entry.getKey();
			i++;
			
			if(i == N) {//当取得头前十个词后跳出
				break;
			}
		}
		
		return words;
	}
	
	/**
	 * 重新组合前10位特征项词组
	 * @param tenFrontWords
	 * @return
	 */
	public String concatenate(String[] tenFrontWords) {
		int i = 0;
		
		if(tenFrontWords.length < N) {
			return null;
		}
		
		StringBuilder sb = new StringBuilder();
		while (i < N) {
			System.out.print(tenFrontWords[i] + " ");
			sb.append(tenFrontWords[i]);
			i++;
		}
		
		System.out.println();
		if(sb != null && sb.length() > 0) {
			return sb.toString();
		}
		
		return null;
	}

	/**
	 * 生成特征字符串的md5码
	 * @param words
	 * @return
	 */
	public String md5(String words) {
		MessageDigest digest = null;
		StringBuffer sb = new StringBuffer();

		try {
			digest = MessageDigest.getInstance("MD5");
			digest.update(words.getBytes(), 0, words.length());

			byte[] tmp = digest.digest();
			BigInteger bigint = new BigInteger(1, tmp);
			sb.append(String.format("%1$016X", bigint));

		} catch (NoSuchAlgorithmException e) {
			e.printStackTrace();
		}

		return sb.toString();
	}

	/**
	 * 比较md5编码是否相同
	 * @param p1
	 * @param p2
	 * @return
	 */
	public boolean mirror(String p1, String p2) {
		String p1_md5 = md5(p1).trim();
		String p2_md5 = md5(p2).trim();
		
		if(p1_md5.equals(p2_md5)) {
			return true;
		}
		
		return false;
	}
	
	public static void main(String[] args) {
		
		FileSimCal cal = new FileSimCal();
		PingYinCompare pingYinCompare = new PingYinCompare();
		
		
		//提取文档特征词项
		Map<String, Integer> fwmap = cal.getTerms("./txt/fzfenghuang.txt");
		Map<String, Integer> qqmap = cal.getTerms("./txt/fzqq.txt");
		
		//排序词项
		String[] topTenwords_fw = cal.topTen(fwmap);
		String[] topTenwords_qq = cal.topTen(qqmap);
		
		try {
			String[] pinyinarr_fw = pingYinCompare.pinYinArr(topTenwords_fw);
			String[] pinyinarr_qq = pingYinCompare.pinYinArr(topTenwords_qq);
			
			
			//按照拼音字母顺序，对字符串数组排序
			pingYinCompare.quickSort(pinyinarr_fw, 0, pinyinarr_fw.length - 1, topTenwords_fw);
			pingYinCompare.quickSort(pinyinarr_qq, 0, pinyinarr_qq.length - 1, topTenwords_qq);
			
			//合并排序后的词项
			String con_fw = cal.concatenate(topTenwords_fw);
			String con_qq = cal.concatenate(topTenwords_qq);
			
			//比较连个字符串是否互为转载文章
			if(cal.mirror(con_fw, con_qq)) {
				System.out.println("fzfenghuang.txt和fzqq.txt为互为转载文章！");
			} else {
				System.out.println("fzfenghuang.txt和fzqq.txt不是互为转载文章！");
			}
		} catch (BadHanyuPinyinOutputFormatCombination e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

拼音识别及排序比较：

import java.text.Collator;
import java.util.Comparator;
import java.util.Locale;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class PingYinCompare implements Comparator<String> {
	
	//定义拼音输出格式
	private HanyuPinyinOutputFormat format;
	
	public PingYinCompare() {
		this.format = new HanyuPinyinOutputFormat();
		format.setCaseType(HanyuPinyinCaseType.LOWERCASE);
		format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
		format.setVCharType(HanyuPinyinVCharType.WITH_V);
	}
	
	/**
	 * 比较两个字符串的大小
	 */
	public int compare(String o1, String o2) {
		return Collator.getInstance(Locale.ENGLISH).compare(o1, o2);
	}

	/**
	 * 把中文转换为字符串
	 * @param src
	 * @return
	 * @throws BadHanyuPinyinOutputFormatCombination
	 */
	public String str2PingYin(String src) throws BadHanyuPinyinOutputFormatCombination {
		char[] chararr = src.toCharArray();
		StringBuffer sb = new StringBuffer();
		for(char c : chararr) {
			String[] pinyin = PinyinHelper.toHanyuPinyinStringArray(c, this.format);
			sb.append(pinyin[0]);
		}
		return sb.toString();
	}
	
	/**
	 * 快速排序
	 * 
	 * @param arr 
	 * 			拼音字符串数组
	 * @param left 
	 * 			拼音字符串左起点
	 * @param right 
	 * 			拼音字符串的右起点
	 * @param src 
	 * 			源字符串数组
	 */
	public void quickSort(String[] arr, int left, int right, String[] src) {
		String middle, tmp, tmp2;
		int i = left;
		int j = right;

		middle = arr[(left + right) / 2];
		
		do {
			while ((compare(arr[i], middle) < 0) && i < right) {// 找出比middle大的数
				i++;
			}
			
			while ((compare(arr[j], middle) > 0) && j > left) {// 找出比middle小的数
				j--;
			}

			if (i <= j) {// 如果找出数值的位置下标符合条件，则两数组调换
				tmp = arr[i];
				arr[i] = arr[j];
				arr[j] = tmp;
				
				//源字符串对调
				tmp2 = src[i];
				src[i] = src[j];
				src[j] = tmp2;
				
				i++;// i下标右移
				j--;// j下标左移
			}
			
		} while (i < j);

		if (left < j) {
			quickSort(arr, left, j, src);
		}

		if (right > i) {
			quickSort(arr, i, right, src);
		}
	}
	
	/**
	 * 把字符串数组，转换成拼音数组
	 * @param src
	 * @return
	 * @throws BadHanyuPinyinOutputFormatCombination 
	 */
	public String[] pinYinArr(String[] src) throws BadHanyuPinyinOutputFormatCombination {
		if(src.length <= 0) {
			return null;
		}
		
		String[] temp = new String[src.length];
		for(int i = 0; i < temp.length; i++) {
			temp[i] = str2PingYin(src[i]);
		}
		
		return temp;
	}
	
	public static void main(String[] args) {
		PingYinCompare compare = new PingYinCompare();
		
		try {
			String p1 = compare.str2PingYin("我是一个兵");
			String p2 = compare.str2PingYin("来自老百姓");
			
			
			System.out.println(p1);
			System.out.println(p2);
			System.out.println(compare.compare(p2, p1));
			
		} catch (BadHanyuPinyinOutputFormatCombination e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

参考文献：

一种适合java环境的中文快速排序和模糊检索方法--刘焕焕陆锋，赵云山

注：附件是三篇转载的文章

haha0515

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
网页消重算法（java）

在爬虫的过程中，我们常常会遇到主题内容相同的网页，例如转载网页等等。由于标题不一样，内容有细微的偏差，也许我们的爬虫会误认为两个网页是不同的。这个时候，我们就必须对网页内容过滤消重。几乎所有的消重技术都基于这样一个基本思想：为每个文档计算出一组指纹（fingerprint），若两个文档拥有一定数量的相同指纹，则认为这两个文档的内容重叠性较高，也即二者是内容转载的。（具体详细内容在搜 ...
复制链接

扫一扫