生成文本聚类java实现 (3)

最新推荐文章于 2021-02-18 19:40:22 发布

老和与宇华霖

最新推荐文章于 2021-02-18 19:40:22 发布

阅读量302

点赞数

文章标签： java 人工智能

很多网友看到我的聚类的研究，到后来基本上都是到carrot2的研究上去了。但由于carrot2对中文的理解很不靠谱，所以参考了网络上的一些资料，现在贡献出来所有代码。

　代码的思路就是找字或者词出现的频度，并进行打分，最后按照出现次数和重要性，找出重要的语汇。现在贴出来一些可用的代码。

　ClusterBuilder.java

/**

 * 
* @author  
* @version 创建时间：2011-3-8 下午02:02:36
* 聚类生成器
 */
public class ClusterBuilder {
	private static final Log LOG;
	private List<DocCluster> clusters;
	private ICTHit[] docs;
	private int maxLevels;
	private ClusteringOptions[] options;
	private boolean useTagsAsTitle;
	private String wordsExcluded;
	private static short[] bit1Table;

	static {
		LOG = LogFactory.getLog(ClusterBuilder.class.getName());

		bit1Table = new short[65536];

		for (int n = 0; n < bit1Table.length; n++) {
			String s = Integer.toBinaryString(n);
			short m = 0;
			for (int k = 0; k < s.length(); k++) {
				if (s.charAt(k) == '1') {
					m = (short) (m + 1);
				}
			}
			bit1Table[n] = m;
		}
	}

	private static int getValidBitCount(long n) {
		int i3 = (int) (n % 65536L);
		n /= 65536L;
		int i2 = (int) (n % 65536L);
		n /= 65536L;
		int i1 = (int) (n % 65536L);
		n /= 65536L;
		int i0 = (int) (n % 65536L);
		return bit1Table[i0] + bit1Table[i1] + bit1Table[i2] + bit1Table[i3];
	}

	private static int getDocHitCount(long[] hits) {
		assert (hits != null);
		if (hits == null)
			return 0;
		int n0 = 0;
		for (int i = 0; i < hits.length; i++) {
			n0 += getValidBitCount(hits[i]);
		}
		return n0;
	}

	public ClusterBuilder() {
		for (int n = 0; n < bit1Table.length; n++)
		{
			String s = Integer.toBinaryString(n);
			short m = 0;
			for (int k = 0; k < s.length(); k++)
			{
				if (s.getBytes()[k] == '1')
				{
					m = (short)(m + 1);
				}
			}
			bit1Table[n] = m;
		}
	}
	/**
	 * 
	 * @param docsToCluster 要聚类的记录列表
	 * @param exWords 不使用的主题词列表，多个词用西文逗号分隔。这些词将不会作为主题词。
	 * @param maxLevels 最大聚类级数
	 * @param useTagsAsTitle 是否使用主题词作为类别主题词。如果不使用，则根据文档标题自动生成类别主题词。
	 */
	public ClusterBuilder(ICTHit[] docsToCluster, String exWords, int maxLevels, boolean useTagsAsTitle) {
		this.useTagsAsTitle = useTagsAsTitle;
		this.wordsExcluded = exWords;
		this.maxLevels = maxLevels;
		this.docs = docsToCluster;
		this.options = new ClusteringOptions[3];
		this.options[0] = new ClusteringOptions();
		this.options[0].setDocMaxTagCount(10);
		this.options[0].setMinTagRelevance(60);
		this.options[0].setMinSameDocPercent(80);

		this.options[1] = new ClusteringOptions();
		this.options[1].setDocMaxTagCount(8);
		this.options[1].setMinTagRelevance(85);
		this.options[1].setMinSameDocPercent(70);
		this.options[1].setTagMinDocCount(2);
		this.options[1].setMinSameDocs(2);

		this.options[2] = new ClusteringOptions();
		this.options[2].setDocMaxTagCount(8);
		this.options[2].setMinTagRelevance(50);
		this.options[2].setMinSameDocPercent(70);
		this.options[2].setTagMinDocCount(2);
		this.options[2].setMinSameDocs(2);
	}
	/**
	 * 对Docs记录列表执行聚类，结果存放于Clusters中
	 */
	public void cluster() {
		this.clusters = createLevelClusters(docs, 0, options[0]);
		List subs = null;
		if (this.maxLevels <= 1) {
			return;
		}
		for (DocCluster dc : this.clusters) {
			if ((dc.getDocList().length < options[0].getMinDocsToCluster()) || (dc.getTags() == "其他"))
				continue;
			subs = createLevelClusters(dc.getDocList(), 1, options[1]);
			if (subs.size() > 1)
				dc.setSubclusters(subs);
		}
	}
	/**
	 * 创建一个层级的聚类
	 * @param docs 文档列表
	 * @param level 层级号
	 * @param levelOpt 该层级的聚类选项
	 * @return
	 */
	private List<DocCluster> createLevelClusters(ICTHit[] docs, int level, ClusteringOptions levelOpt) {
		TagHitMatrix matrix = new TagHitMatrix(docs.length, levelOpt.getDocMaxTagCount());
		List clusters = new ArrayList();
		int i, ValidTagCount;
		int DocCount = 0;
		// 扫描文档列表，根据每个文档的主题词列表，初始化主题词文档对照表。
		for (i = 0; i < docs.length; i++) {
			ICTHit d = docs[i];
			int validTagCount = 0;
			if (d.getTagList() != null) {
				String[] tagList = d.getTagList();
				for (int tagIdx = 0; (tagIdx < tagList.length) && (validTagCount < levelOpt.getDocMaxTagCount()); tagIdx++) {
					String tag = tagList[tagIdx].trim();
					 // 主题词长度大于6个字的丢弃
					if ((tag.length() <= 0)
							|| (tag.length() > 20)
							|| ((this.wordsExcluded.length() != 0) && ((tag.contains(this.wordsExcluded)) || (this.wordsExcluded
									.contains(tag)))))
						continue;
					matrix.AddDocHit(tag, i);
					validTagCount++;
				}
			}

		}

		int maxKwDocCount = 0;
		List entryListToRemove = new ArrayList();
		String kwWithMaxDocCount = "";
		LOG.debug("有效关键词：");
		for (Map.Entry entry : matrix.entrySet()) {
			// 统计当前主题词的命中文档数，文档数小于预设值，则该主题词将被删除
			int n = getDocHitCount((long[]) entry.getValue());
			if (n < levelOpt.getTagMinDocCount()) {
				entryListToRemove.add((String) entry.getKey());
			} else {
				LOG.debug((String) entry.getKey() + "(" + n + "), ");

				DocCount += n;
			}
			if (n > maxKwDocCount) {
				maxKwDocCount = n;
				kwWithMaxDocCount = (String) entry.getKey();
			}
		}
		LOG.debug("");

		LOG.debug("被忽略的关键词：");

		for (i = 0; i < entryListToRemove.size(); i++) {
			LOG.debug((String) entryListToRemove.get(i) + ", ");
			matrix.remove(entryListToRemove.get(i));
		}

		LOG.debug("");

		LOG.debug(entryListToRemove.size() + "个关键词被忽略。剩余" + matrix.size() + "个关键词。");

		LOG.debug("最大文档数的关键词：" + kwWithMaxDocCount + "，文档数：" + maxKwDocCount + "。");

		double docCountPerTag = matrix.size() > 0 ? DocCount / matrix.size() : 0.0D;
		LOG.debug("关键词平均文档数：" + docCountPerTag);

		levelOpt.setMinSameDocs((int) (docCountPerTag / (2.0D + level)));
		if (levelOpt.getMinSameDocs() < 1) {
			levelOpt.setMinSameDocs(1);
		}

		while (mergeClusters(matrix, levelOpt) > 0) {
		}
		return createResult(matrix, docs, level, levelOpt);
	}

	private int mergeClusters(TagHitMatrix matrix, ClusteringOptions opt) {
		if (matrix.size() == 0)
			return 0;
		long[] docHitsMerged = (long[]) null;
		long[] maxDocHitsMerged = (long[]) null;
		String word1 = "";
		String word2 = "";
		String word1ToMerge = "";
		String word2ToMerge = "";
		int i,j;
		int sameDocs = 0;
		// 初始化一个相关度数组，0到100分，共101项
		List rankMatrix = new ArrayList();
		for (i = 0; i < 101; i++) {
			rankMatrix.add(new ArrayList());
		}
		List matrix2List = new ArrayList();
		matrix2List.addAll(matrix.entrySet());
		// 将主题词文档映射表中的主题词两两比对
		for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {
			Map.Entry hits1 = (Map.Entry) matrix2List.get(i1);
			word1 = (String) hits1.getKey();
			for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {
				Map.Entry hits2 = (Map.Entry) matrix2List.get(i2);
				word2 = (String) hits2.getKey();
				Object[] re = getWordsRelevance(mapEntry2TagHitEntry(hits1), mapEntry2TagHitEntry(hits2),
						docHitsMerged, sameDocs, opt, matrix.hitsItemCount);
				// 计算两个词的相关性，获取两词的文档汇总表，以及相同文档数
				int nRank = ((Integer) re[0]).intValue();
				docHitsMerged = (long[]) re[1];
				sameDocs = ((Integer) re[2]).intValue();
				// 相关度小于预设阈值的忽略
				if (nRank >= opt.getMinTagRelevance()) {
					((List) rankMatrix.get(nRank)).add(new IdPair(i1, i2));
				}

			}

		}

		List tagListToRemove = new ArrayList();
		List entryListMerged = new ArrayList();
		entryListMerged.add(new TagHitEntry("", null));
		HashSet idPairTable = new HashSet();
		TagHitEntry entryToMerge1;
		while (true) {
			// 找到最大相关性的两个主题词
			for (i = 100; (i >= opt.getMinTagRelevance()) && (((List) rankMatrix.get(i)).size() == 0); i--){};
			if (i < opt.getMinTagRelevance()) {
				break;
			}
			IdPair ip = (IdPair) ((List) rankMatrix.get(i)).get(0);
			// 合并两个类别
			((List) rankMatrix.get(i)).remove(0);
			
			entryToMerge1 = ip.Id1 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id1))
					: (TagHitEntry) entryListMerged.get(-ip.Id1);
			TagHitEntry entryToMerge2 = ip.Id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(ip.Id2))
					: (TagHitEntry) entryListMerged.get(-ip.Id2);
			word1ToMerge = entryToMerge1.key;
			word2ToMerge = entryToMerge2.key;
			assert ((word1ToMerge.length() > 0) && (word2ToMerge.length() > 0));

			String wordsMerged = word1ToMerge + "," + word2ToMerge;
			long[] lDocs0 = entryToMerge1.value;
			long[] lDocs1 = entryToMerge2.value;
			maxDocHitsMerged = new long[matrix.hitsItemCount];
			for (i = 0; i < lDocs0.length; i++) {
				lDocs0[i] |= lDocs1[i];// 获取合并的文档集
			}
			if (ip.Id1 >= 0)
				tagListToRemove.add(word1ToMerge);
			else
				entryListMerged.set(-ip.Id1, new TagHitEntry("", null));
			if (ip.Id2 >= 0)
				tagListToRemove.add(word2ToMerge);
			else {
				entryListMerged.set(-ip.Id2, new TagHitEntry("", null));
			}
			entryListMerged.add(new TagHitEntry(wordsMerged, maxDocHitsMerged));
			// 替换与合并主题词有关联的其他相关主题词对的评分
			int idMerged = -(entryListMerged.size() - 1);
			int id2 = 0;

			boolean CanDelete = false;

			for (i = 0; i <= 100; i++) {
				int ListCount = ((List) rankMatrix.get(i)).size();
				if (ListCount == 0) {
					continue;
				}

				for (j = 0; j < ListCount; j++) {
					IdPair p = (IdPair) ((List) rankMatrix.get(i)).get(j);
					CanDelete = false;
					if ((ip.Id1 == p.Id1) || (ip.Id2 == p.Id1)) {
						id2 = p.Id2;
						CanDelete = true;
					} else if ((ip.Id1 == p.Id2) || (ip.Id2 == p.Id2)) {
						id2 = p.Id1;
						CanDelete = true;
					}
					if (!CanDelete)
						continue;
					if (idMerged == id2) {
						continue;
					}

					((List) rankMatrix.get(i)).remove(j);
					j--;
					ListCount--;

					IdPair pairMerged = new IdPair(idMerged, id2);
					if (idPairTable.contains(pairMerged)) {
						continue;
					}

					TagHitEntry e2 = id2 >= 0 ? mapEntry2TagHitEntry((Map.Entry) matrix2List.get(id2))
							: (TagHitEntry) entryListMerged.get(-id2);

					assert ((e2.key.length() != 0) && (e2.key != wordsMerged));

					Object[] re = getWordsRelevance(new TagHitEntry(wordsMerged, maxDocHitsMerged), e2, docHitsMerged,
							sameDocs, opt, matrix.hitsItemCount);
					int rank = ((Integer) re[0]).intValue();
					docHitsMerged = (long[]) re[1];
					sameDocs = ((Integer) re[2]).intValue();

					if (rank <= opt.getMinTagRelevance())
						continue;
					((List) rankMatrix.get(rank)).add(pairMerged);
					idPairTable.add(pairMerged);
				}

			}

		}
		// 删除被合并的主题词
		for (int m =0;m<tagListToRemove.size();m++){
			matrix.remove(tagListToRemove.get(m));
		}
		/**
		for (String w : tagListToRemove)
			matrix.remove(w);
		**/	
		// 添加合并而成的新主题词
		for (int n=0;n<entryListMerged.size();n++){
			TagHitEntry e = (TagHitEntry) entryListMerged.get(n);
			matrix.put(e.getKey(), e.getValue());
		}
		/**
		for (TagHitEntry e : entryListMerged) {
			if (e.getKey().length() > 0)
				matrix.put(e.getKey(), e.getValue());
		}
		**/
		return 0;
	}

	private int mergeClusters1(TagHitMatrix matrix, ClusteringOptions opt) {
		if (matrix.size() == 0)
			return 0;
		long[] docHitsMerged = (long[]) null;
		long[] maxDocHitsMerged = (long[]) null;
		int nMaxRank = 0;
		String word1 = "";
		String word2 = "";
		String word1ToMerge = "";
		String word2ToMerge = "";
		int sameDocs = 0;

		List matrix2List = new ArrayList();
		matrix2List.addAll(matrix.entrySet());

		for (int i1 = 0; i1 < matrix2List.size() - 1; i1++) {
			TagHitEntry hits1 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i1));
			word1 = hits1.getKey();
			for (int i2 = i1 + 1; i2 < matrix2List.size(); i2++) {
				TagHitEntry hits2 = mapEntry2TagHitEntry((Map.Entry) matrix2List.get(i2));
				word2 = hits2.getKey();
				Object[] re = getWordsRelevance(hits1, hits2, docHitsMerged, sameDocs, opt, matrix.hitsItemCount);
				int nRank = ((Integer) re[0]).intValue();
				docHitsMerged = (long[]) re[1];
				sameDocs = ((Integer) re[2]).intValue();

				if ((nRank <= nMaxRank) || (nRank <= opt.getMinTagRelevance()))
					continue;
				nMaxRank = nRank;
				maxDocHitsMerged = docHitsMerged;
				word1ToMerge = word1;
				word2ToMerge = word2;
			}

		}

		if ((word1ToMerge.length() == 0) || (word2ToMerge.length() == 0)) {
			return 0;
		}

		String wordsMerged = word1ToMerge + "," + word2ToMerge;
		if ((nMaxRank > opt.getMinTagRelevance()) && (wordsMerged != "")) {
			matrix.remove(word1ToMerge);
			matrix.remove(word2ToMerge);
			matrix.put(wordsMerged, maxDocHitsMerged);
			LOG.debug("(" + word1ToMerge + ") - (" + word2ToMerge + ")");

			return 1;
		}

		return 0;
	}

	private Object[] getWordsRelevance(TagHitEntry entry1, TagHitEntry entry2, long[] docHitsMerged, int sameDocCount,
			ClusteringOptions opt, int hitsItemCount) {
		Object[] re = new Object[3];
		docHitsMerged = new long[hitsItemCount];
		sameDocCount = 0;

		String tag1 = entry1.getKey();
		String tag2 = entry2.getKey();
		assert (tag2 != tag1);

		long[] lDocs0 = entry1.getValue();
		long[] lDocs1 = entry2.getValue();
		int n0 = 0;
		int n1 = 0;
		n0 = getDocHitCount(lDocs0);
		n1 = getDocHitCount(lDocs1);
		int docCountMin = Math.min(n0, n1);
		int docCountMax = Math.max(n0, n1);
		int docCountMerged = 0;

		long sameDocBits = 0L;
		long diffDocBits = 0L;
		int diffDocCount = 0;
		for (int i = 0; i < lDocs0.length; i++) {
			docHitsMerged[i] = lDocs0[i] | lDocs1[i];// 获取合并的文档集
			docCountMerged += getValidBitCount(docHitsMerged[i]);
			diffDocBits = lDocs0[i] ^ lDocs1[i];// 获取不同的文档集
			diffDocCount += getValidBitCount(diffDocBits);
			sameDocBits = lDocs0[i] & lDocs1[i];// 获取相同的文档集
			sameDocCount += getValidBitCount(sameDocBits);
		}

		boolean IsSubstring = false;
		// 一个主题词是另一个的子串，则得分较高
		if ((tag2.contains(tag1)) || (tag1.contains(tag2))) {
			IsSubstring = true;
			docCountMin += opt.getTagMinDocCount();
		}

		if ((sameDocCount == 0) && (!IsSubstring)) {
			re[0] = Integer.valueOf(0);
			re[1] = docHitsMerged;
			re[2] = Integer.valueOf(sameDocCount);
			return re;
		}

		if (docCountMin < opt.getTagMinDocCount()) {
			re[0] = Integer.valueOf(0);
			re[1] = docHitsMerged;
			re[2] = Integer.valueOf(sameDocCount);
			return re;
		}

		int samePercent = (int) Math.round(sameDocCount * 100.0D / docCountMerged);
		int samePercentMin = (int) Math.round(sameDocCount * 100.0D / docCountMin);
		int diffPercent = (int) Math.round(diffDocCount * 100.0D / docCountMerged);
		LOG.debug("相关性：" + tag1 + "(" + n0 + ")-(" + n1 + ")" + tag2);
		LOG.debug(", SamePercent=" + samePercent);
		LOG.debug(", SamePercentMin=" + samePercentMin);
		LOG.debug(", DiffPercent=" + diffPercent);
		int nRank;
		if ((sameDocCount >= opt.getMinSameDocs())
				&& ((docCountMin < 10) || (samePercentMin >= opt.getMinSameDocPercent()))) {
			nRank = (int) Math.round((samePercentMin + samePercent) * 0.85D - diffPercent * 0.2D);
		} else {
			nRank = 0;
		}
		if (IsSubstring)
			nRank += 80;
		LOG.debug(", Rank=" + nRank);

		re[0] = Integer.valueOf(Math.min(nRank, 100));
		re[1] = docHitsMerged;
		re[2] = Integer.valueOf(sameDocCount);
		return re;
	}

	private TagHitEntry mapEntry2TagHitEntry(Map.Entry<String, long[]> e) {
		return new TagHitEntry((String) e.getKey(), (long[]) e.getValue());
	}

	@SuppressWarnings("unchecked")
	private List<DocCluster> createResult(TagHitMatrix matrix, ICTHit[] docs, int level, ClusteringOptions opt) {
		int i,j;
		Map<String,DocValue> clsIdList = new HashMap();
		List ClassTitleList = new ArrayList();
		for (Map.Entry de : matrix.entrySet()) {
			DocValue dv = new DocValue();
			clsIdList.put((String) de.getKey(), dv);
		}

		List<Integer> otherIdList = new ArrayList();
		TagHitEntry maxTagHitEntry = new TagHitEntry();
		int clsCount;
		String tag;
		// 确定每个文档所属的类别
		for (i = 0; i < docs.length; i++) {
			ICTHit d = docs[i];
			TagHitMatrix.ClusterDocInfo di = matrix.docs[i];
			assert (docs[i] != null);
			int maxTagHit = 0;
			clsCount = 0;

			for (Map.Entry hits : matrix.entrySet()) {
				int tagHitCount = 0;
				int score = 0;
				String clsWordListStr = "," + (String) hits.getKey() + ",";
				// 那个类别包含当前文档的主题词最多，该文档就属于哪个类别
				for (j = 0; j < di.TagCount; j++) {
					tag = di.TagList[j];
					score = j < 3 ? 2 : 1;
					assert (tag.length() > 0);
					if (!clsWordListStr.contains("," + tag + ","))
						continue;
					tagHitCount += score;
					clsCount++;
				}

				if (maxTagHit >= tagHitCount)
					continue;
				maxTagHit = tagHitCount;
				maxTagHitEntry = mapEntry2TagHitEntry(hits);
			}

			if (maxTagHit > 0) {
				DocValue dv = (DocValue) clsIdList.get(maxTagHitEntry.getKey());
				dv.idList.add(Integer.valueOf(i));
			} else {
				otherIdList.add(Integer.valueOf(i));
			}

		}
		// 生成类别列表
		List<DocCluster> clusterList = new ArrayList();
		String[] TagList;
		Object dc;
		for (Map.Entry<String,DocValue> kv : clsIdList.entrySet()) {
			DocValue dv = (DocValue) kv.getValue();
			if (dv.idList.size() <= 0)
				continue;
			if (dv.idList.size() == 1) {
				otherIdList.add((Integer) dv.idList.get(0));
			} else {
				dc = new DocCluster();
				((DocCluster) dc).setDocIdList(new String[dv.idList.size()]);
				((DocCluster) dc).setDocList(new ICTHit[dv.idList.size()]);
				for (i = 0; i < dv.idList.size(); i++) {
					((DocCluster) dc).getDocIdList()[i] = docs[((Integer) dv.idList.get(i)).intValue()].getDocId();
					((DocCluster) dc).getDocList()[i] = docs[((Integer) dv.idList.get(i)).intValue()];
				}
				((DocCluster) dc).setLevel(level);
				((DocCluster) dc).setTags((String) kv.getKey());

				for (i = 0; (i < clusterList.size())
						&& (((DocCluster) dc).getDocIdList().length <= ((DocCluster) clusterList.get(i)).getDocIdList().length);) {
					i++;
				}
				clusterList.add(i, (DocCluster) dc);
			}
		}
		for (i = opt.getMaxClusterCount(); i < clusterList.size();) {
			DocCluster c = (DocCluster) clusterList.get(i);
			List idList = ((DocValue) clsIdList.get(c.getTags())).idList;
			for (dc = idList.iterator(); ((Iterator) dc).hasNext();) {
				int idx = ((Integer) ((Iterator) dc).next()).intValue();
				otherIdList.add(Integer.valueOf(idx));
			}
			clusterList.remove(i);
		}
		int i1;
		for (i = 0; i < clusterList.size(); i++) {
			DocCluster dc1 = (DocCluster) clusterList.get(i);
			String[] tagList = dc1.getTags().split(",");
			String newTags = "";

			for (j = 0; j < tagList.length; j++) {
				i1 = dc1.getTags().indexOf(tagList[j]);
				int i2 = dc1.getTags().lastIndexOf(tagList[j]);
				if (i1 == i2)
					newTags = newTags + tagList[j] + ",";
			}
			if ((newTags.trim().length() > 0) && (newTags.endsWith(","))) {
				newTags = newTags.substring(0, newTags.length() - 1);
			}
			dc1.setTags(newTags);

			dc1.setTitle("");

			if (this.useTagsAsTitle) {
				tagList = dc1.getTags().split(",");
				for (j = 0; (tagList != null) && (j < tagList.length); j++) {
					if ((dc1.getTitle() + tagList[j]).length() > 16)
						break;
					boolean isSubstr = false;
					for (DocCluster c : clusterList) {
						if ((c.getTitle().length() <= 0)
								|| ((!c.getTitle().contains(tagList[j])) && (!tagList[j].contains(c.getTitle()))))
							continue;
						isSubstr = true;
						break;
					}
					if (!isSubstr)
						dc1.setTitle(dc1.getTitle() + tagList[j] + ",");
				}
				if ((dc1.getTitle().trim().length() > 0) && (dc1.getTitle().endsWith(","))) {
					dc1.setTitle(dc1.getTitle().substring(0, dc1.getTitle().length() - 1));
				}

			}

			if (dc1.getTitle() != "")
				continue;
			dc1.setTitle(dc1.getTags());
			if (dc1.getTitle().length() <= 16)
				continue;
			String s = dc1.getTitle().substring(0, 16);
			int li = s.lastIndexOf(',');
			if (li > 0) {
				dc1.setTitle(s.substring(0, li));
			}

		}

		if (otherIdList.size() > 0) {
			DocCluster clusterOther = new DocCluster();
			clusterOther.setDocIdList(new String[otherIdList.size()]);
			clusterOther.setDocList(new ICTHit[otherIdList.size()]);
			clusterOther.setLevel(level);
			clusterOther.setTitle("其他");
			clusterOther.setTags("其他");
			i = 0;
			for (int k=0;k<otherIdList.size();k++) {
				int idx = otherIdList.get(k);

				clusterOther.getDocIdList()[i] = docs[idx].getDocId();
				clusterOther.getDocList()[i] = docs[idx];
				i++;
			}
			clusterList.add(clusterOther);
		}

		return (List<DocCluster>) clusterList;
	}

	public List<DocCluster> getClusters() {
		return this.clusters;
	}

	public void setClusters(List<DocCluster> clusters) {
		this.clusters = clusters;
	}

	public ICTHit[] getDocs() {
		return this.docs;
	}

	public void setDocs(ICTHit[] docs) {
		this.docs = docs;
	}

	public int getMaxLevels() {
		return this.maxLevels;
	}

	public void setMaxLevels(int maxLevels) {
		this.maxLevels = maxLevels;
	}

	public ClusteringOptions[] getOptions() {
		return this.options;
	}

	public void setOptions(ClusteringOptions[] options) {
		this.options = options;
	}

	public boolean isUseTagsAsTitle() {
		return this.useTagsAsTitle;
	}

	public void setUseTagsAsTitle(boolean useTagsAsTitle) {
		this.useTagsAsTitle = useTagsAsTitle;
	}

	public String getWordsExcluded() {
		return this.wordsExcluded;
	}

	public void setWordsExcluded(String wordsExcluded) {
		this.wordsExcluded = wordsExcluded;
	}

	private class DocValue {
		public List<Integer> idList = new ArrayList();
		public String titleListStr = "";

		private DocValue() {
		}
	}
	/**
	 * 主题词ID对，主题词ID为该主题词在主题词文档映射表中的主键位置。
	* @author 
	* @version 创建时间：2011-3-9 下午02:52:44
	 */
	private class IdPair {
		public int Id1;
		public int Id2;

		public IdPair(int id1, int id2) {
			assert (id1 != id2);
			if (id1 < id2) {
				this.Id1 = id1;
				this.Id2 = id2;
			} else {
				this.Id1 = id2;
				this.Id2 = id1;
			}
		}

		public int hashCode() {
			return -1;
		}

		public boolean equals(Object o) {
			return (((IdPair) o).Id1 == this.Id1) && (((IdPair) o).Id2 == this.Id2);
		}
	}

	public static class TagHitEntry {
		public String key;
		public long[] value;

		public TagHitEntry() {
		}

		public TagHitEntry(String k, long[] v) {
			this.key = k;
			this.value = v;
		}

		public String getKey() {
			return this.key;
		}

		public long[] getValue() {
			return this.value;
		}
	}
}

ClusteringOptions.java

/**
 * 
* @author 
* @version 创建时间：2011-3-8 上午10:23:27
 */
public class ClusteringOptions {
	public static int DefMaxClusterCount = 20;
	public static int DefMaxKeywordCount = 6;
	public static int DefMinWordsRelevance = 10;
	public static int DefTagMinDocCount = 3;
	public static int DefIgnoreSameDocs = 2;
	public static int DefSameDocPercent = 50;
	public static int DefMinDocsToCluster = 8;
	private int docMaxTagCount;
	private int maxClusterCount;
	private int minDocsToCluster;
	private int minSameDocPercent;
	private int minSameDocs;
	private int minTagRelevance;
	private int tagMinDocCount;

	public ClusteringOptions() {
		this.maxClusterCount = DefMaxClusterCount;
		this.minTagRelevance = DefMinWordsRelevance;
		this.tagMinDocCount = DefTagMinDocCount;
		this.minSameDocs = DefIgnoreSameDocs;
		this.minSameDocPercent = DefSameDocPercent;
		this.docMaxTagCount = DefMaxKeywordCount;
		this.minDocsToCluster = DefMinDocsToCluster;
	}

	public int getDocMaxTagCount() {
		return this.docMaxTagCount;
	}

	public void setDocMaxTagCount(int docMaxTagCount) {
		this.docMaxTagCount = docMaxTagCount;
	}

	public int getMaxClusterCount() {
		return this.maxClusterCount;
	}

	public void setMaxClusterCount(int maxClusterCount) {
		this.maxClusterCount = maxClusterCount;
	}

	public int getMinDocsToCluster() {
		return this.minDocsToCluster;
	}

	public void setMinDocsToCluster(int minDocsToCluster) {
		this.minDocsToCluster = minDocsToCluster;
	}

	public int getMinSameDocPercent() {
		return this.minSameDocPercent;
	}

	public void setMinSameDocPercent(int minSameDocPercent) {
		this.minSameDocPercent = minSameDocPercent;
	}

	public int getMinSameDocs() {
		return this.minSameDocs;
	}

	public void setMinSameDocs(int minSameDocs) {
		this.minSameDocs = minSameDocs;
	}

	public int getMinTagRelevance() {
		return this.minTagRelevance;
	}

	public void setMinTagRelevance(int minTagRelevance) {
		this.minTagRelevance = minTagRelevance;
	}

	public int getTagMinDocCount() {
		return this.tagMinDocCount;
	}

	public void setTagMinDocCount(int tagMinDocCount) {
		this.tagMinDocCount = tagMinDocCount;
	}
}

DocCluster.java

/**
 * 
* @author
* @version 创建时间：2011-3-8 上午10:23:35
 */
public class DocCluster {
	private String[] docIdList;
	private ICTHit[] docList;
	private int level;
	private List<DocCluster> subclusters;
	private String tags;
	private String title;

	public String[] getDocIdList() {
		return this.docIdList;
	}

	public void setDocIdList(String[] docIdList) {
		this.docIdList = docIdList;
	}

	public ICTHit[] getDocList() {
		return this.docList;
	}

	public void setDocList(ICTHit[] docList) {
		this.docList = docList;
	}

	public int getLevel() {
		return level;
	}

	public void setLevel(int level) {
		this.level = level;
	}

	public List<DocCluster> getSubclusters() {
		return this.subclusters;
	}

	public void setSubclusters(List<DocCluster> subclusters) {
		this.subclusters = subclusters;
	}

	public String getTags() {
		return this.tags;
	}

	public void setTags(String tags) {
		this.tags = tags;
	}

	public String getTitle() {
		if (title == null)
			title = "";
		return this.title;
	}

	public void setTitle(String title) {
		this.title = title;
	}
}

ICTHit.java

public class ICTHit implements Serializable {
	/*
	 * 关键词数组
	 */
	private String[] TagList;
	private String docId;
	private String title;

	public String[] getTagList() {
		return TagList;
	}

	public void setTagList(String[] tagList) {
		TagList = tagList;
	}

	public String getDocId() {
		return docId;
	}

	public void setDocId(String docId) {
		this.docId = docId;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}	
	
}

TagHitMatrix.java

public class TagHitMatrix extends LinkedHashMap<String, long[]> {
	/**
	 * 
	 */
	private static final long serialVersionUID = -7511464445378974433L;
	public static int ii = 0;
	public ClusterDocInfo[] docs;
	public int hitsItemCount;

	public TagHitMatrix(int DocCount, int MaxTagCount) {
		this.hitsItemCount = (int) (DocCount / 62.0D + 0.984375D);
		this.docs = new ClusterDocInfo[DocCount];

		for (int i = 0; i < this.docs.length; i++)
			this.docs[i] = new ClusterDocInfo(MaxTagCount);
	}

	public void AddDocHit(String TagStr, int Position) {
		TagStr = TagStr.trim();

		int n = Position / 62;
		int m = Position % 62;
		long[] DocHits = (long[]) get(TagStr);
		if (DocHits == null) {
			DocHits = new long[this.hitsItemCount];
			put(TagStr, DocHits);
		}
		DocHits[n] |= Math.round(Math.pow(2.0D, m));
		ClusterDocInfo di = this.docs[Position];
		di.TagList[(di.TagCount++)] = TagStr;
	}

	class ClusterDocInfo {
		public String[] TagList;
		public int TagCount;

		public ClusterDocInfo(int MaxTagCount) {
			this.TagList = new String[MaxTagCount];
			this.TagCount = 0;
		}
	}
}

测试方法：

public void test(ICTHit[] icthits) throws IOException {
		ClusterBuilder clusterBuilder = new ClusterBuilder();
		// 设置需要聚类的数据集合，测试中用的null。
		clusterBuilder.setDocs(icthits);
		// 设置聚类级别，只使用1级
		clusterBuilder.setMaxLevels(10);
		clusterBuilder.setUseTagsAsTitle(true);
		// 一般将检索词设置为wordsExcluded
		clusterBuilder.setWordsExcluded("万美元,日本,公司,视频,北京时间,图文,新华网,新浪,消息,通讯,互联网,美国,中国");
		clusterBuilder
				.setOptions(new ClusteringOptions[] { new ClusteringOptions(),new ClusteringOptions() });

		// 开始聚类
		clusterBuilder.cluster();
		FileWriter fw1 = new FileWriter("c:/today-20110509-cluster.txt ", true);
		BufferedWriter bw1 = new BufferedWriter(fw1);

		// 打印结果
		if (clusterBuilder.getClusters() != null) {
			int i = 0;
			for (DocCluster docCluster : clusterBuilder.getClusters()) {
				i++;
				System.out.println("tag:" + docCluster.getTags() + "("
						+ docCluster.getDocIdList().length + ")");
				bw1.write(docCluster.getTags() + "("+ docCluster.getDocIdList().length + ")"+"\r\n ");				
				
				if (docCluster.getDocList() != null
						&& docCluster.getDocList().length > 0) {
					for (ICTHit co : docCluster.getDocList()) {
						System.out.println("     DocID: " + co.getDocId());
						bw1.write("标题为: "	+ co.getTitle()+",ID为"+co.getDocId()+"\r\n ");	
						for (int m = 0; m < co.getTagList().length; m++) {							
							bw1.write("标题为: "	+ co.getTitle()+",ID为"+co.getDocId()+"\r\n ");	
							System.out.println("     Key Word: "
									+ co.getTagList()[m]);
						}
						System.out.println("");
					}
					System.out.println("");
				} else {
					bw1.write("      该分类下无数据！"+"\r\n ");	
				}
				bw1.write("-------------------------------------------------------------------------------\r\n");
			}
		}
		bw1.close();
		fw1.close();
	}

　如上方法可以，是一个示例性的，没有用在生产当中。核心方法有了。大家可以引用到项目当中。效果比carrot２标准的方法要好很多。

老和与宇华霖

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
生成文本聚类java实现 (3)

很多网友看到我的聚类的研究，到后来基本上都是到carrot2的研究上去了。但由于carrot2对中文的理解很不靠谱，所以参考了网络上的一些资料，现在贡献出来所有代码。　代码的思路就是找字或者词出现的频度，并进行打分，最后按照出现次数和重要性，找出重要的语汇。现在贴出来一些可用的代码。　ClusterBuilder.java /** * * @author *...
复制链接

扫一扫