java实现文本分类中卡方特征选择

java在文本分类中卡方的特征选择,

 在文本分类的特征选择阶段,一般使用“词汇t与类别c不相关”来做原假设,计算出的开方值越大,说明对原假设的偏离越大,我们越倾向于认为原

假设的反面情况是正确的。选择的过程为每个词计算它与类别c的开方值,从大到小排个序(此时开方值越大越相关),取前k个就可以。所以卡方值越大,

词汇与分类越相关。

最后结果,一个特征对应一个最后的卡方值:

挺 81613.22937057534
特别 80715.8079021637
太 77021.62013902201
按摩 77012.332216897
喜欢 74483.97317313829
好吃 68723.25474753922
推荐 64809.29378047458
朋友 55988.55238512035
里 55371.21236337274
舒服 54673.57763700053
服务员 53918.5463140205
手法 53379.51974620014
体验 52445.83665612151




看代码实现:

package com.meituan.model.libsvm;

public class ChiSquare {
	private String word;
	private double A;
	private double B;
	private double C;
	private double D;
	public double chisq;

	public ChiSquare() {

	}

	public ChiSquare(String word) {
		this.word = word;
	}

	public ChiSquare(String word, int a, int b, int c, int d) {
		super();
		this.word = word;
		this.A = a;
		this.B = b;
		this.C = c;
		this.D = d;
		this.chisq = getChisq();
	}

	public String getWord() {
		return word;
	}

	public void setWord(String word) {
		this.word = word;
	}

	public double getA() {
		return A;
	}

	public void setA(int a) {
		this.A = a;
	}

	public double getB() {
		return B;
	}

	public void setB(int b) {
		this.B = b;
	}

	public double getC() {
		return C;
	}

	public void setC(int c) {
		this.C = c;
	}

	public double getD() {
		return D;
	}

	public void setD(int d) {
		this.D = d;
	}

	public void incrA() {
		A = A + 1;
	}

	public void incrB() {
		B++;
	}

	public void decrementC() {
		C--;
	}

	public void decrementD() {
		D--;
	}

	public void setChisq() {
		this.chisq = getChisq();
	}

	public double getChisq() {
		if (A == 0 && B == 0 && C == 0 && D == 0) {
			return 0;
		} else {
			return Math.pow((A * D - B * C) * 1.0, 2) * 155738.0
					/ ((A + C) * (A + B) * (B + D) * (C + D));
		}
	}

	@Override
	public String toString() {
		return String.format(
				"word=%s  ChiSquare=%f [A=%f , B=%f , C=%f , D=%f  ]",
				this.word, this.getChisq(), this.A, this.B, this.C, this.D);
	}

	public static void main(String[] args) {
		ChiSquare chiSquare = new ChiSquare("搀扶", 17, 1, 20936, 134784);
		System.out.println(chiSquare.toString());
	}
}





package com.meituan.model.libsvm;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.stream.Collectors;

import org.ansj.splitWord.analysis.ToAnalysis;

import com.meituan.model.util.Config;
import com.meituan.nlp.util.TextUtil;
import com.meituan.nlp.util.WordUtil;

public class ChisQuareSelect {
	private static String inputpath = Config.getString("data.path");
	private static Map<String, ChiSquare> mapchis = new HashMap<String, ChiSquare>();

	public static void main(String[] args) {
		System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
		int[] arr = getTotal(inputpath);
		System.out.println("arr is :"+Arrays.toString(arr));
		init(inputpath, arr);
		featureselect(inputpath);

		System.out.println("排序开始");

		List<Map.Entry<String, ChiSquare>> list = new ArrayList<Map.Entry<String, ChiSquare>>(
				mapchis.entrySet());

		Collections.sort(list, new Comparator<Map.Entry<String, ChiSquare>>() {

			@Override
			public int compare(Entry<String, ChiSquare> o1,
					Entry<String, ChiSquare> o2) {
				double result = o1.getValue().getChisq()
						- o2.getValue().getChisq();
				if (result > 0) {
					return -1;
				}else{
				
				return 1;
				}
			}
		});

		write(list, "file/chisq.txt", arr);
		System.out.println("success");

	}

	public static void write(List<Map.Entry<String, ChiSquare>> list, String out, int[] arr) {
		BufferedWriter bw = null;
		int sum = 0;
		for (int ar : arr) {
			sum += ar;
		}
		System.out.println("sum is :"+sum);
		try {
			bw = new BufferedWriter(new OutputStreamWriter(
					new FileOutputStream(out)));
			for (Map.Entry<String, ChiSquare> map : list) {
				bw.write(map.getKey() + " " + map.getValue().getChisq() 
						+ "\n");
				if(Math.random()<0.001){
					System.out.println(map.getValue().toString());
				}
			}

		} catch (Exception e) {

			e.printStackTrace();

		} finally {
			if (bw != null) {
				try {
					bw.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}

	}

	public static int[] getTotal(String in) {
		BufferedReader br = null;

		int[] arr = new int[2];
		int totalP = 0;
		int totalN = 0;

		try {
			br = new BufferedReader(new InputStreamReader(new FileInputStream(
					in)));
			String lines = br.readLine();

			while (lines != null) {
				String label = lines.split("\t")[1].equalsIgnoreCase("-1") ? "-1"
						: "1";
				if (label.equals("-1")) {
					totalN++;
				} else {
					totalP++;
				}

				lines = br.readLine();

			}

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		arr[0] = totalP;
		arr[1] = totalN;
		return arr;
	}

	public static void init(String in, int[] arr) {
		BufferedReader br = null;

		try {
			br = new BufferedReader(new InputStreamReader(new FileInputStream(
					in)));
			String lines = br.readLine();

			while (lines != null) {
				String content = lines.split("\t")[0];
				Set<String> sets = ToAnalysis
						.parse(WordUtil.replaceAllSynonyms(TextUtil
								.fan2Jian(WordUtil.replaceAll(content
										.toLowerCase()))))
						.getTerms()
						.stream()
						.map(x -> x.getName())
						.filter(x ->WordUtil.isChineseChar(x) &&  !WordUtil.isStopword(x)
								&& !WordUtil.startWithNumeber(x))
						.collect(Collectors.toSet());

				for (String s : sets) {

					if (!mapchis.containsKey(s)) {
						mapchis.put(s, new ChiSquare(s, 0, 0, arr[0], arr[1]));
					}

				}

				lines = br.readLine();

			}

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	public static void featureselect(String in) {
		BufferedReader br = null;

		try {
			br = new BufferedReader(new InputStreamReader(new FileInputStream(
					in)));
			String lines = br.readLine();

			while (lines != null) {
				String content = lines.split("\t")[0];
				String label = lines.split("\t")[1].equalsIgnoreCase("-1") ? "-1"
						: "1";

				Set<String> sets = ToAnalysis
						.parse(WordUtil.replaceAllSynonyms(TextUtil
								.fan2Jian(WordUtil.replaceAll(content
										.toLowerCase()))))
						.getTerms()
						.stream()
						.map(x -> x.getName())
						.filter(x -> WordUtil.isChineseChar(x) && !WordUtil.isStopword(x)
								&& !WordUtil.startWithNumeber(x))
						.collect(Collectors.toSet());

				for (String s : sets) {
					if ("-1".equalsIgnoreCase(label)) {
						mapchis.get(s).incrA();
						mapchis.get(s).decrementC();
					} else if ("1".equalsIgnoreCase(label)) {
						mapchis.get(s).incrB();
						mapchis.get(s).decrementD();
					}

				}

				lines = br.readLine();

			}

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

}


  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值