hadoop 读取 文本内容

Configuration conf = context.getConfiguration();
				FileSystem fs = FileSystem.get(conf);				
				
//				FSDataInputStream fin = fs.open(new Path(conf.get("emotionPath")));
				FSDataInputStream fin = fs.open(new Path("/user/lvxinjian/negative.txt"));
				BufferedReader in = null;
				String line;
				try {
					in = new BufferedReader(new InputStreamReader(fin, "UTF-8"));
					
					while ((line = in.readLine()) != null) {
						wordSet.add(line);
					}
					System.out.println(wordSet.size());

				} finally {
					if(in != null)
						in.close();
				}


public class GetSentenceWithPos {

	
	public void read () throws IOException
	{
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		SequenceFile.Reader sreader = null;
		
		try {
			sreader = new SequenceFile.Reader(fs, new Path("/user/lvxinjian/tfidf/mediafile/dictionary.file-0"),conf);
			Text key = new Text();// key 和 value的类型要和当前读取文件的key val 一致
			IntWritable val = new IntWritable();
			HashMap<Integer , String> WordList = new HashMap<Integer, String>();//词典
			System.out.println("load dictionary 0...");
			while (sreader.next(key, val)) {
				WordList.put(val.get(),key.toString());					
			}		
			System.out.println("load dictionary 1...");
			sreader = null;
			sreader = new SequenceFile.Reader(fs, new Path("/user/lvxinjian/tfidf/mediafile/dictionary.file-1"),conf);
			while (sreader.next(key, val)) {
				WordList.put(val.get(),key.toString());				
			}
			System.out.println("load dictionary 2...");
			sreader = null;
			sreader = new SequenceFile.Reader(fs, new Path("/user/lvxinjian/tfidf/mediafile/dictionary.file-2"),conf);
			while (sreader.next(key, val)) {
				WordList.put(val.get(),key.toString());					
			}
			
			Configuration conf1 = new Configuration();;
			FileSystem fs2 = FileSystem.get(conf1);

			FSDataInputStream fin = fs2.open(new Path("/user/lvxinjian/showTfidf49AllData/part-r-00000"));
			BufferedReader in = null;
			String line;
			System.out.println("load wordindex_count...");
			ArrayList<String> wordInfo = new ArrayList<String>();       //mapreduce结果
			in = new BufferedReader(new InputStreamReader(fin, "UTF-8"));				
			while ((line = in.readLine()) != null) {
				wordInfo.add(line);
			}
			System.out.println("sizef:\t"+ wordInfo.size());
			System.out.println("get word ...");
			ArrayList<String> lstResult = new ArrayList<String>();
			int count = 0; 
			for(String str : wordInfo){
				if(count % 1000 == 0)
					System.out.println(count);
				count++;
				String [] arr = str.split("\t");
				if(arr.length != 2)
					continue;
				if(WordList.containsKey(Integer.parseInt(arr[0]))){
					String word = WordList.get(Integer.parseInt(arr[0]));
					lstResult.add(word + "\t" + arr[1]);
				}
				
			}
			System.out.println("saving....");
			FileTool.SaveListToFile(lstResult, "./2013052802.txt", false, Charset.forName("utf-8"));
		} 
		finally {
			IOUtils.closeStream(sreader);
		}

		
	}
	static public void main(String [] args)
	{
		
		try {
			GetSentenceWithPos getSentenceWithPos = new GetSentenceWithPos();
			getSentenceWithPos.read();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
}


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值