词频统计

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace ConsoleApplication1
{
    /*
     * 
     * Problem 2:   (Please write this one in C# if you know C#)
     *  Write a program that counts word occurrences in a directory of
     * UTF-8-encoded text files and outputs a file (UTF-8) containing a list
     * sorted (a) by frequency and (b) alphabetically within the same
     * frequency. Words should be normalized to lower case 
     * (i.e., "Hat", "hAt" and "HAT" should all be normalized to "hat").
     * 
     * You can assume words are space-separated tokens.
     * 
     * The directory may contain sub directories and we need to search through
     * all sub directories. Also, the directory may contain different type of files,
     * but we should only process files with extension ".txt".
     * 
     * Assume punctuations : , . !
     */

    class Program
    {
        static void Main(string[] args)
        {
            string inputDictPath = @"test";
            string outputFilePath = @"output.txt";

            Dictionary<string, int> dict = new Dictionary<string, int>();

            //Recursive traversal directory, statistics, stored in $dict
            WalkThroughDirectory(inputDictPath, dict);
            //Sort and output to file
            Output(dict, outputFilePath);
            //Wait for input
            System.Console.ReadLine();

        }

        /// <summary>
        ///  Recursive traversal directory to process files with suffix ".txt"
        /// </summary>
        /// <param name="directory">path of directory</param>
        /// <param name="dict">directory to store "wordscount"</param>
        static void WalkThroughDirectory(string directory, Dictionary<string, int> dict)
        {
            DirectoryInfo di = new DirectoryInfo(directory);

            FileInfo[] fis = di.GetFiles();
            for (int i = 0; i < fis.Length; i++)
            {
                // Only process the files with suffix ".txt"
                if (fis[i].FullName.EndsWith(".txt"))
                {
                    CountWords(fis[i].FullName, dict);
                }
            }

            DirectoryInfo[] dis = di.GetDirectories();
            for (int j = 0; j < dis.Length; j++)
            {
                WalkThroughDirectory(dis[j].FullName, dict);
            }
        }

        /// Count word occurrences in a directory of UTF-8-encoded text files
        static void CountWords(string filepath, Dictionary<string, int> dict)
        {
            FileStream fs = new FileStream(filepath, FileMode.Open, FileAccess.Read);
            StreamReader sr = new StreamReader(fs, Encoding.UTF8);
            string txt;
            while ((txt = sr.ReadLine()) != null)
            {
                //Replace "," "." and "!" with " "
                txt = txt.Replace(",", " ").Replace(".", " ").Replace("!", " ");
                string[] txtsp = txt.Split(' ');
                for (int i = 0; i < txtsp.Length; i++)
                {
                    //Skip when 0-length
                    if (txtsp[i].Length == 0)
                    {
                        continue;
                    }

                    string word = txtsp[i].ToLower();
                    if (dict.ContainsKey(word))
                    {
                        dict[word]++;
                    }
                    else
                    {
                        dict.Add(word, 1);
                    }
                }

            }

            fs.Close();
            sr.Close();            

        }

        /// Outputs a file (UTF-8) containing a list
        /// sorted (a) by frequency and (b) alphabetically within the same frequency.
        static void Output(Dictionary<string, int> dict, string outputFilePath)
        {
            //If the file exists, overwrite it.
            StreamWriter sw = new StreamWriter(
                outputFilePath, false, Encoding.GetEncoding("utf-8"));
            //Sorted (a) by frequency and (b) alphabetically within the same frequency.
            dict = dict.OrderByDescending(p => p.Value).ThenBy(p => p.Key).ToDictionary(p => p.Key, p => p.Value);

            foreach (KeyValuePair<string, int> kvp in dict)
            {
                sw.WriteLine("{0} {1}", kvp.Key, kvp.Value);
                //For debug
                Console.WriteLine("{0} {1}", kvp.Key, kvp.Value);
            }

            sw.Close();
        }
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值