C# 读取大文本出现System.OutOfMemoryException,两种解决方案

最新推荐文章于 2024-02-20 11:26:54 发布

PLA12147111

最新推荐文章于 2024-02-20 11:26:54 发布

阅读量7.9k

点赞数 1

分类专栏： C# 文章标签：读取超大文本内存溢出 C#

本文链接：https://blog.csdn.net/PLA12147111/article/details/111226114

版权

C# 专栏收录该内容

24 篇文章 1 订阅

订阅专栏

问题:读取一个444M的文本文件,出现内存溢出

方案1:利用FileStream读取byte,利用换行符次数为间隔区间,依次读取(边读边处理):

优缺点:速度快,不适合读取指定行(因为是用遍历的,所以读取指定行会比较慢)

解决思路:获取区间数组中最后一个换行符(\n的byte字节),以此为分界点,该换行符以后的数据,留给下一个区间来组装;

代码:

/// <summary>
/// 通过给定的文件流，判断文件的编码类型
/// </summary>
/// <param name=“fs“>文件流</param>
/// <returns>文件的编码类型</returns>
public Encoding GetType(Stream fs) {
    byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };
    byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };
    byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //带BOM
    Encoding reVal = Encoding.Default;

    BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);
    int i;
    int.TryParse(fs.Length.ToString(), out i);
    byte[] ss = r.ReadBytes(i);
    if (IsUTF8Bytes(ss) || (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF)) {
        reVal = Encoding.UTF8;
    } else if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00) {
        reVal = Encoding.BigEndianUnicode;
    } else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41) {
        reVal = Encoding.Unicode;
    }
    r.Close();
    return reVal;
}

/// <summary>
/// 通过给定的文件流，判断文件的编码类型
/// </summary>
/// <param name=“fs“>文件流</param>
/// <returns>文件的编码类型</returns>
public Encoding GetType2(byte[] ss)
{

    Encoding reVal = Encoding.Default;

    if ((ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF) || IsUTF8Bytes(ss))
    {
        reVal = Encoding.UTF8;
    }
    else if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
    {
        reVal = Encoding.BigEndianUnicode;
    }
    else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
    {
        reVal = Encoding.Unicode;
    }
    return reVal;
}


/// <summary>
/// 获取文件编码
/// </summary>
/// <param name="filePath"></param>
/// <returns></returns>
public Encoding GetEncoding(string filePath) {
    FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read);
    Encoding r = GetType(fs);
    fs.Close();
    return r;
}



/// <summary>
/// FileStream读取文件文本
/// </summary>
/// <param name="filePath"></param>
/// <param name="txtCount">间隔多少个换行符读取一次</param>
/// <returns></returns>
public void FSLimitReadFileText(string filePath,int txtCount)
{
        if (!File.Exists(filePath))
    {
        return;
    }
    FileTool ft = new FileTool();

    byte rn = new UTF8Encoding(false).GetBytes("\n")[0];

    using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read))
    {
        Encoding encoding = null;
        List<byte> bList = new List<byte>();
        string text;

        int rnCount = 0;
        int cnt, m;
        m = 0;
        cnt = fs.ReadByte();
        byte by;
        while (cnt != -1)
        {
            if (!isRuning)
            {
                break;
            }

            by = Convert.ToByte(cnt);
            bList.Add(by);

            if (by == rn)
            {
                rnCount++;
            }

            if (rnCount >= txtCount)
            {
                rnCount = 0;

                if (encoding == null)
                {
                    encoding = ft.GetType2(bList.ToArray());
                    if (encoding is UTF8Encoding)
                    {
                        encoding = new UTF8Encoding(false);
                    }
                }

                text = encoding.GetString(bList.ToArray());
                //这里就可以执行你的任务了
                //...............

                //最后这里清空byte列表,否则下一次会数据会重复
                bList.Clear();
            }

            cnt = fs.ReadByte();
        }

        if (bList.Count > 0)
        {
            if (encoding == null)
            {
                encoding = ft.GetType2(bList.ToArray());
                if (encoding is UTF8Encoding)
                {
                    encoding = new UTF8Encoding(false);
                }
            }

            text = encoding.GetString(bList.ToArray());
            //这里就可以执行你的任务了
            //...............
            bList.Clear();
        }

    }

}

方案2:利用IEnumerable进行迭代获取

优缺点:几乎不怎么占内存,速度遍历的话速度比较慢

如果是只需要取其中几行的话可以考虑用这个,如果是要全部读取,那么不推荐,

同一个444M的文件,利用IEnumerable迭代完毕需要4分多钟,方案一也就13秒钟

(这里我是需要将数据变成字符串,依次进行处理,所以会比较慢,如果是文件的粘贴复制之类的,建议用BufferStream)

public string ReadFileLineText(string filePath)
{
    if (!File.Exists(filePath))
    {
        return null;
    }
    
    int limit = 5000;//每次读取5000行
    string text = null;
    Encoding encoding = GetEncoding(filePath); //GetEncoding方法参考方案1
    if (encoding is UTF8Encoding)
    {
        encoding = new UTF8Encoding(false);
    }

    IEnumerable<string> lines = File.ReadLines(filePath, encoding);
    int size = lines.Count<string>();

    //遍历
    // IEnumerable<string> tempLines;
    // IEnumerator<string> iter;
    // for (int i = 0; i < size; i++)
    // {
    //     if (i%limit==0)
    //     {
    //         int max = limit - 1 + i / limit * limit >= size ? size % limit : limit;
    //         tempLines = lines.Skip(i).Take(max);
    //         iter = tempLines.GetEnumerator();
    //         while (iter.MoveNext())
    //         {
    //             string dataStr = iter.Current;
    //             //SetLog(dataStr);
    //         }
    //     }
    // }

    //取第994400后面的5000行数据(不包含994400行)
    lines = lines.Skip(994400).Take(limit);
    IEnumerator<string> iter = lines.GetEnumerator();
    while (iter.MoveNext())
    {
        string dataStr = iter.Current;
        SetLog(dataStr);
    }
    return text;
}

PLA12147111

关注

1
点赞
踩
11

收藏

觉得还不错? 一键收藏
1
评论
C# 读取大文本出现System.OutOfMemoryException,两种解决方案

问题:读取一个444M的文本文件,出现内存溢出方案1:利用FileStream将数据读取到byte数组中,然后按区间遍历该byte数组;优缺点:速度快,占系统内存可能出现的问题:一个汉字是两个字节,按区间读取转换成字符串,就有可能把一个汉字拆成两半,导致乱码;解决思路:获取区间数组中最后一个换行符(\n的byte字节),以此为分界点,该换行符以后的数据,留给下一个区间来组装;代码:/// <summary>/// 通过给定的文件流，判断文件的编码类型/// &.
复制链接

扫一扫

专栏目录