当我们需要处理大量的数据时,为了能够提高程序的处理速度,我们的做法通常是尽可能的优化算法。然而,当算法不可再优化时,我们就该考虑能否合理的将数据分割成若干个子集,然后去做并行处理。下面是我通过将Excel中的两个Sheet页的数据作比较的例子,目的是得到,A表中有,B表中没有的数据。(两表中各有10000行数据)
class Program
{
static void Main(string[] args)
{
TableCompare tc = new TableCompare();
ExcelHelper excelHelper = new
ExcelHelper(@"C:\Users\soul\Desktop\test.xlsx");
DataTable tb1 = excelHelper.ExcelToDataTable("sheet1", true);
DataTable tb2 = excelHelper.ExcelToDataTable("sheet2", true);
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
DataTable retTb = tc.SingleThreadCompare(tb1, tb2);
//DataTable retTb = tc.MultiThreadCompare(tb1, tb2);
//DataTable retTb = tc.ParallelCompare(tb1, tb2);
stopwatch.Stop();
TimeSpan timeSpan = stopwatch.Elapsed;
PrintDataTable(retTb);
Console.WriteLine(timeSpan.TotalSeconds + "秒");
Console.ReadKey();
}
static void PrintDataTable(DataTable table)
{
for (int i = 0; i < table.Rows.Count; i++)
{
for (int j = 0; j < table.Columns.Count; j++)
{
Console.Write(table.Rows[i][j] + "\t");
}
Console.WriteLine();
}
}
}
class TableCompare
{
private DataTable retTb;
public DataTable SingleThreadCompare(DataTable tb1, DataTable tb2)
{
retTb = tb1.Clone();
for (int i = 0; i < tb1.Rows.Count; i++)
{
bool isExist = true;
for (int j = 0; j < tb2.Rows.Count; j++)
{
bool isEquals = true;
for (int x = 0; x < tb2.Columns.Count; x++)
{
if (Convert.ToString(tb1.Rows[i][x]) != Convert.ToString(tb2.Rows[j][x]))
{
isEquals = false;
}
}
if (isEquals)
{
isExist = false;
}
}
if (isExist)
{
retTb.ImportRow(tb1.Rows[i]);
}
}
return retTb;
}
public DataTable MultiThreadCompare(DataTable tb1, DataTable tb2)
{
retTb = tb1.Clone();
int workCount = tb1.Rows.Count / 2;
bool IsComplete1 = false;
bool IsComplete2 = false;
Thread t1 = new Thread(() => Run(0, workCount, tb1, tb2, ref IsComplete1));
Thread t2 = new Thread(() => Run(workCount, tb1.Rows.Count, tb1, tb2, ref IsComplete2));
t1.Start();
t2.Start();
while (!IsComplete1 && !IsComplete2){ }
return retTb;
}
private void Run(int begin, int workCount, DataTable tb1, DataTable tb2, ref bool IsComplete)
{
for (int i = begin; i < workCount; i++)
{
bool isExist = true;
for (int j = 0; j < tb2.Rows.Count; j++)
{
bool isEquals = true;
for (int x = 0; x < tb2.Columns.Count; x++)
{
if (Convert.ToString(tb1.Rows[i][x]) != Convert.ToString(tb2.Rows[j][x]))
{
isEquals = false;
}
}
if (isEquals)
{
isExist = false;
}
}
if (isExist)
{
lock (retTb)
{
retTb.ImportRow(tb1.Rows[i]);
}
}
}
IsComplete = true;
}
public DataTable ParallelCompare(DataTable tb1, DataTable tb2)
{
retTb = tb1.Clone();
Parallel.For(0, tb1.Rows.Count, i =>
{
bool isExist = true;
for (int j = 0; j < tb2.Rows.Count; j++)
{
bool isEquals = true;
for (int x = 0; x < tb2.Columns.Count; x++)
{
if (Convert.ToString(tb1.Rows[i][x]) != Convert.ToString(tb2.Rows[j][x]))
{
isEquals = false;
}
}
if (isEquals)
{
isExist = false;
}
}
if (isExist)
{
retTb.ImportRow(tb1.Rows[i]);
}
});
return retTb;
}
}
SingleThreadCompare只使用一个线程做处理,即在主线程中进行处理。
运行结果如下所示:
这种处理方式,运行速度较慢,很难让人满意。
为了提高运行速度,决定采用多线程的方式来处理,即MultiThreadCompare方法,这里我只使用了两个子线程,两个子线程各做一半的数据处理,得到如下的效果:
速度提升了将近一半,结果是喜人的,不过,在.Net Framework 4.0 以上版本提供了一个并行处理框架Parallel,使用此框架则无需手动创建子线程,即可做并行处理。ParallelCompare方法的处理结果如下:
并行处理后,速度得到了大量的提升。