写了一个控制台应用程序, 用了2个小时时间一共从sohu网站上抓取30000多条记录.截图如下.
![](https://p-blog.csdn.net/images/p_blog_csdn_net/bakers/200071/o_idea1.gif)
数据库截图
![](https://p-blog.csdn.net/images/p_blog_csdn_net/bakers/200071/o_idea2.gif)
离十几万的距离又近了一步.
using
System;
using
System.Data;
using
System.IO;
using
System.Configuration;
using
System.Web;
using
System.Web.Security;
using
System.Web.UI;
using
System.Web.UI.WebControls;
using
System.Web.UI.WebControls.WebParts;
using
System.Web.UI.HtmlControls;
using
System.Data.SqlClient;
using
System.Text;
using
System.Text.RegularExpressions;
using
System.Net;
using
System.Collections.Generic;
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
/**/
/// <summary>
/// test 的摘要说明
/// </summary>
///
public
class
test
![](https://i-blog.csdnimg.cn/blog_migrate/a41954a27d6ad96fa2c2cf816e677448.gif)
...
{
//private string requestUrl;
//private int resultCount;
private static string information;
private static HttpWebRequest httpwRequest;
private static HttpWebResponse httpwResponse;
//private string pageText = "";
//private string[] regexs;
private static int count;
private static SqlConnection connection;
//public MatchCollection[] matchs;
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
public static void Main()
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
string headerRegex = "(?<=ti8 fc_3333'>)[^<]{10,}";
string textRegex = "(?<=528 class='break'>)[^/]*</td></tr>";
MatchCollection mcHeader;
MatchCollection mcText;
connection = new SqlConnection(@"Data Source=BAKERS;Initial Catalog=idea;User ID=sa;Password=123456");
// string header = "";
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
for (int j = 1; j < 1599; j++)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
try
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
count = 0;
string page;
page = j.ToString();
string requestUrl = "http://comment2.news.sohu.com/viewcomments.action?id=242136287&pageNumber=" + page;
//string requestUrl = "http://comment2.news.sohu.com/viewcomment.action?id=12146627";
httpwRequest = (HttpWebRequest)WebRequest.Create(requestUrl);
httpwResponse = (HttpWebResponse)httpwRequest.GetResponse();
information = TextContent(httpwResponse);
mcHeader = Regex.Matches(information, headerRegex);
mcText = Regex.Matches(information, textRegex);
// Response.Write(mcHeader.Count.ToString() + " " + mcText.Count.ToString());
for (int i = 0; (i < mcText.Count) && (i < mcHeader.Count); i++)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
count++;
Console.WriteLine("page:" + j.ToString() + " count:" + count.ToString() + " :" + Regex.Replace(mcHeader[i].Value, "(<[^>]*>)|'", ""));
Console.WriteLine(Regex.Replace(mcText[i].Value, "(<[^>]*>)|'", ""));
ResultInsert(Regex.Replace(mcHeader[i].Value, "(<[^>]*>)|'", ""), Regex.Replace(mcText[i].Value, "(<[^>]*>)|'", ""));
}
// Label1.Text = header;
//}
// Label1.Text = count.ToString() + "条记录已经被抓取!";
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
}
catch (Exception ee)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
Console.WriteLine(ee.Message);
}
}
}
public static void ResultInsert(string header, string context)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
string sql = "insert into sohu(header,context,other)values('" + header + "','" + context + "',2)";
SqlCommand command = new SqlCommand(sql, connection);
if (connection.State == ConnectionState.Closed)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
command.Connection.Open();
}
command.ExecuteNonQuery();
}
public static Encoding GetEncoding(HttpWebResponse response)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
string name = response.ContentEncoding;
Encoding code = Encoding.Default;
if (name == "")
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
string contentType = response.ContentType;
if (contentType.ToLower().IndexOf("charset") != -1)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
name = contentType.Substring(contentType.ToLower().IndexOf("charset=") + "charset=".Length);
}
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
if (name != "")
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
try
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
code = Encoding.GetEncoding(name);
}
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
catch ...{ }
}
return code;
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
public static string TextContent(HttpWebResponse response)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
string buffer = "", line;
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream, GetEncoding(response));
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
while ((line = reader.ReadLine()) != null)
![](https://i-blog.csdnimg.cn/blog_migrate/37c8bf68cdc3cc81759c34160776bc53.gif)
...{
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
buffer += line + " ";
}
stream.Close();
response.Close();
return buffer;
}
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/6a9c071a08f1dae2d3e1c512000eef41.gif)
}
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)
![](https://i-blog.csdnimg.cn/blog_migrate/6810355c2f78c12e91b7997a8e8c583a.gif)