写了一个控制台应用程序, 用了2个小时时间一共从sohu网站上抓取30000多条记录.截图如下.
数据库截图
离十几万的距离又近了一步.
using
System;
using System.Data;
using System.IO;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Data.SqlClient;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.Collections.Generic;
/**/ /// <summary>
/// test 的摘要说明
/// </summary>
///
public class test
... {
//private string requestUrl;
//private int resultCount;
private static string information;
private static HttpWebRequest httpwRequest;
private static HttpWebResponse httpwResponse;
//private string pageText = "";
//private string[] regexs;
private static int count;
private static SqlConnection connection;
//public MatchCollection[] matchs;
public static void Main()
...{
string headerRegex = "(?<=ti8 fc_3333'>)[^<]{10,}";
string textRegex = "(?<=528 class='break'>)[^/]*</td></tr>";
MatchCollection mcHeader;
MatchCollection mcText;
connection = new SqlConnection(@"Data Source=BAKERS;Initial Catalog=idea;User ID=sa;Password=123456");
// string header = "";
for (int j = 1; j < 1599; j++)
...{
try
...{
count = 0;
string page;
page = j.ToString();
string requestUrl = "http://comment2.news.sohu.com/viewcomments.action?id=242136287&pageNumber=" + page;
//string requestUrl = "http://comment2.news.sohu.com/viewcomment.action?id=12146627";
httpwRequest = (HttpWebRequest)WebRequest.Create(requestUrl);
httpwResponse = (HttpWebResponse)httpwRequest.GetResponse();
information = TextContent(httpwResponse);
mcHeader = Regex.Matches(information, headerRegex);
mcText = Regex.Matches(information, textRegex);
// Response.Write(mcHeader.Count.ToString() + " " + mcText.Count.ToString());
for (int i = 0; (i < mcText.Count) && (i < mcHeader.Count); i++)
...{
count++;
Console.WriteLine("page:" + j.ToString() + " count:" + count.ToString() + " :" + Regex.Replace(mcHeader[i].Value, "(<[^>]*>)|'", ""));
Console.WriteLine(Regex.Replace(mcText[i].Value, "(<[^>]*>)|'", ""));
ResultInsert(Regex.Replace(mcHeader[i].Value, "(<[^>]*>)|'", ""), Regex.Replace(mcText[i].Value, "(<[^>]*>)|'", ""));
}
// Label1.Text = header;
//}
// Label1.Text = count.ToString() + "条记录已经被抓取!";
}
catch (Exception ee)
...{
Console.WriteLine(ee.Message);
}
}
}
public static void ResultInsert(string header, string context)
...{
string sql = "insert into sohu(header,context,other)values('" + header + "','" + context + "',2)";
SqlCommand command = new SqlCommand(sql, connection);
if (connection.State == ConnectionState.Closed)
...{
command.Connection.Open();
}
command.ExecuteNonQuery();
}
public static Encoding GetEncoding(HttpWebResponse response)
...{
string name = response.ContentEncoding;
Encoding code = Encoding.Default;
if (name == "")
...{
string contentType = response.ContentType;
if (contentType.ToLower().IndexOf("charset") != -1)
...{
name = contentType.Substring(contentType.ToLower().IndexOf("charset=") + "charset=".Length);
}
}
if (name != "")
...{
try
...{
code = Encoding.GetEncoding(name);
}
catch ...{ }
}
return code;
}
public static string TextContent(HttpWebResponse response)
...{
string buffer = "", line;
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream, GetEncoding(response));
while ((line = reader.ReadLine()) != null)
...{
buffer += line + " ";
}
stream.Close();
response.Close();
return buffer;
}
}
using System.Data;
using System.IO;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Data.SqlClient;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.Collections.Generic;
/**/ /// <summary>
/// test 的摘要说明
/// </summary>
///
public class test
... {
//private string requestUrl;
//private int resultCount;
private static string information;
private static HttpWebRequest httpwRequest;
private static HttpWebResponse httpwResponse;
//private string pageText = "";
//private string[] regexs;
private static int count;
private static SqlConnection connection;
//public MatchCollection[] matchs;
public static void Main()
...{
string headerRegex = "(?<=ti8 fc_3333'>)[^<]{10,}";
string textRegex = "(?<=528 class='break'>)[^/]*</td></tr>";
MatchCollection mcHeader;
MatchCollection mcText;
connection = new SqlConnection(@"Data Source=BAKERS;Initial Catalog=idea;User ID=sa;Password=123456");
// string header = "";
for (int j = 1; j < 1599; j++)
...{
try
...{
count = 0;
string page;
page = j.ToString();
string requestUrl = "http://comment2.news.sohu.com/viewcomments.action?id=242136287&pageNumber=" + page;
//string requestUrl = "http://comment2.news.sohu.com/viewcomment.action?id=12146627";
httpwRequest = (HttpWebRequest)WebRequest.Create(requestUrl);
httpwResponse = (HttpWebResponse)httpwRequest.GetResponse();
information = TextContent(httpwResponse);
mcHeader = Regex.Matches(information, headerRegex);
mcText = Regex.Matches(information, textRegex);
// Response.Write(mcHeader.Count.ToString() + " " + mcText.Count.ToString());
for (int i = 0; (i < mcText.Count) && (i < mcHeader.Count); i++)
...{
count++;
Console.WriteLine("page:" + j.ToString() + " count:" + count.ToString() + " :" + Regex.Replace(mcHeader[i].Value, "(<[^>]*>)|'", ""));
Console.WriteLine(Regex.Replace(mcText[i].Value, "(<[^>]*>)|'", ""));
ResultInsert(Regex.Replace(mcHeader[i].Value, "(<[^>]*>)|'", ""), Regex.Replace(mcText[i].Value, "(<[^>]*>)|'", ""));
}
// Label1.Text = header;
//}
// Label1.Text = count.ToString() + "条记录已经被抓取!";
}
catch (Exception ee)
...{
Console.WriteLine(ee.Message);
}
}
}
public static void ResultInsert(string header, string context)
...{
string sql = "insert into sohu(header,context,other)values('" + header + "','" + context + "',2)";
SqlCommand command = new SqlCommand(sql, connection);
if (connection.State == ConnectionState.Closed)
...{
command.Connection.Open();
}
command.ExecuteNonQuery();
}
public static Encoding GetEncoding(HttpWebResponse response)
...{
string name = response.ContentEncoding;
Encoding code = Encoding.Default;
if (name == "")
...{
string contentType = response.ContentType;
if (contentType.ToLower().IndexOf("charset") != -1)
...{
name = contentType.Substring(contentType.ToLower().IndexOf("charset=") + "charset=".Length);
}
}
if (name != "")
...{
try
...{
code = Encoding.GetEncoding(name);
}
catch ...{ }
}
return code;
}
public static string TextContent(HttpWebResponse response)
...{
string buffer = "", line;
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream, GetEncoding(response));
while ((line = reader.ReadLine()) != null)
...{
buffer += line + " ";
}
stream.Close();
response.Close();
return buffer;
}
}