一直想着整理出网页抓取的具体实现功能代码,方便大家指正,也方便自己学习修正。当然这个并不是针对所有网页,自己写的功能有限,只能针对某一特定结构的网页进行数据采集,如果有更好的方法,请大家不吝指教,在此谢过!
一、抓取网页内容:
网上可以搜索到很多抓取网页的代码,以下这个方法是我搜到的一个供参考:
/// <summary>
/// 获取网页全部源代码
/// </summary>
/// <param name="url">/要访问的网站地址</param>
/// <param name="charSets">目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码</param>
/// <returns></returns>
public static string getHtml(string url, params string[] charSets)
{
try
{
string charSet = null;
if (charSets.Length == 1)
{
charSet = charSets[0];
}
WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
// 需要注意的:
//有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
//这是就要具体问题具体分析比如在头部加入cookie
// webclient.Headers.Add("Cookie", cookie);
//这样可能需要一些重载方法.根据需要写就可以了
//获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据.
myWebClient.Credentials = CredentialCache.DefaultCredentials;
//如果服务器要验证用户名,密码
//NetworkCredential mycred = new NetworkCredential(struser, strpassword);
//myWebClient.Credentials = mycred;
//从资源下载数据并返回字节数组.(加@是因为网址中间有"/"符号)
byte[] myDataBuffer = myWebClient.DownloadData(url);
string strWebData = Encoding.Default.GetString(myDataBuffer);
//获取网页字符编码描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[2].Value;
if (charSet == null || charSet == "")
charSet = webCharSet;
if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
{
strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
}
else
{
strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);
}
return strWebData;
}
catch (Exception e) { return ""; }
}
二、网页代码处理与分类导入数据库
网页处理是采用的正则表达式进行匹配查找,包括html标签的匹配、空格空行的匹配等。
public string GetClasses(string str)
{
Model.product_category model = new Model.product_category();
BLL.product_category bll = new BLL.product_category();
string classname = "";
string pattern1 = "<span><h2><a\\s>(.*?)</a></h2></span>";
string pattern2 = "(<dt>[^<a\\shref=\"http://.*\">]*)(.*?)</a></dt>";
string pattern21 = "<a\\shref=\"http://.*\">(.*?)</a>";
string pattern3 = "(<li>[^<a\\shref=\"http://.*\">]*)(.*?)</a></li>";
foreach(char b in str)
{
var m = Regex.Match(str, pattern1, RegexOptions.Singleline | RegexOptions.IgnoreCase);
string bs1 = m.Groups[1].Value; //一级类
str = str.Substring(str.IndexOf("</a></h2></span>") + 17);
classname = "★一级类:" + bs1;
model = new Model.product_category();
model.title = bs1;
model.call_index = "";
model.parent_id = 0;
model.class_layer = 1;
model.sort_id = 99;
int cateId = bll.Add(model);
if(cateId>0)
{
model.class_list = "," + cateId + ",";
bll.Update(model);
foreach (char a in str)
{
var m2 = Regex.Match(str, pattern2, RegexOptions.Singleline | RegexOptions.IgnoreCase);
if (m2.Length > 0)
{
string aa = m2.ToString();
m = Regex.Match(aa, pattern21, RegexOptions.Singleline | RegexOptions.IgnoreCase);
string bs2 = m.Groups[1].Value; //二级类
Regex rgx2 = new Regex("(<dt>[^<a\\shref=\"http://.*\">]*)(.*?)" + bs2 + "</a></dt>");
str = rgx2.Replace(str, "");
classname += "◆二级类:" + bs2 + "▲三级类:[";
model = new Model.product_category();
model.title = bs2;
model.call_index = "";
model.parent_id = cateId;
model.class_layer = 2;
model.sort_id = 99;
int catexId = bll.Add(model);
if(catexId>0)
{
model.class_list = "," + cateId + "," + catexId + ",";
bll.Update(model);
foreach (char c in str)
{
var m3 = Regex.Match(str, pattern3, RegexOptions.Singleline | RegexOptions.IgnoreCase);
if (m3.Length > 0)
{
string str3 = m3.ToString();
m = Regex.Match(str3, pattern21, RegexOptions.Singleline | RegexOptions.IgnoreCase);
string bs3 = m.Groups[1].Value; //三级类
model = new Model.product_category();
model.title = bs3;
model.call_index = "";
model.parent_id = catexId;
model.class_list = "";
model.class_layer = 3;
model.sort_id = 99;
int catexxId = bll.Add(model);
if(catexxId>0)
{
model.class_list = "," + cateId + "," + catexId + "," + catexxId + ",";
bll.Update(model);
}
classname += "|" + bs3;
Regex rgx = new Regex("(<li>[^<a\\shref=\"http://.*\">]*)(.*?)" + bs3 + "</a></li>");
str = rgx.Replace(str, "");
}
str = new Regex("(\r\n)*[(\\t)*]").Replace(str, "");
int index_0 = str.IndexOf("<div style=\"clear:both\"></div>");
if (index_0 == 74 || index_0 == 29)
{
str = str.Substring(index_0 + 34);
break; //跳出本层循环(最内层循环结束)
}
}
}
classname += "]";
}
}
}
int index_1 = str.IndexOf("<div class=\"sidelist\">");
if (index_1 < 45)
{
break; //跳出本层循环(内层循环结束)
}
}
return classname;
}
三、触发采集
前台代码:
输入网址:<asp:TextBox ID="TextBox1" runat="server" Width="400">http://www.tansoole.com/</asp:TextBox>
<asp:Button ID="Button2" runat="server" Text="提交" onclick="Button2_Click" /><br /><br />
<asp:TextBox ID="TextBox2" runat="server" TextMode="MultiLine" Rows="30" Columns="100"></asp:TextBox>
后台事件:
protected void Button2_Click(object sender, EventArgs e)
{
if (this.TextBox1.Text.Length > 0)
{
string str = getHtml(this.TextBox1.Text, "utf-8");
str = str.Substring(str.IndexOf("sidebar")+10);
this.TextBox2.Text = GetClasses(str);
}
}
本文只能为做网页采集提供一个思路,具体实现还需根据实际情况进行验证修改。
PS:涉及的正则内容
//s 匹配空格(带转义字符)
(\r\n)* 匹配空行及换行
(\t)* 匹配制表符