自己写网页爬虫——网页分类抓取/采集并导入数据库

一直想着整理出网页抓取的具体实现功能代码,方便大家指正,也方便自己学习修正。当然这个并不是针对所有网页,自己写的功能有限,只能针对某一特定结构的网页进行数据采集,如果有更好的方法,请大家不吝指教,在此谢过!

一、抓取网页内容:
网上可以搜索到很多抓取网页的代码,以下这个方法是我搜到的一个供参考:

/// <summary>
/// 获取网页全部源代码
/// </summary>
/// <param name="url">/要访问的网站地址</param>
/// <param name="charSets">目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码</param>
/// <returns></returns>
public static string getHtml(string url, params  string[] charSets)
{
    try
    {
        string charSet = null;
        if (charSets.Length == 1)
        {
            charSet = charSets[0];
        }
        WebClient myWebClient = new WebClient(); //创建WebClient实例myWebClient
        // 需要注意的:
        //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
        //这是就要具体问题具体分析比如在头部加入cookie
        // webclient.Headers.Add("Cookie", cookie);
        //这样可能需要一些重载方法.根据需要写就可以了
        //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据.
        myWebClient.Credentials = CredentialCache.DefaultCredentials;
        //如果服务器要验证用户名,密码
        //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
        //myWebClient.Credentials = mycred;
        //从资源下载数据并返回字节数组.(加@是因为网址中间有"/"符号)
        byte[] myDataBuffer = myWebClient.DownloadData(url);
        string strWebData = Encoding.Default.GetString(myDataBuffer);
        //获取网页字符编码描述信息
        Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        string webCharSet = charSetMatch.Groups[2].Value;
        if (charSet == null || charSet == "")
            charSet = webCharSet;
        if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)
        {
            strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);
        }
        else
        {
            strWebData = Encoding.GetEncoding("utf-8").GetString(myDataBuffer);
        }
        return strWebData;
    }
    catch (Exception e) { return ""; }
}

二、网页代码处理与分类导入数据库
网页处理是采用的正则表达式进行匹配查找,包括html标签的匹配、空格空行的匹配等。

public string GetClasses(string str)
{
    Model.product_category model = new Model.product_category();
    BLL.product_category bll = new BLL.product_category();
    string classname = "";

    string pattern1 = "<span><h2><a\\s>(.*?)</a></h2></span>";
    string pattern2 = "(<dt>[^<a\\shref=\"http://.*\">]*)(.*?)</a></dt>";
    string pattern21 = "<a\\shref=\"http://.*\">(.*?)</a>";
    string pattern3 = "(<li>[^<a\\shref=\"http://.*\">]*)(.*?)</a></li>";

    foreach(char b in str)
    {
        var m = Regex.Match(str, pattern1, RegexOptions.Singleline | RegexOptions.IgnoreCase);
        string bs1 = m.Groups[1].Value; //一级类
        str = str.Substring(str.IndexOf("</a></h2></span>") + 17);
        classname = "★一级类:" + bs1;

        model = new Model.product_category();
        model.title = bs1;
        model.call_index = "";
        model.parent_id = 0;
        model.class_layer = 1;
        model.sort_id = 99;
        int cateId = bll.Add(model);
        if(cateId>0)
        {
            model.class_list = "," + cateId + ",";
            bll.Update(model);
            foreach (char a in str)
            {
                var m2 = Regex.Match(str, pattern2, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                if (m2.Length > 0)
                {
                    string aa = m2.ToString();

                    m = Regex.Match(aa, pattern21, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                    string bs2 = m.Groups[1].Value; //二级类

                    Regex rgx2 = new Regex("(<dt>[^<a\\shref=\"http://.*\">]*)(.*?)" + bs2 + "</a></dt>");
                    str = rgx2.Replace(str, "");
                    classname += "◆二级类:" + bs2 + "▲三级类:[";

                    model = new Model.product_category();
                    model.title = bs2;
                    model.call_index = "";
                    model.parent_id = cateId;
                    model.class_layer = 2;
                    model.sort_id = 99;
                    int catexId = bll.Add(model);
                    if(catexId>0)
                    {
                        model.class_list = "," + cateId + "," + catexId + ",";
                        bll.Update(model);
                        foreach (char c in str)
                        {
                            var m3 = Regex.Match(str, pattern3, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                            if (m3.Length > 0)
                            {
                                string str3 = m3.ToString();
                                m = Regex.Match(str3, pattern21, RegexOptions.Singleline | RegexOptions.IgnoreCase);
                                string bs3 = m.Groups[1].Value; //三级类

                                model = new Model.product_category();
                                model.title = bs3;
                                model.call_index = "";
                                model.parent_id = catexId;
                                model.class_list = "";
                                model.class_layer = 3;
                                model.sort_id = 99;
                                int catexxId = bll.Add(model);
                                if(catexxId>0)
                                {
                                    model.class_list = "," + cateId + "," + catexId + "," + catexxId + ",";
                                    bll.Update(model);
                                }

                                classname += "|" + bs3;

                                Regex rgx = new Regex("(<li>[^<a\\shref=\"http://.*\">]*)(.*?)" + bs3 + "</a></li>");
                                str = rgx.Replace(str, "");
                            }
                            str = new Regex("(\r\n)*[(\\t)*]").Replace(str, "");
                            int index_0 = str.IndexOf("<div style=\"clear:both\"></div>");
                            if (index_0 == 74 || index_0 == 29)
                            {
                                str = str.Substring(index_0 + 34);
                                break; //跳出本层循环(最内层循环结束)
                            }
                        }
                    }
                    classname += "]";
                }
            }
        }
        int index_1 = str.IndexOf("<div class=\"sidelist\">");
        if (index_1 < 45)
        {
            break; //跳出本层循环(内层循环结束)
        }
    }

    return classname;
}

三、触发采集
前台代码:

输入网址:<asp:TextBox ID="TextBox1" runat="server" Width="400">http://www.tansoole.com/</asp:TextBox>
<asp:Button ID="Button2" runat="server" Text="提交" onclick="Button2_Click" /><br /><br />
<asp:TextBox ID="TextBox2" runat="server" TextMode="MultiLine" Rows="30" Columns="100"></asp:TextBox>

后台事件:

protected void Button2_Click(object sender, EventArgs e)
{
    if (this.TextBox1.Text.Length > 0)
    {
        string str = getHtml(this.TextBox1.Text, "utf-8");
        str = str.Substring(str.IndexOf("sidebar")+10);
        this.TextBox2.Text = GetClasses(str);
    }
}

本文只能为做网页采集提供一个思路,具体实现还需根据实际情况进行验证修改。

PS:涉及的正则内容
//s 匹配空格(带转义字符)
(\r\n)* 匹配空行及换行
(\t)* 匹配制表符

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值