HTML解析利器HtmlAgilityPack

 

using System;
using System.Collections.Generic;

using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using HtmlAgilityPack;
using System.IO;
using System.Text.RegularExpressions;
using Maticsoft.Common;


namespace Maticsoft.Web.PubResources
{
    public partial class ImportGroupMessage : System.Web.UI.Page
    {
        HtmlDocument doc = null;
        StreamReader sr = null;
        private readonly Maticsoft.BLL.publish_Resources bLL = new Maticsoft.BLL.publish_Resources();
        protected void Page_Load(object sender, EventArgs e)
        {
            //没有登录或是没有审核通过
            if (CookiesManager.GetCookie("cn") == "0" || CookiesManager.GetCookie("loginName") == "")
            {
                Response.Redirect("/index.aspx");
            }
            //审核通过但不是管理员
            else if (CookiesManager.GetCookie("cn") == "1")
            {
                Response.Redirect("/add.aspx");
            }
        }

        protected void btn_Import_Click(object sender, EventArgs e)
        {
            try
            {
                if (!FileUpload1.HasFile)
                {
                    MessageBox.Show(this, "请选择要上传的文件!");
                    return;
                }
                //获取文件后缀名
                string fileType = System.IO.Path.GetExtension(FileUpload1.FileName);
                if (fileType != ".mht")
                {
                    MessageBox.Show(this, "文件类型格式不对请重新选择!");
                    return;
                }
                //将文件上传到服务器指定的文件夹下保存
                FileUpload1.SaveAs(Server.MapPath("\\upload") + "\\" + FileUpload1.FileName);
                //删除已有过期所有信息
                bool b = bLL.Del(DateTime.Now.ToString("yyyy-MM-dd"));
                if (b)
                {
                    //MessageBox.Show(this, "过期信息删除成功!");
                }
                //string filepath = this.FileUpload1.PostedFile.FileName;
                //获取文件在服务器的完整路径
                string filepath = Server.MapPath("\\upload") + "\\" + FileUpload1.FileName;
                if (filepath == "")
                {
                    MessageBox.Show(this, "没有找到服务器上,已上传的文件!");
                    return;
                }
                sr = File.OpenText(filepath);
                doc = new HtmlDocument();
                doc.Load(sr);
                getNode();
                sr.Close();
                //删除已导入的文件
                File.Delete(Server.MapPath("\\upload")+"\\"+ FileUpload1.FileName);
                MessageBox.Show(this,"导入成功!");
            }
            catch (Exception ex)
            {
                throw ex;
            }
            finally
            {
                //关闭数据流
                sr.Close();
            }
        }
        /// <summary>
        /// 解析HTML
        /// </summary>
        private void getNode()
        {
            HtmlNodeCollection repeatNodes = doc.DocumentNode.SelectNodes("//table/tr");
            List<Maticsoft.Model.publish_Resources> list = new List<Maticsoft.Model.publish_Resources>();
            //循环节点
            foreach (HtmlNode node in repeatNodes)
            {
                HtmlDocument d = new HtmlDocument();
                d.LoadHtml(node.InnerHtml);
                HtmlNode title = d.DocumentNode.SelectSingleNode("//td[1]//div[1]//div[1]");
                HtmlNode title2 = d.DocumentNode.SelectSingleNode("//td[1]//div[2]");
                HtmlNode title3 = d.DocumentNode.SelectSingleNode("//td[1]//div[1]");
                string s3 = null;

                String s = null;
                string s2 = null;
                string contentText = null;
                //获取QQ
                if (title != null)
                {
                    s = title.InnerText;
                    if (s.Length > 10)
                    {
                        int ef = s.LastIndexOf(")");
                        int b = s.LastIndexOf("(");
                        if(b!=-1 && ef!=-1)
                        {
                    s = s.Substring(b + 1, ef - b - 1);
                        }
                    }
                    //    s = s.Substring(s.Length - 11, 10);
                    //    if (s.Substring(0, 1) == "(")
                    //    {
                    //        s = s.Substring(1, 9);
                    //    }

                }
                //内容content
                if(title2!=null)
                {

                    s2 = title2.InnerHtml;
                    contentText = title2.InnerText;
                }
                //发消息时间
                if (title3 != null)
                {
                    s3 = title3.InnerText;
                    if (s3.Length > 7)
                    {
                        s3 = s3.Substring(s3.Length - 8);
                        if (s3.Substring(0, 1) == ";" || s3.Substring(0, 1) == ")")
                        {
                            s3 = s3.Substring(1, 7);
                        }
                    }
                }

                if (title != null && title2 != null && title3 != null)
                {
                    int t = s.Length;
                    int t2 = s2.Length;
                    int t3 = s3.Length;
                    if (s.Length > 7 && s2.Length > 300 && s3.Length > 7)
                    {
                        Maticsoft.Model.publish_Resources model = new Maticsoft.Model.publish_Resources();
                        model.Qq = s;
                        model.Content = s2;
                        model.Infotime = s3;
                        model.ContentText = contentText;
                        bool bcontent = getcontent(model.ContentText,list);
                        if(bcontent)
                        {
                            list.Add(model);
                        }
                    }
                }
               
                // HtmlNodeCollection cc = node.SelectNodes("//td/div");

                //foreach (HtmlNode c in cc)
                //{
                //    string s = c.InnerText;
                //    string f = c.InnerHtml;
                //    //String t = c.XPath + "/div[1]";
                //    //String x=c.SelectSingleNode(t).InnerText;
                //}

                //if(s==null||s2==null||s3==null)
                //{
                //    return;
                //}

            }
            int i = 0;
            //循环遍历插入数据库
            foreach (Maticsoft.Model.publish_Resources publishResources in list)
            {
                Maticsoft.Model.publish_Resources mod = new Maticsoft.Model.publish_Resources();
                if (!string.IsNullOrEmpty(publishResources.Infotime))
                {

                    bool isvalidate = CheckIstime(publishResources.Infotime);
                    if (isvalidate)
                    {
                        mod.Infotime = publishResources.Infotime;
                    }
                }
                if (!string.IsNullOrEmpty(publishResources.Qq))
                {
                    bool isQQ = CheckIsQQNumber(publishResources.Qq);
                    if (isQQ)
                    {
                        mod.Qq = publishResources.Qq;
                    }
                }
                mod.Content = publishResources.Content;
                mod.Type = "0";
                mod.Title = "";
                mod.GetDateTime = DateTime.Now.ToString();
                mod.ContentText = publishResources.ContentText;
                if(mod.Qq!=null)
                {
                    i = bLL.Add(mod);
                }
            }
        }

        //判定字符串是否为 时 分 秒
        private bool CheckIstime(String StrSource)
        {
            return Regex.IsMatch(StrSource, @"^([0-1]?[0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])$");
        }

        //判定字符串是否为qq号
        private bool CheckIsQQNumber(String StrSource)
        {
            return Regex.IsMatch(StrSource, @"^\d{5,12}$");
        }

        /// <summary>
        /// 获取文本内容  看这条记录是否已存在
        /// </summary>
        /// <param name="str">内容</param>
        /// <param name="list">在集合中的内容</param>
        /// <returns></returns>
        private bool getcontent(string str,List<Maticsoft.Model.publish_Resources> list)
        {
            bool bo = true;
            foreach (Maticsoft.Model.publish_Resources pr in list)
            {
                if (str == pr.ContentText)
                {
                    bo = false;
                }
            }
            return bo;
        }
    }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值