使用c#来进行html源码解析

使用c#来进行html源码解析

本文将介绍一下如何使用c#以及HtmlAgilityPack包对html源码进行解析与信息提取。

HtmlAgilityPack安装

在visual studio工具栏中依次点Tools–>NuGet Package Manager–>Manage Package for Solution,搜索HtmlAgilityPack,然后进行安装。在此不详细叙述。

使用

不要忘了引用一下HtmlAgilityPack

using HtmlAgilityPack;

直接通过字符串来解析

        public void HtmlStringPattern(string htmlString)
        {
            //加载html string
            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(htmlString);
            //以下部分为具体的提取操作,因情况而异
            HtmlNode version = htmlDocument.DocumentNode.SelectSingleNode("html/body//table[@class='basicinfo']/tr[5]/td[2]");
            Console.WriteLine("version: " + version.InnerHtml.Trim());

            string target_table = "html/body//table[@class='webgrid_table'][1]/tbody";
            List<string> result = new List<string>();

            HtmlNodeCollection countcollection = htmlDocument.DocumentNode.SelectNodes(target_table + "/tr");
            int length_collection = countcollection.Count;
            for (int i = 1; i <= length_collection; i++)
            {
                HtmlNode result_node = htmlDocument.DocumentNode.SelectSingleNode(target_table + "/tr[" + i + "]/td[2]/a");
                result.Add(result_node.InnerHtml);
            }
            Console.WriteLine("version");
            foreach (string s in result)
            {
                Console.WriteLine(s);
            }
        }

通过html文件

        public void HtmlFilePattern(string filePath)
        {
            string htmlDoc;
            using (StreamReader sr = new StreamReader(filePath))
            {
                htmlDoc = sr.ReadToEnd();
            }

            htmlDoc = htmlDoc.Replace(@"\r\n", "");

            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(@htmlDoc);

            //具体的提取操作参考上一代码块
            //....
            //

通过httpclient

        public void WebURLPattern(string webURL)
        {
            string resultset = new HttpHepler().HttpClientRequest(webURL);
            //上一行写了一个HttpHepler函数来发送http请求,具体见下一代码块

            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(@resultset);

            //具体的提取操作参考上一代码块
            //....
            //

这里写了一个HttpHepler函数来发送http请求,具体见下一代码块

    public class HttpHepler
    {
        public HttpHepler()
        {
        }
        //通过httpClient发送请求的函数
        public string HttpClientRequest(string webURL)
        {
            string resultset = "";
            using (HttpClient client = new HttpClient())
            {
                using (HttpResponseMessage response = client.GetAsync(webURL).Result)
                {
                    using (HttpContent content = response.Content)
                    {
                        resultset = content.ReadAsStringAsync().Result;
                    }
                }
            }
            return resultset;
        }
        //通过httpWeb发送请求的函数
        public string HttpWebRequest(string webURL)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webURL);
            CookieContainer cookieContainer = new CookieContainer();
            request.CookieContainer = cookieContainer;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            string data = "";
            if (response.StatusCode == HttpStatusCode.OK)
            {
                Stream receiveStream = response.GetResponseStream();
                StreamReader readStream = null;

                if (response.CharacterSet == null)
                {
                    readStream = new StreamReader(receiveStream);
                }
                else
                {
                    readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
                }

                data = readStream.ReadToEnd();

                response.Close();
                readStream.Close();
            }
            return data;
        }
    }
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值