获取html地址中的内容

这里 有两种,可能是因为它们本身的编码方式不一样,所以解析也不一样,只是做一个笔记,不一定能用得着

1、获取word文档解析的网页

        public string GetHtml(string url)
        {
            //http://center.file.odxd.com/2016/12/19/10067/69f1ac86-d92e-448f-905e-2403642389a9.docx.html
            //http://center.file.odxd.com/2017/9/12/10155/eb279af3-03d6-4817-8804-fd563b573c44/00e681ab-0016-4883-a4d1-2b6d1f5d8c8a.txt.html
            string strResult;
            try
            {
                HttpWebRequest hwr = (System.Net.HttpWebRequest)System.Net.HttpWebRequest.Create(url);
                hwr.Timeout = 19600;
                HttpWebResponse hwrs = (HttpWebResponse)hwr.GetResponse();
                Stream myStream = hwrs.GetResponseStream();
                StreamReader sr = new StreamReader(myStream, Encoding.UTF8);
                StringBuilder sb = new StringBuilder();
                while (-1 != sr.Peek())
                {
                    sb.Append(sr.ReadLine() + "\r\n");
                }
                strResult = sb.ToString().Replace("?", "");
                hwrs.Close();
            }
            catch
            {
                strResult = "";
            }
            return strResult;
        }

  2、获取txt解析的网页

public string GetTxtHtml(string url)
        {
            try
            {
                byte[] buffer = new WebClient().DownloadData(url);
                StreamReader streamReader = new StreamReader(new MemoryStream(buffer), Encoding.UTF8);
                string text = streamReader.ReadToEnd();
                if (IsMessyCode(text))
                {
                    streamReader = new StreamReader(new MemoryStream(buffer), Encoding.Default);
                    text = streamReader.ReadToEnd();
                }
                return text.Replace("\r\n", "<br>");
            }
            catch
            {
                return "";
            }
        }

        /// <summary>
        /// 判断是否为乱码
        /// </summary>
        /// <param name="txt">文本</param>
        /// <returns></returns>
        private static bool IsMessyCode(string txt)
        {
            byte[] bytes = Encoding.UTF8.GetBytes(txt);
            for (int i = 0; i < bytes.Length; i++)
            {
                if ((i < (bytes.Length - 3)) && (((bytes[i] == 0xef) && (bytes[i + 1] == 0xbf)) && (bytes[i + 2] == 0xbd)))
                {
                    return true;
                }
            }
            return false;
        }

  

 

转载于:https://www.cnblogs.com/lldbj/p/8434599.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值