C# 解析URL

        string server = "www.google.com.hk";
        private void SearchButton_Click(object sender, EventArgs e)
        {           
            var ipas = Dns.GetHostAddresses(server);
            var response = "";
            using (var sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
            {
               
                sock.Connect(ipas[0], 80);

                try
                {
                    var keyword = Uri.EscapeDataString(this.KeywordTextBox.Text);
                    sock.Send(Encoding.Default.GetBytes(string.Format("GET /search?q={0}&ie=utf-8&oe=utf-8 HTTP/1.1\r\nHost: {1}\r\nConnection: Close\r\n\r\n", keyword, server)));

                    var buf = new byte[1024];
                    var readsize = 0;
                    do
                    {
                        readsize = sock.Receive(buf);
                        response += Encoding.UTF8.GetString(buf);
                    } while (readsize > 0);


                        Array.ForEach<string[]>((new HtmlParser()).Parse(response).ToArray(), (_) => { table.Rows.Add(_); });

                }
                finally
                {
                    sock.Disconnect(false);
                }
            }           

        }

 

 

  /// <summary>
    /// HTMLパーサ
    /// </summary>
    public class HtmlParser
    {
        /// <summary>
        /// パース実行
        /// </summary>
        /// <param name="source"></param>
        /// <returns></returns>
        public IEnumerable<string[]> Parse(string source)
        {
            // htmlヘッダ部分を除去
            var lines = source.Split(new string[] { "\r\n" }, StringSplitOptions.None).SkipWhile(s => !s.StartsWith("<"));
            var html = lines.ToArray()[0];

            // XDocumentに加工
            using (var sgmlReader = new SgmlReader { DocType = "HTML", CaseFolding = CaseFolding.ToLower })
            {
                sgmlReader.InputStream = new StringReader(html);
                var xml = XDocument.Load(sgmlReader);

                // 項目リンク部分を取り出し
                var query =
                    from ele in xml.Elements().Descendants()
                    where ele.Attribute("class") != null && ele.Attribute("class").Value == "g"
                    select ele.Element("h3");

                // リンクurl整形用
                var regex = new Regex(@"/url\?q=(.*)\&sa");

                foreach (var item in query)
                {
                    if (item != null)
                    {
                        var mc = regex.Matches(item.Element("a").Attribute("href").Value);
                        if (mc.Count > 0)
                        {
                            var url = mc[0].Groups[1].Value;
                            yield return new string[] { item.Element("a").Value, url };
                        }
                    }
                }
            }
        }
    }

 

参考

WEB页面抓取

http://www.cnblogs.com/lumnm/archive/2009/12/23/1630435.html

HTML解析

http://developer.51cto.com/art/200909/149097.htm


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值