string server = "www.google.com.hk";
private void SearchButton_Click(object sender, EventArgs e)
{
var ipas = Dns.GetHostAddresses(server);
var response = "";
using (var sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp))
{
sock.Connect(ipas[0], 80);
try
{
var keyword = Uri.EscapeDataString(this.KeywordTextBox.Text);
sock.Send(Encoding.Default.GetBytes(string.Format("GET /search?q={0}&ie=utf-8&oe=utf-8 HTTP/1.1\r\nHost: {1}\r\nConnection: Close\r\n\r\n", keyword, server)));
var buf = new byte[1024];
var readsize = 0;
do
{
readsize = sock.Receive(buf);
response += Encoding.UTF8.GetString(buf);
} while (readsize > 0);
Array.ForEach<string[]>((new HtmlParser()).Parse(response).ToArray(), (_) => { table.Rows.Add(_); });
}
finally
{
sock.Disconnect(false);
}
}
}
/// <summary>
/// HTMLパーサ
/// </summary>
public class HtmlParser
{
/// <summary>
/// パース実行
/// </summary>
/// <param name="source"></param>
/// <returns></returns>
public IEnumerable<string[]> Parse(string source)
{
// htmlヘッダ部分を除去
var lines = source.Split(new string[] { "\r\n" }, StringSplitOptions.None).SkipWhile(s => !s.StartsWith("<"));
var html = lines.ToArray()[0];
// XDocumentに加工
using (var sgmlReader = new SgmlReader { DocType = "HTML", CaseFolding = CaseFolding.ToLower })
{
sgmlReader.InputStream = new StringReader(html);
var xml = XDocument.Load(sgmlReader);
// 項目リンク部分を取り出し
var query =
from ele in xml.Elements().Descendants()
where ele.Attribute("class") != null && ele.Attribute("class").Value == "g"
select ele.Element("h3");
// リンクurl整形用
var regex = new Regex(@"/url\?q=(.*)\&sa");
foreach (var item in query)
{
if (item != null)
{
var mc = regex.Matches(item.Element("a").Attribute("href").Value);
if (mc.Count > 0)
{
var url = mc[0].Groups[1].Value;
yield return new string[] { item.Element("a").Value, url };
}
}
}
}
}
}
参考
WEB页面抓取
http://www.cnblogs.com/lumnm/archive/2009/12/23/1630435.html
HTML解析
http://developer.51cto.com/art/200909/149097.htm