方法一:
WebRequest request = WebRequest.Create("http://www.cftea.com/"); WebResponse response = request.GetResponse();
StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
var contents = reader.ReadToEnd();
Console.WriteLine(StripHTML(contents));
reader.Close();
reader.Dispose();
response.Close();
Console.Read();
方法二:(抓取html中table里面的数据)
string html = @"
";
var strReg = @"(?is)(?<=
)";List result = new List();
MatchCollection mc = Regex.Matches(html, strReg);
foreach (Match m in mc)
{
//result.Add(m.Value);
Console.WriteLine(m.Value);
}
方法三:
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(orgStr);
var tables = htmlDocument.DocumentNode.SelectNodes("//table");//xpath的写法
foreach (var table in tables)
{
foreach (var tr in table.SelectNodes("//tr"))
{
var collegeName = tr.SelectNodes("//td").Skip(1).FirstOrDefault().InnerText;
Console.WriteLine(collegeName);
}
}
相关的网址:
方法四:(对网页中table里面的数据提取)
#region http://www.gaokao.com/e/20120109/4f0a8e1773aa0.shtml
//var url = "http://www.gaokao.com/e/20120109/4f0a8e1773aa0.shtml";
//var url = "http://www.gaokao.com/e/20120109/4f0a8e1773aa0_2.shtml";
//var url = "http://www.gaokao.com/e/20120109/4f0a8e1773aa0_3.shtml";
//var url = "http://www.gaokao.com/e/20120109/4f0a8e1773aa0_4.shtml";
//var url = "http://www.gaokao.com/e/20120109/4f0a8e1773aa0_5.shtml";
//var url = "http://www.gaokao.com/e/20120109/4f0a8e1773aa0_6.shtml";
//var orgStr = ChinaEduSp.Crawl.HttpUtility.GetContentByUrl(url, "gb2312");
//var htmlDocument = new HtmlDocument();
//htmlDocument.LoadHtml(orgStr);
//var rows = htmlDocument.DocumentNode.SelectNodes("//table//tr");
//foreach (var item in rows)
//{
// var pos = item.SelectSingleNode("td[1]").InnerText;
// var school = item.SelectSingleNode("td[2]").InnerText;
// var province = item.SelectSingleNode("td[3]").InnerText;
// var type = item.SelectSingleNode("td[4]").InnerText;
// var totalScore = item.SelectSingleNode("td[5]").InnerText;
// var seq = db.RankingDescriptions.Count();
// //Response.Write("名次:" + pos + " 学校名称:" + school + " 所在省份:" + province + " 类型:" + type + " 总分:" + totalScore + "/r/n");
// //Response.Write("名次:" + pos + " 学校名称:" + school);
// try
// {
// db.RankingDescriptions.Add(new RankingDescription
// {
// POS = Convert.ToInt32(pos),
// SchoolName = school,
// Province = province,
// Area = province,
// Type = type,
// TotalScore = totalScore,
// IsShow = true,
// IsDelete = false,
// RankId = 0,
// Seq = seq
// });
// db.SaveChanges();
// seq++;
// }
// catch (Exception ex)
// {
// string msg = ex.Message;
// }
//}
#endregion
#region http://www.gaokao.com/e/20120109/4f0a914934baa_2.shtml //var url = "http://www.gaokao.com/e/20120109/4f0a914934baa_2.shtml"; //var orgStr = ChinaEduSp.Crawl.HttpUtility.GetContentByUrl(url, "gb2312"); //var htmlDocument = new HtmlDocument(); //htmlDocument.LoadHtml(orgStr); //var rows = htmlDocument.DocumentNode.SelectNodes("//table//tr//td//table//tr"); //foreach (var item in rows) //{ // var pos = item.SelectSingleNode("td[1]").InnerText; // var school = item.SelectSingleNode("td[2]").InnerText; // var province = item.SelectSingleNode("td[3]").InnerText; // var totalScore = item.SelectSingleNode("td[4]").InnerText; // var seq = db.RankingDescriptions.Count(); // //Response.Write("名次:" + pos + " 学校名称:" + school + " 所在省份:" + province + " 总分:" + totalScore + "/r/n"); // //Response.Write("名次:" + pos + " 学校名称:" + school); // try // { // db.RankingDescriptions.Add(new RankingDescription // { // POS = Convert.ToInt32(pos), // SchoolName = school, // Province = province, // Area = province, // TotalScore = totalScore, // IsShow = true, // IsDelete = false, // RankId = 0, // Seq = seq // }); // db.SaveChanges(); // seq++; // } // catch (Exception ex) // { // string msg = ex.Message; // } //} #endregion