/// <summary>
/// C#爬虫获取网页中表格的数据
/// </summary>
public void GetDataFromNet() {
//爬取的网页地址
string url="http://www.sse.net.cn/index/singleIndex?indexType=cbcfi";
//若是POST请求,下面一行代码解注释
//request.Method = "POST";
WebRequest request = WebRequest.Create(url);
WebResponse response = (WebResponse)request.GetResponse();
Stream dataStream = response.GetResponseStream();
StreamReader reader = new StreamReader(dataStream, Encoding.UTF8);
//此处将爬取到的内容转换为HTML
string strHTML = reader.ReadToEnd();
//也可以将HTML直接转换为dynamic对象,按需选择
//dynamic Datas = JsonConvert.DeserializeObject(strHTML);
var list = new List<string>();
//正则表达式获取table所有行==>mc
Regex reg = new Regex("(?is)(?<=<table[^>]*?[^>]*?>(?:(?!</?table).)*)(?is)<tr[^>]*?[^>]*?>(?:\\s*<td[^>]*>(.*?)</td>)*\\s*</tr>");
MatchCollection mc = reg.Matches(strHTML);
foreach (Match mat in mc)
{
//正则表达式获取每行所有的td
Regex reg1 = new Regex(@"<td.*?>(?<value>.*?)</td>");
//mat.Value是表格中每一行的HTML字符串
bool abc = false;
foreach (Match m in reg1.Matches(mat.Value))//类似循环一行的每个td
{
string val = m.Groups["value"].Value;
if (val.IndexOf("本期") != -1)
{
list.Add(val.Substring(6));
}
if (val == "秦皇岛-广州(6-7万DWT)")
abc = true;
if (abc)
list.Add(m.Groups["value"].Value);
//后续处理,保存到数据库
}
}
reader.Close();
dataStream.Close();
response.Close();
}