以前格式化网页或者信息提取一般都用的是正则;昨天由于网页信息用正则实现起来太麻烦而且易出错所以改用webbrowser 然后利用GetAttribute来实现信息的格式化
HtmlElementCollection tables = webBrowser.Document.GetElementsByTagName("table");
try
{
HtmlElement table1=null;
HtmlElement table2=null;
for (int x = 0; x < tables.Count; x++)
{
try
{
if (tables[x].GetAttribute("className") == "text-f12" && tables[x].GetAttribute("width") == "100%" && tables[x].GetAttribute("cellspacing") == "2")
{
table1 = tables[x];
}
if (tables[x].GetAttribute("width") == "100%" && tables[x].GetAttribute("cellpadding") == "4" && tables[x].GetAttribute("cellspacing") == "1")
{
table2 = tables[x];
}
}
catch
{
}
}
string[] sArray = new string[15];
string[] sArray2 = new string[14];
sArray2[0] = "Business";
sArray2[1] = "Year";
sArray2[2] = "Products";
sArray2[3] = "Website";
sArray2[4] = "Bankers";
sArray2[5] = "Standard";
sArray2[6] = "Products";
sArray2[7] = "Services";
sArray2[8] = "Company";
sArray2[9] = "Address";
sArray2[10] = "Phone";
sArray2[11] = "Fax";
sArray2[12] = "Contact";
sArray2[13] = "Mobile";
for (int i = 1; i < table1.GetElementsByTagName("tr").Count; i++)
{
string temp = table1.GetElementsByTagName("tr")[i].GetElementsByTagName("td")[0].InnerHtml;
if (table1.GetElementsByTagName("tr")[i].GetElementsByTagName("td").Count ==3)
{
for (int j = 0; j < 8; j++)
{
if (temp.IndexOf(sArray2[j]) > -1)
{
sArray[j] = table1.GetElementsByTagName("tr")[i].GetElementsByTagName("td")[2].InnerHtml.ToString();
}
}
}
}
for (int m = 0; m < table2.GetElementsByTagName("tr").Count-1; m++)
{
string temp2 = table2.GetElementsByTagName("tr")[m].GetElementsByTagName("td")[0].InnerHtml;
if (m == 0)
{
sArray[8] = table2.GetElementsByTagName("tr")[0].GetElementsByTagName("td")[2].InnerHtml;
}
if (table2.GetElementsByTagName("tr")[m].GetElementsByTagName("td").Count==2)
{
for (int j = 8; j < 13; j++)
{
if (temp2.IndexOf(sArray2[j]) > -1)
{
sArray[j] = table2.GetElementsByTagName("tr")[m].GetElementsByTagName("td")[1].InnerHtml;
}
}
}
}
sArray[14] = id;
AddComInfo(sArray);