<div class="m-repbox"><!--/html/body/div-->
<div class="m-repbody firstPage"><!--/html/body/div/div-->
<div class="t1">基本信息</div>
<div class="g-tt-h3 f-tleft f-mgtop">基本概况信息</div><!--/html/body/div/div[1]/div[2]-->
<table class="g-tab-bor f-tab-nomargin">
<tr>
<th class="g-w-4">经济类型</th>
<td class="g-w-4 ">股份有限(公司)</td>
<th class="g-w-4">组织机构类型</th>
<td class="g-w-4 ">企业</td>
</tr>
<tr>
<th>企业规模</th>
<td class="">微型企业</td>
<th>所属行业</th>
<td class="">建材批发</td>
</tr>
</table>
<div class="g-tt-h3 f-tleft f-mgtop">实际控制人</div><!--/html/body/div/div[1]/div[2]-->
<table class="g-tab-bor f-tab-nomargin">
<tr>
<th class="g-w-4">名称</th>
<th class="g-w-4">身份标识类型</th>
<th class="g-w-4">身份标识号码</th>
<th class="g-w-4">更新日期</th>
</tr>
<tbody class="">
<tr>
<td>控制人</td>
<td class="g-w-4">身份证</td>
<td class="g-w-4">*******************</td>
<td class="g-w-4">2017-03-01</td>
</tr>
</tbody>
<tbody class="">
<tr>
<td>控制人二二二二二</td>
<td class="g-w-4">组织机构代码</td>
<td class="g-w-4">***********</td>
<td class="g-w-4">2017-03-01</td>
</tr>
</tbody>
</table>
</div>
</div>
NuGet 引入 HtmlAgilityPack 包
HtmlDocument htmlDoc;
/// <summary>
/// Load the html page source.
/// </summary>
/// <param name="htmlSource"></param>
public void LoadHtml(string htmlSource)
{
htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(htmlSource);
}
public int GetNodeIndexByKeyword(string xPath, string keyword)
{
var index = int.MinValue;
var nodes = htmlDoc.DocumentNode.SelectNodes(xPath);
if (nodes != null)
{
for (var i = 0; i < nodes.Count; i++)
{
var data = nodes[i].InnerText;
if (Regex.IsMatch(data, keyword))
{
index = i + 1;
break;
}
}
}
return index;
}
public int GetNodeIndex(string divPath, int divIndex)
{
var index = int.MinValue;
var tableXPath = string.Format("{0}[{1}]/following-sibling::table[1]/preceding-sibling::div[1]", divPath, divIndex);
//tableXPath = "/html/body/div/div[4]/div[2]/following-sibling::table[1]/preceding-sibling::div[1]";
var nodes = htmlDoc.DocumentNode.SelectNodes(tableXPath);
if (nodes != null)
{
foreach (var node in nodes)
{
var lastS = node.XPath.Substring(node.XPath.LastIndexOf("/") + 1);
var rgx = new Regex(@"(?i)(?<=\[)(.*)(?=\])");
var trimS = rgx.Match(lastS).Value;
_ = int.TryParse(trimS, out int i);
index = i;
}
}
return index;
}
var xPath = "/html/body/div/div";
var keyword = "基本信息";
var divIndex = GetNodeIndexByKeyword(xPath, keyword);
xPath = string.Format("/html/body/div/div[{0}]/div", divIndex);//"/html/body/div/div[4]/div"
keyword = "基本概况信息";
var divIndex2 = htmlDocument.GetNodeIndexByKeyword(xPath, keyword);//2
var precedingSiblingIndeox = GetNodeIndex(xPath, divIndex2);
var eq = divIndex == precedingSiblingIndeox;