BeautifulSoup解析HTML

  • 需要解析HTML源码里面的内容,包含特定标签和属性
<div class="file-source">
  <table>
    <tr>
      <th align="right">Line</th>
      <th align="right">Branch</th>
      <th align="right">Exec</th>
      <th align="left">Source</th>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l1" href="#l1">1</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="c1">// RunAdder.cpp</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l2" href="#l2">2</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l3" href="#l3">3</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;iostream&gt;</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l4" href="#l4">4</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#include</span><span class="w"> </span><span class="cpf">&lt;unistd.h&gt;</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l5" href="#l5">5</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l6" href="#l6">6</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#ifdef GCOV</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l7" href="#l7">7</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="k">extern</span><span class="w"> </span><span class="s">&quot;C&quot;</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">__gcov_flush</span><span class="p">();</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l8" href="#l8">8</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="k">extern</span><span class="w"> </span><span class="s">&quot;C&quot;</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">__gcov_dump</span><span class="p">();</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l9" href="#l9">9</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="k">extern</span><span class="w"> </span><span class="s">&quot;C&quot;</span><span class="w"> </span><span class="kt">void</span><span class="w"> </span><span class="n">__gcov_reset</span><span class="p">();</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l10" href="#l10">10</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#endif </span><span class="c1">// TESTPLUS_GCOV</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l11" href="#l11">11</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l12" href="#l12">12</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#include</span><span class="w"> </span><span class="cpf">&quot;./include/Adder.h&quot;</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l13" href="#l13">13</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l14" href="#l14">14</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount coveredLine">1</td>
      <td class="src coveredLine"><span class="kt">int</span><span class="w"> </span><span class="nf">main</span><span class="p">()</span><span class="w"> </span><span class="p">{</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l15" href="#l15">15</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l16" href="#l16">16</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="w">    </span><span class="k">while</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l17" href="#l17">17</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="w">    </span><span class="p">{</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l18" href="#l18">18</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l19" href="#l19">19</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">1/2</summary>
        <div class="linebranchContents">
          <div class="takenBranch">&check; Branch 1 taken 1 times.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">       </span><span class="n">printf</span><span class="p">(</span><span class="s">&quot;dump</span><span class="se">\n</span><span class="s">&quot;</span><span class="p">);</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l20" href="#l20">20</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l21" href="#l21">21</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">1/2</summary>
        <div class="linebranchContents">
          <div class="takenBranch">&check; Branch 1 taken 1 times.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">        </span><span class="n">Adder</span><span class="w"> </span><span class="n">adder</span><span class="p">;</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l22" href="#l22">22</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">1/2</summary>
        <div class="linebranchContents">
          <div class="takenBranch">&check; Branch 1 taken 1 times.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">        </span><span class="n">adder</span><span class="p">.</span><span class="n">print_value</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="p">);</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l23" href="#l23">23</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l24" href="#l24">24</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">1/2</summary>
        <div class="linebranchContents">
          <div class="takenBranch">&check; Branch 1 taken 1 times.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">        </span><span class="n">adder</span><span class="p">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">);</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l25" href="#l25">25</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">1/2</summary>
        <div class="linebranchContents">
          <div class="takenBranch">&check; Branch 1 taken 1 times.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">        </span><span class="n">adder</span><span class="p">.</span><span class="n">print_value</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="p">);</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l26" href="#l26">26</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l27" href="#l27">27</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">1/2</summary>
        <div class="linebranchContents">
          <div class="takenBranch">&check; Branch 1 taken 1 times.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">        </span><span class="n">adder</span><span class="p">.</span><span class="n">add</span><span class="p">(</span><span class="mi">5</span><span class="p">);</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l28" href="#l28">28</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">1/2</summary>
        <div class="linebranchContents">
          <div class="takenBranch">&check; Branch 1 taken 1 times.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">        </span><span class="n">adder</span><span class="p">.</span><span class="n">print_value</span><span class="p">(</span><span class="n">std</span><span class="o">::</span><span class="n">cout</span><span class="p">);</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l29" href="#l29">29</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="w">    </span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l30" href="#l30">30</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount coveredLine">1</td>
      <td class="src coveredLine"><span class="w">        </span><span class="n">adder</span><span class="p">.</span><span class="n">clear</span><span class="p">();</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l31" href="#l31">31</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l32" href="#l32">32</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="c1">// reference https://github.com/mongodb/mongo/blob/3adaac9be00bb0800184e0c27503c3d29c8e577e/src/mongo/util/quick_exit.cpp#L72</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l33" href="#l33">33</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="c1">// exit </span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l34" href="#l34">34</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#ifdef TESTPLUS_GCOV</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l35" href="#l35">35</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#if (defined(__clang__) &amp;&amp; __clang_major__ &gt;= 12) || __GNUC__ &gt;= 11</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l36" href="#l36">36</a></td>
      <td class="linebranch">
        <details class="linebranchDetails">
        <summary class="linebranchSummary">0/2</summary>
        <div class="linebranchContents">
          <div class="notTakenBranch">&cross; Branch 1 not taken.</div>
          <div class="notTakenBranch">&cross; Branch 2 not taken.</div>
        </div>
        </details>
      </td>
      <td class="linecount partialCoveredLine">1</td>
      <td class="src partialCoveredLine"><span class="w">    </span><span class="n">__gcov_dump</span><span class="p">();</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l37" href="#l37">37</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount uncoveredLine">&cross;</td>
      <td class="src uncoveredLine"><span class="w">    </span><span class="n">__gcov_reset</span><span class="p">();</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l38" href="#l38">38</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#else</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l39" href="#l39">39</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="w">    </span><span class="n">__gcov_flush</span><span class="p">();</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l40" href="#l40">40</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#endif</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l41" href="#l41">41</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="cp">#endif </span><span class="c1">// TESTPLUS_GCOV</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l42" href="#l42">42</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l43" href="#l43">43</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount uncoveredLine">&cross;</td>
      <td class="src uncoveredLine"><span class="w">        </span><span class="n">sleep</span><span class="p">(</span><span class="mi">61</span><span class="p">);</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l44" href="#l44">44</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount uncoveredLine">&cross;</td>
      <td class="src uncoveredLine"><span class="w">    </span><span class="p">}</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l45" href="#l45">45</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l46" href="#l46">46</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l47" href="#l47">47</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "><span class="p">}</span></td>
    </tr>
    <tr class="source-line">
      <td class="lineno"><a id="l48" href="#l48">48</a></td>
      <td class="linebranch">
      </td>
      <td class="linecount "></td>
      <td class="src "></td>
    </tr>
  </table>
</div>
  • BeautifulSoup

https://blog.csdn.net/naer_chongya/article/details/130633043
https://blog.51cto.com/u_13673090/2466801

安装

pip install BeautifulSoup4

demo.py解析Gcov工具生成html文件,主要思路是先观察HTML中我们需要提取信息数据的特征,找到特征之后利用BeautifulSoup的各种方法来遍历和查询HTML文档。
具体的使用方法前面两个链接介绍的较为详细。

from bs4 import BeautifulSoup
import json
import requests

headers = {
    'accept': 'application/json',
    'Content-Type': 'application/json',
}

json_data = {
    'project_id': 'mecha',
    'report_id': 314,
    'file_name': 'Game/AchievementMoudle/AchievementData.cpp',
}

response = requests.post('http://10.11.00.00:0000/api/get/file', headers=headers, json=json_data)

file_data = json.loads(response.text).get('data')

src_code = []

with open(r'D:/test/html_test/out.RunAdder.cpp.a9f32ea8f9bca6ced5ffba7e0bd94313.html') as f:
    # soup = BeautifulSoup(f.read(), 'html.parser')  # 读文件方式
    soup = BeautifulSoup(file_data, 'html.parser')  # 指定数据"str"
    # for child in soup.children:
    #     print(child)
    # 查找 标签为tr class为source-line的数据
    links = soup.find_all("tr", "source-line")
    # print(links)
    for link in links:
        # url = link.text
        # print(url.strip())
        src = link.find("td", "src")
        line_no = link.find("td", "lineno")
        line_count_coveredLine = link.find("td", "linecount coveredLine")
        line_count_partial_coveredLine = link.find("td", "linecount partialCoveredLine")
        line_count_uncoveredLine = link.find("td", "linecount uncoveredLine")
        src_code.append({
            "src": src.text.strip(),
            "line_no": int(line_no.text.strip()),
            "line_count_coveredLine": line_count_coveredLine.text.strip()
            if line_count_coveredLine else None,
            "line_count_partial_coveredLine": line_count_partial_coveredLine.text.strip()
            if line_count_partial_coveredLine else None,
            "line_count_uncoveredLine": line_count_uncoveredLine.text.strip()
            if line_count_uncoveredLine else None
        })
        print(src.text.strip())
function_name = "main"
function_line_no = 0
stack_list = []
function_end_line_no = 0


for s in src_code:
    if function_name in s["src"]:
        function_line_no = s["line_no"]
        break

# 括号匹配来找到指定函数的起始行号
for s in src_code:
    if s["line_no"] >= function_line_no:
        if "{" in s["src"]:
            stack_list.append("{")
        if "}" in s["src"]:
            stack_list.pop()
        if len(stack_list) == 0 and s["line_no"] != function_line_no:
            function_end_line_no = s["line_no"]
            break

print(function_line_no)
print(function_end_line_no)

cover_count = 0
uncover_count = 0

if function_end_line_no:
    for s in src_code:
        if function_line_no <= s["line_no"] <= function_end_line_no:
            if s["line_count_coveredLine"] or s["line_count_partial_coveredLine"]:
                cover_count += 1
            if s["line_count_uncoveredLine"]:
                uncover_count += 1

print(cover_count)
print(uncover_count)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Loganer

感谢

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值