BeautifulSoup解析html表格

用BeautifulSoup解析html表格
http://www.voidcn.com/article/p-eooarkay-wa.html

for idx, tr in enumerate(soup.find_all('tr')):
    if idx != 0:
        tds = tr.find_all('td')
        data_list.append({
            '船名': tds[0].contents[0],
            '航次': tds[1].contents[0],
            '提单号': tds[2].contents[0],
            '作业码头': tds[5].contents[0]
        })
print(data_list)


https://www.136.la/html/show-10322.html
# find_all 查到所有tr列表
for tr in soup.find_all(‘tr‘,):
  # 在每个tr找td
    td = tr.find_all(‘td‘)
    try:
        print(‘%s_%s_%s_%s.pdf‘ % (
            td[school].text.strip(),
            td[pro_code].text.strip(),
            td[pro_name].text.strip(),
            td[xuewei].text.strip())
            ,td[pdf].find(‘a‘)[‘href‘])
    except IndexError as e:

如何抓取不规则表格的内容
https://segmentfault.com/q/1010000007087889
import pandas as pd

html = 'tab.html' # 你给的table源码

#默认pd会用 lxml 解析html
df = pd.read_html(html,header=0,encoding='utf8')[0]
print(df)
df2 = df.iloc[1:,0:-1]
df2.columns = df.columns.delete(0)
df2 = df2.append(df.iloc[0,1:])
df2['产品']=df.iat[0,0].replace(' ','')
df2.insert(0,'产品',df2.pop('产品'))
df2 = df2.sort_index()
print(df2)


查找+遍历表格
https://www.pythonheidong.com/blog/article/785484/b7550c25fbde58debbf0/
prop_table = soup.find('table', id="mainT")
#prop_table = soup.find('table', {"font-size" : "13px"})
#prop_table = soup.select('.addr') # Pluck out the listings


rows = prop_table.findAll('tr')                    

for row in rows:
    print(row.text)


data = []
for page in range(0, 2):    # <-- increase to number of pages you want to crawl
    soup = BeautifulSoup(requests.get(url.format(page=page)).text, 'html.parser')
    for table in soup.select('table[id^="r"]'):
        name = table.select_one('span.addr').text
        price = table.select_one('span.addr').find_next('b').get_text
(strip=True).split()[-1]
        sold = table.select_one('span.addr').find_next('b').find_next_sibling
(text=True).replace('in', '').replace('(Auction)', '').strip()

        beds = table.select_one('img[alt="Bed rooms"]')
        beds = beds.find_previous_sibling(text=True).strip() if beds else '-'

        bath = table.select_one('img[alt="Bath rooms"]')
        bath = bath.find_previous_sibling(text=True).strip() if bath else '-'

        car = table.select_one('img[alt="Car spaces"]')
        car = car.find_previous_sibling(text=True).strip() if car else '-'

        land = table.select_one('b:contains("Land size:")')
        land = land.find_next_sibling(text=True).split()[0] if land else '-'

        building = table.select_one('b:contains("Building size:")')
        building = building.find_next_sibling(text=True).split()[0] if building
else '-'

        data.append([name, price, sold, beds, bath, car, land, building])

# print the data
print('{:^25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format('Name',
'Price', 'Sold', 'Beds', 'Bath', 'Car', 'Land', 'Building'))
for row in data:
    print('{:<25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format(*row))


获取特定表格的内容
http://cn.voidcc.com/question/p-bixzseew-rk.html
html = urllib2.urlopen(url).read() 
bs = BeautifulSoup(html) 
table = bs.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']
=="Table1") 
rows = table.findAll(lambda tag: tag.name=='tr') 


使用BeautifulSoup提取网页上表格中的文本
https://www.cnpython.com/qa/699634
soup = bs4.BeautifulSoup(data)
#table = soup.find('tr', {'class':'tableheader'}).parent
table = soup.find('table', {'class':'tableforms'})
for i,tr in  enumerate(table.findChildren()):
    if i>0:
        for idx,td in enumerate(tr.findChildren()):
            if idx==2:
                print td.get_text().replace('(Registered)','').strip()

header_text = soup.find(text=re.compile("Model Type "))
value = header_cell.find_next('tr').select('td:nth-of-type(3)')[0].get_text()


soup = BeautifulSoup(html, "html.parser")
soup = soup.find('table',{'class':'tableforms'})

dico = {}
l1 = soup.findAll('tr')[1].findAll('td')
l2 = soup.findAll('tr')[2].findAll('td')
for i in range(len(l1)):
    dico[l1[i].getText().strip()] = l2[i].getText().replace
('(Registered)','').strip()

print dico['Model Type']


python BeautifulSoup解析表

https://www.imooc.com/wenda/detail/600078

data = []

table = soup.find('table', attrs={'class':'lineItemsTable'})

table_body = table.find('tbody')

rows = table_body.find_all('tr')

for row in rows:

    cols = row.find_all('td')

    cols = [ele.text.strip() for ele in cols]

    data.append([ele for ele in cols if ele]) # Get rid of empty values

table = soup.find("table", { "class" : "lineItemsTable" })

for row in table.findAll("tr"):

    cells = row.findAll("td")

    if len(cells) == 9:

        summons = cells[1].find(text=True)

        plateType = cells[2].find(text=True)

        vDate = cells[3].find(text=True)


def tableDataText(table):       

    rows = []

    trs = table.find_all('tr')

    headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header
row

    if headerow: # if there is a header row include first

        rows.append(headerow)

        trs = trs[1:]

    for tr in trs: # for every table row

        rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data
row

    return rows

使用它,我们得到(前两行)。


新手python3+Beautiful Soup 定向爬取表格实例
https://blog.csdn.net/weixin_41730416/article/details/86755582


def fillulist(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr
in soup.find('table', {'class': 'hq_table'}).children:
        if isinstance(tr,
bs4.element.Tag):
            tds = tr('td')
            ulist.append([tds[0].string,
tds[1].string, tds[2].string, tds[3].string, tds[4].string, tds[5].string, tds
[6].string])


使用beautifulsoup解析网页爬取的表格信息
https://blog.csdn.net/wyquin/article/details/79601918
import re
from bs4 import BeautifulSoup
 
 
def load_file(filepath):
 
    soup = BeautifulSoup(open(filepath), "lxml")
    a = soup.tbody.children
    reg = re.compile(("<[^>]*>"))   # 清除html标签,提取文本
    row0 = []       # row0用于保存上一行的信息
    flag = True     # row0未初始化
    for child in a:
        row = []    # 保存表格提取结果
        if child.find('th'):    # 提取表格字段
            for value in child.children:
                st = reg.sub('', str(value))    # 正则匹配替换
                row.append((st.strip('\n')))
            row = '-'.join(row)
            print(row)
            continue
        if child.find('td'):    # 提取每一行
            while child.find('sup'):    # 先清洗可能存在的上标符号
                child.find('sup').extract()
            for value in child.children:
                st = reg.sub('', str(value))
                row.append(st.strip('\n'))
            if flag:
                flag = False
            if len(row) < len(row0):    # 与上一行比较,分析是否需要处理字段缺省的情

                row_temp = row0[0:len(row0)-len(row)]
                for i in range(len(row)):
                    row_temp.append(row[i])
                row0 = row_temp
                row_temp = '-'.join(row_temp)   # 将列表保存的字段连接起来
                print(row_temp)
                continue
            row0 = row
            row = '-'.join(row)
            print(row)
 
if __name__ == '__main__':
    load_file('data/2333.txt')

 

https://github.com/icodeu/BeautifulSoup/blob/master/main.py

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值