BeautifulSoup解析html表格

最新推荐文章于 2025-01-22 15:54:57 发布

guxue365

最新推荐文章于 2025-01-22 15:54:57 发布

阅读量2.9k

点赞数 2

分类专栏： python

本文链接：https://blog.csdn.net/guxue365/article/details/116517206

版权

python 专栏收录该内容

1 篇文章

订阅专栏

用BeautifulSoup解析html表格
http://www.voidcn.com/article/p-eooarkay-wa.html

for idx, tr in enumerate(soup.find_all('tr')):
if idx != 0:
tds = tr.find_all('td')
data_list.append({
'船名': tds[0].contents[0],
'航次': tds[1].contents[0],
'提单号': tds[2].contents[0],
'作业码头': tds[5].contents[0]
})
print(data_list)

https://www.136.la/html/show-10322.html
# find_all 查到所有tr列表
for tr in soup.find_all(‘tr‘,):
　　# 在每个tr找td
td = tr.find_all(‘td‘)
try:
print(‘%s_%s_%s_%s.pdf‘ % (
td[school].text.strip(),
td[pro_code].text.strip(),
td[pro_name].text.strip(),
td[xuewei].text.strip())
,td[pdf].find(‘a‘)[‘href‘])
except IndexError as e:

如何抓取不规则表格的内容
https://segmentfault.com/q/1010000007087889
import pandas as pd

html = 'tab.html' # 你给的table源码

#默认pd会用 lxml 解析html
df = pd.read_html(html,header=0,encoding='utf8')[0]
print(df)
df2 = df.iloc[1:,0:-1]
df2.columns = df.columns.delete(0)
df2 = df2.append(df.iloc[0,1:])
df2['产品']=df.iat[0,0].replace(' ','')
df2.insert(0,'产品',df2.pop('产品'))
df2 = df2.sort_index()
print(df2)

查找+遍历表格
https://www.pythonheidong.com/blog/article/785484/b7550c25fbde58debbf0/
prop_table = soup.find('table', id="mainT")
#prop_table = soup.find('table', {"font-size" : "13px"})
#prop_table = soup.select('.addr') # Pluck out the listings

rows = prop_table.findAll('tr')

for row in rows:
print(row.text)

data = []
for page in range(0, 2): # <-- increase to number of pages you want to crawl
soup = BeautifulSoup(requests.get(url.format(page=page)).text, 'html.parser')
for table in soup.select('table[id^="r"]'):
name = table.select_one('span.addr').text
price = table.select_one('span.addr').find_next('b').get_text
(strip=True).split()[-1]
sold = table.select_one('span.addr').find_next('b').find_next_sibling
(text=True).replace('in', '').replace('(Auction)', '').strip()

beds = table.select_one('img[alt="Bed rooms"]')
beds = beds.find_previous_sibling(text=True).strip() if beds else '-'

bath = table.select_one('img[alt="Bath rooms"]')
bath = bath.find_previous_sibling(text=True).strip() if bath else '-'

car = table.select_one('img[alt="Car spaces"]')
car = car.find_previous_sibling(text=True).strip() if car else '-'

land = table.select_one('b:contains("Land size:")')
land = land.find_next_sibling(text=True).split()[0] if land else '-'

building = table.select_one('b:contains("Building size:")')
building = building.find_next_sibling(text=True).split()[0] if building
else '-'

data.append([name, price, sold, beds, bath, car, land, building])

# print the data
print('{:^25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format('Name',
'Price', 'Sold', 'Beds', 'Bath', 'Car', 'Land', 'Building'))
for row in data:
print('{:<25} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15} {:^15}'.format(*row))

获取特定表格的内容
http://cn.voidcc.com/question/p-bixzseew-rk.html
html = urllib2.urlopen(url).read()
bs = BeautifulSoup(html)
table = bs.find(lambda tag: tag.name=='table' and tag.has_attr('id') and tag['id']
=="Table1")
rows = table.findAll(lambda tag: tag.name=='tr')

使用BeautifulSoup提取网页上表格中的文本
https://www.cnpython.com/qa/699634
soup = bs4.BeautifulSoup(data)
#table = soup.find('tr', {'class':'tableheader'}).parent
table = soup.find('table', {'class':'tableforms'})
for i,tr in enumerate(table.findChildren()):
if i>0:
for idx,td in enumerate(tr.findChildren()):
if idx==2:
print td.get_text().replace('(Registered)','').strip()

header_text = soup.find(text=re.compile("Model Type "))
value = header_cell.find_next('tr').select('td:nth-of-type(3)')[0].get_text()

soup = BeautifulSoup(html, "html.parser")
soup = soup.find('table',{'class':'tableforms'})

dico = {}
l1 = soup.findAll('tr')[1].findAll('td')
l2 = soup.findAll('tr')[2].findAll('td')
for i in range(len(l1)):
dico[l1[i].getText().strip()] = l2[i].getText().replace
('(Registered)','').strip()

print dico['Model Type']

python BeautifulSoup解析表

https://www.imooc.com/wenda/detail/600078

data = []

table = soup.find('table', attrs={'class':'lineItemsTable'})

table_body = table.find('tbody')

rows = table_body.find_all('tr')

for row in rows:

cols = row.find_all('td')

cols = [ele.text.strip() for ele in cols]

data.append([ele for ele in cols if ele]) # Get rid of empty values

table = soup.find("table", { "class" : "lineItemsTable" })

for row in table.findAll("tr"):

cells = row.findAll("td")

if len(cells) == 9:

summons = cells[1].find(text=True)

plateType = cells[2].find(text=True)

vDate = cells[3].find(text=True)

def tableDataText(table):

rows = []

trs = table.find_all('tr')

headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header
row

if headerow: # if there is a header row include first

rows.append(headerow)

trs = trs[1:]

for tr in trs: # for every table row

rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data
row

return rows

使用它，我们得到（前两行）。

新手python3+Beautiful Soup 定向爬取表格实例
https://blog.csdn.net/weixin_41730416/article/details/86755582

def fillulist(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr
in soup.find('table', {'class': 'hq_table'}).children:
if isinstance(tr,
bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string,
tds[1].string, tds[2].string, tds[3].string, tds[4].string, tds[5].string, tds
[6].string])

使用beautifulsoup解析网页爬取的表格信息
https://blog.csdn.net/wyquin/article/details/79601918
import re
from bs4 import BeautifulSoup

def load_file(filepath):

soup = BeautifulSoup(open(filepath), "lxml")
a = soup.tbody.children
reg = re.compile(("<[^>]*>")) # 清除html标签,提取文本
row0 = [] # row0用于保存上一行的信息
flag = True # row0未初始化
for child in a:
row = [] # 保存表格提取结果
if child.find('th'): # 提取表格字段
for value in child.children:
st = reg.sub('', str(value)) # 正则匹配替换
row.append((st.strip('\n')))
row = '-'.join(row)
print(row)
continue
if child.find('td'): # 提取每一行
while child.find('sup'): # 先清洗可能存在的上标符号
child.find('sup').extract()
for value in child.children:
st = reg.sub('', str(value))
row.append(st.strip('\n'))
if flag:
flag = False
if len(row) < len(row0): # 与上一行比较,分析是否需要处理字段缺省的情
况
row_temp = row0[0:len(row0)-len(row)]
for i in range(len(row)):
row_temp.append(row[i])
row0 = row_temp
row_temp = '-'.join(row_temp) # 将列表保存的字段连接起来
print(row_temp)
continue
row0 = row
row = '-'.join(row)
print(row)

if __name__ == '__main__':
load_file('data/2333.txt')