实现功能的关键代码
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = urlopen('网址')
url.read()
url_lxml = BeautifulSoup(url, 'lxml')
url_table = url_lxml.find_all('table')[1]
url_all_tr = url_table.find_all('tr')
url_all_tr.pop(0)
xxx.text
实例1:
示例步骤
打开网址
转成lxml树结构
筛选出要抓取的层级
删除一些不要的
循环遍
r = urlopen('https://www.boc.cn/sourcedb/whpj/')
c = r.read()
bs_obj = BeautifulSoup(c, 'lxml')
t = bs_obj.find_all('table')[1]
all_tr = t.find_all('tr')
all_tr.pop(0)
for i in all_tr:
a = i.find_all('td')
print(a[0].text, a[2].text)