from bs4 import BeautifulSoup from urllib.request import urlopen # if has Chinese, apply decode() html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8') print(html)#
soup = BeautifulSoup(html, features='lxml') print(soup.h1)
#\n(匹配一个换行符) print('\n', soup.p)
#
all_href = soup.find_all('a') all_href = [l['href'] for l in all_href]
#只抓取了
<body>下的超链接print ( ' \n ' , all_href )