网页抓取学习(3)BeautifulSoup

from bs4 import BeautifulSoup
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://morvanzhou.github.io/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)
#
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)
#\n(匹配一个换行符)
print('\n', soup.p)
#

all_href = soup.find_all('a')
all_href = [l['href'] for l in all_href]
#只抓取了
<body>下的超链接
print('\n', all_href)            
阅读更多

没有更多推荐了,返回首页