1.安装beauitfulsoup4 cmd-> pip install beautifulsoup4
python提供了一个支持处理网络链接的内置模块urllib,beatuifulsoup是用来解析html
验证安装是否成功
2. pycharm配置
3.代码如下
import urllib.request
from bs4 import BeautifulSoup
class Scraper:
def __init__(self,site):
self.site=site
def scrape(self):
r=urllib.request.urlopen(self.site)
html=r.read()
parser="html.parser"
sp=BeautifulSoup(html,parser)
for tag in sp.find_all("a"):
url=tag.get("href")
if url is None:
continue
if "html" in url:
print("\n"+url)
news="http://news.baidu.com/"
S