import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def getUnivList(HTML):
soup = BeautifulSoup(HTML,'html.parser')
for i in soup.find_all('span'):
print (i.get_text(strip=True))
def main():
url = 'https://www.qiushibaike.com/hot/page/2/'
HTML = getHTMLText(url)
getUnivList(HTML)
main()
这个小爬虫用到了requests和bs4库
r.raise_for_status()用于检验是否成功的获得了页面内容
r.encoding 是requests库读取页面内容开头得出的编码,r.apparent_encoding是读取全文的编码
get_text()得出<span>下的所用文本内容
strip=True 用于去点开头和结尾的空格