import requests
import bs4
# 获取网页内容
def get_html(url):
response = requests.get(
url=url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}
)
response.encoding = response.apparent_encoding
html_text = response.text
return html_text
# 提取有用数据到恰当的数据结构中
def parse_html(html_text, n):
soup = bs4.BeautifulSoup(html_text, "html.parser")
table = soup.select("table[class='rk-table']")[0]
tbody = table.select("tbody")[0]
rows = tbody.find_all("tr")
data = []
for row in rows:
cols = row.find_all("td")
rank = int(cols[0].get_text().strip())
if rank <= n: # 仅处理前 n 名
name = cols[1].select_one("div.univname div:nth-child(1) a").get_text().str
python——实例:中国大学排名定向爬虫
最新推荐文章于 2024-08-04 17:55:11 发布