今日热榜:https://tophub.today/
爬取数据及保存格式:
爬取后保存为.txt文件:
部分内容:
源码及注释:
import requests
from bs4 import BeautifulSoup
def download_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
try:
r = requests.get(url,timeout = 30,headers=headers)
return r.text
except:
return "please inspect your url or setup"
def get_content(html,tag):
output = """