用 Selenium实现自动化比较好…
import requests
from bs4 import BeautifulSoup
import re
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}
article_info = {}
def get_html(url):
r = requests.get(url=url, headers=header)
if (r.status_code == 200):
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'html.parser')
return soup
else:
print("请求网页失败")
def get_article_url(soup):
try:
article_list_div = soup.find('div', attrs={'class': 'article-list'})
items = article_list_div.find_all('a', attrs={'target': '_blank'})
for item in items:
article_url = item['href']
article_name = item.get_text().replace('原创', '').replace('转载', '').strip()
article_info[article_name] = article_url
except Exception as e:
print(e)
get_article_url(soup)
def save_article():
for name, url in article_info.items():
soup=get_html(url)
for script in soup(["script", "style"]):
script.extract()
soup=str(soup).replace("csdn","")
try:
with open('D:/python_save/web/'+name+'.html', 'w',encoding='utf-8') as f:
f.write(soup)
f.close()
except Exception as e:
print(name+'.html'+"保存错误")
print(e)
if __name__ == '__main__':
for i in range(1, 3):
url = 'https://blog.csdn.net/qq_43751489/article/list/' + str(i)
get_article_url(get_html(url))
save_article()