'''
爬取博客文章,静态页面
'''
import urllib.request
import http.cookiejar
import requests
from bs4 import BeautifulSoup
def clean_file(): #初始清除文档内容
with open('article.txt','w',encoding='utf-8') as f:
f.truncate()
def write_to_file(content): #增量写入
with open('article.txt', 'a', encoding='utf-8') as f:
f.write(content)
def get_page(url): #获取response
try:
response = requests.get(url,timeout=20)
return response.text
except:
return "error"
def get_blog_info():
headers={'User-Agent': #chrome申请头信息
'Mozilla/5.0 (Windows NT 6.1; WOW64)'
'AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/77.0.3865.90'
'Safari/537.36'}
html=get_page(blog_url)
soup=BeautifulSoup(html,'lxml&