场景是 爬取网页的内容,过滤无用信息,根据正则提取需要的内容,写入记事本
from urllib.request import urlopen, Request
import re
# 模拟浏览器
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
# 爬取的网站
url = 'https://read.douban.com/provider/all'
response = Request(url, headers=headers)
res = urlopen(response)
data = res.read().decode('utf-8')
# 根据正则提前出版社名字
pat = '<div class="name">(.*?)</div>'
# 查找所有出版社名字
rst =re.compile(pat).findall(data)
# print(rst)
# print(rst[0])
# 写入的路径
fh = open('D:\\selenium\\selenium-3.141.0\\selenium\\测试写入.txt','a')
for i in range(0,len(rst)):
print(rst[i])
fh.write(rst[i]+'\n')
# 关闭文件
fh.close()