爬取网站:某某校园新闻网
代码:
##爬取网站:校园新闻网
import requests
import re
heads = {
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;\
q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
} ##heads非必要
##
##r.encoding = "utf-8"
response = requests.get('https://news2014.ctbu.edu.cn/', headers = heads)
##print(response)
response.encoding = "utf-8"
content = response.text
pattern = re.compile('<li><.*?>(.*?)</span><.*?title="(.*?)</a></li>',re.S)
results = re.findall(pattern, content)
f = open(r'C:\Users\AdamCY\Desktop\wenjian\python爬虫\xinwenwang.txt','w',encoding='utf-8')
for result in results:
f.writelines([result[0],'\t', result[1],'\n'])
f.close()
print("TXT文本数据保存成功")
效果: