预期效果
可以看到,该事务所有1625条新闻纪录,一个一个点开查看判断,再下载所需要的新闻信息(发布时间、标题、内容、链接),比较麻烦,用爬虫可以较快实现这一过程。
代码实现
import requests
from bs4 import BeautifulSoup
import re
import csv
n=0
f= open('news6', 'w', encoding='utf-8-sig', newline='')
writer = csv.writer(f)
head=['time','title','content','url','key']
writer.writerow(head)
for x in range(41,4169):
url1='http://ltd.reanda.com/dongtai/news_list.asp?'
url=url1+'id='+str(x)
html=requests.get(url)
html.encoding = 'GBK'
if html.status_code==200:
wrong=re.findall('数(据库出)错',html.text)
if wrong==[]:
soup = BeautifulSoup(html.text, 'lxml')
ftitle = soup.find_all(attrs={'class': 'big'})
title = ftitle[0].text
for key in ['调研','莅临','访','研讨','邀','到','视察','接见']:
result = re.search(key, title)
if result:
a=soup.find_all(attrs={'class':'neirong'})
ftime=a[1].text
print(ftime)
time=re.findall('发布时间:(.*)阅读',ftime)
str2=time[0]
x=str2.split('-')
time='{}/{}/{}'.format(x[0],x[1],x[2])
n+=1
print(n)
print('爬取筛选出第{}条新闻,发布时间是{}'.format(n,time))
content=a[0].text
# print(content)
print('关键字为{}'.format(key))
row=[time,title,content,url,key]
writer.writerow(row)
f.close()