import requests
import csv
from bs4import BeautifulSoup
for jin range(100):#爬取前多少页的新闻就填几
j+=1
if j ==1:
html = requests.get("http://web.fosu.edu.cn/news/category/school-news")
else:
html = requests.get("http://web.fosu.edu.cn/news/category/school-news"+"/page/"+str(j))
soup = BeautifulSoup(html.content, 'lxml')#初始化,由于html是request对象,无法用beautifulsoup解释,因此在后面加上.contain
#print(soup)输出soup转换后的内容
all_news = soup.find('div', class_="contain2_right_bottom")#找到最大的div,其class为contain2_big
#print(all_news) #输出新闻盒子div里的内容
csv_file =open('data.csv', 'a', encoding="utf-8", newline='')#''
writer = csv.writer(csv_file)
writer.writerow(["新闻标题","发布时间","新闻链接","新闻内容"])#写入标题
print('正在爬第' +str(j) +'页新闻')
for iin range(10):#爬取目录第一页的新闻标题及链接
all_a_tag = all_news.find_all('a')# 找到所有的a标签,赋值给all_a_tag
news_name = all_a_tag[i].text#标题的抓取
news_link = all_a_tag[i]['href']#链接的抓取
# 新闻正文的抓取
html_a = requests.get(news_link)
soup_a = BeautifulSoup(html_a.content, 'lxml')
news_contain = soup_a.find('div', class_='contain3_right_bottom')
news_main = news_contain.text
#新闻发布时间的抓取
news_time = soup_a.find('div', class_='contain3_right_bottom_xx')
news_time_a = news_time.text
print('新闻标题:{},时间:{},链接:{},内容:{}'.format(news_name,news_time_a,news_link,news_main))
writer.writerow([news_name,news_time_a,news_link,news_main])
print('第' +str(j) +'页第'+str(i+1) +'篇新闻爬取完毕!')
print('第' +str(j) +'页新闻爬取完毕!')
csv_file.close()
print("写入完毕!")