(爬取清华大学体育部重要新闻前十页)
运行截图:
代码:
import requests
from bs4 import BeautifulSoup
import csv
import time
# http://www.thsports.tsinghua.edu.cn/publish/sports/1899/index.html
# http://www.thsports.tsinghua.edu.cn/publish/sports/1899/index_2.html
# http://www.thsports.tsinghua.edu.cn/publish/sports/1899/index_3.html
url_1=["http://www.thsports.tsinghua.edu.cn/publish/sports/1899/index_{}.html".format(i) for i in range(2,10)]
urls=url_1+["http://www.thsports.tsinghua.edu.cn/publish/sports/1899/index.html"]
#print (urls)
for url in urls:
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56'}
time.sleep(5)
r = requests.get(url,headers=headers)
r.encoding="utf-8"
html= r.text
# print(html)
with open("qhsport.html","w+",encoding="utf-8") as f:
f.write(html)
soup = BeautifulSoup(html,"lxml")
news=soup.find(class_="box_list").find_all("li")
#news=soup.find_all('ul')[2].find_all('li')
for new in news:
times=new.find(name="p").get_text()
title=new.find(name="a").get("title")
link=new.find(name="a").get("href")
# print(link)
url2="http://www.thsports.tsinghua.edu.cn"+link
# print(url2)
headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.56'}
con=requests.get(url2,headers=headers)
con.encoding="utf-8"
html2=con.text
soup2=BeautifulSoup(html2,"lxml")
content=soup2.find(class_="box_detail").get_text()
print(times,title,url2,content)
with open("qh_news.csv","a",newline="",encoding="utf_8_sig")as f:
writer=csv.writer(f)
writer.writerow((times,title,url2,content))
print("爬取完毕!")