这是爬取国外的一个网站的疫情新闻,这个网站的数据很少,爬的时候要翻墙
https://apnews.com/hub/epidemics
from bs4 import BeautifulSoup
import re
import requests
#得到文章内容
def get_content(url):
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.content, 'html.parser')
newlist = soup.find_all('p')
content=''
for i in newlist:
content+=i.text
return content
data = []
for i in range(1):
newsurl = 'https://apnews.com/hub/epidemics'
res = requests.get(newsurl)
soup = BeautifulSoup(res.text, 'html.parser')
a = soup.find_all('a', class_="Component-headline-0-2-111")
b = soup.find_all('span', class_="Timestamp Component-root-0-2-116 Component-timestamp-0-2-115")
date = []
for j in b:
j = str(j)
pattern1 = '(.*?) data-key="timestamp" data-source="(.*?)" title="(.*?)</span>'
j = re.findall(pattern1, j)
for a1, a2, a3 in j:
date.append(a2)
title1 = []
title2 = []
lianjie= []
for i in a:
i = str(i)
# <a class="Component-headline-0-2-111" data-key="card-headline" href="/article/asia-pacific-carrie-lam-coronavirus-pandemic-hong-kong-58660e5ecbd579169dcd9c7f244ed649"><h1 class="Component-h1-0-2-112">Hong Kong leader urges people to stay home as cases rise</h1></a>
pattern = '<a class="Component-headline-0-2-111" data-key="card-headline" href="(.*?)"><h1 class="Component-h1-0-2-112">(.*?)</h1></a>'
title = re.findall(pattern, i)
for i, j in title:
i = 'https://apnews.com/' + i
title1.append(i)
title2.append(j)
lianjie.append(get_content(i))
data = zip(date, title2, title1,lianjie)
data=list(data)
print(data)
with open('国外美国AP数据.csv','w',encoding='utf_8_sig',newline="") as f:
f.write('时间,标题,链接,内容\n')
i=0
while i<len(data):
f.write(data[i][0]+","+data[i][1]+","+data[i][2]+','+data[i][3]+'\n')
i+=1
f.close()
print("已保存文件")