import requests
from bs4 import BeautifulSoup
url = "http://www.chinadaily.com.cn"
html = requests.get(url).text
data = list()
soup = BeautifulSoup(html, "lxml")
news = soup.find_all("div",{"class":"item"})
for index,new in enumerate(news):
content_url = "http:" + new.a["href"]
data.append(content_url + "\n")
print(content_url)
content_html = requests.get(content_url).text
content_soup = BeautifulSoup(content_html, "lxml")
h = content_soup.find("h1")
#print(h.text)
data.append(h.text + "\n")
#print(data)
contents = content_soup.find("div", {"id":"Content"})
try:
p = contents.find_all("p") #str段落
for content in p:
data.append(content.text + "\n")
image_content =contents.find('figure',{"class":"image"})
for figcontent in image_content.find("figcaption"): #图片中的str
data.append(figcontent.text + '\n')
except: #出更异常跳过
pass
print(data)
with open("chinadialy.txt","w",encoding = "utf-8") as f:
f.write("".join(data))
chinadialy 部分文字抓取
最新推荐文章于 2024-06-19 15:00:43 发布