import requests
from bs4 import BeautifulSoup
import os
import re
import traceback
from datetime import datetime
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
#获取一个新闻网页的内容,返回新闻dict和图片list
def get_news(url):
new_dict={}
news_pic_list= []
try:
#获取页面内容
res=requests.get(url,headers=headers)
res.encoding=res.apparent_encoding
#2.用BeautifulSoup提取标题、时间、作者、来源、正文、图片
soup = BeautifulSoup(res.text,'html.parser')
#标题
new_title=soup.find('h1',class_='main-title').text
#print(new_title)
#时间
new_time=soup.select('span.date')[0].text
#print(new_time)
#作者
new_author=soup.find('p',class_='show_author').text
#print(new_author)
#来源
new_source=soup.select('.source')[0].text
#print(new_source)
#正文
news_article=soup.select('div#article>p')
tmp_str=''
for i in range(len(news_article)-1):
tmp_str+=news_article[i].text+"\r\n"
#print(new_article)
#图片
news_pic=soup.find('div',class_='img_wrapper')
news_pic_list=[]
for pic in news_pic:
news_pic_list.append(pic.get('src'))
#new_dict={'title':new_time,'time':new_time,'source':new_source,'author':new_author,'article':tmp_str,'picture':news_pic_list}
#print(new_title,new_time,new_author,new_source,tmp_str,news_pic_list)
new_dict['title']=new_title
new_dict ['time']=new_time
new_dict['source']=new_source
new_dict['author']=new_author
new_dict['article']=tmp_str
new_dict['picture']=news_pic_list
except Exception as e:
print("抓取出错,此条新闻略过")
return (new_dict,news_pic_list)
#print(news_pic)
#获取一个新闻列表页的所有连接
def get_url(new_list_url):
#获取网页内容
res=requests.get(new_list_url,headers=headers)
res.encoding=res.apparent_encoding
#获取内容中所有超链接
soup=BeautifulSoup(res.text,'html.parser')
url_list=[]
news_url=soup.select("ul.list_009>li>a")
for a in news_url:
url_list.append(a.get('href'))
print("本页共有{0}条新闻连接{1}".format(len(url_list),url_list))
return url_list
#获取所有新闻连接
def get_all_url(new_list_url):
news_url_list=[]
for i in range(1,3):
url=new_list_url.format(page=i)
print("第{0}个页面的新闻列表{1}".format(i,url))
url_list=get_url(url)
news_url_list=news_url_list+url_list #将每个页面的url列表合并成一个url列表
return news_url_list
#保存新闻
#(1)按照新闻标题创建文件夹
def save_new(root_dir,new_dict,pic_list):
try:
title=new_dict['title']
#除去标题中的特殊字符(以标题为名称建立文件夹)
for ch in '/\!?//- <>|""《》':
title.replace(ch,'')
file_dir=root_dir + os.sep+ title
if not os.path.exists(file_dir):
os.makedirs(file_dir)
#保存新闻文本
save_text(file_dir,new_dict)
#保存图片
save_pic(file_dir,pic_list)
except Exception as e:
print("保存出错",e)
print("保存完毕,新闻文件路径:",file_dir)
#保存新闻的文本内容
def save_text(file_dir,new_dict):
res=('标题'+new_dict['title']+'\r\n'+"时间"+new_dict['time']+'\r\n'+
'作者'+new_dict['author']+'\r\n'+"来源"+new_dict['source']+'\r\n'+
'新闻正文'+new_dict['article'])
title=new_dict['title']
for ch in '\ : ; . ? / - 《》""、':
title.replace(ch,'')
file_path = file_dir + os.sep+title +'/txt'
#file_path=file_dir +os.sep+title
with open(file_path,'wb') as fp:
fp.write(res.encode('utf-8'))
#with open(file_dir,'w') as fp:
#fp.write(res)
def save_pic(file_dir,pic_list):
for i in range(len(pic_list)):
#图片名
pic_path=file_dir+os.sep+str(i+1)+'.jpg'
try:
res=requests.get(pic_list[i],headers=headers)
except requests.exceptions.MissingSchemaas e:
#print('图片url',pic_list[i],e)
res=requests.get('http:'+pic_list[i],headers=headers)
with open(pic_path,'wb') as fp:
fp.write(res.content)
if __name__=='__main__':
url='http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_{page}.shtml'
url_list=get_all_url(url)
print("-------连接获取结束--------")
print("即将抓取{0}条新闻".format(len(url_list)))
for u in url_list:
new,pic=get_news(u)
if new:
save_new("d:\\doc",new,pic)
print("--------抓取并保存结束----------")
日常代码分享
最新推荐文章于 2024-06-19 18:46:58 发布