python日常代码

最新推荐文章于 2024-08-15 01:55:01 发布

Savi.

最新推荐文章于 2024-08-15 01:55:01 发布

阅读量50

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_71051136/article/details/130400378

版权

import requests

from bs4 import BeautifulSoup

from datetime import datetime

import traceback

import os

import re

def get_news(new_url):

new_dict={}

print("新闻地址:"+new_url)

try:

res=requests.get(new_url)

res.encoding='utf-8'

soup=BeautifulSoup(res.text,'lxml')

new_title=soup.select('h1.main-title')[0].text.strip()

new_dict['title']=new_title

nt=datetime.strptime(soup.select('span.date')[0].text.strip(),'%Y年%m月%d日 %H:%M')

new_time=datetime.strftime(nt,'%Y- %m- %d %H:%M')

new_dict['time']=new_time

new_source=soup.select('.source')[0].text

new_dict['source']=new_source

new_author=soup.select('p.show_author')[0].text

new_dict['author']=new_author

news_article=soup.select('div#article>p')

tmp_str=''

for i in range(len(news_article)-1):

tmp_str+=news_article[i].text + '\r\n'

new_dict['article']=tmp_str

news_pic=soup.select('div.img_wrapper>img')

news_pic_list=[]

for pic in news_pic:

news_pic_list.append(pic.get("src"))

new_dict['picture']=news_pic_list

except Exception as e:

print('抓取错误，此条新闻已略过')

print(e)

traceback.print_exc()

return None,None

print('时间:%s 标题:%s 作者:%s 来源:%s'%(new_time,new_title,new_author,new_source))

print('共有%d张图片'%len(news_pic_list))

return new_dict,news_pic_list

def get_url_list(new_list_url):

news_url_list=[]

for i in range(1,20):

url=new_list_url.format(page=1)

tmp_url_list=get_url(url)

if len(tmp_url_list):

news_url_list[len(news_url_list):len(news_url_list)]=tmp_url_list

else:

print('------目录爬取完毕-------')

break

return news_url_list

def get_url(new_list_url):

url=new_list_url

res=requests.get(url)

res.encoding='utf-8'

soup=BeautifulSoup(res.text,'html.parser')

url_list=[]

news_url=soup.select('ul.list_009>li>a')

for url in news_url:

url_list.append(url.get('href'))

print('本页共%d条新闻链接：%s' % (len(url_list),new_list_url))

return url_list

def save_new(root_dir,news_dict,pic_list):

try:

title=news_dict['title']

title=re.sub(r'[\\/:*?"<>|!:?!;]','_',title)

file_dir=root_dir + os.sep + title

is_dir_exist = os.path.exists(file_dir)

if not is_dir_exist:

os.makedirs(file_dir)

save_text(file_dir,news_dict)

save_pic(file_dir,pic_list)

except Exception as e:

print('保存出错')

print(e)

traceback.print_exc()

print("保存完毕，新闻文件途径:%s" % file_dir)

def save_text(file_dir,news_dict):

res=('标题：' + news_dict['title'] + '\r\n' +

'时间:' + news_dict['time'] + '\r\n' +

'作者:' + news_dict['author'] + '\r\n' +

'来源：' + news_dict['source'] + '\r\n' +

'新闻正文:' + news_dict['article'] + '\r\n')

title = news_dict['title']

title=re.sub(r'[\\/:*?"<>|!:?!;]','_',title)

file_path=file_dir + os.sep + title + '.txt'

f = open(file_path,"wb")

f.write(res.encode("utf-8"))

f.close()

def save_pic(file_dir,pic_list):

for i in range(len(pic_list)):

pic_path=file_dir + os.sep + '%d.jpg' % i

try:

req = requests.get(pic_list[i])

except requests.exceptions.MissingSchema as e:

print('图片URL出错，尝试补全URL')

print(e)

req = requests.get('http:' + pic_list[i])

finally:

img = req.content

f = open(pic_path,"wb")

f.write(img)

def start_spider(root_url,root_dir):

url_list = get_url_list(root_url)

print('------链接获取结束------')

print('即将抓取 %d条新闻\r\n' % len(url_list))

for i in range(len(url_list)):

print('%d:' % i)

new,pic=get_news(url_list[i])

if new:

save_new(root_dir,new,pic)

print('------抓取结束------')

if __name__=='__main__':

root_url='http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_{page}.shtml'

root_dir=r'.\news'

start_spider(root_url,root_dir)

Savi.

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
python日常代码

nt=datetime.strptime(soup.select('span.date')[0].text.strip(),'%Y年%m月%d日 %H:%M')'新闻正文:' + news_dict['article'] + '\r\n')res=('标题：' + news_dict['title'] + '\r\n' +print('------目录爬取完毕-------')print('------链接获取结束------')print('------抓取结束------')print('保存出错')
复制链接

扫一扫