python爬取一条新闻内容_爬取新浪新闻内容python代码简介

一、获取一个新浪国内新闻页面的内容

import requests #导入requests

res = requests.get('http://news.sina.com.cn/china/')

res.encoding = 'utf-8'#转码

print(res.text)

二、获取新浪国内新闻页面所有新闻的标题,发布时间和链接

import requests

from bs4 import BeautifulSoup#导入bs4

res = requests.get('http://news.sina.com.cn/china/')

res.encoding= 'utf-8'

soup = BeautifulSoup(res.text,

'html.parser')#html.parser是一个解析器

for news in soup.select('.news-item'):#“class= news-item”前用#号,“id = 。。。”用“*”号

#print(news)

if

len(news.select('h2')) > 0:

h2 = news.select('h2')[0].text#获取标题

a = news.select('a')[0]['href']#获取链接

time = news.select('.time')[0].text#获取时间

print(time,h2,a)

三、获取一篇新闻的相关内容

(1)获取一篇新闻的内容,存入soup中

import requests

from bs4 import BeautifulSoup

res =

requests.get('http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml')

res.encoding = 'utf-8'

print(res.text)

soup = BeautifulSoup(res.text, 'html.parser')

(2)抓取标题

soup.select('.main-title')[0].text

(3)抓取时间并存入timesource

timesource =

soup.select('.date-source')[0].text.split('\n')[1]或者soup.select('.date-source')[0].contents[1].text

print(timesource)

#将时间转换成datetime格式

from datetime import datetime#导入datetime

datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')

#将datetime格式转换成字符串

dt.strftime('%y-%m-%d')

(4)抓取新闻来源

soup.select('.date-source

a')[0].text#获取date-source中a部分的内容

(5)抓取新闻正文并存入article

article = []

for p in soup.select('#article_content

p')[:-1]:

article.append(p.text.strip())

print(article)

' '.join(article)#用空格分割文章

用一句代码也可以实现上述功能 ' '.join([p.text.strip() for p in

soup.select('#article_content p')[:-1]])

(6)抓取编辑

soup.select('.show_author')[0].text.lstrip('责任编辑:')

(7)抓取章评论数

import requests

comments =

requests.get('http://comment5.news.sina.com.cn/page/info?version=1\

&format=json&channel=gn&newsid=comos-fyqzcxh0024159&\

group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&\

page_size=3&t_size=3&h_size=3&')

import json

jd = json.loads(comments.text)#将内容存入jd

jd['result']['count']['total']

(8)剖析新闻标识

newsurl =

'http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml'

newsid =

newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')

import re

m = re.search('doc-i(.+).shtml', newsurl)

#print (m.group(0))

newsid = m.group(1)#剖析出newsid

print(newsid)

commentURL =

'http://comment5.news.sina.com.cn/page/info?version=1\&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&'

commentURL.format(newsid)在commentURL中植入一个新的newsid。

四、函式总结

(1)获取评论数的函式

import re

import json

import requests

def getCommentCounts(newsurl):

m =

re.search('doc-i(.+).shtml', newsurl)

newsid =

m.group(1)

commentURL =

'http://comment5.news.sina.com.cn/page/info?version=1\&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&'

comments =

requests.get(commentURL.format(newsid))

jd =

json.loads(comments.text.strip('var data='))

counts =

jd['result']['count']['total']

return counts

调用:

news

='http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml'

getCommentCounts(news)

(2)抓取新闻正文内容函式

import requests

from bs4 import BeautifulSoup

from datetime import datetime

def getNewsDetail(newsurl):

result = {}

res =

requests.get(newsurl)

res.encoding =

'utf-8'

soup =

BeautifulSoup(res.text, 'html.parser')

result['title']=soup.select('.main-title')[0].text

result['newssource'] =

soup.select('.date-source a')[0].text

timesource =

soup.select('.date-source')[0].text.split('\n')[1]

result['dt'] =

datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')

result['article']= '

'.join([p.text.strip() for p in soup.select('#article_content

p')[:-1]])

result['editor']=

soup.select('.show_author')[0].text.lstrip('责任编辑:')

result['comments']=getCommentCounts(newsurl)

return result

调用:

newsurl

='http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml'

getNewsDetail(newsurl)

以上函式只是对第三部分每一小部分的总结

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值