一、获取一个新浪国内新闻页面的内容
import requests #导入requests
res = requests.get('http://news.sina.com.cn/china/')
res.encoding = 'utf-8'#转码
print(res.text)
二、获取新浪国内新闻页面所有新闻的标题,发布时间和链接
import requests
from bs4 import BeautifulSoup#导入bs4
res = requests.get('http://news.sina.com.cn/china/')
res.encoding= 'utf-8'
soup = BeautifulSoup(res.text,
'html.parser')#html.parser是一个解析器
for news in soup.select('.news-item'):#“class= news-item”前用#号,“id = 。。。”用“*”号
#print(news)
if
len(news.select('h2')) > 0:
h2 = news.select('h2')[0].text#获取标题
a = news.select('a')[0]['href']#获取链接
time = news.select('.time')[0].text#获取时间
print(time,h2,a)
三、获取一篇新闻的相关内容
(1)获取一篇新闻的内容,存入soup中
import requests
from bs4 import BeautifulSoup
res =
requests.get('http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml')
res.encoding = 'utf-8'
print(res.text)
soup = BeautifulSoup(res.text, 'html.parser')
(2)抓取标题
soup.select('.main-title')[0].text
(3)抓取时间并存入timesource
timesource =
soup.select('.date-source')[0].text.split('\n')[1]或者soup.select('.date-source')[0].contents[1].text
print(timesource)
#将时间转换成datetime格式
from datetime import datetime#导入datetime
datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')
#将datetime格式转换成字符串
dt.strftime('%y-%m-%d')
(4)抓取新闻来源
soup.select('.date-source
a')[0].text#获取date-source中a部分的内容
(5)抓取新闻正文并存入article
article = []
for p in soup.select('#article_content
p')[:-1]:
article.append(p.text.strip())
print(article)
' '.join(article)#用空格分割文章
用一句代码也可以实现上述功能 ' '.join([p.text.strip() for p in
soup.select('#article_content p')[:-1]])
(6)抓取编辑
soup.select('.show_author')[0].text.lstrip('责任编辑:')
(7)抓取章评论数
import requests
comments =
requests.get('http://comment5.news.sina.com.cn/page/info?version=1\
&format=json&channel=gn&newsid=comos-fyqzcxh0024159&\
group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&\
page_size=3&t_size=3&h_size=3&')
import json
jd = json.loads(comments.text)#将内容存入jd
jd['result']['count']['total']
(8)剖析新闻标识
newsurl =
'http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml'
newsid =
newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
import re
m = re.search('doc-i(.+).shtml', newsurl)
#print (m.group(0))
newsid = m.group(1)#剖析出newsid
print(newsid)
commentURL =
'http://comment5.news.sina.com.cn/page/info?version=1\&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&'
commentURL.format(newsid)在commentURL中植入一个新的newsid。
四、函式总结
(1)获取评论数的函式
import re
import json
import requests
def getCommentCounts(newsurl):
m =
re.search('doc-i(.+).shtml', newsurl)
newsid =
m.group(1)
commentURL =
'http://comment5.news.sina.com.cn/page/info?version=1\&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&'
comments =
requests.get(commentURL.format(newsid))
jd =
json.loads(comments.text.strip('var data='))
counts =
jd['result']['count']['total']
return counts
调用:
news
='http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml'
getCommentCounts(news)
(2)抓取新闻正文内容函式
import requests
from bs4 import BeautifulSoup
from datetime import datetime
def getNewsDetail(newsurl):
result = {}
res =
requests.get(newsurl)
res.encoding =
'utf-8'
soup =
BeautifulSoup(res.text, 'html.parser')
result['title']=soup.select('.main-title')[0].text
result['newssource'] =
soup.select('.date-source a')[0].text
timesource =
soup.select('.date-source')[0].text.split('\n')[1]
result['dt'] =
datetime.strptime(timesource, '%Y年%m月%d日 %H:%M')
result['article']= '
'.join([p.text.strip() for p in soup.select('#article_content
p')[:-1]])
result['editor']=
soup.select('.show_author')[0].text.lstrip('责任编辑:')
result['comments']=getCommentCounts(newsurl)
return result
调用:
newsurl
='http://news.sina.com.cn/c/nd/2018-01-26/doc-ifyqzcxh0024159.shtml'
getNewsDetail(newsurl)
以上函式只是对第三部分每一小部分的总结