#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
res = requests.get('http://news.sina.com.cn/o/2017-12-01/doc-ifyphtze3020095.shtml')
res.encoding = 'utf-8'
#打印文章标题
soup = BeautifulSoup(res.text , 'html.parser')
print(soup.select('#artibodyTitle')[0].text)
#获取文章发表时间
timesource = soup.select('#navtimeSource')[0].contents[0].strip()
#获取文章来源
#wsource = soup.select('#navtimeSource')[0].contents[1].text.strip()
wsource = soup.select('#navtimeSource span a')[0].text
#获取文章来源链接
wa = soup.select('#navtimeSource span a')[0].get('href')
#时字符串转时间格式
dt = datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
#时间格式转字符串
#dtstr = dt.strftime('%Y-%m-%d %H:%M:%S')
#将时间,来源,链接依次打印
print(dt,wsource,wa)
#取得文章内容
#方式1
'''
article = []
for p in soup.select('#artibody p')[:-1]:
article.append(p.text)
print(' '.join(article))
'''
#方式2
print(' '.join(p.text.strip() for p in soup.select('#artibody p')[:-1]))
#选取新闻编辑者名称
print(soup.select('.article-editor')[0].text)
#print(soup.select('.article-editor')[0].text.lstrip('责任编辑:'))
#剖析新闻标识符
#方法1
newsurl = 'http://news.sina.com.cn/o/2017-12-01/doc-ifyphtze3020095.shtml'
newsid = newsurl.split('/')[-1].rstrip('.shtml').lstrip('doc-i')
print('newsid =',newsid)
'''
#方法2
m = re.search('doc-i(.+).shtml',newsurl)
print(m.group(0))
print(m.group(1))
'''
结果展示: