#coding=utf-8
import requests
from bs4 import BeautifulSoup
from datetime import datetime
res = requests.get('http://news.sina.com.cn/c/nd/2017-02-28/doc-ifyavvsk3799160.shtml')
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text,'html.parser') #剖析成固定格式
# print type(soup)
print soup.select('#artibodyTitle')[0].text #标题
# 输出 新疆举行反恐维稳誓师大会 王宁陈全国讲话(图)
timesource = soup.select('.time-source')[0].contents[0].strip() #时间
print timesource
#输出 2017年02月28日01:09
print soup.select('.time-source')[0].contents[1].text[1:] #来源
# print soup.select('.time-source span a')[0].text
#输出 天山网
# str(timesource)
#取得内文
soup.select('#artibody p')
article = []
for p in soup.select('#artibody p')[:-1]:
d = p.text
# print type(d)
article.append(p.text)
# print article 为编码不是正常汉子
for i in article:
print i
##读json
# import json
# jb = json.loads(requests.get('XXXX').text.strip('var data='))
# jd['result']['count']['total']
# #获得新闻id
# newsurl = 'http://news.sina.com.cn/c/nd/2016-08-20/doc-ifxvctcc8121090.shtml'
# newsid = newsurl.strip('/')[-1].rstrip('.shtml').lstrip('doc-i') #xvctcc8121090
#
# import re
# pattern = re.compile('doc-i(.*).shtml')
# m = re.search(pattern,newsurl)
# print m.group(1)