# -*- coding: utf-8 -*-
import urllib2, re
def getNews(url, regular):
html = urllib2.urlopen(url, timeout = 40).read()
txt = re.findall(regular, html, re.S)
for item in txt:
item=re.sub('<[\/\!]*?[^<>]*?>|<script[^>]*?>.*?</script>si|window.zlzp = window.zlzp||{};|&(nbsp|#160);|&(quot|#34);|" target="_blank">(.*)</a>', '', item)
item=re.sub('\n\s*\r| |document.getElementById(.*);|<span class="rz">(.*)</span>]</span>|<a href="| | ', '', item)
item=re.sub(' |>>', ' ', item)
return item
url = "http://news.ifeng.com/mainland/detail_2013_11/06/30987846_0.shtml"
regular = '<div id="artical_real">(.*)<span class="ifengLogo">'
print getNews(url, regular)