#get news from www.sina.com
#and store them in different directories named by different categories
import urllib,os,sys
def getPage(url):
try:
webfile = urllib.urlopen(url)
content = webfile.read()
webfile.close
return content
except:
print 'Can/'t find the page requred!Or disconnected!'
def downtolocal(content,filename,target):
os.chdir('C:'+os.sep+'sina_news')
if os.path.isdir(target)==False:
os.mkdir(target)
print 'Successful make directory:',target
os.chdir(target)
f=open(filename,"w")
f.write(content)
f.close
else:
os.chdir(target)
if os.path.isfile(filename)==True:
f=open(filename,"a")
else:
f=open(filename,"w")
f.write(content)
f.close
def category(content):
if content.find('<!--新闻开始-->')!=-1:
pos=content.find('<!--新闻开始-->')
content=content[pos:]
pos=content.find('[')
content=content[pos:]
while pos!=content.find('<!--新闻结束-->'):
cat=content[content.find('[')+1:content.find(']')]
content=content[content.find('<a href=')+8:]
realurl=content[:content.find('target=_blank>')-1]
content=content[content.find('>')+1:]
filename=content[:content.find('</a>')]
downtolocal(news(realurl),filename+'.txt',cat)
if content.find('[')!=-1:
pos=content.find('[')
content=content[pos:]
else:
break
else:
print 'This is not a regular news page.'
def news(url):
date=getPage(url)
if date.find('<div id=article>')!=-1:
pos=date.find('<div id=article>')
date=date[pos:]
buff=''
while date.find('<p>')!=-1:
pos=date.find('<p>')
date=date[pos+3:]
next=date.find('<')
buff=buff+date[:next]
date=date[next:]
pos=date.find('>')+1
date=date[pos:]
next=date.find('<')
while next!=date.find('</p>'):
date=date[next:]
pos=date.find('>')+1
date=date[pos:]
next=date.find('<')
else:
buff=buff+date[:next]
return buff
else:
print 'This is not a text news page!'
buff='This is not a text news page!'
return buff
if os.path.exists('C:'+os.sep+'sina_news')==False:
os.mkdir('C:'+os.sep+'sina_news')
category(getPage('http://news.sina.com.cn/news1000/index.shtml'))
这个程序是我接触python的第二天写出来的东西,有很多幼稚的地方,但是它让我了解到了python的易用,简洁。后来一直在用python写测试,python写测试真的很适合,我觉得。但是python应该还有更大的用处。看到很多网站都是用Python搞出来的,比如现在很火的豆瓣。如果写小程序,让我选择C++或者是python的话,我一定会选择python.