from BeautifulSoup import BeautifulSoup
import urllib2
import urllib
outfile = open("qiubai1.txt", "w")
def formalize(text):
result = ''
lines = text.split(u'\n')
for line in lines:
line = line.strip()
if len(line) == 0:
continue
result += line + u'\n\n'
return result
def writeIO(text):
#text=text+r"\r\n"
outfile.write(text)
#print >> outfile, text + "\r\n"
def qiuBaiDemo(page):
url="http://www.qiushibaike.com/hot/page/"+page
#print url
data = urllib2.urlopen(url).readlines()
#print len(data)
soup = BeautifulSoup("".join(data))
#print soup.div.content
#print soup['title']
#print type(soup)
contents = soup.findAll('div',"content")
contentss=[]
for i in range(0,len(contents)):
try:
title=contents[i]['title']
contentss.append(contents[i])
except:
print ""
stories = [str(text) for text in contentss]
count=0
for story in stories:
count+=1
minisoup = BeautifulSoup(story)
text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
#text = urllib.unquote(unescape(text, {'"':'"'}))
text = formalize(text).encode("gb18030")
print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"
print text
writeIO(text)
if __name__ == '__main__':
page=raw_input('Enter the page you want view : ')
qiuBaiDemo(page)
outfile.close()
糗百爬虫
最新推荐文章于 2021-01-11 11:27:25 发布