#coding=utf-8
#需要BeautifulSoup(美丽的汤)支持:http://crummy.com/software/BeautifulSoup
import urllib
import urllib2
from xml.sax.saxutils import unescape
from BeautifulSoup import BeautifulSoup # For processing HTML
def formalize(text):
result = ''
lines = text.split(u'\n')
for line in lines:
line = line.strip()
if len(line) == 0:
continue
result += line + u'\n\n'
return result
outfile = open("qiushi.txt", "w")
count = 0
for i in range(1, 101):
url = "http://qiushibaike.com/qiushi/best/all/page/%d" % i
data = urllib2.urlopen(url).readlines()
soup = BeautifulSoup("".join(data))
contents = soup.findAll('div', "content")
stories = [str(text) for text in contents]
for story in stories:
count += 1
print "processing page %d, %d items added" % (i, count)
minisoup = BeautifulSoup(story)
text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
text = urllib.unquote(unescape(text, {'"':'"'}))
text = formalize(text).encode("utf-8")
print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"
print >> outfile, text + "\r\n"
outfile.close()
基本的操作如下:
# -*- coding: cp936 -*-
import urllib
import urllib2
from xml.sax.saxutils import unescape
from BeautifulSoup import BeautifulSoup # For processing HTML
#url = "http://job.dajie.com/7262fae6-a1aa-4674-9efa-3baf697faa46.html"
url="http://www.qiushibaike.com/hot"
data = urllib2.urlopen(url).readlines()
soup = BeautifulSoup("".join(data))
contents = soup.findAll('div', "content")
stories = [str(text) for text in contents]
for story in stories:
minisoup = BeautifulSoup(story)
text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
print text
break
'''
for div in soup.findAll('div', "content") :
print 'find it'
print div
minisoup = BeautifulSoup(div)
#来遍历文档中所有元素, 并打印它们
text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
print text
break
s = div.contents
for x in s:
if (x.encode('GB2312')) != '<br/>' and (x.encode('GB2312')) != '\n': #注意此处GB2312编码不是utf8
print x.encode('GB2312')
break
'''