一、爬取知乎热门内容
# -*- coding: utf-8-*-
import urllib2
import re
from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
f = open('howtoTucao2.txt', 'w') # open the file
for pagenum in range(1, 21):
strpagenum = str(pagenum)
print "Getting data for Page " + strpagenum # for we can see the process in shell
url = "http://www.zhihu.com/collection/27109279?page=" + strpagenum
page = urllib2.urlopen(url) # get the web page
soup = BeautifulSoup(page) # use BeautifulSoup to parsing the web page
ALL = soup.findAll(attrs={'class': ['zm-item-title', 'content hidden']})
for each in ALL:
if each.name == 'h2':
nowstring = re.sub('<s.+>\n<a.+>\n<.+>\n', '', each.a.string)
nowstring = re.sub('<br>', '\n', nowstring)
nowstring = re.sub('<\w+>', '', nowstring)
nowstring = re.sub('</\w+>', '', nowstring)
nowstring = re.sub('<.+>', '\n图片\n', nowstring)
nowstring = re.sub('"', '"', nowstring)
print nowstring
if nowstring:
f.write(nowstring)
else:
f.write("\n No Answer \n")
else:
nowstring = re.sub('<s.+>\n<a.+>\n<.+>\n', '', each.string)
nowstring = re.sub('<br>', '\n', nowstring)
nowstring = re.sub('<\w+>', '', nowstring)
nowstring = re.sub('</\w+>', '', nowstring)
nowstring = re.sub('<.+>', '\n图片\n', nowstring)
nowstring = re.sub('"', '"', nowstring)
print nowstring
if nowstring:
f.write(nowstring)
else:
f.write("\n No Answer \n")
f.close() # close the file
二、爬取简书内容(基于Scrapy框架)
(1)item.py
<