基础爬虫参考:http://www.cnblogs.com/xin-xin/p/4297852.html
建议阅读这个,写的很清晰易懂
说明文档:
直接运行, 会将嗅事百科第1页到第20页的笑话内容爬下来;
例如:
嗅事百科有很多笑话,这里我没有处理;
# -*- coding:utf-8 -*-
from scrapy import Selector
import urllib2
import sys
from time import sleep
reload(sys)
sys.setdefaultencoding('utf-8')
page = 1
for x in range(1, 20):
url = 'http://www.qiushibaike.com/8hr/page/'+str(page)
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
try:
request = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(request)
except urllib2.URLError, e:
if hasattr(e, 'code'):
print e.code
if hasattr(e, 'reason'):
print e.reason
# selector的使用参考:http://scrapy-chs.readthedocs.io/zh_CN/latest/topics/selectors.html
sel = Selector(text=response.read(), type="html")
with open(r'C:\Users\Wang Zuo\Desktop\test.txt', 'a') as f:
# 通过 XPath来选择title标签内的文字:
for x in sel.xpath('//div[@class = "content"]/text()').extract():
f.write(x)
sleep(0.5)
page += 1