1.创建spider
import scrapy
import re
from scrapy.http import Request
from urllib import parse
from ArticleSpider.items import PythonDoc
import datetime
"""
创建class PythonDocSpider并继承scrapy.Spider
"""
class PythonDocSpider(scrapy.Spider):
name = 'pythonDoc'
allowed_domains = ['www.kuqin.com']
start_urls = ['http://www.kuqin.com/abyteofpython_cn/index.html']
"""
获取列表页post_urls
并将每页Request发送scrapy下载器处理
:param response:
:return:
"""
def parse(self, response):
post_urls = response.css("a::attr(href)").extract()
for post_url in post_urls:
yield Request(url = parse.urljoin(response.url,post_url),callback = self.parse_detail)
#从取回的页面中提取所需要的链接并递归调用当前函数,循环执行抓取动作
next_urls = response.css("[align=right]>a::attr(href)").extract()[0]
if next_urls:
yield Request(url = parse.urljoin(response.url,next_urls),callback = self.parse)
"""
解析response对象,并将取到的页面内容保存到对应的item字典对象中
"""
def parse_detail(self, response):
doc_item = PythonDoc()
content = response.text
doc_item['content'] = content.replace("2312","k")
doc_item['file_name'] = response.url
yield doc_item
2.创建spider所用的items方法(处理抓取回来的数据)
#item中调用的Compose函数,用于处理item中的值,这里处理了字符串替换
def Replace(content):
new_content = content.replace('2312','k')
return new_content
#抓取python学习静态页面
class PythonDoc(scrapy.Item):
content = scrapy.Field()
file_name = scrapy.Field(
output_processor = Compose(Replace)
)
3.创建pipelines保存抓取的页面内容:
class StorePythonDoc(object):
def process_item(self,item,spider):
content = item['content']
# file_name = item['file_name']
file_name = item['file_name']
name = file_name.rsplit('/',1)
self.save_to_file(name[1],content)
def save_to_file(self,file_name,content):
fn = open(file_name,mode="w",encoding='GBK')
fn.write(content)
fn.close()
4.在settings文件中配置该spider参数
ITEM_PIPELINES = {
#将写好的pipeline添加到配置文件中,后边数字表示执行顺序,越小越先被执行
'ArticleSpider.pipelines.StorePythonDoc':100
}
#该选项表示是否基于机器人协议,设置为false,否则会被某些网站禁止爬取
ROBOTSTXT_OBEY = False
#控制下载速度,设置延时时间
DOWNLOAD_DELAY = 5
5.在main函数中启动spider开始爬取页面
from scrapy.cmdline import execute
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy","crawl","pythonDoc"])