使用的是 using template 'crawl' in module: 没有使使用basic in module
以下是spider.py 中的代码,如果有看不懂的可以直接查看官网中的讲解,使用的是1.5版本的scrapy
# -*- coding: utf-8 -*-
import scrapyfrom scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from DongGuan.items import DongguanItem
class DongguanSpider(CrawlSpider):
"""此处如果继承的basic爬虫类,的话,就实现parse方法"""
name = 'dongguan'
allowed_domains = ['wz.sun0769.com']# 此处如果不写,可能匹配到其他网站中的网址,就会到别处去爬取
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
#开始要爬取的网址,只会执行一次
rules = [
# """如果这个Rule 不需要回调函数,可以不写callback"""
Rule(LinkExtractor(allow='index.php/question/questionType'),process_links='deal_url', callback='parse_url',follow=True),
]
def deal_url(self,links):
"""如果获取到的url不能被访问,就需要在这里将链接修改一下"""
for link in links:
print(link.url)
return links
def parse_url(self,response):
# print(response.url)
pass
def parse_item(self, response):
link_list = response.xpath("//a[@class='news14']/@href").extract()
# for link in link_list:
# print(link)
item = DongguanItem()
yield item
def parse_detail(self,response):
"""处理每页中的具体内容"""
item = DongguanItem()
total_info = response.xpath("//strong[@class='tgray14']/text()").extract()
question = "".join(total_info)[4:-18]
order = "".join(total_info)[-14:-8]
# print(order)
"""因为这个网站的服务器太慢,可能获取不到完整的额数据,因为打开就需要十几秒,等不了"""
detail = response.xpath("//div[6]/div/div[2]/div[1]/text() | //div[6]/div/div[2]/div[1]/div[2]/text()").extract()
detail = "".join(detail)
detail.strip()#理论上是去掉前后空格,和乱七八糟的(\a0l)符号
# print(detail)
# print("------------------")
item['question'] = question
item['order'] = order
item['question_detail'] = detail.strip()
yield item
"""-----------------------------------------------------------"""
"""管道中将文件下载"""
class DongguanPipeline(object):
def __init__(self):
self.file = open("question.json","wb")
def process_item(self, item, spider):
time.sleep(0.1)
json_text = json.dumps(dict(item),ensure_ascii=False)+"\n" #使用ensure_ascii 如果有中文就要转为ascii码
self.file.write(json_text.encode("utf-8"))
return item
def close_spider(self, spider):
self.file.close()