使用scrapy爬取网站

最新推荐文章于 2024-04-17 17:00:53 发布

干啥都要好好干！

最新推荐文章于 2024-04-17 17:00:53 发布

阅读量691

点赞数

本文链接：https://blog.csdn.net/chasejava/article/details/79520422

版权

使用的是 using template 'crawl' in module: 没有使使用basic in module

以下是spider.py 中的代码，如果有看不懂的可以直接查看官网中的讲解，使用的是1.5版本的scrapy

# -*- coding: utf-8 -*-

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from DongGuan.items import DongguanItem

class DongguanSpider(CrawlSpider):
"""此处如果继承的basic爬虫类，的话，就实现parse方法"""
name = 'dongguan'
allowed_domains = ['wz.sun0769.com']# 此处如果不写，可能匹配到其他网站中的网址，就会到别处去爬取
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
#开始要爬取的网址，只会执行一次

rules = [

# """如果这个Rule 不需要回调函数，可以不写callback"""

Rule(LinkExtractor(allow='index.php/question/questionType'),process_links='deal_url', callback='parse_url',follow=True),

Rule(LinkExtractor(allow='/html/question/', deny='question/201803/362497.shtml'),callback='parse_detail',follow=True) #deny表示排除这一类的，可以查看官方文档
]

def deal_url(self,links):
"""如果获取到的url不能被访问，就需要在这里将链接修改一下"""
for link in links:
print(link.url)
return links

def parse_url(self,response):
# print(response.url)
pass

def parse_item(self, response):
link_list = response.xpath("//a[@class='news14']/@href").extract()
# for link in link_list:
# print(link)
item = DongguanItem()

yield item

def parse_detail(self,response):
"""处理每页中的具体内容"""
item = DongguanItem()
total_info = response.xpath("//strong[@class='tgray14']/text()").extract()
question = "".join(total_info)[4:-18]
order = "".join(total_info)[-14:-8]
# print(order)
"""因为这个网站的服务器太慢，可能获取不到完整的额数据，因为打开就需要十几秒，等不了"""
detail = response.xpath("//div[6]/div/div[2]/div[1]/text() | //div[6]/div/div[2]/div[1]/div[2]/text()").extract()
detail = "".join(detail)
detail.strip()#理论上是去掉前后空格，和乱七八糟的(\a0l)符号
# print(detail)
# print("------------------")
item['question'] = question
item['order'] = order
item['question_detail'] = detail.strip()

yield item

"""-----------------------------------------------------------"""
"""管道中将文件下载"""
class DongguanPipeline(object):
def __init__(self):
self.file = open("question.json","wb")

def process_item(self, item, spider):
time.sleep(0.1)
json_text = json.dumps(dict(item),ensure_ascii=False)+"\n" #使用ensure_ascii 如果有中文就要转为ascii码
self.file.write(json_text.encode("utf-8"))
return item

def close_spider(self, spider):
self.file.close()