scrapy 爬 zol 笑话大全

最新推荐文章于 2024-01-09 16:57:29 发布

xiaogeldx

最新推荐文章于 2024-01-09 16:57:29 发布

阅读量171

点赞数

本文链接：https://blog.csdn.net/xiaogeldx/article/details/104057284

版权

文章目录

xlb.py
settings.py
pipelines.py
items.py

xlb.py

import scrapy
import re
from test_spider.items import TestSpiderItem


class XlbSpider(scrapy.Spider):
    name = 'xlb'    # name 唯一，一个爬虫一个 name
    allowed_domains = ['xiaohua.zol.com.cn/']   #指定网址，不是该网址下的网页不爬
    start_urls = ['http://xiaohua.zol.com.cn/lengxiaohua/1.html']    #起始 url
    base_domain = 'http://xiaohua.zol.com.cn'

    def parse(self, response):
        """
        response 是一个 scrapy.http.response.html.HtmlResponse 对象，可以执行 xpath 和 css 语法来提取数据
        提取出来的数据是一个 SelectorList 对象，如果想要获取其中的字符串，用 getall() 或者 get()
        getall()：获取 Selector 中的所有文本，返回的是一个列表
        get()：获取 Selector 中的第一个文本，返回的是 str
        如果数据解析回来，要传给 pipeline 处理，可以使用 yield 返回，也可以将数据 append 到列表中，统一进行 return
        item：建议在 items.py 中定义好模型，以后就不要使用字典
        pipeline：这个是专门用来保存数据的，其中有三个方法是会经常用到的
            1. open_spider(self,spider)：当爬虫被打开时执行
            2. process_item(self,item,spider)：当爬虫有 item 传过来的时候会被调用
            3. close_spider(self,spider)：当爬虫关闭的时候会被调用
            要激活 pipeline，应该在 settings.py 中，设置 ITEM_PIPELINES
        """
        content_left = response.xpath('//ul[@class="article-list"]/li[@class="article-summary"]')
        for content in content_left:
            # items = []
            author = content.xpath('.//span[@class="article-title"]/a[@target="_blank"]/text()').get()
            conts = content.xpath('.//div[@class="summary-text"]//text()').getall()
            # 这里如果不替换掉这三个而是用 strip()，会发现没效果
            # \t：tab，制表符    \r：回车符  \n：换行符
            # ''.join()：将列表中的字符串拼接起来，比遍历列表拼接字符串简单
            cont = ''.join(conts).replace('\t','').replace('\r','').replace('\n','')
            item = TestSpiderItem(author=author,content=cont)
            # duanzi= {TestSpiderItem.author: author, TestSpiderItem.content: cont}
            # 将 duanzi yield 给引擎，引擎移交给 pipelines
            # yield duanzi
            # 用 item 好处：可以规定传递值的参数，也便于规划
            # items.append(item)
            yield item

        next_url = response.xpath('//div[@class="page"]/a[@class="page-next"]/@href').get()
        next_url = self.base_domain + next_url
        # 爬取前 1-9 页
        if '10' in next_url:
            return
        yield scrapy.Request(next_url,callback=self.parse,dont_filter=True)
            # print('*'*50)
            # print(item)
            # print(type(item))   #<class 'test_spider.items.TestSpiderItem'>
        # return items

settings.py

...
# 下载延迟
DOWNLOAD_DELAY = random.randint(1,3)

...

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 \
                    Safari/537.36'
}

...

# 为了 pipelines.py 能够运行，需要将 ITEM_PIPELINES 取消注释
ITEM_PIPELINES = {
    # 300 表示优先级，值越小，优先级越高
   'test_spider.pipelines.TestSpiderPipeline': 300,
}
...

pipelines.py

import json
# 用 json

# class TestSpiderPipeline(object):
#     """
#     三个方法：
#         open_spider：打开爬虫
#         process_item：item 参数即引擎转交的内容，如 xlb.py 中的 duanzi
#         close_spider：完成爬虫后关闭
#     """
#     # 为了 pipelines.py 能够运行，需要将 settings.py 中的 ITEM_PIPELINES 取消注释
#     def __init__(self):
#         # 也可以在 open_spider(self, spider) 中打开
#         self.fp = open('duanzi.json','w',encoding='utf8')
#
#     def open_spider(self,spider):
#         # with open('duanzi.json','w',encoding='utf8')
#         print('spider is running!')
#
#     def process_item(self, item, spider):
#         # 解析数据那里返回的 item 不是字典格式，需要先转换成字典格式，再转换成 json
#         item_json = json.dumps(dict(item),ensure_ascii=False)
#         self.fp.write(item_json + '\n')
#         return item
#
#     def close_spider(self,spider):
#         self.fp.close()
#         print('spider was closed!')

# 用 scrapy.exporters.JsonItemExporter
# JsonItemExporter 是将 item 都放入一个列表中，暂存在内存中，在 finish 时一起写入，比较耗内存
# JsonLinesItemExporter 是将 item 的字典逐行写入，节约内存，并且不用 start 和 finish
from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter


class TestSpiderPipeline(object):

    def __init__(self):
        # 以 bytes 格式写入
        self.fp = open('duanzi.json','wb')
        # self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf8')
        self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf8')
        # self.exporter.start_exporting()

    def open_spider(self,spider):
        print('spider is running!')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self,spider):
        # self.exporter.finish_exporting()
        self.fp.close()
        print('spider was closed!')

items.py

import scrapy


class TestSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 将 json 中的 key 在这里定义，然后在解析数据中导入
    author = scrapy.Field()
    content = scrapy.Field()