用scrapy框架写爬虫

最新推荐文章于 2024-11-06 07:26:29 发布

置顶 aら　淼

最新推荐文章于 2024-11-06 07:26:29 发布

阅读量471

点赞数 3

分类专栏：爬虫文章标签： scrapy 爬虫

本文链接：https://blog.csdn.net/geniusxyt/article/details/101511376

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

结构图

爬虫可以发送给引擎的两种请求：

    # 1、url：
    # （爬虫）yield scrapy.Request -> 引擎 -> 调度器（发送给调度器入队） -> 引擎（调度器出队请求于引擎）
    # -> 下载器（引擎发送于下载器） -> 引擎（下载器成功（失败）返回引擎）：-> 爬虫（引擎接收成功将给爬虫response）or -> 调度器（失败给调度器重新下载）
    # -> 引擎（爬虫接收response并做处理，发送跟进url：yield scrapy.Request） -> 调度器（引擎发送给调度器入队） ->...
    # 2、weiboitem：
    # 一般在接收response后便有了数据，然后
    # （爬虫） yield weiboitem -> 引擎 -> pipelines(管道)进行存储，管道中自己写存储代码 -> mysql or redis

一、准备工作

python、pip、scrapy（pip install Scrapy）
测试：scrapy fetch http://www.baidu.com

二、构建crapy框架

创建爬虫项目（cmd或terminal）：scrapy startproject mySpiderName
cd：cd mySpider
创建爬虫：scrapy genspider myspidername www.dytt8.net
（ www.dytt8.net 是要爬取网址的根域名，只有在此根域名才能爬取到内容）
修改settings协议： ROBOTSTXT_OBEY = False
切记在settings中ITEM_PIPELINES列表添加语句（打开注释），否则管道不会被执行：
‘mySpiderName.pipelines.WeiboSpiderPipeline’: 300,

三、填写代码三部曲

在自动生成的spiders文件夹下的myspider.py文件中编辑：

import scrapy
from hotnewsSpider.items import WeiboSpiderItem     # hotnewsSpider为项目名，WeiboSpiderItem为爬虫item类，在items.py中可找到
                                                    # 创建第二个爬虫时需要手动在items中添加此类

from bs4 import BeautifulSoup

class WeiboSpider(scrapy.Spider):
	# 以微博为例：
 	name = 'weibo'                          # 爬虫名 -- 自动生成，唯一，不可变
    allowed_domains = ['s.weibo.com']       # 允许访问的根域名
    start_urls = ['http://s.weibo.com/']    # 起始访问地址
    
    searchName = "张钧甯 感谢抬爱"
    headers = {

    }
    cookies = {

    }
    urls = [
        # 模拟搜索 searchName
        "https://s.weibo.com/weibo?q=%s&Refer=SWeibo_box"%searchName

    ]
    # urls.extend(start_urls)

	# 重写起始请求，可以给请求加上许多信息
    def start_requests(self):
        # 发送初始请求
        for url in self.urls:
            yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse)

    # 默认第一次返回response接收函数，第二次response可以继续返回这里，也可以返回你定义的人一个函数中，
    # 这在yield scrapy.Request(url，callback=self.your_parse)中决定
    def parse(self, response):
    
        # 用爬虫对应的item类声明一个对象，类型为字典，用来保存数据，通过 yield weiboitem 返回给引擎
        weiboitem = WeiboSpiderItem()                       # from hotnewsSpider.items import WeiboSpiderItem
		
		# BeautifulSoup代码块：
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
        content_id_div = soup.find(id='pl_feedlist_index')
        card_wraps = content_id_div.find_all(class_='card-wrap')

        id = 0

        for card_wrap_item in card_wraps:
            # 用户名
            username = card_wrap_item.find(class_='info').find(class_='name').text

            # 用户头像
            user_headimg = card_wrap_item.find(class_='avator').find('img')['src']

            # 内容
            # 文字 偶尔会搜索出某个人
            content_text_html = card_wrap_item.find(class_='txt')
            content_text = ''
            if content_text_html:
                content_text = content_text_html.get_text().replace(' ', '').replace('\n', '').replace('展开全文c', '')

            # 图片 有的无图
            img_items_html = card_wrap_item.find(class_='m3')
            content_imgs = []
            if img_items_html:
                for img_item in img_items_html.find_all('img'):
                    content_imgs.append(img_item['src'])

            # （收藏）、转发、评论、点赞数量
            other_items_html = card_wrap_item.find(class_='card-act')
            other_items_dic = {}
            if other_items_html:
                other_items_lst = other_items_html.find_all('a')
                for other_item_index in range(len(other_items_lst)):
                    if other_item_index == 0:
                        other_items_dic['收藏'] = ""
                    elif other_item_index == 1:
                        other_items_dic['转发'] = other_items_lst[other_item_index].text.strip().split()[1]
                    elif other_item_index == 2:
                        other_items_dic['评论'] = other_items_lst[other_item_index].text.strip().split()[1]
                    else:
                        other_items_dic['点赞'] = other_items_lst[other_item_index].text.strip()
            # print(other_items_dic)
            id += 1
            weiboitem['id'] = id
            weiboitem['username'] = username
            weiboitem['user_headimg'] = user_headimg
            weiboitem['content_text'] = content_text
            weiboitem['content_imgs'] = content_imgs
            weiboitem['other_items_dic'] = other_items_dic

            yield weiboitem		# 返回数据给引擎，引擎将其传入管道执行管道中的代码
            # yield scrapy.Request(url，callback=self.parse)	# 返回跟进url给引擎
            # yield scrapy.Request(url，callback=self.parse2)	# 返回跟进url给引擎

            break       # 用于测试，只拿一次数据
          
 	def parse2(self,response):
 		pass

在items.py中初始化item字典（第二次以上新建的爬虫需要自己新增对应类）

import scrapy


# 第一个爬虫对应的item类，在创建项目时自动产生
class HotnewsspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

# 自己新增的爬虫类
class WeiboSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    id = scrapy.Field()
    username = scrapy.Field()
    user_headimg = scrapy.Field()
    content_text = scrapy.Field()
    content_imgs = scrapy.Field()
    other_items_dic = scrapy.Field()
    pass

在pipelines.py中保存数据

# 第一个爬虫对应的Pipeline类，在创建项目时自动产生
class HotnewsspiderPipeline(object):
    def process_item(self, item, spider):
        pass
        # return item

# 自己新增的爬虫类
# 切记在settings中ITEM_PIPELINES列表添加语句，否则不会被执行：
# 'hotnewsSpider.pipelines.WeiboSpiderPipeline': 300,
class WeiboSpiderPipeline(object):
    def process_item(self, item, spider):
        # 在这里将数据存入mysql,redis
        print(item)

运行：

scrapy crawl mysipdername（别忘了cd目录）
添加以下任意一个py运行文件命名run或main，要与scrapy.cfg文件同级目录
更改自己的爬虫名即可右键运行

一、
from scrapy.cmdline import execute
import sys
import os

'''
运行scrapy爬虫的方式是在命令行输入    scrapy crawl <spider_name>
调试的常用方式是在命令行输入          scrapy shell <url_name>
'''

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

execute(['scrapy', 'crawl', 'weibo'])  # 你需要将此处的spider_name替换为你自己的爬虫名称

二、
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

'''
运行scrapy爬虫的方式是在命令行输入    scrapy crawl <spider_name>
调试的常用方式是在命令行输入          scrapy shell <url_name>
'''

if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl('weibo')    #  你需要将此处的spider_name替换为你自己的爬虫名称
    process.start()