scrapy_redis

使用hashlib模块生成指纹

import hashlib
f = hashlib.sha1()
text = 'http://www.baidu.com'
f.update(text.encode()) # 需要将字符串进行编码
f.hexdigest()
# '633a42441e296c9004a78abe0b2ee3b37559d32f'

RedisSpider 类

  • 设置一个redis中的键用来保存开始地址

  • 当某一台主机在redis中输入开始地址后, 分布式任务开始.

  • 配置settings, 使用scrapy-redis调度器和hash去重

  • settings.py:

# reids配置信息
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 过滤器
SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 调度器
SCHEDULER_PERSIST = True # 持久化
REDIS_URL = 'redis://192.168.145.129:6379' # redis数据库

# 其他配置信息
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
ROBOTSTXT_OBEY = False
# -*- coding: utf-8 -*-
import scrapy
# 导入分布式爬虫类
from scrapy_redis.spiders import RedisSpider
from copy import deepcopy
import urllib


# 继承分布式爬虫类
class DangdangSpider(RedisSpider):
    name = 'gdang'
    allowed_domains = ['gdang.com']
    # start_urls = ['http://gdang.com/'] 
    redis_key = "gdang:start_urls" # 在redis中根据该键输入start_urls开始任务

    def parse(self, response):
        # 大分类分组
        div_list = response.xpath("//div[@class='con flq_body']/div")
        for div in div_list:
            item = dict()
            item['b_cate'] = div.xpath("./dl/dt//text()").extract()
            item['b_cate'] = [i.strip() for i in item['b_cate'] if len(i.strip()) > 0]

            # 中分类分组
            dl_list = div.xpath(".//dl[@class='inner_dl']")
            for dl in dl_list:
                # dt下面可能有看不到的字符需要去掉
                item['m_cate'] = dl.xpath("./dt//text()").extract()
                item['m_cate'] = [i.strip() for i in item['m_cate'] if len(i.strip()) > 0][0]

                # 小分类分组
                a_list = dl.xpath("./dd/a")
                for a in a_list:
                    item['s_cate'] = a.xpath("./@title").extract_first()
                    item['s_href'] = a.xpath("./@href").extract_first()

                    if item['s_href'] is not None:
                        yield scrapy.Request(
                            url=item['s_href'],
                            callback=self.parse_book_list,
                            meta={'item': deepcopy(item)}
                        )

    def parse_book_list(self, response):
        item = response.meta['item']
        li_list = response.xpath("//ul[@class='bigimg']/li")
        for li in li_list:
            item['book_name'] = li.xpath("./a/@title").extract_first()
            item['book_detail'] = li.xpath("./p[@class='detail']/text()").extract_first()
            item['book_price'] = li.xpath("./p[@class='price']/span[1]/text()").extract_first()
            item['book_author'] = li.xpath("./p[@class='search_book_author']/span[1]/a/text()").extract()

        print(item)

        # 翻页
        next_url = response.xpath("//li[@class='next']/a/@href").extract_first()
        if next_url is not None:
            next_url = urllib.parse.urljoin(response.url, next_url)
            yield scrapy.Request(
                url=next_url,
                callback=self.parse_book_list,
                meta={'item': item}
            )

RedisCrawlSpider类实

  • 配置信息同上
  • 定义url规则自动提取url进行相应
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
import re

class AmazonSpider(RedisCrawlSpider):
    name = 'dqrm'
    allowed_domains = ['dqrm.cn']
    redis_key = 'dqrm:start_urls'

    rules = (
        # 大分类地址
        Rule(LinkExtractor(restrict_xpaths=("//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-one']/div/li",)), follow=True),
        # 小分类地址
        Rule(LinkExtractor(restrict_xpaths=("//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-two']/div/li",)), follow=True),
        # 图书地址
        Rule(LinkExtractor(restrict_xpaths=("//div[@id='mainResults']/ul/li//h2/..",)), callback='parse_book_detail'),
        # 图书列表翻页
        Rule(LinkExtractor(restrict_xpaths=("//div[@id='pagn']",)), follow=True),
    )

    def parse_book_detail(self, response):
        item = {}

        item['book_name'] = response.xpath("//div[@id='booksTitle']//span[@id='productTitle']/text()").extract_first()
        item['book_price'] = response.xpath("//div[@id='soldByThirdParty']/span[2]/text()").extract_first()
        item['book_breadcrumbs'] = response.xpath("//div[@id='wayfinding-breadcrumbs_container']/ul/li/span/a/text()").extract()
        item['book_breadcrumbs'] = [re.sub(r"\n|\s", '', i) for i in item['book_breadcrumbs']]
        item['book_press'] = response.xpath("//b[text()='出版社:']/../text()").extract_first()
        item['book_detail'] = response.xpath("//div[@id='bookDescription_feature_div']/noscript/div/text()").extract()
        item['book_detail'] = [re.sub(r"\n|\s", '', i) for i in item['book_detail']]
        print(item)

  • 运行爬虫

    scrapy runspider mycrawler_redis.py

  • 爬虫开始后悔进入等待start_url状态, 在redis中根据redis_key保存一个起始url

  • 从redis中获取到start_urls后, 各个spider开始爬取工作

    redis-cli> lpush yy:start_urls https://www.dqrm.cn/

分布式DEMO

增量式爬虫 CrawlSpider

# -*- coding: utf-8 -*-

# 指定使用scrapy-redis的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 指定使用scrapy-redis的去重
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'

# 指定排序爬取地址时使用的队列,
# 默认的 按优先级排序(Scrapy默认),由sorted set实现的一种非FIFO、LIFO方式。
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
# 可选的 按先进先出排序(FIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
# 可选的 按后进先出排序(LIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'

# 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理redis queues
SCHEDULER_PERSIST = True

# 只在使用SpiderQueue或者SpiderStack是有效的参数,指定爬虫关闭的最大间隔时间
# SCHEDULER_IDLE_BEFORE_CLOSE = 10

# 通过配置RedisPipeline将item写入key为 spider.name : items 的redis的list中,供后面的分布式处理item
# 这个已经由 scrapy-redis 实现,不需要我们写代码
ITEM_PIPELINES = {
    'example.pipelines.ExamplePipeline': 300,
    'scrapy_redis.pipelines.RedisPipeline': 400
}

# 指定redis数据库的连接参数
# REDIS_PASS是我自己加上的redis连接密码(默认不做)
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
#REDIS_PASS = 'redisP@ssw0rd'

# LOG等级
LOG_LEVEL = 'DEBUG'

#默认情况下,RFPDupeFilter只记录第一个重复请求。将DUPEFILTER_DEBUG设置为True会记录所有重复的请求。
DUPEFILTER_DEBUG =True

# 覆盖默认请求头,可以自己编写Downloader Middlewares设置代理和UserAgent
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Connection': 'keep-alive',
    'Accept-Encoding': 'gzip, deflate, sdch'
}
# -*- coding: utf-8 -*-

from datetime import datetime

class ExamplePipeline(object):
    def process_item(self, item, spider):
        #utcnow() 是获取UTC时间
        item["crawled"] = datetime.utcnow()
        # 爬虫名
        item["spider"] = spider.name
        return item
from scrapy.item import Item, Field

class yyItem(Item):
    # 个人头像链接
    header_url = Field()
    # 用户名
    username = Field()
    # 内心独白
    monologue = Field()
    # 相册图片链接
    pic_urls = Field()
    # 年龄
    age = Field()
    # 网站来源 yy
    source = Field()
    # 个人主页源url
    source_url = Field()
    # 获取UTC时间
    crawled = Field()
    # 爬虫名
    spider = Field()
  • spiders/yy.py
# -*- coding:utf-8 -*-

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# 使用redis去重
from scrapy.dupefilters import RFPDupeFilter

from example.items import yyItem
import re

#
class	YySpider(CrawlSpider):
    name = 'yy'
    allowed_domains = ['yy.com']
    # yy的列表页
    start_urls = ['http://www.yy.com/find/beijing/y']

    # 搜索页面匹配规则,根据response提取链接
    list_page_lx = LinkExtractor(allow=(r'http://www.yy.com/find/.+'))

    # 北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
    page_lx = LinkExtractor(allow =(r'http://www.yy.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))

    # 个人主页 匹配规则,根据response提取链接
    profile_page_lx = LinkExtractor(allow=(r'http://www.yy.com/\d+-profile/'))

    rules = (
        # 匹配find页面,跟进链接,跳板
        Rule(list_page_lx, follow=True),
        # 匹配列表页成功,跟进链接,跳板
        Rule(page_lx, follow=True),
        # 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
        Rule(profile_page_lx, callback='parse_profile_page', follow=False),
    )

    # 处理个人主页信息,得到我们要的数据
    def parse_profile_page(self, response):
        item = yyItem()
        item['header_url'] = self.get_header_url(response)
        item['username'] = self.get_username(response)
        item['monologue'] = self.get_monologue(response)
        item['pic_urls'] = self.get_pic_urls(response)
        item['age'] = self.get_age(response)
        item['source'] = 'yy'
        item['source_url'] = response.url

        #print "Processed profile %s" % response.url
        yield item


    # 提取头像地址
    def get_header_url(self, response):
        header = response.xpath('//dl[@class=\'personal_cen\']/dt/img/@src').extract()
        if len(header) > 0:
            header_url = header[0]
        else:
            header_url = ""
        return header_url.strip()

    # 提取用户名
    def get_username(self, response):
        usernames = response.xpath("//dl[@class=\'personal_cen\']/dd/div/strong/text()").extract()
        if len(usernames) > 0:
            username = usernames[0]
        else:
            username = "NULL"
        return username.strip()

    # 提取内心独白
    def get_monologue(self, response):
        monologues = response.xpath("//ul[@class=\'requre\']/li/p/text()").extract()
        if len(monologues) > 0:
            monologue = monologues[0]
        else:
            monologue = "NULL"
        return monologue.strip()

    # 提取相册图片地址
    def get_pic_urls(self, response):
        pic_urls = []
        data_url_full = response.xpath('//li[@class=\'smallPhoto\']/@data_url_full').extract()
        if len(data_url_full) <= 1:
            pic_urls.append("");
        else:
            for pic_url in data_url_full:
                pic_urls.append(pic_url)
        if len(pic_urls) <= 1:
            return "NULL"
        # 每个url用|分隔
        return '|'.join(pic_urls)

    # 提取年龄
    def get_age(self, response):
        age_urls = response.xpath("//dl[@class=\'personal_cen\']/dd/p[@class=\'local\']/text()").extract()
        if len(age_urls) > 0:
            age = age_urls[0]
        else:
            age = "0"
        age_words = re.split(' ', age)
        if len(age_words) <= 2:
            return "0"
        age = age_words[2][:-1]
        # 从age字符串开始匹配数字,失败返回None
        if re.compile(r'[0-9]').match(age):
            return age
        return "0"

  • 运行

  • 可以开启多个客户端运行

    scrapy crawl yy

分布式爬虫 RedisCrawlSpider

  • 修改 spiders/yy.py
# -*- coding:utf-8 -*-

from scrapy.linkextractors import LinkExtractor
#from scrapy.spiders import CrawlSpider, Rule

# 1. 导入RedisCrawlSpider类,不使用CrawlSpider
from scrapy_redis.spiders import RedisCrawlSpider
from scrapy.spiders import Rule


from scrapy.dupefilters import RFPDupeFilter
from example.items import yyItem
import re

# 2. 修改父类 RedisCrawlSpider
# class yySpider(CrawlSpider):
class yySpider(RedisCrawlSpider):
    name = 'yy'

# 3. 取消 allowed_domains() 和 start_urls
##### allowed_domains = ['yy.com']
##### start_urls = ['http://www.yy.com/find/beijing/']

# 4. 增加redis-key
    redis_key = 'yy:start_urls'

    list_page_lx = LinkExtractor(allow=(r'http://www.yy.com/find/.+'))
    page_lx = LinkExtractor(allow =(r'http://www.yy.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))
    profile_page_lx = LinkExtractor(allow=(r'http://www.yy.com/\d+-profile/'))

    rules = (
        Rule(list_page_lx, follow=True),
        Rule(page_lx, follow=True),
        Rule(profile_page_lx, callback='parse_profile_page', follow=False),
    )

# 5. 增加__init__()方法,动态获取allowed_domains()
    def __init__(self, *args, **kwargs):
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(yySpider, self).__init__(*args, **kwargs)

    # 处理个人主页信息,得到我们要的数据
    def parse_profile_page(self, response):
        item = yyItem()
        item['header_url'] = self.get_header_url(response)
        item['username'] = self.get_username(response)
        item['monologue'] = self.get_monologue(response)
        item['pic_urls'] = self.get_pic_urls(response)
        item['age'] = self.get_age(response)
        item['source'] = 'yy'
        item['source_url'] = response.url

        yield item

    # 提取头像地址
    def get_header_url(self, response):
        header = response.xpath('//dl[@class=\'personal_cen\']/dt/img/@src').extract()
        if len(header) > 0:
            header_url = header[0]
        else:
            header_url = ""
        return header_url.strip()

    # 提取用户名
    def get_username(self, response):
        usernames = response.xpath("//dl[@class=\'personal_cen\']/dd/div/strong/text()").extract()
        if len(usernames) > 0:
            username = usernames[0]
        else:
            username = "NULL"
        return username.strip()

    # 提取内心独白
    def get_monologue(self, response):
        monologues = response.xpath("//ul[@class=\'requre\']/li/p/text()").extract()
        if len(monologues) > 0:
            monologue = monologues[0]
        else:
            monologue = "NULL"
        return monologue.strip()

    # 提取相册图片地址
    def get_pic_urls(self, response):
        pic_urls = []
        data_url_full = response.xpath('//li[@class=\'smallPhoto\']/@data_url_full').extract()
        if len(data_url_full) <= 1:
            pic_urls.append("");
        else:
            for pic_url in data_url_full:
                pic_urls.append(pic_url)
        if len(pic_urls) <= 1:
            return "NULL"
        return '|'.join(pic_urls)

    # 提取年龄
    def get_age(self, response):
        age_urls = response.xpath("//dl[@class=\'personal_cen\']/dd/p[@class=\'local\']/text()").extract()
        if len(age_urls) > 0:
            age = age_urls[0]
        else:
            age = "0"
        age_words = re.split(' ', age)
        if len(age_words) <= 2:
            return "0"
        age = age_words[2][:-1]
        if re.compile(r'[0-9]').match(age):
            return age
        return "0"

  • 运行

客户端执行:

scrapy runspider yy.py

redis服务器端执行:

redis-cli> lpush yy:start_urls http://www.yy.com/find/beijinyy

DEMO

  • 说明:

    要求:采集所有公司信息.

  • items.py

# items.py

# -*- coding: utf-8 -*-
import scrapy
class CompanyItem(scrapy.Item):
    # 公司id (url数字部分)
    info_id = scrapy.Field()
    # 公司名称
    company_name = scrapy.Field()
    # 公司口号
    slogan = scrapy.Field()
    # 分类
    scope = scrapy.Field()
    # 子分类
    sub_scope = scrapy.Field()

    # 所在城市
    city = scrapy.Field()
    # 所在区域
    area = scrapy.Field()
    # 公司主页
    home_page = scrapy.Field()
    # 公司标签
    tags = scrapy.Field()

    # 公司简介
    company_intro = scrapy.Field()
    # 公司全称:
    company_full_name = scrapy.Field()
    # 成立时间:
    found_time = scrapy.Field()
    # 公司规模:
    company_size = scrapy.Field()
    # 运营状态
    company_status = scrapy.Field()

    # 投资情况列表:包含获投时间、融资阶段、融资金额、投资公司
    tz_info = scrapy.Field()
    # 团队信息列表:包含成员姓名、成员职称、成员介绍
    tm_info = scrapy.Field()
    # 产品信息列表:包含产品名称、产品类型、产品介绍
    pdt_info = scrapy.Field()

# -*- coding: utf-8 -*-

BOT_NAME = 'it'

SPIDER_MODULES = ['it.spiders']
NEWSPIDER_MODULE = 'it.spiders'

# Enables scheduling storing requests queue in redis.
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# REDIS_START_URLS_AS_SET = True

COOKIES_ENABLED = False

DOWNLOAD_DELAY = 1.5

# 支持随机下载延迟
RANDOMIZE_DOWNLOAD_DELAY = True

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

ITEM_PIPELINES = {
    'scrapy_redis.pipelines.RedisPipeline': 300
}

DOWNLOADER_MIDDLEWARES = {
    # 该中间件将会收集失败的页面,并在爬虫完成后重新调度。(失败情况可能由于临时的问题,例如连接超时或者HTTP 500错误导致失败的页面)
   'scrapy.downloadermiddlewares.retry.RetryMiddleware': 80,

    # 该中间件提供了对request设置HTTP代理的支持。您可以通过在 Request 对象中设置 proxy 元数据来开启代理。
    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100,

    'it.middlewares.RotateUserAgentMiddleware': 200,
}

REDIS_HOST = "192.168.199.108"
REDIS_PORT = 6379

# -*- coding: utf-8 -*-

from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
import random

# User-Agetn 下载中间件
class RotateUserAgentMiddleware(UserAgentMiddleware):
    def __init__(self, user_agent=''):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        # 这句话用于随机选择user-agent
        ua = random.choice(self.user_agent_list)
        request.headers.setdefault('User-Agent', ua)

    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
        "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
        "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
        "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
        "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
        "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
    ]

  • spiders/it.py
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup # 使用bs4解析
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from scrapy_redis.spiders import RedisCrawlSpider
from it.items import CompanyItem


class itSpider(RedisCrawlSpider):
    name = 'it'
    allowed_domains = ['www.it.com']
    # start_urls = ['http://www.it.com/company']
    redis_key = 'itspider:start_urls'
    rules = [
        # 获取每一页的链接
        Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))),
        # 获取每一个公司的详情
        Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item')
    ]

    def parse_item(self, response):
        soup = BeautifulSoup(response.body, 'lxml')

        # 开头部分: //div[@class="infoheadrow-v2 ugc-block-item"]
        cpy1 = soup.find('div', class_='infoheadrow-v2')
        if cpy1:
            # 公司名称://span[@class="title"]/b/text()[1]
            company_name = cpy1.find(class_='title').b.contents[0].strip().replace('\t', '').replace('\n', '')

            # 口号: //div[@class="info-line"]/p
            slogan = cpy1.find(class_='info-line').p.get_text()

            # 分类:子分类//span[@class="scope c-gray-aset"]/a[1]
            scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a')
            # 分类://span[@class="scope c-gray-aset"]/a[1]
            scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else ''
            # 子分类:# //span[@class="scope c-gray-aset"]/a[2]
            sub_scope = scope_a[1].get_text().strip() if len(scope_a) > 1 else ''

            # 城市+区域://span[@class="loca c-gray-aset"]/a
            city_a = cpy1.find(class_='loca c-gray-aset').find_all('a')
            # 城市://span[@class="loca c-gray-aset"]/a[1]
            city = city_a[0].get_text().strip() if len(city_a) > 0 else ''
            # 区域://span[@class="loca c-gray-aset"]/a[2]
            area = city_a[1].get_text().strip() if len(city_a) > 1 else ''

            # 主页://a[@class="weblink marl10"]/@href
            home_page = cpy1.find(class_='weblink marl10')['href']
            # 标签://div[@class="tagset dbi c-gray-aset"]/a
            tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip().strip().replace('\n', ',')

        #基本信息://div[@class="block-inc-info on-edit-hide"]
        cpy2 = soup.find('div', class_='block-inc-info on-edit-hide')
        if cpy2:

            # 公司简介://div[@class="block-inc-info on-edit-hide"]//div[@class="des"]
            company_intro = cpy2.find(class_='des').get_text().strip()

            # 公司全称:成立时间:公司规模:运行状态://div[@class="des-more"]
            cpy2_content = cpy2.find(class_='des-more').contents

            # 公司全称://div[@class="des-more"]/div[1]
            company_full_name = cpy2_content[1].get_text().strip()[len('公司全称:'):] if cpy2_content[1] else ''

            # 成立时间://div[@class="des-more"]/div[2]/span[1]
            found_time = cpy2_content[3].contents[1].get_text().strip()[len('成立时间:'):] if cpy2_content[3] else ''

            # 公司规模://div[@class="des-more"]/div[2]/span[2]
            company_size = cpy2_content[3].contents[3].get_text().strip()[len('公司规模:'):] if cpy2_content[3] else ''

            #运营状态://div[@class="des-more"]/div[3]
            company_status = cpy2_content[5].get_text().strip() if cpy2_content[5] else ''

        # 主体信息:
        main = soup.find('div', class_='main')

        # 投资情况://table[@class="list-round-v2 need2login"]
          # 投资情况,包含获投时间、融资阶段、融资金额、投资公司
        tz = main.find('table', 'list-round-v2')
        tz_list = []
        if tz:
            all_tr = tz.find_all('tr')
            for tr in all_tr:
                tz_dict = {}
                all_td = tr.find_all('td')
                tz_dict['tz_time'] = all_td[0].span.get_text().strip()
                tz_dict['tz_round'] = all_td[1].get_text().strip()
                tz_dict['tz_finades'] = all_td[2].get_text().strip()
                tz_dict['tz_capital'] = all_td[3].get_text().strip().replace('\n', ',')
                tz_list.append(tz_dict)

        # 团队信息:成员姓名、成员职称、成员介绍
        tm = main.find('ul', class_='list-prodcase limited-itemnum')
        tm_list = []
        if tm:
            for li in tm.find_all('li'):
                tm_dict = {}
                tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip()
                tm_dict['tm_m_title'] = li.find('span', class_='c-gray').get_text().strip()
                tm_dict['tm_m_intro'] = li.find('p', class_='mart10 person-des').get_text().strip()
                tm_list.append(tm_dict)

        # 产品信息:产品名称、产品类型、产品介绍
        pdt = main.find('ul', class_='list-prod limited-itemnum')
        pdt_list = []
        if pdt:
            for li in pdt.find_all('li'):
                pdt_dict = {}
                pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip()
                pdt_dict['pdt_type'] = li.find('span', class_='tag yellow').get_text().strip()
                pdt_dict['pdt_intro'] = li.find(class_='on-edit-hide').p.get_text().strip()
                pdt_list.append(pdt_dict)

        item = CompanyItem()
        item['info_id'] = response.url.split('/')[-1:][0]
        item['company_name'] = company_name
        item['slogan'] = slogan
        item['scope'] = scope
        item['sub_scope'] = sub_scope
        item['city'] = city
        item['area'] = area
        item['home_page'] = home_page
        item['tags'] = tags
        item['company_intro'] = company_intro
        item['company_full_name'] = company_full_name
        item['found_time'] = found_time
        item['company_size'] = company_size
        item['company_status'] = company_status
        item['tz_info'] = tz_list
        item['tm_info'] = tm_list
        item['pdt_info'] = pdt_list
        return item

  • 运行

Slave端:

scrapy runspider it.py

Master端:

redis-cli > lpush iytspider:start_urls http://www.it.com/company

处理redis中保存的items

  • 说明:

    官方示例的目录下可以看到一个process_items.py文件,这个文件就是scrapy-redis的example提供的从redis读取item进行处理的模版。

保存入MongoDB

  • 编写 process_yy_profile.py 文件,然后保持后台运行就可以不停地将爬回来的数据入库了。
# process_yy_mongodb.py

# -*- coding: utf-8 -*-

import json
import redis
import pymongo

def main():
    # 指定Redis数据库信息
    rediscli = redis.StrictRedis(host='192.168.199.108', port=6379, db=0)
    # 指定MongoDB数据库信息
    mongocli = pymongo.MongoClient(host='localhost', port=27017)

    # 创建数据库名
    db = mongocli['yy']
    # 创建表名
    sheet = db['beijing_18_25']

    while True:
        # FIFO模式为 blpop,LIFO模式为 brpop,获取键值
        source, data = rediscli.blpop(["yy:items"]) # 取出redis的items
        item = json.loads(data) # json转python对象
        sheet.insert(item) # 存入mongodb

        try:
            print u"Processing: %(name)s <%(link)s>" % item
        except KeyError:
            print u"Error procesing: %r" % item

if __name__ == '__main__':
    main()

保存入MySQL

  • 准备数据库和表
启动mysql:         mysql.server start(更平台不一样)
登录到root用户:    mysql -uroot -p
创建数据库yy:  create database yy;
切换到指定数据库:  use yy
创建表beijing_18_25以及所有字段的列名和数据类型。
  • 编写process_yy_mysql.py
#process_yy_mysql.py

# -*- coding: utf-8 -*-

import json
import redis
import MySQLdb

def main():
    # 指定redis数据库信息
    rediscli = redis.StrictRedis(host='192.168.199.108', port = 6379, db = 0)
    # 指定mysql数据库
    mysqlcli = MySQLdb.connect(host='127.0.0.1', user='power', passwd='xxxxxxx', db = 'yy', port=3306, use_unicode=True)

    while True:
        # FIFO模式为 blpop,LIFO模式为 brpop,获取键值
        source, data = rediscli.blpop(["yy:items"]) # 读出redis的items
        item = json.loads(data) # json转python对象

        try:
            # 使用cursor()方法获取操作游标
            cur = mysqlcli.cursor()
            # 使用execute方法执行SQL INSERT语句
            cur.execute(
                "INSERT INTO beijing_18_25 (username, crawled, age, spider, header_url, source, pic_urls, monologue, source_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s )", 
                [
                    item['username'], 
                    item['crawled'], 
                    item['age'], 
                    item['spider'], 
                    item['header_url'], 
                    item['source'], 
                    item['pic_urls'], 
                    item['monologue'], 
                    item['source_url']
                ])
            # 提交sql事务
            mysqlcli.commit()
            #关闭本次操作
            cur.close()
            print "inserted %s" % item['source_url']
        except MySQLdb.Error,e:
            print "Mysql Error %d: %s" % (e.args[0], e.args[1])

if __name__ == '__main__':
    main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值