使用hashlib模块生成指纹
import hashlib
f = hashlib.sha1()
text = 'http://www.baidu.com'
f.update(text.encode()) # 需要将字符串进行编码
f.hexdigest()
# '633a42441e296c9004a78abe0b2ee3b37559d32f'
RedisSpider 类
-
设置一个redis中的键用来保存开始地址
-
当某一台主机在redis中输入开始地址后, 分布式任务开始.
-
配置settings, 使用scrapy-redis调度器和hash去重
# reids配置信息
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 过滤器
SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 调度器
SCHEDULER_PERSIST = True # 持久化
REDIS_URL = 'redis://192.168.145.129:6379' # redis数据库
# 其他配置信息
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
ROBOTSTXT_OBEY = False
# -*- coding: utf-8 -*-
import scrapy
# 导入分布式爬虫类
from scrapy_redis.spiders import RedisSpider
from copy import deepcopy
import urllib
# 继承分布式爬虫类
class DangdangSpider(RedisSpider):
name = 'gdang'
allowed_domains = ['gdang.com']
# start_urls = ['http://gdang.com/']
redis_key = "gdang:start_urls" # 在redis中根据该键输入start_urls开始任务
def parse(self, response):
# 大分类分组
div_list = response.xpath("//div[@class='con flq_body']/div")
for div in div_list:
item = dict()
item['b_cate'] = div.xpath("./dl/dt//text()").extract()
item['b_cate'] = [i.strip() for i in item['b_cate'] if len(i.strip()) > 0]
# 中分类分组
dl_list = div.xpath(".//dl[@class='inner_dl']")
for dl in dl_list:
# dt下面可能有看不到的字符需要去掉
item['m_cate'] = dl.xpath("./dt//text()").extract()
item['m_cate'] = [i.strip() for i in item['m_cate'] if len(i.strip()) > 0][0]
# 小分类分组
a_list = dl.xpath("./dd/a")
for a in a_list:
item['s_cate'] = a.xpath("./@title").extract_first()
item['s_href'] = a.xpath("./@href").extract_first()
if item['s_href'] is not None:
yield scrapy.Request(
url=item['s_href'],
callback=self.parse_book_list,
meta={'item': deepcopy(item)}
)
def parse_book_list(self, response):
item = response.meta['item']
li_list = response.xpath("//ul[@class='bigimg']/li")
for li in li_list:
item['book_name'] = li.xpath("./a/@title").extract_first()
item['book_detail'] = li.xpath("./p[@class='detail']/text()").extract_first()
item['book_price'] = li.xpath("./p[@class='price']/span[1]/text()").extract_first()
item['book_author'] = li.xpath("./p[@class='search_book_author']/span[1]/a/text()").extract()
print(item)
# 翻页
next_url = response.xpath("//li[@class='next']/a/@href").extract_first()
if next_url is not None:
next_url = urllib.parse.urljoin(response.url, next_url)
yield scrapy.Request(
url=next_url,
callback=self.parse_book_list,
meta={'item': item}
)
RedisCrawlSpider类实
- 配置信息同上
- 定义url规则自动提取url进行相应
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
import re
class AmazonSpider(RedisCrawlSpider):
name = 'dqrm'
allowed_domains = ['dqrm.cn']
redis_key = 'dqrm:start_urls'
rules = (
# 大分类地址
Rule(LinkExtractor(restrict_xpaths=("//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-one']/div/li",)), follow=True),
# 小分类地址
Rule(LinkExtractor(restrict_xpaths=("//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-two']/div/li",)), follow=True),
# 图书地址
Rule(LinkExtractor(restrict_xpaths=("//div[@id='mainResults']/ul/li//h2/..",)), callback='parse_book_detail'),
# 图书列表翻页
Rule(LinkExtractor(restrict_xpaths=("//div[@id='pagn']",)), follow=True),
)
def parse_book_detail(self, response):
item = {}
item['book_name'] = response.xpath("//div[@id='booksTitle']//span[@id='productTitle']/text()").extract_first()
item['book_price'] = response.xpath("//div[@id='soldByThirdParty']/span[2]/text()").extract_first()
item['book_breadcrumbs'] = response.xpath("//div[@id='wayfinding-breadcrumbs_container']/ul/li/span/a/text()").extract()
item['book_breadcrumbs'] = [re.sub(r"\n|\s", '', i) for i in item['book_breadcrumbs']]
item['book_press'] = response.xpath("//b[text()='出版社:']/../text()").extract_first()
item['book_detail'] = response.xpath("//div[@id='bookDescription_feature_div']/noscript/div/text()").extract()
item['book_detail'] = [re.sub(r"\n|\s", '', i) for i in item['book_detail']]
print(item)
-
运行爬虫
scrapy runspider mycrawler_redis.py
-
爬虫开始后悔进入等待start_url状态, 在redis中根据redis_key保存一个起始url
-
从redis中获取到start_urls后, 各个spider开始爬取工作
redis-cli> lpush yy:start_urls https://www.dqrm.cn/
分布式DEMO
增量式爬虫 CrawlSpider
# -*- coding: utf-8 -*-
# 指定使用scrapy-redis的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 指定使用scrapy-redis的去重
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
# 指定排序爬取地址时使用的队列,
# 默认的 按优先级排序(Scrapy默认),由sorted set实现的一种非FIFO、LIFO方式。
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
# 可选的 按先进先出排序(FIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
# 可选的 按后进先出排序(LIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'
# 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理redis queues
SCHEDULER_PERSIST = True
# 只在使用SpiderQueue或者SpiderStack是有效的参数,指定爬虫关闭的最大间隔时间
# SCHEDULER_IDLE_BEFORE_CLOSE = 10
# 通过配置RedisPipeline将item写入key为 spider.name : items 的redis的list中,供后面的分布式处理item
# 这个已经由 scrapy-redis 实现,不需要我们写代码
ITEM_PIPELINES = {
'example.pipelines.ExamplePipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400
}
# 指定redis数据库的连接参数
# REDIS_PASS是我自己加上的redis连接密码(默认不做)
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
#REDIS_PASS = 'redisP@ssw0rd'
# LOG等级
LOG_LEVEL = 'DEBUG'
#默认情况下,RFPDupeFilter只记录第一个重复请求。将DUPEFILTER_DEBUG设置为True会记录所有重复的请求。
DUPEFILTER_DEBUG =True
# 覆盖默认请求头,可以自己编写Downloader Middlewares设置代理和UserAgent
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, sdch'
}
# -*- coding: utf-8 -*-
from datetime import datetime
class ExamplePipeline(object):
def process_item(self, item, spider):
#utcnow() 是获取UTC时间
item["crawled"] = datetime.utcnow()
# 爬虫名
item["spider"] = spider.name
return item
from scrapy.item import Item, Field
class yyItem(Item):
# 个人头像链接
header_url = Field()
# 用户名
username = Field()
# 内心独白
monologue = Field()
# 相册图片链接
pic_urls = Field()
# 年龄
age = Field()
# 网站来源 yy
source = Field()
# 个人主页源url
source_url = Field()
# 获取UTC时间
crawled = Field()
# 爬虫名
spider = Field()
- spiders/yy.py
# -*- coding:utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# 使用redis去重
from scrapy.dupefilters import RFPDupeFilter
from example.items import yyItem
import re
#
class YySpider(CrawlSpider):
name = 'yy'
allowed_domains = ['yy.com']
# yy的列表页
start_urls = ['http://www.yy.com/find/beijing/y']
# 搜索页面匹配规则,根据response提取链接
list_page_lx = LinkExtractor(allow=(r'http://www.yy.com/find/.+'))
# 北京、18~25岁、女性 的 搜索页面匹配规则,根据response提取链接
page_lx = LinkExtractor(allow =(r'http://www.yy.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))
# 个人主页 匹配规则,根据response提取链接
profile_page_lx = LinkExtractor(allow=(r'http://www.yy.com/\d+-profile/'))
rules = (
# 匹配find页面,跟进链接,跳板
Rule(list_page_lx, follow=True),
# 匹配列表页成功,跟进链接,跳板
Rule(page_lx, follow=True),
# 匹配个人主页的链接,形成request保存到redis中等待调度,一旦有响应则调用parse_profile_page()回调函数处理,不做继续跟进
Rule(profile_page_lx, callback='parse_profile_page', follow=False),
)
# 处理个人主页信息,得到我们要的数据
def parse_profile_page(self, response):
item = yyItem()
item['header_url'] = self.get_header_url(response)
item['username'] = self.get_username(response)
item['monologue'] = self.get_monologue(response)
item['pic_urls'] = self.get_pic_urls(response)
item['age'] = self.get_age(response)
item['source'] = 'yy'
item['source_url'] = response.url
#print "Processed profile %s" % response.url
yield item
# 提取头像地址
def get_header_url(self, response):
header = response.xpath('//dl[@class=\'personal_cen\']/dt/img/@src').extract()
if len(header) > 0:
header_url = header[0]
else:
header_url = ""
return header_url.strip()
# 提取用户名
def get_username(self, response):
usernames = response.xpath("//dl[@class=\'personal_cen\']/dd/div/strong/text()").extract()
if len(usernames) > 0:
username = usernames[0]
else:
username = "NULL"
return username.strip()
# 提取内心独白
def get_monologue(self, response):
monologues = response.xpath("//ul[@class=\'requre\']/li/p/text()").extract()
if len(monologues) > 0:
monologue = monologues[0]
else:
monologue = "NULL"
return monologue.strip()
# 提取相册图片地址
def get_pic_urls(self, response):
pic_urls = []
data_url_full = response.xpath('//li[@class=\'smallPhoto\']/@data_url_full').extract()
if len(data_url_full) <= 1:
pic_urls.append("");
else:
for pic_url in data_url_full:
pic_urls.append(pic_url)
if len(pic_urls) <= 1:
return "NULL"
# 每个url用|分隔
return '|'.join(pic_urls)
# 提取年龄
def get_age(self, response):
age_urls = response.xpath("//dl[@class=\'personal_cen\']/dd/p[@class=\'local\']/text()").extract()
if len(age_urls) > 0:
age = age_urls[0]
else:
age = "0"
age_words = re.split(' ', age)
if len(age_words) <= 2:
return "0"
age = age_words[2][:-1]
# 从age字符串开始匹配数字,失败返回None
if re.compile(r'[0-9]').match(age):
return age
return "0"
-
运行
-
可以开启多个客户端运行
scrapy crawl yy
分布式爬虫 RedisCrawlSpider
- 修改 spiders/yy.py
# -*- coding:utf-8 -*-
from scrapy.linkextractors import LinkExtractor
#from scrapy.spiders import CrawlSpider, Rule
# 1. 导入RedisCrawlSpider类,不使用CrawlSpider
from scrapy_redis.spiders import RedisCrawlSpider
from scrapy.spiders import Rule
from scrapy.dupefilters import RFPDupeFilter
from example.items import yyItem
import re
# 2. 修改父类 RedisCrawlSpider
# class yySpider(CrawlSpider):
class yySpider(RedisCrawlSpider):
name = 'yy'
# 3. 取消 allowed_domains() 和 start_urls
##### allowed_domains = ['yy.com']
##### start_urls = ['http://www.yy.com/find/beijing/']
# 4. 增加redis-key
redis_key = 'yy:start_urls'
list_page_lx = LinkExtractor(allow=(r'http://www.yy.com/find/.+'))
page_lx = LinkExtractor(allow =(r'http://www.yy.com/find/beijing/mm18-25/advance-0-0-0-0-0-0-0/p\d+/'))
profile_page_lx = LinkExtractor(allow=(r'http://www.yy.com/\d+-profile/'))
rules = (
Rule(list_page_lx, follow=True),
Rule(page_lx, follow=True),
Rule(profile_page_lx, callback='parse_profile_page', follow=False),
)
# 5. 增加__init__()方法,动态获取allowed_domains()
def __init__(self, *args, **kwargs):
domain = kwargs.pop('domain', '')
self.allowed_domains = filter(None, domain.split(','))
super(yySpider, self).__init__(*args, **kwargs)
# 处理个人主页信息,得到我们要的数据
def parse_profile_page(self, response):
item = yyItem()
item['header_url'] = self.get_header_url(response)
item['username'] = self.get_username(response)
item['monologue'] = self.get_monologue(response)
item['pic_urls'] = self.get_pic_urls(response)
item['age'] = self.get_age(response)
item['source'] = 'yy'
item['source_url'] = response.url
yield item
# 提取头像地址
def get_header_url(self, response):
header = response.xpath('//dl[@class=\'personal_cen\']/dt/img/@src').extract()
if len(header) > 0:
header_url = header[0]
else:
header_url = ""
return header_url.strip()
# 提取用户名
def get_username(self, response):
usernames = response.xpath("//dl[@class=\'personal_cen\']/dd/div/strong/text()").extract()
if len(usernames) > 0:
username = usernames[0]
else:
username = "NULL"
return username.strip()
# 提取内心独白
def get_monologue(self, response):
monologues = response.xpath("//ul[@class=\'requre\']/li/p/text()").extract()
if len(monologues) > 0:
monologue = monologues[0]
else:
monologue = "NULL"
return monologue.strip()
# 提取相册图片地址
def get_pic_urls(self, response):
pic_urls = []
data_url_full = response.xpath('//li[@class=\'smallPhoto\']/@data_url_full').extract()
if len(data_url_full) <= 1:
pic_urls.append("");
else:
for pic_url in data_url_full:
pic_urls.append(pic_url)
if len(pic_urls) <= 1:
return "NULL"
return '|'.join(pic_urls)
# 提取年龄
def get_age(self, response):
age_urls = response.xpath("//dl[@class=\'personal_cen\']/dd/p[@class=\'local\']/text()").extract()
if len(age_urls) > 0:
age = age_urls[0]
else:
age = "0"
age_words = re.split(' ', age)
if len(age_words) <= 2:
return "0"
age = age_words[2][:-1]
if re.compile(r'[0-9]').match(age):
return age
return "0"
- 运行
客户端执行:
scrapy runspider yy.py
redis服务器端执行:
redis-cli> lpush yy:start_urls http://www.yy.com/find/beijinyy
DEMO
-
说明:
要求:采集所有公司信息.
# items.py
# -*- coding: utf-8 -*-
import scrapy
class CompanyItem(scrapy.Item):
# 公司id (url数字部分)
info_id = scrapy.Field()
# 公司名称
company_name = scrapy.Field()
# 公司口号
slogan = scrapy.Field()
# 分类
scope = scrapy.Field()
# 子分类
sub_scope = scrapy.Field()
# 所在城市
city = scrapy.Field()
# 所在区域
area = scrapy.Field()
# 公司主页
home_page = scrapy.Field()
# 公司标签
tags = scrapy.Field()
# 公司简介
company_intro = scrapy.Field()
# 公司全称:
company_full_name = scrapy.Field()
# 成立时间:
found_time = scrapy.Field()
# 公司规模:
company_size = scrapy.Field()
# 运营状态
company_status = scrapy.Field()
# 投资情况列表:包含获投时间、融资阶段、融资金额、投资公司
tz_info = scrapy.Field()
# 团队信息列表:包含成员姓名、成员职称、成员介绍
tm_info = scrapy.Field()
# 产品信息列表:包含产品名称、产品类型、产品介绍
pdt_info = scrapy.Field()
# -*- coding: utf-8 -*-
BOT_NAME = 'it'
SPIDER_MODULES = ['it.spiders']
NEWSPIDER_MODULE = 'it.spiders'
# Enables scheduling storing requests queue in redis.
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# Ensure all spiders share same duplicates filter through redis.
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# REDIS_START_URLS_AS_SET = True
COOKIES_ENABLED = False
DOWNLOAD_DELAY = 1.5
# 支持随机下载延迟
RANDOMIZE_DOWNLOAD_DELAY = True
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 300
}
DOWNLOADER_MIDDLEWARES = {
# 该中间件将会收集失败的页面,并在爬虫完成后重新调度。(失败情况可能由于临时的问题,例如连接超时或者HTTP 500错误导致失败的页面)
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 80,
# 该中间件提供了对request设置HTTP代理的支持。您可以通过在 Request 对象中设置 proxy 元数据来开启代理。
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100,
'it.middlewares.RotateUserAgentMiddleware': 200,
}
REDIS_HOST = "192.168.199.108"
REDIS_PORT = 6379
# -*- coding: utf-8 -*-
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
import random
# User-Agetn 下载中间件
class RotateUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, user_agent=''):
self.user_agent = user_agent
def process_request(self, request, spider):
# 这句话用于随机选择user-agent
ua = random.choice(self.user_agent_list)
request.headers.setdefault('User-Agent', ua)
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12",
"Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 ",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14",
"Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
]
- spiders/it.py
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup # 使用bs4解析
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from it.items import CompanyItem
class itSpider(RedisCrawlSpider):
name = 'it'
allowed_domains = ['www.it.com']
# start_urls = ['http://www.it.com/company']
redis_key = 'itspider:start_urls'
rules = [
# 获取每一页的链接
Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))),
# 获取每一个公司的详情
Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item')
]
def parse_item(self, response):
soup = BeautifulSoup(response.body, 'lxml')
# 开头部分: //div[@class="infoheadrow-v2 ugc-block-item"]
cpy1 = soup.find('div', class_='infoheadrow-v2')
if cpy1:
# 公司名称://span[@class="title"]/b/text()[1]
company_name = cpy1.find(class_='title').b.contents[0].strip().replace('\t', '').replace('\n', '')
# 口号: //div[@class="info-line"]/p
slogan = cpy1.find(class_='info-line').p.get_text()
# 分类:子分类//span[@class="scope c-gray-aset"]/a[1]
scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a')
# 分类://span[@class="scope c-gray-aset"]/a[1]
scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else ''
# 子分类:# //span[@class="scope c-gray-aset"]/a[2]
sub_scope = scope_a[1].get_text().strip() if len(scope_a) > 1 else ''
# 城市+区域://span[@class="loca c-gray-aset"]/a
city_a = cpy1.find(class_='loca c-gray-aset').find_all('a')
# 城市://span[@class="loca c-gray-aset"]/a[1]
city = city_a[0].get_text().strip() if len(city_a) > 0 else ''
# 区域://span[@class="loca c-gray-aset"]/a[2]
area = city_a[1].get_text().strip() if len(city_a) > 1 else ''
# 主页://a[@class="weblink marl10"]/@href
home_page = cpy1.find(class_='weblink marl10')['href']
# 标签://div[@class="tagset dbi c-gray-aset"]/a
tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip().strip().replace('\n', ',')
#基本信息://div[@class="block-inc-info on-edit-hide"]
cpy2 = soup.find('div', class_='block-inc-info on-edit-hide')
if cpy2:
# 公司简介://div[@class="block-inc-info on-edit-hide"]//div[@class="des"]
company_intro = cpy2.find(class_='des').get_text().strip()
# 公司全称:成立时间:公司规模:运行状态://div[@class="des-more"]
cpy2_content = cpy2.find(class_='des-more').contents
# 公司全称://div[@class="des-more"]/div[1]
company_full_name = cpy2_content[1].get_text().strip()[len('公司全称:'):] if cpy2_content[1] else ''
# 成立时间://div[@class="des-more"]/div[2]/span[1]
found_time = cpy2_content[3].contents[1].get_text().strip()[len('成立时间:'):] if cpy2_content[3] else ''
# 公司规模://div[@class="des-more"]/div[2]/span[2]
company_size = cpy2_content[3].contents[3].get_text().strip()[len('公司规模:'):] if cpy2_content[3] else ''
#运营状态://div[@class="des-more"]/div[3]
company_status = cpy2_content[5].get_text().strip() if cpy2_content[5] else ''
# 主体信息:
main = soup.find('div', class_='main')
# 投资情况://table[@class="list-round-v2 need2login"]
# 投资情况,包含获投时间、融资阶段、融资金额、投资公司
tz = main.find('table', 'list-round-v2')
tz_list = []
if tz:
all_tr = tz.find_all('tr')
for tr in all_tr:
tz_dict = {}
all_td = tr.find_all('td')
tz_dict['tz_time'] = all_td[0].span.get_text().strip()
tz_dict['tz_round'] = all_td[1].get_text().strip()
tz_dict['tz_finades'] = all_td[2].get_text().strip()
tz_dict['tz_capital'] = all_td[3].get_text().strip().replace('\n', ',')
tz_list.append(tz_dict)
# 团队信息:成员姓名、成员职称、成员介绍
tm = main.find('ul', class_='list-prodcase limited-itemnum')
tm_list = []
if tm:
for li in tm.find_all('li'):
tm_dict = {}
tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip()
tm_dict['tm_m_title'] = li.find('span', class_='c-gray').get_text().strip()
tm_dict['tm_m_intro'] = li.find('p', class_='mart10 person-des').get_text().strip()
tm_list.append(tm_dict)
# 产品信息:产品名称、产品类型、产品介绍
pdt = main.find('ul', class_='list-prod limited-itemnum')
pdt_list = []
if pdt:
for li in pdt.find_all('li'):
pdt_dict = {}
pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip()
pdt_dict['pdt_type'] = li.find('span', class_='tag yellow').get_text().strip()
pdt_dict['pdt_intro'] = li.find(class_='on-edit-hide').p.get_text().strip()
pdt_list.append(pdt_dict)
item = CompanyItem()
item['info_id'] = response.url.split('/')[-1:][0]
item['company_name'] = company_name
item['slogan'] = slogan
item['scope'] = scope
item['sub_scope'] = sub_scope
item['city'] = city
item['area'] = area
item['home_page'] = home_page
item['tags'] = tags
item['company_intro'] = company_intro
item['company_full_name'] = company_full_name
item['found_time'] = found_time
item['company_size'] = company_size
item['company_status'] = company_status
item['tz_info'] = tz_list
item['tm_info'] = tm_list
item['pdt_info'] = pdt_list
return item
- 运行
Slave端:
scrapy runspider it.py
Master端:
redis-cli > lpush iytspider:start_urls http://www.it.com/company
处理redis中保存的items
-
说明:
官方示例的目录下可以看到一个process_items.py文件,这个文件就是scrapy-redis的example提供的从redis读取item进行处理的模版。
保存入MongoDB
- 编写 process_yy_profile.py 文件,然后保持后台运行就可以不停地将爬回来的数据入库了。
# process_yy_mongodb.py
# -*- coding: utf-8 -*-
import json
import redis
import pymongo
def main():
# 指定Redis数据库信息
rediscli = redis.StrictRedis(host='192.168.199.108', port=6379, db=0)
# 指定MongoDB数据库信息
mongocli = pymongo.MongoClient(host='localhost', port=27017)
# 创建数据库名
db = mongocli['yy']
# 创建表名
sheet = db['beijing_18_25']
while True:
# FIFO模式为 blpop,LIFO模式为 brpop,获取键值
source, data = rediscli.blpop(["yy:items"]) # 取出redis的items
item = json.loads(data) # json转python对象
sheet.insert(item) # 存入mongodb
try:
print u"Processing: %(name)s <%(link)s>" % item
except KeyError:
print u"Error procesing: %r" % item
if __name__ == '__main__':
main()
保存入MySQL
- 准备数据库和表
启动mysql: mysql.server start(更平台不一样)
登录到root用户: mysql -uroot -p
创建数据库yy: create database yy;
切换到指定数据库: use yy
创建表beijing_18_25以及所有字段的列名和数据类型。
- 编写process_yy_mysql.py
#process_yy_mysql.py
# -*- coding: utf-8 -*-
import json
import redis
import MySQLdb
def main():
# 指定redis数据库信息
rediscli = redis.StrictRedis(host='192.168.199.108', port = 6379, db = 0)
# 指定mysql数据库
mysqlcli = MySQLdb.connect(host='127.0.0.1', user='power', passwd='xxxxxxx', db = 'yy', port=3306, use_unicode=True)
while True:
# FIFO模式为 blpop,LIFO模式为 brpop,获取键值
source, data = rediscli.blpop(["yy:items"]) # 读出redis的items
item = json.loads(data) # json转python对象
try:
# 使用cursor()方法获取操作游标
cur = mysqlcli.cursor()
# 使用execute方法执行SQL INSERT语句
cur.execute(
"INSERT INTO beijing_18_25 (username, crawled, age, spider, header_url, source, pic_urls, monologue, source_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s )",
[
item['username'],
item['crawled'],
item['age'],
item['spider'],
item['header_url'],
item['source'],
item['pic_urls'],
item['monologue'],
item['source_url']
])
# 提交sql事务
mysqlcli.commit()
#关闭本次操作
cur.close()
print "inserted %s" % item['source_url']
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
if __name__ == '__main__':
main()