利用scrapy、proxy_pool、cookie_pool抓取新浪微博:用户信息、关注列表、粉丝列表、微博内容,信息保存至MongoDB。以几个大V为起点,爬取个人信息、粉丝、关注、微博信息,然后继续获取这些粉丝和关注的个人信息、粉丝、关注、微博信息,以此类推,实现递归爬取。
1、 spider.py:请求网页,反爬:pc端困难,数据是ajax请求,containerid参数,
解决:转移动端(m.weibo.cn/u/uid),请求json数据,参数:230283/100505/107603/231051区别
2、 middlewares:调用proxy_pool,cookie_pool,随机获取代理或cookie
3、 Pipeline:item对象中有’crawlet_at’代表当前爬取时间,使用TimePipeline,赋值当前时间;item对象中有’created_at’代表评论时间(刚刚、几分钟前…),使用WeiboPipeline,进行时间处理;数据库使用MongoPipeline,进行连接,存储
4、 技术:scrapy,MongoDB,proxy_pool,cookie_pool,time
weibo.py
import json
from scrapy import Request, Spider
from weibo.items import *
class WeiboSpider(Spider):
name = 'weibocn'
allowed_domains = ['m.weibo.cn']
user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}'
follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}'
fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}'
weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}'
start_users = ['3217179555', '1742566624', '2282991915', '1288739185', '3952070245', '5878659096']
def start_requests(self):
for uid in self.start_users:
yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
def parse_user(self, response):
"""
解析用户信息
:param response: Response对象
"""
self.logger.debug(response)
result = json.loads(response.text)
if result.get('data').get('userInfo'):
user_info = result.get('data').get('userInfo')
user_item = UserItem()
field_map = {
'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone',
'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count',
'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified',
'verified_reason': 'verified_reason', 'verified_type': 'verified_type'
}
for field, attr in field_map.items():
user_item[field] = user_info.get(attr)
yield user_item
# 关注
uid = user_info.get('id')
yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows,
meta={'page': 1, 'uid': uid})
# 粉丝
yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans,
meta={'page': 1, 'uid': uid})
# 微博
yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos,
meta={'page': 1, 'uid': uid})
def parse_follows(self, response):
"""
解析用户关注
:param response: Response对象
"""
result = json.loads(response.text)
if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) \
and result.get('data').get('cards')[-1].get('card_group'):
# 解析用户
follows = result.get('data').get('cards')[-1].get('card_group')
for follow in follows:
if follow.get('user'):
uid = follow.get('user').get('id')
yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
uid = response.meta.get('uid')
# 关注列表
user_relation_item = UserRelationItem()
follows = [{'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name')} for follow in
follows]
user_relation_item['id'] = uid
user_relation_item['follows'] = follows
user_relation_item['fans'] = []
yield user_relation_item
# 下一页关注
page = response.meta.get('page') + 1
yield Request(self.follow_url.format(uid=uid, page=page),
callback=self.parse_follows, meta={'page': page, 'uid': uid})
def parse_fans(self, response):
"""
解析用户粉丝
:param response: Response对象
"""
result = json.loads(response.text)
if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get(
'card_group'):
# 解析用户
fans = result.get('data').get('cards')[-1].get('card_group')
for fan in fans:
if fan.get('user'):
uid = fan.get('user').get('id')
yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
uid = response.meta.get('uid')
# 粉丝列表
user_relation_item = UserRelationItem()
fans = [{'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name')} for fan in
fans]
user_relation_item['id'] = uid
user_relation_item['fans'] = fans
user_relation_item['follows'] = []
yield user_relation_item
# 下一页粉丝
page = response.meta.get('page') + 1
yield Request(self.fan_url.format(uid=uid, page=page),
callback=self.parse_fans, meta={'page': page, 'uid': uid})
def parse_weibos(self, response):
"""
解析微博列表
:param response: Response对象
"""
result = json.loads(response.text)
if result.get('ok') and result.get('data').get('cards'):
weibos = result.get('data').get('cards')
for weibo in weibos:
mblog = weibo.get('mblog')
if mblog:
weibo_item = WeiboItem()
field_map = {
'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count',
'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics',
'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text',
'thumbnail': 'thumbnail_pic',
}
for field, attr in field_map.items():
weibo_item[field] = mblog.get(attr)
weibo_item['user'] = response.meta.get('uid')
yield weibo_item
# 下一页微博
uid = response.meta.get('uid')
page = response.meta.get('page') + 1
yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos,
meta={'uid': uid, 'page': page})
middleware.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
import json
import logging
from scrapy import signals
import requests
class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url
def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False
def process_request(self, request, spider):
if request.meta.get('retry_times'):
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + proxy)
request.meta['proxy'] = uri
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)
class CookiesMiddleware():
def __init__(self, cookies_url):
self.logger = logging.getLogger(__name__)
self.cookies_url = cookies_url
def get_random_cookies(self):
try:
response = requests.get(self.cookies_url)
if response.status_code == 200:
cookies = json.loads(response.text)
return cookies
except requests.ConnectionError:
return False
def process_request(self, request, spider):
self.logger.debug('正在获取Cookies')
cookies = self.get_random_cookies()
if cookies:
request.cookies = cookies
self.logger.debug('使用Cookies ' + json.dumps(cookies))
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
cookies_url=settings.get('COOKIES_URL')
)
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re, time
import logging
import pymongo
from weibo.items import *
class TimePipeline():
def process_item(self, item, spider):
if isinstance(item, UserItem) or isinstance(item, WeiboItem):
now = time.strftime('%Y-%m-%d %H:%M', time.localtime())
item['crawled_at'] = now
return item
class WeiboPipeline():
def parse_time(self, date):
if re.match('刚刚', date):
date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time()))
if re.match('\d+分钟前', date):
minute = re.match('(\d+)', date).group(1)
date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60))
if re.match('\d+小时前', date):
hour = re.match('(\d+)', date).group(1)
date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(hour) * 60 * 60))
if re.match('昨天.*', date):
date = re.match('昨天(.*)', date).group(1).strip()
date = time.strftime('%Y-%m-%d', time.localtime() - 24 * 60 * 60) + ' ' + date
if re.match('\d{2}-\d{2}', date):
date = time.strftime('%Y-', time.localtime()) + date + ' 00:00'
return date
def process_item(self, item, spider):
if isinstance(item, WeiboItem):
if item.get('created_at'):
item['created_at'] = item['created_at'].strip()
item['created_at'] = self.parse_time(item.get('created_at'))
if item.get('pictures'):
item['pictures'] = [pic.get('url') for pic in item.get('pictures')]
return item
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)])
self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)])
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
if isinstance(item, UserItem) or isinstance(item, WeiboItem):
self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True)
if isinstance(item, UserRelationItem):
self.db[item.collection].update(
{'id': item.get('id')},
{'$addToSet':
{
'follows': {'$each': item['follows']},
'fans': {'$each': item['fans']}
}
}, True)
return item
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for weibo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'weibo'
SPIDER_MODULES = ['weibo.spiders']
NEWSPIDER_MODULE = 'weibo.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'weibo (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'Host': 'm.weibo.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'weibo.middlewares.WeiboSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'weibo.middlewares.CookiesMiddleware': 554,
'weibo.middlewares.ProxyMiddleware': 555,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'weibo.pipelines.TimePipeline': 300,
'weibo.pipelines.WeiboPipeline': 301,
'weibo.pipelines.MongoPipeline': 302,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
MONGO_URI = 'localhost'
MONGO_DATABASE = 'weibo'
COOKIES_URL = 'http://localhost:5000/weibo/random'
PROXY_URL = 'http://localhost:5555/random'
内置的CookiesMiddleware的优先级为700
内置的HttpProxyMiddleware的优先级为750