新浪微博分布式爬虫分享

SinaSpider/Sina_spider2/scrapy.cfg文件

# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = Sina_spider2.settings

[deploy]
#url = http://localhost:6800/
project = Sina_spider2

SinaSpider/Sina_spider2/Begin.py文件

from scrapy import cmdline

cmdline.execute("scrapy crawlall".split())


SinaSpider/Sina_spider2/Sina_spider2/cookies.py文件

# encoding=utf-8
import json
import base64
import requests

"""
输入你的微博账号和密码,可去淘宝买,一元七个。
建议买几十个,微博反扒的厉害,太频繁了会出现302转移。
或者你也可以把时间间隔调大点。
"""
myWeiBo = [
    {'no': 'jiadieyuso3319@163.com', 'psw': 'a123456'},
    {'no': 'shudieful3618@163.com', 'psw': 'a123456'},
]


def getCookies(weibo):
    """ 获取Cookies """
    cookies = []
    loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
    for elem in weibo:
        account = elem['no']
        password = elem['psw']
        username = base64.b64encode(account.encode('utf-8')).decode('utf-8')
        postData = {
            "entry": "sso",
            "gateway": "1",
            "from": "null",
            "savestate": "30",
            "useticket": "0",
            "pagerefer": "",
            "vsnf": "1",
            "su": username,
            "service": "sso",
            "sp": password,
            "sr": "1440*900",
            "encoding": "UTF-8",
            "cdult": "3",
            "domain": "sina.com.cn",
            "prelt": "0",
            "returntype": "TEXT",
        }
        session = requests.Session()
        r = session.post(loginURL, data=postData)
        jsonStr = r.content.decode('gbk')
        info = json.loads(jsonStr)
        if info["retcode"] == "0":
            print "Get Cookie Success!( Account:%s )" % account
            cookie = session.cookies.get_dict()
            cookies.append(cookie)
        else:
            print "Failed!( Reason:%s )" % info['reason']
    return cookies


cookies = getCookies(myWeiBo)
print "Get Cookies Finish!( Num:%d)" % len(cookies)

SinaSpider/Sina_spider2/Sina_spider2/items.py文件

# -*- coding: utf-8 -*-

from scrapy import Item, Field


class InformationItem(Item):
    """ 个人信息 """
    _id = Field()  # 用户ID
    NickName = Field()  # 昵称
    Gender = Field()  # 性别
    Province = Field()  # 所在省
    City = Field()  # 所在城市
    Signature = Field()  # 个性签名
    Birthday = Field()  # 生日
    Num_Tweets = Field()  # 微博数
    Num_Follows = Field()  # 关注数
    Num_Fans = Field()  # 粉丝数
    Sex_Orientation = Field()  # 性取向
    Marriage = Field()  # 婚姻状况
    URL = Field()  # 首页链接


class TweetsItem(Item):
    """ 微博信息 """
    _id = Field()  # 用户ID-微博ID
    ID = Field()  # 用户ID
    Content = Field()  # 微博内容
    PubTime = Field()  # 发表时间
    Co_oridinates = Field()  # 定位坐标
    Tools = Field()  # 发表工具/平台
    Like = Field()  # 点赞数
    Comment = Field()  # 评论数
    Transfer = Field()  # 转载数

SinaSpider/Sina_spider2/Sina_spider2/middleware.py文件

# encoding=utf-8
import random
from user_agents import agents
from cookies import cookies


class UserAgentMiddleware(object):
    """ 换User-Agent """

    def process_request(self, request, spider):
        agent = random.choice(agents)
        request.headers["User-Agent"] = agent


class CookiesMiddleware(object):
    """ 换Cookie """

    def process_request(self, request, spider):
        cookie = random.choice(cookies)
        request.cookies = cookie

SinaSpider/Sina_spider2/Sina_spider2/pipelines.py文件

# -*- coding: utf-8 -*-
import pymongo
from items import InformationItem, TweetsItem


class MongoDBPipleline(object):
    def __init__(self):
        clinet = pymongo.MongoClient("localhost", 27017)
        db = clinet["Sina"]
        self.Information = db["Information"]
        self.Tweets = db["Tweets"]
        self.Follows = db["Follows"]
        self.Fans = db["Fans"]

    def process_item(self, item, spider):
        """ 判断item的类型,并作相应的处理,再入数据库 """
        if isinstance(item, InformationItem):
            try:
                self.Information.insert(dict(item))
            except Exception:
                pass
        elif isinstance(item, TweetsItem):
            try:
                self.Tweets.insert(dict(item))
            except Exception:
                pass
        return item

SinaSpider/Sina_spider2/Sina_spider2/settings.py文件

# -*- coding: utf-8 -*-
BOT_NAME = ['tweetsSpider', 'informationSpider']

SPIDER_MODULES = ['Sina_spider2.spiders']
NEWSPIDER_MODULE = 'Sina_spider2.spiders'

DOWNLOADER_MIDDLEWARES = {
    "Sina_spider2.middleware.UserAgentMiddleware": 401,
    "Sina_spider2.middleware.CookiesMiddleware": 402,
}
ITEM_PIPELINES = ["Sina_spider2.pipelines.MongoDBPipleline"]

SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIE_URL = None
REDIS_HOST = '192.168.1.199'
REDIS_PORT = 6379

DOWNLOAD_DELAY = 2  # 间隔时间
COMMANDS_MODULE = 'Sina_spider2.commands'
# LOG_LEVEL = 'INFO'  # 日志级别
# CONCURRENT_REQUESTS = 1
# CONCURRENT_ITEMS = 1
# CONCURRENT_REQUESTS_PER_IP = 1

SinaSpider/Sina_spider2/Sina_spider2/user_agents.py文件

# encoding=utf-8
""" User-Agents """
agents = [
    "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
    "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
    "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
    "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
    "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
    "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
    "Mozilla/2.02E (Win95; U)",
    "Mozilla/3.01Gold (Win95; I)",
    "Mozilla/4.8 [en] (Windows NT 5.1; U)",
    "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
    "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
    "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
    "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522  (KHTML, like Gecko) Safari/419.3",
    "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
    "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
    "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
]

SinaSpider/Sina_spider2/Sina_spider2/weiboID.py文件

# encoding=utf-8
""" 初始的待爬队列 """
weiboID = [
    1797054534, 2509414473L, 2611478681L, 5861859392L, 2011086863, 5127716917L, 1259110474, 5850775634L, 1886437464,
    3187474530L, 2191982701L, 1940562032, 5874450550L, 1337925752, 2081079420, 5664530558L, 3493173952L, 1202806915,
    1864507535, 2032640064, 5585682587L, 3083673764L, 5342109866L, 5878685868L, 5728706733L, 2103050415, 5876752562L,
    3138085045L, 5775974583L, 1879400644, 2417139911L, 5836619975L, 5353816265L, 5219508427L, 1766613205, 2480158031L,
    5660754163L, 2456764664L, 3637354755L, 1940087047, 5508473104L, 1004454162, 2930327837L, 1874608417, 5379621155L,
    1720664360, 2714280233L, 3769073964L, 5624119596L, 2754904375L, 5710151998L, 5331042630L, 5748179271L, 2146132305,
    2313896275L, 3193618787L, 5743059299L, 1742930277, 5310538088L, 1794474362, 2798510462L, 3480076671L, 5678653833L,
    5743657357L, 5460191980L, 1734164880, 5876988653L, 5678031258L, 5860163996L, 1496924574, 5878970110L, 1679704482,
    1142210982, 3628925351L, 1196397981, 1747485107, 5675893172L, 5438521785L, 2192269762L, 1992614343, 5878686155L,
    2407186895L, 5559116241L, 2528477652L, 1295950295, 5038203354L, 3659276765L, 2126733792, 5878350307L, 2761179623L,
    5484511719L, 5825708520L, 1578230251, 5878686190L, 5810946551L, 3833070073L, 1795047931, 5855789570L, 3580125714L,
    5709578773L, 5236539926L, 2907633071L, 1709244961, 5405450788L, 3251257895L, 5054538290L, 2713199161L, 5698445883L,
    1784537661, 3195290182L, 1824506454, 5738766939L, 5565915740L, 5336031840L, 5098775138L, 5685568105L, 1774289524,
    2932662914L, 5433223957L, 2680044311L, 1111523983, 5067889432L, 5878686362L, 2844992161L, 3878314663L, 1766548141,
    5763269297L, 5878383287L, 5235499706L, 5876375670L, 5866447563L, 5129945819L, 1704116960, 1929380581, 1223762662,
    1193476843, 2899591923L, 5162099453L, 5072151301L, 5385741066L, 5411455765L, 2685535005L, 2297905950L, 1216766752,
    5838668577L, 5359133478L, 3077460103L, 5577802539L, 5862392623L, 1786700611, 1259258694, 1845191497, 1731838797,
    1740301135, 2816074584L, 1217733467, 5345035105L, 5050827618L, 5486257001L, 5767857005L, 2050605943, 5733778298L,
    1914725244, 5872583558L, 5604377483L, 1253491601, 5554922386L, 3170223002L, 5662737311L, 3217179555L, 1538163622,
    5304533928L, 5644198830L, 1896650227, 5298774966L, 2795873213L, 1834378177, 5769651141L, 2656256971L, 5876433869L,
    1826792401, 3002246100L, 3082519511L, 5780366296L, 5704696797L, 5204108258L, 2090615793, 1739746131, 1378010100,
    5741331445L, 2376442895L, 3638486041L, 5781365789L, 1827234850, 5703214121L, 1855398955, 1227908142, 5703820334L,
]

SinaSpider/Sina_spider2/Sina_spider2/commands/文件夹

SinaSpider/Sina_spider2/Sina_spider2/commands/crawlall.py文件

from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerRunner
from scrapy.utils.conf import arglist_to_dict


class Command(ScrapyCommand):
    requires_project = True

    def syntax(self):
        return '[options]'

    def short_desc(self):
        return 'Runs all of the spiders'

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
                          help="set spider argument (may be repeated)")
        parser.add_option("-o", "--output", metavar="FILE",
                          help="dump scraped items into FILE (use - for stdout)")
        parser.add_option("-t", "--output-format", metavar="FORMAT",
                          help="format to use for dumping items with -o")

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)

    def run(self, args, opts):
        # settings = get_project_settings()

        spider_loader = self.crawler_process.spider_loader
        for spidername in args or spider_loader.list():
            print "crawlall spidername:  " + spidername
            self.crawler_process.crawl(spidername, **opts.spargs)

        self.crawler_process.start()

SinaSpider/Sina_spider2/Sina_spider2/spiders/文件夹

SinaSpider/Sina_spider2/Sina_spider2/spiders/informationSpider.py文件

# encoding=utf-8
import re
import datetime
import requests
from lxml import etree
from scrapy_redis.spiders import RedisSpider
from Sina_spider2.weiboID import weiboID
from scrapy.selector import Selector
from scrapy.http import Request
from Sina_spider2.items import InformationItem


class Spider(RedisSpider):
    name = "informationSpider"
    host = "http://weibo.cn"
    redis_key = "informationSpider:start_urls"
    start_urls = []
    for ID in weiboID:
        url = url_information1 = "http://weibo.cn/%s/info" % ID
        start_urls.append(url)

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url=url, callback=self.parse)

    def parse(self, response):
        informationItems = InformationItem()
        selector = Selector(response)
        ID = re.findall('weibo\.cn/(\d+)', response.url)[0]
        text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract())  # 获取标签里的所有text()
        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # 昵称
        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # 性别
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1)  # 地区(包括省份和城市)
        signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # 个性签名
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # 生日
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1)  # 性取向
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1)  # 婚姻状况
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # 首页链接

        informationItems["_id"] = ID
        if nickname:
            informationItems["NickName"] = nickname[0]
        if gender:
            informationItems["Gender"] = gender[0]
        if place:
            place = place[0].split(" ")
            informationItems["Province"] = place[0]
            if len(place) > 1:
                informationItems["City"] = place[1]
        if signature:
            informationItems["Signature"] = signature[0]
        if birthday:
            try:
                birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
                informationItems["Birthday"] = birthday - datetime.timedelta(hours=8)
            except Exception:
                pass
        if sexorientation:
            if sexorientation[0] == gender[0]:
                informationItems["Sex_Orientation"] = "gay"
            else:
                informationItems["Sex_Orientation"] = "Heterosexual"
        if marriage:
            informationItems["Marriage"] = marriage[0]
        if url:
            informationItems["URL"] = url[0]

        urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID
        r = requests.get(urlothers, cookies=response.request.cookies)
        if r.status_code == 200:
            selector = etree.HTML(r.content)
            texts = ";".join(selector.xpath('//body//div[@class="tip2"]/a//text()'))
            if texts:
                num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', texts)  # 微博数
                num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', texts)  # 关注数
                num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', texts)  # 粉丝数
                if num_tweets:
                    informationItems["Num_Tweets"] = int(num_tweets[0])
                if num_follows:
                    informationItems["Num_Follows"] = int(num_follows[0])
                if num_fans:
                    informationItems["Num_Fans"] = int(num_fans[0])
        yield informationItems

        urlFollows = "http://weibo.cn/%s/follow" % ID  # 爬第一页的关注,加入待爬队列
        idFollows = self.getNextID(urlFollows, response.request.cookies)
        for ID in idFollows:
            url = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
            yield Request(url=url, callback=self.parse)

    def getNextID(self, url, cookies):
        """ 打开url爬取里面的个人ID """
        IDs = []
        r = requests.get(url=url, cookies=cookies)
        if r.status_code == 200:
            selector = etree.HTML(r.content)
            texts = selector.xpath(
                u'body//table/tr/td/a[text()="\u5173\u6ce8\u4ed6" or text()="\u5173\u6ce8\u5979"]/@href')
            IDs = re.findall('uid=(\d+)', ";".join(texts), re.S)
        return IDs

SinaSpider/Sina_spider2/Sina_spider2/spiders/tweetsSpider.py文件

# encoding=utf-8
import re
import requests
from lxml import etree
from scrapy_redis.spiders import RedisSpider
from Sina_spider2.weiboID import weiboID
from scrapy.selector import Selector
from scrapy.http import Request
from Sina_spider2.items import TweetsItem


class Spider(RedisSpider):
    name = "tweetsSpider"
    host = "http://weibo.cn"
    redis_key = "tweetsSpider:start_urls"
    start_urls = []
    for ID in weiboID:
        url = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
        start_urls.append(url)

    def start_requests(self):
        for url in self.start_urls:
            yield Request(url=url, callback=self.parse)

    def parse(self, response):
        """ 抓取微博数据 """
        selector = Selector(response)
        ID = re.findall('weibo\.cn/(\d+)', response.url)[0]
        tweets = selector.xpath('body/div[@class="c" and @id]')
        for tweet in tweets:
            tweetsItems = TweetsItem()
            id = tweet.xpath('@id').extract_first()  # 微博ID
            content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first()  # 微博内容
            cooridinates = tweet.xpath('div/a/@href').extract_first()  # 定位坐标
            like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract())  # 点赞数
            transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract())  # 转载数
            comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract())  # 评论数
            others = tweet.xpath('div/span[@class="ct"]/text()').extract_first()  # 求时间和使用工具(手机或平台)

            tweetsItems["_id"] = ID + "-" + id
            tweetsItems["ID"] = ID
            if content:
                tweetsItems["Content"] = content.strip(u"[\u4f4d\u7f6e]")  # 去掉最后的"[位置]"
            if cooridinates:
                cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
                if cooridinates:
                    tweetsItems["Co_oridinates"] = cooridinates[0]
            if like:
                tweetsItems["Like"] = int(like[0])
            if transfer:
                tweetsItems["Transfer"] = int(transfer[0])
            if comment:
                tweetsItems["Comment"] = int(comment[0])
            if others:
                others = others.split(u"\u6765\u81ea")
                tweetsItems["PubTime"] = others[0]
                if len(others) == 2:
                    tweetsItems["Tools"] = others[1]
            yield tweetsItems
        url_next = selector.xpath(
            u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
        if url_next:
            yield Request(url=self.host + url_next[0], callback=self.parse)
        else:  # 如果没有下一页即表示该用户的微博已经爬完了,接下来爬第一页的关注,加入待爬队列
            urlFollows = "http://weibo.cn/%s/follow" % ID
            idFollows = self.getNextID(urlFollows, response.request.cookies)
            for ID in idFollows:
                url = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
                yield Request(url=url, callback=self.parse)

    def getNextID(self, url, cookies):
        """ 打开url爬取里面的个人ID """
        IDs = []
        r = requests.get(url=url, cookies=cookies)
        if r.status_code == 200:
            selector = etree.HTML(r.content)
            texts = selector.xpath(
                u'body//table/tr/td/a[text()="\u5173\u6ce8\u4ed6" or text()="\u5173\u6ce8\u5979"]/@href')
            IDs = re.findall('uid=(\d+)', ";".join(texts), re.S)
        return IDs

使用说明:

  • Python需要安装好Scrapy、pymongo、json、base64、requests。
  • Master机只需要安装好Redis即可(内存要求大点),Slaver机需要安装python环境和MongoDB来存储数据。如果想要将数据都存储到一台机子上,直接改一下爬虫程序(pipeline)里面MongoDB的IP即可,或者建议搭建一个MongoDB集群。Redis和MongoDB都是安装好即可,不需要配置。
  • 将你用来登录的微博账号和密码加入到 cookies.py 文件中,里面已经有两个账号作为格式参考了。
  • 可以修改scrapy里面setting的设置,例如间隔时间、日志级别、redis的IP等等。
  • 以上配置完以后运行 Begin.py 即可。重申Master机不需要跑程序,它的功能是利用Redis进行任务调度。Slaver机跑爬虫,新增一台Slaver机,只需要把python环境和MongoDB搭建好,然后将代码复制过去直接运行就行了。

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值