SinaSpider/Sina_spider2/scrapy.cfg文件
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = Sina_spider2.settings
[deploy]
#url = http://localhost:6800/
project = Sina_spider2
SinaSpider/Sina_spider2/Begin.py文件
from scrapy import cmdline
cmdline.execute("scrapy crawlall".split())
SinaSpider/Sina_spider2/Sina_spider2/cookies.py文件
# encoding=utf-8
import json
import base64
import requests
"""
输入你的微博账号和密码,可去淘宝买,一元七个。
建议买几十个,微博反扒的厉害,太频繁了会出现302转移。
或者你也可以把时间间隔调大点。
"""
myWeiBo = [
{'no': 'jiadieyuso3319@163.com', 'psw': 'a123456'},
{'no': 'shudieful3618@163.com', 'psw': 'a123456'},
]
def getCookies(weibo):
""" 获取Cookies """
cookies = []
loginURL = r'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)'
for elem in weibo:
account = elem['no']
password = elem['psw']
username = base64.b64encode(account.encode('utf-8')).decode('utf-8')
postData = {
"entry": "sso",
"gateway": "1",
"from": "null",
"savestate": "30",
"useticket": "0",
"pagerefer": "",
"vsnf": "1",
"su": username,
"service": "sso",
"sp": password,
"sr": "1440*900",
"encoding": "UTF-8",
"cdult": "3",
"domain": "sina.com.cn",
"prelt": "0",
"returntype": "TEXT",
}
session = requests.Session()
r = session.post(loginURL, data=postData)
jsonStr = r.content.decode('gbk')
info = json.loads(jsonStr)
if info["retcode"] == "0":
print "Get Cookie Success!( Account:%s )" % account
cookie = session.cookies.get_dict()
cookies.append(cookie)
else:
print "Failed!( Reason:%s )" % info['reason']
return cookies
cookies = getCookies(myWeiBo)
print "Get Cookies Finish!( Num:%d)" % len(cookies)
SinaSpider/Sina_spider2/Sina_spider2/items.py文件
# -*- coding: utf-8 -*-
from scrapy import Item, Field
class InformationItem(Item):
""" 个人信息 """
_id = Field() # 用户ID
NickName = Field() # 昵称
Gender = Field() # 性别
Province = Field() # 所在省
City = Field() # 所在城市
Signature = Field() # 个性签名
Birthday = Field() # 生日
Num_Tweets = Field() # 微博数
Num_Follows = Field() # 关注数
Num_Fans = Field() # 粉丝数
Sex_Orientation = Field() # 性取向
Marriage = Field() # 婚姻状况
URL = Field() # 首页链接
class TweetsItem(Item):
""" 微博信息 """
_id = Field() # 用户ID-微博ID
ID = Field() # 用户ID
Content = Field() # 微博内容
PubTime = Field() # 发表时间
Co_oridinates = Field() # 定位坐标
Tools = Field() # 发表工具/平台
Like = Field() # 点赞数
Comment = Field() # 评论数
Transfer = Field() # 转载数
SinaSpider/Sina_spider2/Sina_spider2/middleware.py文件
# encoding=utf-8
import random
from user_agents import agents
from cookies import cookies
class UserAgentMiddleware(object):
""" 换User-Agent """
def process_request(self, request, spider):
agent = random.choice(agents)
request.headers["User-Agent"] = agent
class CookiesMiddleware(object):
""" 换Cookie """
def process_request(self, request, spider):
cookie = random.choice(cookies)
request.cookies = cookie
SinaSpider/Sina_spider2/Sina_spider2/pipelines.py文件
# -*- coding: utf-8 -*-
import pymongo
from items import InformationItem, TweetsItem
class MongoDBPipleline(object):
def __init__(self):
clinet = pymongo.MongoClient("localhost", 27017)
db = clinet["Sina"]
self.Information = db["Information"]
self.Tweets = db["Tweets"]
self.Follows = db["Follows"]
self.Fans = db["Fans"]
def process_item(self, item, spider):
""" 判断item的类型,并作相应的处理,再入数据库 """
if isinstance(item, InformationItem):
try:
self.Information.insert(dict(item))
except Exception:
pass
elif isinstance(item, TweetsItem):
try:
self.Tweets.insert(dict(item))
except Exception:
pass
return item
SinaSpider/Sina_spider2/Sina_spider2/settings.py文件
# -*- coding: utf-8 -*-
BOT_NAME = ['tweetsSpider', 'informationSpider']
SPIDER_MODULES = ['Sina_spider2.spiders']
NEWSPIDER_MODULE = 'Sina_spider2.spiders'
DOWNLOADER_MIDDLEWARES = {
"Sina_spider2.middleware.UserAgentMiddleware": 401,
"Sina_spider2.middleware.CookiesMiddleware": 402,
}
ITEM_PIPELINES = ["Sina_spider2.pipelines.MongoDBPipleline"]
SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIE_URL = None
REDIS_HOST = '192.168.1.199'
REDIS_PORT = 6379
DOWNLOAD_DELAY = 2 # 间隔时间
COMMANDS_MODULE = 'Sina_spider2.commands'
# LOG_LEVEL = 'INFO' # 日志级别
# CONCURRENT_REQUESTS = 1
# CONCURRENT_ITEMS = 1
# CONCURRENT_REQUESTS_PER_IP = 1
SinaSpider/Sina_spider2/Sina_spider2/user_agents.py文件
# encoding=utf-8
""" User-Agents """
agents = [
"Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
"Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
"Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
"Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
"Mozilla/2.02E (Win95; U)",
"Mozilla/3.01Gold (Win95; I)",
"Mozilla/4.8 [en] (Windows NT 5.1; U)",
"Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
"HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3",
"Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
]
SinaSpider/Sina_spider2/Sina_spider2/weiboID.py文件
# encoding=utf-8
""" 初始的待爬队列 """
weiboID = [
1797054534, 2509414473L, 2611478681L, 5861859392L, 2011086863, 5127716917L, 1259110474, 5850775634L, 1886437464,
3187474530L, 2191982701L, 1940562032, 5874450550L, 1337925752, 2081079420, 5664530558L, 3493173952L, 1202806915,
1864507535, 2032640064, 5585682587L, 3083673764L, 5342109866L, 5878685868L, 5728706733L, 2103050415, 5876752562L,
3138085045L, 5775974583L, 1879400644, 2417139911L, 5836619975L, 5353816265L, 5219508427L, 1766613205, 2480158031L,
5660754163L, 2456764664L, 3637354755L, 1940087047, 5508473104L, 1004454162, 2930327837L, 1874608417, 5379621155L,
1720664360, 2714280233L, 3769073964L, 5624119596L, 2754904375L, 5710151998L, 5331042630L, 5748179271L, 2146132305,
2313896275L, 3193618787L, 5743059299L, 1742930277, 5310538088L, 1794474362, 2798510462L, 3480076671L, 5678653833L,
5743657357L, 5460191980L, 1734164880, 5876988653L, 5678031258L, 5860163996L, 1496924574, 5878970110L, 1679704482,
1142210982, 3628925351L, 1196397981, 1747485107, 5675893172L, 5438521785L, 2192269762L, 1992614343, 5878686155L,
2407186895L, 5559116241L, 2528477652L, 1295950295, 5038203354L, 3659276765L, 2126733792, 5878350307L, 2761179623L,
5484511719L, 5825708520L, 1578230251, 5878686190L, 5810946551L, 3833070073L, 1795047931, 5855789570L, 3580125714L,
5709578773L, 5236539926L, 2907633071L, 1709244961, 5405450788L, 3251257895L, 5054538290L, 2713199161L, 5698445883L,
1784537661, 3195290182L, 1824506454, 5738766939L, 5565915740L, 5336031840L, 5098775138L, 5685568105L, 1774289524,
2932662914L, 5433223957L, 2680044311L, 1111523983, 5067889432L, 5878686362L, 2844992161L, 3878314663L, 1766548141,
5763269297L, 5878383287L, 5235499706L, 5876375670L, 5866447563L, 5129945819L, 1704116960, 1929380581, 1223762662,
1193476843, 2899591923L, 5162099453L, 5072151301L, 5385741066L, 5411455765L, 2685535005L, 2297905950L, 1216766752,
5838668577L, 5359133478L, 3077460103L, 5577802539L, 5862392623L, 1786700611, 1259258694, 1845191497, 1731838797,
1740301135, 2816074584L, 1217733467, 5345035105L, 5050827618L, 5486257001L, 5767857005L, 2050605943, 5733778298L,
1914725244, 5872583558L, 5604377483L, 1253491601, 5554922386L, 3170223002L, 5662737311L, 3217179555L, 1538163622,
5304533928L, 5644198830L, 1896650227, 5298774966L, 2795873213L, 1834378177, 5769651141L, 2656256971L, 5876433869L,
1826792401, 3002246100L, 3082519511L, 5780366296L, 5704696797L, 5204108258L, 2090615793, 1739746131, 1378010100,
5741331445L, 2376442895L, 3638486041L, 5781365789L, 1827234850, 5703214121L, 1855398955, 1227908142, 5703820334L,
]
SinaSpider/Sina_spider2/Sina_spider2/commands/文件夹
SinaSpider/Sina_spider2/Sina_spider2/commands/crawlall.py文件
from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerRunner
from scrapy.utils.conf import arglist_to_dict
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE",
help="set spider argument (may be repeated)")
parser.add_option("-o", "--output", metavar="FILE",
help="dump scraped items into FILE (use - for stdout)")
parser.add_option("-t", "--output-format", metavar="FORMAT",
help="format to use for dumping items with -o")
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
try:
opts.spargs = arglist_to_dict(opts.spargs)
except ValueError:
raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
def run(self, args, opts):
# settings = get_project_settings()
spider_loader = self.crawler_process.spider_loader
for spidername in args or spider_loader.list():
print "crawlall spidername: " + spidername
self.crawler_process.crawl(spidername, **opts.spargs)
self.crawler_process.start()
SinaSpider/Sina_spider2/Sina_spider2/spiders/文件夹
SinaSpider/Sina_spider2/Sina_spider2/spiders/informationSpider.py文件
# encoding=utf-8
import re
import datetime
import requests
from lxml import etree
from scrapy_redis.spiders import RedisSpider
from Sina_spider2.weiboID import weiboID
from scrapy.selector import Selector
from scrapy.http import Request
from Sina_spider2.items import InformationItem
class Spider(RedisSpider):
name = "informationSpider"
host = "http://weibo.cn"
redis_key = "informationSpider:start_urls"
start_urls = []
for ID in weiboID:
url = url_information1 = "http://weibo.cn/%s/info" % ID
start_urls.append(url)
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, callback=self.parse)
def parse(self, response):
informationItems = InformationItem()
selector = Selector(response)
ID = re.findall('weibo\.cn/(\d+)', response.url)[0]
text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract()) # 获取标签里的所有text()
nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # 昵称
gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # 性别
place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # 地区(包括省份和城市)
signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # 个性签名
birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # 生日
sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # 性取向
marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # 婚姻状况
url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # 首页链接
informationItems["_id"] = ID
if nickname:
informationItems["NickName"] = nickname[0]
if gender:
informationItems["Gender"] = gender[0]
if place:
place = place[0].split(" ")
informationItems["Province"] = place[0]
if len(place) > 1:
informationItems["City"] = place[1]
if signature:
informationItems["Signature"] = signature[0]
if birthday:
try:
birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
informationItems["Birthday"] = birthday - datetime.timedelta(hours=8)
except Exception:
pass
if sexorientation:
if sexorientation[0] == gender[0]:
informationItems["Sex_Orientation"] = "gay"
else:
informationItems["Sex_Orientation"] = "Heterosexual"
if marriage:
informationItems["Marriage"] = marriage[0]
if url:
informationItems["URL"] = url[0]
urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID
r = requests.get(urlothers, cookies=response.request.cookies)
if r.status_code == 200:
selector = etree.HTML(r.content)
texts = ";".join(selector.xpath('//body//div[@class="tip2"]/a//text()'))
if texts:
num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', texts) # 微博数
num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', texts) # 关注数
num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', texts) # 粉丝数
if num_tweets:
informationItems["Num_Tweets"] = int(num_tweets[0])
if num_follows:
informationItems["Num_Follows"] = int(num_follows[0])
if num_fans:
informationItems["Num_Fans"] = int(num_fans[0])
yield informationItems
urlFollows = "http://weibo.cn/%s/follow" % ID # 爬第一页的关注,加入待爬队列
idFollows = self.getNextID(urlFollows, response.request.cookies)
for ID in idFollows:
url = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
yield Request(url=url, callback=self.parse)
def getNextID(self, url, cookies):
""" 打开url爬取里面的个人ID """
IDs = []
r = requests.get(url=url, cookies=cookies)
if r.status_code == 200:
selector = etree.HTML(r.content)
texts = selector.xpath(
u'body//table/tr/td/a[text()="\u5173\u6ce8\u4ed6" or text()="\u5173\u6ce8\u5979"]/@href')
IDs = re.findall('uid=(\d+)', ";".join(texts), re.S)
return IDs
SinaSpider/Sina_spider2/Sina_spider2/spiders/tweetsSpider.py文件
# encoding=utf-8
import re
import requests
from lxml import etree
from scrapy_redis.spiders import RedisSpider
from Sina_spider2.weiboID import weiboID
from scrapy.selector import Selector
from scrapy.http import Request
from Sina_spider2.items import TweetsItem
class Spider(RedisSpider):
name = "tweetsSpider"
host = "http://weibo.cn"
redis_key = "tweetsSpider:start_urls"
start_urls = []
for ID in weiboID:
url = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
start_urls.append(url)
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, callback=self.parse)
def parse(self, response):
""" 抓取微博数据 """
selector = Selector(response)
ID = re.findall('weibo\.cn/(\d+)', response.url)[0]
tweets = selector.xpath('body/div[@class="c" and @id]')
for tweet in tweets:
tweetsItems = TweetsItem()
id = tweet.xpath('@id').extract_first() # 微博ID
content = tweet.xpath('div/span[@class="ctt"]/text()').extract_first() # 微博内容
cooridinates = tweet.xpath('div/a/@href').extract_first() # 定位坐标
like = re.findall(u'\u8d5e\[(\d+)\]', tweet.extract()) # 点赞数
transfer = re.findall(u'\u8f6c\u53d1\[(\d+)\]', tweet.extract()) # 转载数
comment = re.findall(u'\u8bc4\u8bba\[(\d+)\]', tweet.extract()) # 评论数
others = tweet.xpath('div/span[@class="ct"]/text()').extract_first() # 求时间和使用工具(手机或平台)
tweetsItems["_id"] = ID + "-" + id
tweetsItems["ID"] = ID
if content:
tweetsItems["Content"] = content.strip(u"[\u4f4d\u7f6e]") # 去掉最后的"[位置]"
if cooridinates:
cooridinates = re.findall('center=([\d|.|,]+)', cooridinates)
if cooridinates:
tweetsItems["Co_oridinates"] = cooridinates[0]
if like:
tweetsItems["Like"] = int(like[0])
if transfer:
tweetsItems["Transfer"] = int(transfer[0])
if comment:
tweetsItems["Comment"] = int(comment[0])
if others:
others = others.split(u"\u6765\u81ea")
tweetsItems["PubTime"] = others[0]
if len(others) == 2:
tweetsItems["Tools"] = others[1]
yield tweetsItems
url_next = selector.xpath(
u'body/div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
if url_next:
yield Request(url=self.host + url_next[0], callback=self.parse)
else: # 如果没有下一页即表示该用户的微博已经爬完了,接下来爬第一页的关注,加入待爬队列
urlFollows = "http://weibo.cn/%s/follow" % ID
idFollows = self.getNextID(urlFollows, response.request.cookies)
for ID in idFollows:
url = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
yield Request(url=url, callback=self.parse)
def getNextID(self, url, cookies):
""" 打开url爬取里面的个人ID """
IDs = []
r = requests.get(url=url, cookies=cookies)
if r.status_code == 200:
selector = etree.HTML(r.content)
texts = selector.xpath(
u'body//table/tr/td/a[text()="\u5173\u6ce8\u4ed6" or text()="\u5173\u6ce8\u5979"]/@href')
IDs = re.findall('uid=(\d+)', ";".join(texts), re.S)
return IDs
使用说明:
- Python需要安装好Scrapy、pymongo、json、base64、requests。
- Master机只需要安装好Redis即可(内存要求大点),Slaver机需要安装python环境和MongoDB来存储数据。如果想要将数据都存储到一台机子上,直接改一下爬虫程序(pipeline)里面MongoDB的IP即可,或者建议搭建一个MongoDB集群。Redis和MongoDB都是安装好即可,不需要配置。
- 将你用来登录的微博账号和密码加入到 cookies.py 文件中,里面已经有两个账号作为格式参考了。
- 可以修改scrapy里面setting的设置,例如间隔时间、日志级别、redis的IP等等。
- 以上配置完以后运行 Begin.py 即可。重申Master机不需要跑程序,它的功能是利用Redis进行任务调度。Slaver机跑爬虫,新增一台Slaver机,只需要把python环境和MongoDB搭建好,然后将代码复制过去直接运行就行了。