上篇已经说了手机版的比较好爬,本篇就贴出一个新浪微博手机版爬虫。。。。至于电脑版,因为我目前要用,暂时不提供分享
# coding: utf-8
'''
以关键词收集新浪微博
'''
import wx
import sys
import urllib
import urllib2
import re
import json
import hashlib
import os
import time
import datetime
import random
from lxml import etree
import logging
import requests
class CollectData():
def __init__(self, keyword, startTime,page):
self.begin_url_per ="http://weibo.cn/search/mblog?hideSearchFrame=&" #设置固定地址部分,默认为"http://s.weibo.com/weibo/",或者"http://s.weibo.com/wb/"
self.setKeyword(keyword) #设置关键字
self.setStartTimescope(startTime) #设置搜索的开始时间
self.setpage(page) #设置
##设置关键字
##关键字需解码
def setKeyword(self, keyword):
self.keyword = keyword.decode('GBK').encode("utf-8")
print 'twice encode:',self.getKeyWord()
##格式为:yyyy-mm-dd-HH
def setStartTimescope(self, startTime):
self.startTime =startTime
def getKeyWord(self):
once = urllib.urlencode({"kw":self.keyword})[3:]
return urllib.urlencode({"kw":once})[3:]
def setpage(self, page):
self.page =page
def getURL(self):
return self.begin_url_per+"keyword="+self.getKeyWord()+"&advancedfilter=1&starttime="+self.startTime+"&endtime="+self.startTime+"&sort=time&&page="
def main():
headers = headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Language':"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Encoding':"gzip, deflate",
'Connection':"keep-alive"}
## headers的目的就是伪装成浏览器防止被封号
cook = {"Cookie": "登录以后的coolies"}
keyword = raw_input('Enter the keyword:')
startTime = raw_input('Enter the start time(Format:YYYY-mm-dd-HH):')
page = raw_input('Enter the page:')
#实例化收集类,收集指定关键字和起始时间的微博
cd = CollectData(keyword,startTime,page)
url = cd.getURL()
i=1
save_path = 'C:\Users\Administrator\Desktop\data2.txt'
io = open(save_path, 'a+') #
flag=0
while i<100:
url1=url+str(i)+'&vt=4'
sleeptime_one = random.randint(1,4)
sleeptime_two = random.randint(3,7)
#设定延时,也是防反爬策略之一
if i%2 == 0:sleeptime = sleeptime_two
else:
sleeptime = sleeptime_one
print 'sleeping ' + str(sleeptime) + ' seconds...'
time.sleep(sleeptime)
html=requests.get(url1, cookies = cook,headers=headers).content
# print html
selector=etree.HTML(html)
content = selector.xpath('//a[@class="nk"]/text()|//span[@class="ct"]/text()') #这里我用的是xpath,但是推荐beautifulsoup,,目前我也用这个。xpath对规范的html比较好用,beautifulsoup的优势就是可以处理格式不规范的html
print i
i+=1
if(len(content)==0):
i=1
cd.startTime=str(int(cd.startTime)-1)
url = cd.getURL()
for each in content:
print each
if flag:
io.write(' '+each.encode('utf8')+'\n')
flag=0
else:
io.write(' '+each.encode('utf8'))
flag=1
io.close()
if __name__ == '__main__':
main()