最近实验室安排了个任务,写一个新浪微博的爬虫,抓取大家的微博内容进行分析。
话说都快毕业了,最近还在一家通信设备商实习(工资好少啊),无奈只能抽出晚上的时间来写这个程序。
本来想用开源的爬虫的,于是尝试了nutch和heritrix。nutch性能太不稳定了,老是出问题。heritrix功能多,复杂,跑起来也慢。
当看到这两篇博客后(http://kcclub.kingsoft.com/home.php?mod=space&uid=93&do=blog&id=890)(http://1.wklken.sinaapp.com/?p=177),决定干脆自己写个爬虫吧。
这程序花了我6个晚上(其实大多数时间都在学习python语言。。。还有格式规范化。。。),现在拿出来和大家分享下吧。
如果有什么问题,请通过邮件和我联系(zhengyi.bupt@qq.com),我会及时更改(毕竟要交差么)。
程序运行方式:保存所有代码后,打开Main.py,修改LoginName为你的新浪微博帐号,PassWord为你的密码。运行Main.py,程序会在当前目录下生成CrawledPages文件夹,并保存所有爬取到的文件在这个文件夹中。
马上要毕业找工作季了,在此攒点rp,希望offer能好又多。
1. 执行文件,文件名Main.py
- #!/usr/bin/env python
- #coding=utf8
- '''''Author: Zheng Yi
- Email: zhengyi.bupt@qq.com'''
- import WeiboCrawl
- if __name__ == '__main__':
- weiboLogin = WeiboCrawl.WeiboLogin('LoginName', 'PassWord')
- if weiboLogin.Login() == True:
- print "The WeiboLogin module works well!"
- #start with my blog :)
- webCrawl = WeiboCrawl.WebCrawl('http://weibo.com/yaochen')
- webCrawl.Crawl()
- del webCrawl
2. 主类,文件名WeiboCrawl.py
- #!/usr/bin/env python
- #coding=utf8
- '''''Author: Zheng Yi
- Email: zhengyi.bupt@qq.com'''
- import urllib2
- import cookielib
- import threading
- import os
- import WeiboEncode
- import WeiboSearch
- import TextAnalyze
- pagesContent = [] #html content of downloaded pages
- textContent = [] #main text content of downloaded pages
- triedUrl = [] #all tried urls, including failed and success
- toTryUrl = [] #urls to be try
- failedUrl = [] #urls that fails to download
- class WeiboLogin:
- "WeiboLogin class is for Weibo login, cookie, etc."
- def __init__(self, user, pwd, enableProxy = False):
- "Constructor of class WeiboLogin."
- print "Initializing WeiboLogin..."
- self.userName = user
- self.passWord = pwd
- self.enableProxy = enableProxy
- print "UserName:", user
- print "Password:", pwd
- self.serverUrl = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=dW5kZWZpbmVk&client=ssologin.js(v1.3.18)&_=1329806375939"
- self.loginUrl = "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.1)"
- self.postHeader = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11'}
- def Login(self):
- "Run this function to laungh the login process"
- self.EnableCookie(self.enableProxy)
- serverTime, nonce = self.GetServerTime()
- postData = WeiboEncode.PostEncode(self.userName, self.passWord, serverTime, nonce)
- print "Post data length:\n", len(postData)
- req = urllib2.Request(self.loginUrl, postData, self.postHeader)
- print "Posting request..."
- result = urllib2.urlopen(req)
- text = result.read()
- print "Post result page length: ", len(text)
- try:
- loginUrl = WeiboSearch.sRedirectData(text)
- urllib2.urlopen(loginUrl)
- except:
- print 'Login error!'
- return False
- print 'Login sucess!'
- return True
- def GetServerTime(self):
- "Get server time and nonce, which are used to encode the password"
- print "Getting server time and nonce..."
- serverData = urllib2.urlopen(self.serverUrl).read()
- print serverData
- try:
- serverTime, nonce = WeiboSearch.sServerData(serverData)
- return serverTime, nonce
- except:
- print 'Get server time & nonce error!'
- return None
- def EnableCookie(self, enableProxy):
- "Enable cookie & proxy (if needed)."
- cookiejar = cookielib.LWPCookieJar()
- cookie_support = urllib2.HTTPCookieProcessor(cookiejar)
- if enableProxy:
- proxy_support = urllib2.ProxyHandler({'http':'http://xxxxx.pac'})
- opener = urllib2.build_opener(proxy_support, cookie_support, urllib2.HTTPHandler)
- print "Proxy enabled"
- else:
- opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
- urllib2.install_opener(opener)
- class WebCrawl:
- "WebCrawl class is for crawling the Weibo"
- def __init__(self, beginUrl, maxThreadNum = 10, maxDepth = 2, thLifetime = 10, saveDir = "." +os.sep + "CrawledPages"):
- "Initialize the class WebCrawl"
- toTryUrl.append(beginUrl)
- self.maxThreadNum = maxThreadNum
- self.saveDir = saveDir
- self.maxDepth = maxDepth
- self.thLifetime = thLifetime
- self.triedPagesNum = 0
- self.threadPool = []
- if not os.path.exists(self.saveDir):
- os.mkdir(self.saveDir)
- self.logFile = open(self.saveDir + os.sep + 'log.txt','w')
- def Crawl(self):
- "Run this function to start the crawl process"
- global toTryUrl
- for depth in range(self.maxDepth):
- print 'Searching depth ', depth, '...'
- self.DownloadAll()
- self.UpdateToTry()
- def DownloadAll(self):
- "Download all urls in current depth"
- global toTryUrl
- iDownloaded = 0
- while iDownloaded < len(toTryUrl):
- iThread = 0
- while iThread < self.maxThreadNum and iDownloaded + iThread < len(toTryUrl):
- iCurrentUrl = iDownloaded + iThread
- pageNum = str(self.triedPagesNum)
- self.DownloadUrl(toTryUrl[iCurrentUrl], pageNum)
- self.triedPagesNum += 1
- iThread += 1
- iDownloaded += iThread
- for th in self.threadPool:
- th.join(self.thLifetime)
- self.threadPool = []
- toTryUrl = []
- def DownloadUrl(self, url, pageNum):
- "Download a single url and save"
- cTh = CrawlThread(url, self.saveDir, pageNum, self.logFile)
- self.threadPool.append(cTh)
- cTh.start()
- def UpdateToTry(self):
- "Update toTryUrl based on textContent"
- global toTryUrl
- global triedUrl
- global textContent
- newUrlList = []
- for textData in textContent:
- newUrlList += WeiboSearch.sUrl(textData)
- toTryUrl = list(set(newUrlList) - set(triedUrl))
- pagesContent = []
- textContent = []
- class CrawlThread(threading.Thread):
- "CrawlThread class is derived from threading.Thread, to create a thread."
- thLock = threading.Lock()
- def __init__(self, url, saveDir, pageNum, logFile):
- "Initialize the CrawlThread"
- threading.Thread.__init__(self)
- self.url = url
- self.pageNum = pageNum
- self.fileName = saveDir + os.sep + pageNum + '.htm'
- self.textName = saveDir + os.sep + pageNum + '.txt'
- self.logFile = logFile
- self.logLine = 'File: ' + pageNum + ' Url: '+ url
- def run(self):
- "rewrite the run() function"
- global failedUrl
- global triedUrl
- global pagesContent
- global textContent
- try:
- htmlContent = urllib2.urlopen(self.url).read()
- transText = TextAnalyze.textTransfer(htmlContent)
- fOut = open(self.fileName, 'w')
- fOut.write(htmlContent)
- fOut.close()
- tOut = open(self.textName, 'w')
- tOut.write(transText)
- tOut.close()
- except:
- self.thLock.acquire()
- triedUrl.append(self.url)
- failedUrl.append(self.url)
- sFailed = 'Failed! ' + self.logLine
- print sFailed
- self.logFile.write(sFailed + '\n')
- self.thLock.release()
- return None
- self.thLock.acquire()
- pagesContent.append(htmlContent)
- textContent.append(transText)
- triedUrl.append(self.url)
- sSuccess = 'Success! ' + self.logLine
- print sSuccess
- self.logFile.write(sSuccess + '\n')
- self.thLock.release()
3. 加密函数,文件名WeiboEncode.py
- #!/usr/bin/env python
- #coding=utf8
- '''Author: Zheng Yi
- Email: zhengyi.bupt@qq.com'''
- import urllib
- import base64
- import hashlib
- def PostEncode(userName, passWord, serverTime, nonce):
- "Used to generate POST data"
- encodedUserName = GetUserName(userName)
- encodedPassWord = GetPassword(passWord, serverTime, nonce)
- postPara = {
- 'entry': 'weibo',
- 'gateway': '1',
- 'from': '',
- 'savestate': '7',
- 'userticket': '1',
- 'ssosimplelogin': '1',
- 'vsnf': '1',
- 'vsnval': '',
- 'su': encodedUserName,
- 'service': 'miniblog',
- 'servertime': serverTime,
- 'nonce': nonce,
- 'pwencode': 'wsse',
- 'sp': encodedPassWord,
- 'encoding': 'UTF-8',
- 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
- 'returntype': 'META'
- }
- postData = urllib.urlencode(postPara)
- return postData
- def GetUserName(userName):
- "Used to encode user name"
- userNameTemp = urllib.quote(userName)
- userNameEncoded = base64.encodestring(userNameTemp)[:-1]
- return userNameEncoded
- def GetPassword(passWord, serverTime, nonce):
- "Used to encode user password"
- pwdTemp1 = hashlib.sha1(passWord).hexdigest()
- pwdTemp2 = hashlib.sha1(pwdTemp1).hexdigest()
- pwdTemp3 = pwdTemp2 + serverTime + nonce
- pwdEncoded = hashlib.sha1(pwdTemp3).hexdigest()
- return pwdEncoded
4. 查找函数,文件名WeiboSearch.py
- #!/usr/bin/env python
- #coding=utf8
- '''''Author: Zheng Yi
- Email: zhengyi.bupt@qq.com'''
- import re
- import json
- def sServerData(serverData):
- "Search the server time & nonce from server data"
- p = re.compile('\((.*)\)')
- jsonData = p.search(serverData).group(1)
- data = json.loads(jsonData)
- serverTime = str(data['servertime'])
- nonce = data['nonce']
- print "Server time is:", serverTime
- print "Nonce is:", nonce
- return serverTime, nonce
- def sRedirectData(text):
- p = re.compile('location\.replace\(\'(.*?)\'\)')
- loginUrl = p.search(text).group(1)
- return loginUrl
- def sUrl(htmlData):
- iMainBegin = htmlData.find('<div class="feed_lists" node-type="feed_list">')
- iMainEnd = htmlData.find('<div node-type="lazyload" class="W_loading">')
- mainData = htmlData[iMainBegin:iMainEnd]
- p = re.compile('href=\"(\/[a-zA-Z0-9\/\%]*?)\"')
- #p = re.compile('href=\"(http:\/\/weibo.com\/[a-zA-Z]*?)\"')
- semiUrlList = p.findall(mainData)
- urlList = []
- for url in semiUrlList:
- urlList.append('http://weibo.com' + url)
- return urlList
5. 简单的html内容分析、格式转换,文件名TextAnalyze.py
- #!/usr/bin/env python
- #coding=utf8
- '''''Author: Zheng Yi
- Email: zhengyi.bupt@qq.com'''
- def textTransfer(htmlContent):
- "Decode the main part of html"
- line = textExtract(htmlContent)
- print 'line:', line
- if line != None:
- transText = textTransfer(line)
- return transText
- else:
- return None
- def textExtract(htmlContent):
- "Extract the main part from html"
- lines = htmlContent.splitlines()
- for line in lines:
- if line.startswith('<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_content_[homeFeed|hisFeed]"'):
- return line
- else:
- return None
- def textTransfer(line):
- "Decode the main part"
- iText = line.find('html":"')
- if iText > 0:
- transText = line[iText + 7: -12].encode("utf-8").decode('unicode_escape').encode("utf-8").replace("\\", "")
- return transText
- else:
- return None