写了一个简单的贴吧爬虫,送上代码:
python版本:2.7
#coding:utf-8
__author__ = 'zhengjinwei'
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from lib import spider
from lib import fileWriter
import time
import os
class TieBaSpider:
def __init__(self,userAgent):
self.spider = spider.Spider(userAgent)
self.users = {}
self.curFilePath = self.cur_file_dir()
def cur_file_dir(self):
#获取脚本路径
path = sys.path[0]
#判断为脚本文件还是py2exe编译后的文件,如果是脚本文件,则返回的是脚本的目录,如果是py2exe编译后的文件,则返回的是编译后的文件路径
if os.path.isdir(path):
return path
elif os.path.isfile(path):
return os.path.dirname(path)
#get user info specified
def getUserSex(self,userUrl):
pattern = '''<span class="userinfo_sex userinfo_sex_(.*?)"></span><span>(.*?)</span>'''
contents = self.spider.getContents(userUrl,pattern,2)
# print contents
if len(contents):
return contents[0][0]
else:
return -1
def getUserGuanZhu(self,url):
pattern = '''<a data-fid=".*?" target="_blank" locate=".*?" href=".*?".*?class="u-f-item unsign"><span>(.*?)</span><span class="forum_level (.*?)"></span></a>'''
contents = self.spider.getContents(url,pattern,2)
if len(contents):
return contents
else:
return -1
def enterUsersPage(self,url):
pattern = '''<span class="member.*?"><a href="(.*?)" class="avatar"><img src="(.*?)" alt="(.*?)"></a><div class="name_wrap"><a href="(.*?)" class="user_name" title="(.*?)">(.*?)</a>'''
contents = self.spider.getContents(url,pattern,5)
# print contents
if len(contents):
for i in range(0,len(contents)):
userUrl = "http://tieba.baidu.com"+contents[i][0]
userName = contents[i][4]
sex = self.getUserSex(userUrl)
guanZhu = self.getUserGuanZhu(userUrl)
if sex != -1 and guanZhu != -1:
userGZ = []
for g in range(0,len(guanZhu)):
userGZ.append(guanZhu[g][0])
self.storeUserInfo(userName,sex,userGZ)
else:
pass
return 1
else:
return -1
def storeUserInfo(self,userName,sex,gz):
if self.users.has_key(userName) == False:
strGz = ''
for i in range(0,len(gz)):
strGz += '['+str(gz[i])+']'
self.users[userName] = sex
str1 = userName+"\t"+sex+'\t'+strGz+"\r\n"
f = fileWriter.FileWriter(self.curFilePath,'zjw.txt','a')
print str1
f.write(str1)
time.sleep(1)
f.close()
else:
print "repeat user:",userName,sex
def go(self,pageUrl):
pattern = '''<a.*?id="member_name_link".*?href="(.*?)".*?target="_blank">(.*?)</a>.*?</p>.*?<p class="forum_info_desc">'''
contents = self.spider.getContents(pageUrl,pattern,1)
if len(contents):
usersUrl = "http://tieba.baidu.com"+contents[0][0]
# enter all users page
self.enterUsersPage(usersUrl)
t = True
page = 2
while t == True:
pageUserUrl = usersUrl +"&pn="+str(page)
print "prcess next page:",pageUserUrl
ret = self.enterUsersPage(pageUserUrl)
if ret == 1:
page += 1
print page
else:
print "all page processed,",page,ret
break
else:
pass
demo = TieBaSpider('Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko')
siteUrl = u"http://tieba.baidu.com/f?kw=%E6%A2%A6%E6%83%B3%E5%B0%8F%E9%95%87&ie=utf-8&pn="
demo.go(siteUrl)
上面使用的lib库如下:
spider.py
#coding:utf-8
__author__ = 'zhengjinwei'
#coding=utf-8
import urllib
import urllib2
import re
import os
import math
import requests
class Spider:
def __init__(self,userAgent):
self.user_agent = userAgent
self.headers = {'User-Agent' : self.user_agent}
def getPage(self,url,decode=False):
request = urllib2.Request(url,headers=self.headers)
response = urllib2.urlopen(request)
if(decode == True):
return response.read().decode('utf-8')
else:
return response.read()
def postHttp(self,url,postData):
r = requests.post(url, data=postData)
return r.text
def getRawContent(self,url):
pageContent = self.getPage(url)
return pageContent
def getContents(self,url,strPattern,count):
page = self.getPage(url)
pattern = re.compile(strPattern,re.S)
items = re.findall(pattern,page)
contents=[]
for item in items:
temArr = []
for i in range(0,count):
temArr.append(item[i])
contents.append(temArr)
return contents
fileWriter.py
__author__ = 'zhengjinwei'
#coding=utf-8
import os
import codecs
class FileWriter:
def __init__(self,fileDir,fileName,format):
self.mkDir(fileDir)
self.codecs = codecs.open(fileDir+u"/"+fileName,format,'utf-8')
self.f = open(fileDir+u"/"+fileName,format)
def mkDir(self,path):
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
def writeCodecs(self,contents):
return self.codecs.write(contents)
def write(self,contents):
return self.f.write(contents)
def close(self):
self.f.close()
self.codecs.close()
------------结束 呵呵----------------