接上文(爬虫实战:学者网(一)),本篇主要讲述学者网爬虫代码实现部分。
程序流程:
1首先我们要建立数据库来实现存储。
2我们要初始化第一个用户,由他开始遍历网络
3遍历网络找到所有人的ID、中文名、单位
4通过ID爬取邮箱
下面是实验运行主要程序,链接数据库后,按照上方的描述进行实验。
# -*- coding: utf-8 -*-
#!/usr/bin/env python
from function import *
db, cur = connectDB()
#1
creatdetailstable(db, cur)
#2
initfirstuser(db, cur)
#3
getbaseinfo(db, cur, 1)
#4
getallemail(db,cur,7000)
cur.close()
db.close()
程序细节:
建立数据表
def creatdetailstable(db, cur):
sql = "create table persondetails(\
id int not null AUTO_INCREMENT,\
age int,\
QQ varchar(20),\
chineseName varchar(255),\
email varchar(255),\
englishName varchar(255),\
gender varchar(10),\
phoneNumber varchar(15),\
pictureUrl varchar(255),\
username varchar(255),\
workUnit varchar(255),\
workUnitEn varchar(255),\
primary key (id,username))\
ENGINE=InnoDB DEFAULT CHARSET=utf8"
cur.execute(sql)
db.commit()
链接数据库
def connectDB():
db = MySQLdb.connect(host='localhost',user='root',passwd='-----',db='-----')
cur = db.cursor()
return db, cur
初始化第一个用户
def initfirstuser(db, cur):
sql = "insert into persondetails(username) values('ytang')"
cur.execute(sql)
db.commit()
获取所有人的ID与姓名,采取广度遍历的方式。从数据库第一个用户开始,获取其所有朋友的ID、中文名等信息插入数据库。
def getbaseinfo(db, cur, i):
j = 55555
while i<=j:
sql = """select username from persondetails where id='%d'"""% i
cur.execute(sql)
cds=cur.fetchone()
username = cds[0]
frienddata = getFriendsList(username)
if len(frienddata) > 0:
for user in frienddata:
processFrienddata(db, cur, user)
i = i+1
sql = 'select count(*) from persondetails'
cur.execute(sql)
tmp=cur.fetchone()
j=tmp[0]
print "当前ID:%d"%(i-1), "所有记录数:%d"%j, "朋友数:", len(frienddata)
获取某用户朋友的具体代码,经过去重后插入表单
def processFrienddata(db, cur, frienddata):
frienddata = re.sub('["]+', '', frienddata)
details = re.findall(':([^:,]*)', frienddata)
chineseName = details[0]
pictureUrl = details[1]
username = details[2]
sql = """select count(*) from persondetails where username="%s" limit 1""" % username
cur.execute(sql)
count = cur.fetchone()
if count[0] > 0L:
'''
print "已经存在此人: %s"% chineseName,
'''
elif count[0] == 0L:
print '新用户: %s'% chineseName
sql = 'insert into persondetails(chineseName,pictureUrl,username) \
values(%s,%s,%s)'
param = (chineseName,pictureUrl,username)
cur.execute(sql, param)
db.commit()
用于变更http请求header中的ip,学者网防流量攻击不够严格,此方法足以绕开规则
def genip():
A = random.randint(0, 255)
B = random.randint(0, 255)
C = random.randint(0, 255)
D = random.randint(0, 255)
ip = str(A)+"."+str(B)+"."+str(C)+"."+str(D)
return ip
通过ID获取所有人邮箱代码
def getallemail(db,cur,i):
sql = 'select count(*) from persondetails'
cur.execute(sql)
tmp=cur.fetchone()
j=tmp[0]
print j
while i<=j:
sql = """select username, chineseName, email from persondetails where id='%d'"""% i
cur.execute(sql)
cds=cur.fetchone()
username = cds[0]
email = cds[2]
if email is not None:
i = i+1
continue
email = getmail(username, None)
print "%d %s 的邮箱是: %s"%(i, cds[0], email)
if email is not None:
sql = """update persondetails set email='%s' where id='%d'"""%(email,i)
cur.execute(sql)
db.commit()
i = i+1
获取具体某人的邮箱的代码,主要是是用正则,xpath此处不适用,其次失败时最多尝试6次避免死循环
def getmail(username, proxies):
url = 'http://www.scholat.com/'+username
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
ip = genip()
headers = { 'User-Agent' : user_agent, 'X-Forwarded-For': ip}
trynum = 6
while trynum>0:
try:
out = ""
trynum = trynum-1
request = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request,timeout=20)
data = response.read()
patt1 = '(KxWfbQ=\"[\w]+)\"'
patt2 = '(VOsWkBL=\"@[\.\w]+)\"'
frienddata1 = re.findall(patt1, data)
frienddata2 = re.findall(patt2, data)
if frienddata1 and frienddata2:
a = re.sub('KxWfbQ=\"', '', frienddata1[0])
b = re.sub('VOsWkBL=\"', '', frienddata2[0])
out = re.sub('\"', '', a+b)
return out
except urllib2.URLError, e:
time.sleep(2)
if hasattr(e, "code"):
print e.code
if hasattr(e, "reason"):
print e.reason
return []
到此为止全部代码奉上,由于网站会不断更迭,代码很可能会失效。