爬人人好友

昨天脑子抽到想要爬人人好友,,,,,,发现只能爬2层 我的好友 和好友的好友。  本来还想搞一下最近访问的,但是模板太多了,不同好友的html可能不一样,而且抓的id有很多重复,再想办法解决。但是要期末考试了,所以先搁置一段时间吧!
from BeautifulSoup import BeautifulSoup as bp 
import urllib 
import urllib2 
import cookielib 
import re 
fp=open('rr.txt','w') 
 
 
def login(username, password): 
    """log in and return uid""" 
    logpage = "http://www.renren.com/ajaxLogin/login" 
    data = {'email': username, 'password': password} 
    login_data = urllib.urlencode(data) 
    cj = cookielib.CookieJar() 
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) 
    urllib2.install_opener(opener) 
    res = opener.open(logpage, login_data) 
    #print "Login now ..." 
    html = res.read() 
    #print html 
 
    # Get uid 
    print "Getting user id of you now" 
    res = urllib2.urlopen("http://www.renren.com/home") 
    html = res.read() 
    #print html 
    uid = re.search("'ruid':'(\d+)'", html).group(1) 
    print uid 
    print "Login and got uid successfully" 
    return uid 
login(username, password)
    url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid 
    html=urllib2.urlopen(url).read() 
    html=bp(html) 
    href=html.findAll('div',{'class':'page'}) 
    try: 
        href=href[1].findChildren()[-1]['href'] 
        href=str(href) 
        page=re.search(r"\d+",href).group(0) 
        #print page 

for i in range(int(page)+1):
    url='http://friend.renren.com/GetFriendList.do?curpage=%s&id=%s'%(i,uid)
    html=urllib2.urlopen(url).read()
    html=bp(html)
    #print html
    words=html.findAll('dd')
    for word in words:
        #print type(word),type(str(word)),'href' in word,'href' in str(word)
        if 'href' in str(word):
            name=word.a.string
            userid= word.a['href'][36:45]
            print name,userid ;fp.write(name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t')
        else:
            try:
                adress = word.string
                print adress;fp.write(adress.encode('utf-8')+'\n')
            except:
                print 'this one have no adress'
                fp.write('\n')
    print i,'is ok.....'
fp.close()

 
from BeautifulSoup import BeautifulSoup as bp
import urllib
import urllib2
import cookielib
import re
 
def login(username, password):
    """log in and return uid"""
    logpage = "http://www.renren.com/ajaxLogin/login"
    data = {'email': username, 'password': password}
    login_data = urllib.urlencode(data)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    res = opener.open(logpage, login_data)
    #print "Login now ..."
    html = res.read()
    #print html
 
    # Get uid
    print "Getting user id of you now"
    res = urllib2.urlopen("http://www.renren.com/home")
    html = res.read()
    #print html
    uid = re.search("'ruid':'(\d+)'", html).group(1)
    print uid
    print "Login and got uid successfully"
    return uid
 
 
print login(username, password) 
fp=open('rr.txt','r')
dic=open('ftf.txt','w+')
for line in fp.readlines():
     
    Mname=line.split('\t')[0]
    Muserid=line.split('\t')[1][36:45]
    #print len(id)
    #print userid
    url='http://friend.renren.com/GetFriendList.do?curpage=0&id=%s'%Muserid
    html=urllib2.urlopen(url).read()
    html=bp(html)
    href=html.findAll('div',{'class':'page'})
    try:
        href=href[1].findChildren()[-1]['href']
        href=str(href)
        page=re.search(r"\d+",href).group(0)
        #print page
         
        for i in range(int(page)+1):
            urls=r'http://friend.renren.com/GetFriendList.do?curpage='+str(i)+r'&id='+str(Muserid)
            #print urls
            html=urllib2.urlopen(urls).read()
            html=bp(html)
            words=html.findAll('dd')
            #print len(words)
            for word in words:
                if 'href' in str(word):
                    name=word.a.string
                    userid= word.a['href']
                    #print Mname, name,userid ;
                    dic.write(Mname+'\t'+name.encode('utf-8')+'\t'+userid.encode('utf-8')+'\t')
                else:
                    try:
                        adress = word.string
                        #print adress;
                        dic.write(adress.encode('utf-8')+'\n')
                    except:
                        #print 'this one have no adress'
                        dic.write('\n')
    except:
        print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>worry'
        print i ,'is ok ...'
    print Mname ,'is ok >>>>>>>>>>>>>'
fp.close()
dic.close()



问题:

1、能不能不要每次都等入

2、最近好友有100人访问限制,需要验证码,求破。看样子是要学习一下urllib包了 里面应该有模拟浏览器的办法



登入的代码是借鉴其他人的 ,自己现在还不会,还是菜鸟~~~~~~~~~~~~~





最后我发了好几次 为什么插入代码的方式会出现html标签,,求解!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值