requests 爬取人人网信息,难点:验证码

目标:爬取人人网信息,此处只爬一个name
在这里插入图片描述
思路:
1.登陆
2.访问大鹏首页,获得到右侧推荐好友的所有url,并保存到数据库,设置状态码为 0,同时获取到大鹏的信息,并保存数据库。
3.从数据库中取出一个url(未当问的),同时将该url的状态码设为1
4.向其他页面发送请求,并获取个人信息和其他推荐好友

1.比如访问大鹏的首页,获取其信息(首先要注册登陆进首页)

import requests
from lxml import etree
import MySQLdb

conn = MySQLdb.connect(host = 'localhost',port = 3306,user = 'root',password = '123456',db = 'spider',charset = 'utf8')
cursor = conn.cursor()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
    'Cookie': 'anonymid=k2jp0i5rnl37jv; depovince=GW; _r01_=1; JSESSIONID=abcyR51ZFGJjAaI9L1Z4w; ick_login=c29a1ecd-3469-480a-9dd0-3ff0bfdfdc0a; ick=1870e757-b68b-4c44-9538-9d924c241107; XNESSESSIONID=c6697a88940b; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232160; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232171; wp=1; wp_fold=1; jebecookies=83813c83-122d-462f-a377-48ad0f71b258|||||; _de=6C64ADFC30B6DD05DACEAE940732293E; p=326b188303e865c3565a41a8252b74608; first_login_flag=1; ln_uact=15137171529; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=8588b84627888708b98404a077443e428; societyguester=8588b84627888708b98404a077443e428; id=972774868; xnsid=cc0dfece; ver=7.0; loginfrom=null; l4pager=0'
}

def get_info(url):
    res = requests.get(url=url,headers=headers)
    ele = etree.HTML(res.text)
    print(res.text)
    #获取用户name并存数据库,并将status状态设为1
    name = ele.xpath('//title/text()')[0]
    change_status(url)
    save_data(name)

    #获取该用户下的访问用户的urls,并存入数据库
    urls = ele.xpath('//div[@id="footprint-box"]/ul/li/a/@href')
    save_url(urls)

#将爬取过的url状态设为1
def change_status(url):
    sql = 'update renren_url set status=1 where url=%s'
    cursor.execute(sql,(url,))
    conn.commit()

#将 name存到的数据库
def save_data(data):
    sql = 'insert into renren_info values (%s)'
    cursor.execute(sql,(data,))
    conn.commit()

#将爬下来的url存入数据库,并设状态为0
def save_url(urls):
    for url in urls:
        sql = 'insert into renren_url values (%s,%s)'
        cursor.execute(sql,(url,'0'))
        conn.commit()

#从数据库中获取第一个status为0的url
def get_url():
    sql = 'select url from renren_url where status=%s'
    cursor.execute(sql,('0',))
    url = cursor.fetchone()[0]
    get_info(url)

if __name__ == '__main__':
    # 首先将大鹏的首页url手动添加到数据库,并设状态为0
    # url = 'http://www.renren.com/880151247/profile'
    while 1:
        get_url()

2.爬到第100个报错。
错误原因:
1.当while 1一直执行get_url()的时候,一直从数据库获取url状态为0的(即未爬取过name的用户)的url并获取信息。
2.当爬取到第100个用户的时候,会出现验证码界面。此时xpath解析:**name = ele.xpath(’//title/text()’)[0]**会报错,因为此时页面没有改xpath的相关信息。
解决思路:
解决验证码的页面问题,验证成功后继续爬取信息。
提供两种解决办法:
1.图像识别,识别验证码
2.掏钱处理识别验证码。
我们选择第二种
百度超级鹰,验证码识别,注册登陆,下载chaojiying.py文件,保存到当前目录下。

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


def chaojiying_check(im):
    chaojiying = Chaojiying_Client('账号', '密码', '902161')	#用户中心>>软件ID 生成一个替换 96001
    return chaojiying.PostPic(im, 1902)									#1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()


3.打印出验证码
在获取name的地方打印出“人人网,中国领先的实名制SNS社交网络。加入人人网,找到老同学,结识新朋友。”即出现了验证码页面。
def get_info(url):函数进行修改

def get_info(url):
    res = requests.get(url=url,headers=headers)
    ele = etree.HTML(res.text)
    print(res.text)
    #获取用户name并存数据库,并将status状态设为1
    name = ele.xpath('//title/text()')[0]
    save_data(name)
    if name != '人人网,中国领先的实名制SNS社交网络。加入人人网,找到老同学,结识新朋友。':
        change_status(url)
        #获取该用户下的访问用户的urls,并存入数据库
        urls = ele.xpath('//div[@id="footprint-box"]/ul/li/a/@href')
        save_url(urls)
    else:
        #处理验证码,获取到验证码url,并发送请求,使用超级鹰识别验证码
        img_url = ele.xpath("//img[@id='verifyPic_login']/@src")[0]
        img = requests.get(url=img_url)
        #打印出验证码
        print(chaojiying_check(img.content))

4.处理验证码,发送验证码请求
此时再写一个check_code()的函数用来发送验证码请求。
此时的验证码界面:
在这里插入图片描述
随便出入一个验证码,doc中多了一个请求,点开查看,form data里携带了信息。
此时在get_info中处理验证码处,应传入一个url,用于data参数的id。

def get_info(url):
    response = requests.get(url=url,cookies=cookies)
    ele = etree.HTML(response.text)
    print(url)
    #有些用户可能被封禁,此时就获取不到name,如果获取不到,将url的状态设为1
    try:
        name = ele.xpath("//title/text()")[0].strip()
    except:
        change_status(url)
        return
    print(name)
    save_info(name)
    if name != '人人网 - 验证码':
        change_status(url)
        urls = ele.xpath('//*[@id="footprint-box"]/ul/li/a/@href')
        save_urls(urls)
    else:
        #处理验证码
        img_url = ele.xpath('//div[@class="optional"]/img/@src')[0]
        img = requests.get(url=img_url,cookies=cookies)
        with open('code.jpg','wb') as f:
            f.write(img.content)
        check_code(chaojiying_check(img.content),url)
#处理验证码,发送验证码请求
def check_code(code,url):
    check_url = 'http://www.renren.com/validateuser.do'
    code = code.get('pic_str')
    print(code)
    print(url.split('/')[3])
    data = {
        'id': url.split('/')[3],
        'icode': code,
        'submit': '继续浏览',
        'requestToken': '380016961',
        '_rtk': 'e267979b'
    }
    requests.post(url=check_url,data=data,cookies=cookies)

此时就可以模拟输入验证码,并跳过验证码继续爬取信息。

还应注意的其他易错点:
1.有些用户可能被封禁,此时就获取不到name,如果获取不到,将url的状态设为1

    try:
        name = ele.xpath("//title/text()")[0].strip()
    except:
        change_status(url)
        return

2.保存爬取下来的url保存到数据库时,如果url重复报错,此时添加一个try:except

#保存爬下来的url
def save_urls(urls):
    for url in urls:
        try:
            sql = 'insert into renren values (%s,%s)'
            data = (url,'0')
            cursor.execute(sql,data)
            conn.commit()
        except:
            pass

3.出现验证码界面是,打印出的name为“人人网 - 验证码”。此时就应将判断条件修改为:

    if name != '人人网 - 验证码':
        change_status(url)
        urls = ele.xpath('//*[@id="footprint-box"]/ul/li/a/@href')
        save_urls(urls)
    else:
        #处理验证码
        img_url = ele.xpath('//div[@class="optional"]/img/@src')[0]
        img = requests.get(url=img_url,cookies=cookies)
        with open('code.jpg','wb') as f:
            f.write(img.content)
        check_code(chaojiying_check(img.content),url)

完整代码:

import requests
from lxml import etree
import MySQLdb
from chaojiying import chaojiying_check

conn = MySQLdb.connect(host = 'localhost',port = 3306,user = 'root',password = '123456',db = 'spider',charset = 'utf8')
cursor = conn.cursor()

cookies = {
    'Cookie': 'anonymid=k2jp0i5rnl37jv; depovince=GW; _r01_=1; JSESSIONID=abcyR51ZFGJjAaI9L1Z4w; ick_login=c29a1ecd-3469-480a-9dd0-3ff0bfdfdc0a; ick=1870e757-b68b-4c44-9538-9d924c241107; XNESSESSIONID=c6697a88940b; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232160; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232171; wp=1; wp_fold=1; jebecookies=83813c83-122d-462f-a377-48ad0f71b258|||||; _de=6C64ADFC30B6DD05DACEAE940732293E; p=326b188303e865c3565a41a8252b74608; first_login_flag=1; ln_uact=15137171529; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=8588b84627888708b98404a077443e428; societyguester=8588b84627888708b98404a077443e428; id=972774868; xnsid=cc0dfece; ver=7.0; loginfrom=null',
}

def get_info(url):
    response = requests.get(url=url,cookies=cookies)
    ele = etree.HTML(response.text)
    print(url)
    #有些用户可能被封禁,此时就获取不到name,如果获取不到,将url的状态设为1
    try:
        name = ele.xpath("//title/text()")[0].strip()
    except:
        change_status(url)
        return
    print(name)
    save_info(name)
    if name != '人人网 - 验证码':
        change_status(url)
        urls = ele.xpath('//*[@id="footprint-box"]/ul/li/a/@href')
        save_urls(urls)
    else:
        #处理验证码
        img_url = ele.xpath('//div[@class="optional"]/img/@src')[0]
        img = requests.get(url=img_url,cookies=cookies)
        with open('code.jpg','wb') as f:
            f.write(img.content)
        check_code(chaojiying_check(img.content),url)

#处理验证码,发送验证码请求
def check_code(code,url):
    check_url = 'http://www.renren.com/validateuser.do'
    code = code.get('pic_str')
    print(code)
    print(url.split('/')[3])
    data = {
        'id': url.split('/')[3],
        'icode': code,
        'submit': '继续浏览',
        'requestToken': '380016961',
        '_rtk': 'e267979b'
    }
    requests.post(url=check_url,data=data,cookies=cookies)

#存用户数据
def save_info(data):
    sql = 'insert into renren_info values (%s)'
    cursor.execute(sql,(data,))
    conn.commit()


#修改url的状态为1
def change_status(url):
    sql = 'update renren set status=1 where url=%s'
    cursor.execute(sql,(url,))
    conn.commit()


#保存爬下来的url
def save_urls(urls):
    for url in urls:
        try:
            sql = 'insert into renren values (%s,%s)'
            data = (url,'0')
            cursor.execute(sql,data)
            conn.commit()
        except:
            pass


#从数据库得到第一个status为0的url
def get_url():
    sql = 'select url from renren where status=%s'
    cursor.execute(sql,('0',))
    url = cursor.fetchone()[0]
    get_info(url)

if __name__ == '__main__':
    # url = 'http://www.renren.com/880151247/profile'
    while 1:
        get_url()

讲的不太详细,请多见谅

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值