目标:爬取人人网信息,此处只爬一个name
思路:
1.登陆
2.访问大鹏首页,获得到右侧推荐好友的所有url,并保存到数据库,设置状态码为 0,同时获取到大鹏的信息,并保存数据库。
3.从数据库中取出一个url(未当问的),同时将该url的状态码设为1
4.向其他页面发送请求,并获取个人信息和其他推荐好友
1.比如访问大鹏的首页,获取其信息(首先要注册登陆进首页)
import requests
from lxml import etree
import MySQLdb
conn = MySQLdb.connect(host = 'localhost',port = 3306,user = 'root',password = '123456',db = 'spider',charset = 'utf8')
cursor = conn.cursor()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
'Cookie': 'anonymid=k2jp0i5rnl37jv; depovince=GW; _r01_=1; JSESSIONID=abcyR51ZFGJjAaI9L1Z4w; ick_login=c29a1ecd-3469-480a-9dd0-3ff0bfdfdc0a; ick=1870e757-b68b-4c44-9538-9d924c241107; XNESSESSIONID=c6697a88940b; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232160; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232171; wp=1; wp_fold=1; jebecookies=83813c83-122d-462f-a377-48ad0f71b258|||||; _de=6C64ADFC30B6DD05DACEAE940732293E; p=326b188303e865c3565a41a8252b74608; first_login_flag=1; ln_uact=15137171529; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=8588b84627888708b98404a077443e428; societyguester=8588b84627888708b98404a077443e428; id=972774868; xnsid=cc0dfece; ver=7.0; loginfrom=null; l4pager=0'
}
def get_info(url):
res = requests.get(url=url,headers=headers)
ele = etree.HTML(res.text)
print(res.text)
#获取用户name并存数据库,并将status状态设为1
name = ele.xpath('//title/text()')[0]
change_status(url)
save_data(name)
#获取该用户下的访问用户的urls,并存入数据库
urls = ele.xpath('//div[@id="footprint-box"]/ul/li/a/@href')
save_url(urls)
#将爬取过的url状态设为1
def change_status(url):
sql = 'update renren_url set status=1 where url=%s'
cursor.execute(sql,(url,))
conn.commit()
#将 name存到的数据库
def save_data(data):
sql = 'insert into renren_info values (%s)'
cursor.execute(sql,(data,))
conn.commit()
#将爬下来的url存入数据库,并设状态为0
def save_url(urls):
for url in urls:
sql = 'insert into renren_url values (%s,%s)'
cursor.execute(sql,(url,'0'))
conn.commit()
#从数据库中获取第一个status为0的url
def get_url():
sql = 'select url from renren_url where status=%s'
cursor.execute(sql,('0',))
url = cursor.fetchone()[0]
get_info(url)
if __name__ == '__main__':
# 首先将大鹏的首页url手动添加到数据库,并设状态为0
# url = 'http://www.renren.com/880151247/profile'
while 1:
get_url()
2.爬到第100个报错。
错误原因:
1.当while 1一直执行get_url()的时候,一直从数据库获取url状态为0的(即未爬取过name的用户)的url并获取信息。
2.当爬取到第100个用户的时候,会出现验证码界面。此时xpath解析:**name = ele.xpath(’//title/text()’)[0]**会报错,因为此时页面没有改xpath的相关信息。
解决思路:
解决验证码的页面问题,验证成功后继续爬取信息。
提供两种解决办法:
1.图像识别,识别验证码
2.掏钱处理识别验证码。
我们选择第二种
百度超级鹰,验证码识别,注册登陆,下载chaojiying.py文件,保存到当前目录下。
#!/usr/bin/env python
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
def chaojiying_check(im):
chaojiying = Chaojiying_Client('账号', '密码', '902161') #用户中心>>软件ID 生成一个替换 96001
return chaojiying.PostPic(im, 1902) #1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
3.打印出验证码
在获取name的地方打印出“人人网,中国领先的实名制SNS社交网络。加入人人网,找到老同学,结识新朋友。”即出现了验证码页面。
将def get_info(url):函数进行修改
def get_info(url):
res = requests.get(url=url,headers=headers)
ele = etree.HTML(res.text)
print(res.text)
#获取用户name并存数据库,并将status状态设为1
name = ele.xpath('//title/text()')[0]
save_data(name)
if name != '人人网,中国领先的实名制SNS社交网络。加入人人网,找到老同学,结识新朋友。':
change_status(url)
#获取该用户下的访问用户的urls,并存入数据库
urls = ele.xpath('//div[@id="footprint-box"]/ul/li/a/@href')
save_url(urls)
else:
#处理验证码,获取到验证码url,并发送请求,使用超级鹰识别验证码
img_url = ele.xpath("//img[@id='verifyPic_login']/@src")[0]
img = requests.get(url=img_url)
#打印出验证码
print(chaojiying_check(img.content))
4.处理验证码,发送验证码请求
此时再写一个check_code()的函数用来发送验证码请求。
此时的验证码界面:
随便出入一个验证码,doc中多了一个请求,点开查看,form data里携带了信息。
此时在get_info中处理验证码处,应传入一个url,用于data参数的id。
def get_info(url):
response = requests.get(url=url,cookies=cookies)
ele = etree.HTML(response.text)
print(url)
#有些用户可能被封禁,此时就获取不到name,如果获取不到,将url的状态设为1
try:
name = ele.xpath("//title/text()")[0].strip()
except:
change_status(url)
return
print(name)
save_info(name)
if name != '人人网 - 验证码':
change_status(url)
urls = ele.xpath('//*[@id="footprint-box"]/ul/li/a/@href')
save_urls(urls)
else:
#处理验证码
img_url = ele.xpath('//div[@class="optional"]/img/@src')[0]
img = requests.get(url=img_url,cookies=cookies)
with open('code.jpg','wb') as f:
f.write(img.content)
check_code(chaojiying_check(img.content),url)
#处理验证码,发送验证码请求
def check_code(code,url):
check_url = 'http://www.renren.com/validateuser.do'
code = code.get('pic_str')
print(code)
print(url.split('/')[3])
data = {
'id': url.split('/')[3],
'icode': code,
'submit': '继续浏览',
'requestToken': '380016961',
'_rtk': 'e267979b'
}
requests.post(url=check_url,data=data,cookies=cookies)
此时就可以模拟输入验证码,并跳过验证码继续爬取信息。
还应注意的其他易错点:
1.有些用户可能被封禁,此时就获取不到name,如果获取不到,将url的状态设为1
try:
name = ele.xpath("//title/text()")[0].strip()
except:
change_status(url)
return
2.保存爬取下来的url保存到数据库时,如果url重复报错,此时添加一个try:except
#保存爬下来的url
def save_urls(urls):
for url in urls:
try:
sql = 'insert into renren values (%s,%s)'
data = (url,'0')
cursor.execute(sql,data)
conn.commit()
except:
pass
3.出现验证码界面是,打印出的name为“人人网 - 验证码”。此时就应将判断条件修改为:
if name != '人人网 - 验证码':
change_status(url)
urls = ele.xpath('//*[@id="footprint-box"]/ul/li/a/@href')
save_urls(urls)
else:
#处理验证码
img_url = ele.xpath('//div[@class="optional"]/img/@src')[0]
img = requests.get(url=img_url,cookies=cookies)
with open('code.jpg','wb') as f:
f.write(img.content)
check_code(chaojiying_check(img.content),url)
完整代码:
import requests
from lxml import etree
import MySQLdb
from chaojiying import chaojiying_check
conn = MySQLdb.connect(host = 'localhost',port = 3306,user = 'root',password = '123456',db = 'spider',charset = 'utf8')
cursor = conn.cursor()
cookies = {
'Cookie': 'anonymid=k2jp0i5rnl37jv; depovince=GW; _r01_=1; JSESSIONID=abcyR51ZFGJjAaI9L1Z4w; ick_login=c29a1ecd-3469-480a-9dd0-3ff0bfdfdc0a; ick=1870e757-b68b-4c44-9538-9d924c241107; XNESSESSIONID=c6697a88940b; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232160; jebe_key=0d2c341c-c119-475d-ac9d-1ce985256f12%7Ca944117966fe3b4572c1a7c8cb29c78c%7C1572829229771%7C1%7C1572829232171; wp=1; wp_fold=1; jebecookies=83813c83-122d-462f-a377-48ad0f71b258|||||; _de=6C64ADFC30B6DD05DACEAE940732293E; p=326b188303e865c3565a41a8252b74608; first_login_flag=1; ln_uact=15137171529; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=8588b84627888708b98404a077443e428; societyguester=8588b84627888708b98404a077443e428; id=972774868; xnsid=cc0dfece; ver=7.0; loginfrom=null',
}
def get_info(url):
response = requests.get(url=url,cookies=cookies)
ele = etree.HTML(response.text)
print(url)
#有些用户可能被封禁,此时就获取不到name,如果获取不到,将url的状态设为1
try:
name = ele.xpath("//title/text()")[0].strip()
except:
change_status(url)
return
print(name)
save_info(name)
if name != '人人网 - 验证码':
change_status(url)
urls = ele.xpath('//*[@id="footprint-box"]/ul/li/a/@href')
save_urls(urls)
else:
#处理验证码
img_url = ele.xpath('//div[@class="optional"]/img/@src')[0]
img = requests.get(url=img_url,cookies=cookies)
with open('code.jpg','wb') as f:
f.write(img.content)
check_code(chaojiying_check(img.content),url)
#处理验证码,发送验证码请求
def check_code(code,url):
check_url = 'http://www.renren.com/validateuser.do'
code = code.get('pic_str')
print(code)
print(url.split('/')[3])
data = {
'id': url.split('/')[3],
'icode': code,
'submit': '继续浏览',
'requestToken': '380016961',
'_rtk': 'e267979b'
}
requests.post(url=check_url,data=data,cookies=cookies)
#存用户数据
def save_info(data):
sql = 'insert into renren_info values (%s)'
cursor.execute(sql,(data,))
conn.commit()
#修改url的状态为1
def change_status(url):
sql = 'update renren set status=1 where url=%s'
cursor.execute(sql,(url,))
conn.commit()
#保存爬下来的url
def save_urls(urls):
for url in urls:
try:
sql = 'insert into renren values (%s,%s)'
data = (url,'0')
cursor.execute(sql,data)
conn.commit()
except:
pass
#从数据库得到第一个status为0的url
def get_url():
sql = 'select url from renren where status=%s'
cursor.execute(sql,('0',))
url = cursor.fetchone()[0]
get_info(url)
if __name__ == '__main__':
# url = 'http://www.renren.com/880151247/profile'
while 1:
get_url()
讲的不太详细,请多见谅