目录
引入包
import requests
from lxml import etree
import re
表单登录,爬取网页
表单模拟登录人人网,访问某个人的主页(如包贝尔主页)
def login(login_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Cookie': 'anonymid=k9l0ubc1-jzfyew; depovince=GW; _r01_=1; taihe_bi_sdk_uid=f093e8f64592e2ed9d3e19664245d525; _de=22E211AC7A0BA7EC784B16E8BE562FAC; ln_hurl=http://hdn.xnimg.cn/photos/hdn321/20160702/1930/main_X0zS_b00300000dd8195a.jpg; jebe_key=47666e08-5ec9-4010-baa2-9a79a0d69af3%7C2769fe7ead2272a870f6f8bf452111f5%7C1588145587644%7C1%7C1588145587084; jebe_key=47666e08-5ec9-4010-baa2-9a79a0d69af3%7C2769fe7ead2272a870f6f8bf452111f5%7C1588145587644%7C1%7C1588145587089; wp=0; jebecookies=dbcb88d3-f860-48f5-a9a3-6ea420c1793d|||||; JSESSIONID=abcpt1WIoE_jx6a0CLjhx; ick_login=0f9e7723-f398-43a6-ac46-fc5f8c44c99b; taihe_bi_sdk_session=d0d0e4e1e06cd0c2f4155cd19738b8c4; p=887fe31bb0da5156873e6709c7fe15939; first_login_flag=1; t=b5d97f97bc39b05d5a2566e50f8ddf5b9; societyguester=b5d97f97bc39b05d5a2566e50f8ddf5b9; id=546335979; loginfrom=syshome; wp_fold=0; ln_uact=18332813055; xnsid=365b4f73'
}
fromdata = {
'log': [{"sourceTag":"default","actionTag":"load","targetTag":"timeline_feed_retrieve","needRecordRelation":True,"sendUserId":"546335979","getUserId":"880792860"}],
'requestToken': '1141648272',
'_rtk': 'fcd29ea'
}
try:
r = s.post(login_url,headers=headers,data=fromdata, timeout = 10)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except Exception as e:
print(e)
return ""
def get_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36',
'Cookie': 'anonymid=k9l0ubc1-jzfyew; depovince=GW; _r01_=1; taihe_bi_sdk_uid=f093e8f64592e2ed9d3e19664245d525; _de=22E211AC7A0BA7EC784B16E8BE562FAC; ln_hurl=http://hdn.xnimg.cn/photos/hdn321/20160702/1930/main_X0zS_b00300000dd8195a.jpg; jebe_key=47666e08-5ec9-4010-baa2-9a79a0d69af3%7C2769fe7ead2272a870f6f8bf452111f5%7C1588145587644%7C1%7C1588145587084; jebe_key=47666e08-5ec9-4010-baa2-9a79a0d69af3%7C2769fe7ead2272a870f6f8bf452111f5%7C1588145587644%7C1%7C1588145587089; wp=0; jebecookies=dbcb88d3-f860-48f5-a9a3-6ea420c1793d|||||; JSESSIONID=abcpt1WIoE_jx6a0CLjhx; ick_login=0f9e7723-f398-43a6-ac46-fc5f8c44c99b; taihe_bi_sdk_session=d0d0e4e1e06cd0c2f4155cd19738b8c4; p=887fe31bb0da5156873e6709c7fe15939; first_login_flag=1; t=b5d97f97bc39b05d5a2566e50f8ddf5b9; societyguester=b5d97f97bc39b05d5a2566e50f8ddf5b9; id=546335979; loginfrom=syshome; wp_fold=0; ln_uact=18332813055; xnsid=36312eaa'
}
try:
r = s.get(url, headers=headers, timeout=10)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except Exception as e:
print(e)
return ""
xpath数据解析
解析其个人信息(姓名,代表作)
def parse_page(html):
dom = etree.HTML(html)
user_list = dom.xpath('//h1[@class="avatar_title"]/text()')
use = [re.sub('\\s','',i) for i in user_list]
user = use[0]
info_list = dom.xpath('//p[@class="authentication"]/text()')
inf = [re.sub('\\s','',i) for i in info_list]
info = inf[0]
return user,info
总函数
将结果直接打印输出即可
人人网登录网址:http://www.renren.com
包贝尔主页网址:人人 - 加入人人,找到老同学,结识新朋友
if __name__ == '__main__':
#登录地址
login_url = "http://www.renren.com/profileLogger/send"
#创建会话对象
s = requests.session()
html_login = login(login_url)
print(html_login)
#登录后访问
url = "http://www.renren.com/880792860/profile"
html = get_page(url)
data = parse_page(html)
print('演员:{}\n个人信息:{}'.format(data[0],data[1]))