import requests
from lxml import etree
class Renrenlogin(object):
def __init__(self):
#post_url: form表单中action中的url地址
self.post_url='http://www.renren.com/PLogin.do'
#真正要抓取的页面url地址:人人网的个人主页
self.get_url='http://www.renren.com/974686948/profile'
#实例化session对象
self.session=requests.session()
#提取数据—先post在get
def parse_html(self):
user=input("请输入人人网用户名:")
password=input("请输入人人网密码:")
data={
"email":user,
"password":password
}
#先post,把cookies保存session对象中-会话保持
self.session.post(url=self.post_url,data=data)
#在get,正常抓取数据
html=self.session.get(url=self.get_url).text
parse_obj=etree.HTML(html)
xpath_bds='//*[@id="operate_area"]/div[1]/ul/li[1]/span/text()'
school=parse_obj.xpath(xpath_bds)
print(school)
def run(self):
self.parse_html()
if __name__ == '__main__':
spider=Renrenlogin()
spider.run()
爬取人人网个人主页
最新推荐文章于 2021-02-22 21:02:01 发布