Python3利用Cookie爬取数据

Python3利用Cookie爬取数据

import re
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
#导入表格
ex = pd.read_excel("renren.xlsx")

#需要爬取的网页
url = 'http://www.renren.com/974704130'
#获取的cookie
cookie = 'anonymid=kc6aif1n-6vdfd; _r01_=1; taihe_bi_sdk_uid=34bcce7c916e863e3d6cabb3ee7ac3fa; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1593827624660; _de=9E1AC321F7FA9FEDCA9229A0637FD32C; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1595158554099; depovince=GW; ln_uact=18323168513; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebecookies=e4786d3b-9562-45ef-8d7a-800153ae3a7a|||||; JSESSIONID=abc5E8i6DGiE9c61u7Tqx; ick_login=50792f89-0782-44d7-89b3-374a924fbfb7; taihe_bi_sdk_session=5eb865a86dd69e61744cb003a3c28c30; p=eb1fff3f27e61458636896116670343a0; first_login_flag=1; t=309f67438391733bb9242201bf2f04c20; societyguester=309f67438391733bb9242201bf2f04c20; id=974704130; xnsid=d2129709; ver=7.0; loginfrom=null; wp_fold=0'
#伪装成浏览器
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
}
#cookie切片
cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
#获取网页
res = requests.get(url=url, headers=headers, cookies=cookie_dict).content.decode("utf-8")
#print(res)
soup = bs(res, 'lxml')
#获取需要的数据
b = soup.find_all('a', class_='hd-name')
#正则表达式处理得到需要的数据
c = re.findall(r'<a class=".*?">(.*?)</a>', str(b))
ex.loc[0, 0] = c
ex.to_excel("renren.xlsx", index=None)
print(c)
#保存html格式
with open("reren.html", "w", encoding="utf-8") as f:
    f.write(res)

1 获取网页

#需要爬取的网页
url = 'http://www.renren.com/974704130'
#获取的cookie
cookie = 'anonymid=kc6aif1n-6vdfd; _r01_=1; taihe_bi_sdk_uid=34bcce7c916e863e3d6cabb3ee7ac3fa; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1593827624660; _de=9E1AC321F7FA9FEDCA9229A0637FD32C; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1595158554099; depovince=GW; ln_uact=18323168513; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebecookies=e4786d3b-9562-45ef-8d7a-800153ae3a7a|||||; JSESSIONID=abc5E8i6DGiE9c61u7Tqx; ick_login=50792f89-0782-44d7-89b3-374a924fbfb7; taihe_bi_sdk_session=5eb865a86dd69e61744cb003a3c28c30; p=eb1fff3f27e61458636896116670343a0; first_login_flag=1; t=309f67438391733bb9242201bf2f04c20; societyguester=309f67438391733bb9242201bf2f04c20; id=974704130; xnsid=d2129709; ver=7.0; loginfrom=null; wp_fold=0'
#伪装成浏览器
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
}
#cookie切片
cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
#获取网页
res = requests.get(url=url, headers=headers, cookies=cookie_dict).content.decode("utf-8")

通过登录人人网获取得到的cookie获取网页信息,url是我们需要爬取的网页地址。
2 获取数据

b = soup.find_all('a', class_='hd-name')

利用soup找到网页中标签为a,class属性为hd-name的内容,获取的结果如下。
<a class="hd-name" href="http://www.renren.com/974704130/profile" title="舒服">舒服</a>]
通过re正则表达式匹配得到我们需要的内容:

c = re.findall(r'<a class=".*?">(.*?)</a>', str(b))

获取得到内容:舒服
3 获取数据保存到表格

ex = pd.read_excel("renren.xlsx")

导入表格。

ex.loc[0, 0] = c

将获取的数据保存到第0行0列。

ex.to_excel("renren.xlsx", index=None)

保存表格。
4 将获取的网页保存为HTML

with open("reren.html", "w", encoding="utf-8") as f:
    f.write(res)
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值