Python3利用Cookie爬取数据
import re
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
#导入表格
ex = pd.read_excel("renren.xlsx")
#需要爬取的网页
url = 'http://www.renren.com/974704130'
#获取的cookie
cookie = 'anonymid=kc6aif1n-6vdfd; _r01_=1; taihe_bi_sdk_uid=34bcce7c916e863e3d6cabb3ee7ac3fa; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1593827624660; _de=9E1AC321F7FA9FEDCA9229A0637FD32C; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1595158554099; depovince=GW; ln_uact=18323168513; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebecookies=e4786d3b-9562-45ef-8d7a-800153ae3a7a|||||; JSESSIONID=abc5E8i6DGiE9c61u7Tqx; ick_login=50792f89-0782-44d7-89b3-374a924fbfb7; taihe_bi_sdk_session=5eb865a86dd69e61744cb003a3c28c30; p=eb1fff3f27e61458636896116670343a0; first_login_flag=1; t=309f67438391733bb9242201bf2f04c20; societyguester=309f67438391733bb9242201bf2f04c20; id=974704130; xnsid=d2129709; ver=7.0; loginfrom=null; wp_fold=0'
#伪装成浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
}
#cookie切片
cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
#获取网页
res = requests.get(url=url, headers=headers, cookies=cookie_dict).content.decode("utf-8")
#print(res)
soup = bs(res, 'lxml')
#获取需要的数据
b = soup.find_all('a', class_='hd-name')
#正则表达式处理得到需要的数据
c = re.findall(r'<a class=".*?">(.*?)</a>', str(b))
ex.loc[0, 0] = c
ex.to_excel("renren.xlsx", index=None)
print(c)
#保存html格式
with open("reren.html", "w", encoding="utf-8") as f:
f.write(res)
1 获取网页
#需要爬取的网页
url = 'http://www.renren.com/974704130'
#获取的cookie
cookie = 'anonymid=kc6aif1n-6vdfd; _r01_=1; taihe_bi_sdk_uid=34bcce7c916e863e3d6cabb3ee7ac3fa; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1593827624660; _de=9E1AC321F7FA9FEDCA9229A0637FD32C; jebe_key=c136167c-b611-4413-b252-b1403a24d9fd%7C7f5f8fa9961b748d50f484778bf79161%7C1593827624050%7C1%7C1595158554099; depovince=GW; ln_uact=18323168513; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebecookies=e4786d3b-9562-45ef-8d7a-800153ae3a7a|||||; JSESSIONID=abc5E8i6DGiE9c61u7Tqx; ick_login=50792f89-0782-44d7-89b3-374a924fbfb7; taihe_bi_sdk_session=5eb865a86dd69e61744cb003a3c28c30; p=eb1fff3f27e61458636896116670343a0; first_login_flag=1; t=309f67438391733bb9242201bf2f04c20; societyguester=309f67438391733bb9242201bf2f04c20; id=974704130; xnsid=d2129709; ver=7.0; loginfrom=null; wp_fold=0'
#伪装成浏览器
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',
}
#cookie切片
cookie_dict = {i.split("=")[0]: i.split("=")[-1] for i in cookie.split("; ")}
#获取网页
res = requests.get(url=url, headers=headers, cookies=cookie_dict).content.decode("utf-8")
通过登录人人网获取得到的cookie获取网页信息,url是我们需要爬取的网页地址。
2 获取数据
b = soup.find_all('a', class_='hd-name')
利用soup找到网页中标签为a,class属性为hd-name的内容,获取的结果如下。
<a class="hd-name" href="http://www.renren.com/974704130/profile" title="舒服">舒服</a>]
通过re正则表达式匹配得到我们需要的内容:
c = re.findall(r'<a class=".*?">(.*?)</a>', str(b))
获取得到内容:舒服
3 获取数据保存到表格
ex = pd.read_excel("renren.xlsx")
导入表格。
ex.loc[0, 0] = c
将获取的数据保存到第0行0列。
ex.to_excel("renren.xlsx", index=None)
保存表格。
4 将获取的网页保存为HTML
with open("reren.html", "w", encoding="utf-8") as f:
f.write(res)