Python数据分析___爬虫—3
1 爬虫
1.1 requests
安装requests模块:pip install requests
使用requests:
import requests
url="http://www.baidu.com/s?"
wd=input("请输入你想查询的内容:")
# 第一种写法
# response=requests.get(url+"s?wd="+query_arg,
# headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"})
# 第二种写法
params= {"wd":wd}
response=requests.get(
url=url,
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"},
params=params
)
content=response.content #返回:bytes内容
with open("自定义请求头.html","wb") as f:
f.write(content)
1.2 爬取豆瓣电影
import requests
from pprint import pprint
import json
url="https://movie.douban.com/j/search_subjects"
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"}
for page_start in range(0,1000,20):
params= {
"type": "movie",
"tag": "热门",
"sort": "recommend",
"page_limit": "100",
"page_start": page_start
}
response=requests.get(
url=url,
headers=headers,
params=params
)
# pprint((response.json())) #将装有json数据的响应结果:<Response [200]>,转换为字典
content=response.content #返回:bytes内容
string=content.decode("utf-8")
results=json.loads(string)
for movie in results["subjects"]:
print(movie['title'],movie['rate'])
1.3 爬百度贴吧
# https://tieba.baidu.com/f?ie=utf-8&kw=%E6%9D%8E%E6%AF%85%E5%90%A7&fr=search
#ie: utf-8
#kw: 李毅吧
#fr: search
#pn: 0 以50累加
import requests
from pprint import pprint
# User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36
class TiebaSpider():
def __init__(self,kw,max_page):
self.kw=kw
self.max_page=max_page
self.base_url="https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
self.headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"}
def get_url_list(self):
url_list=[]
for page in range(0,self.max_page,50):
url=self.base_url.format(self.kw,page)
url_list.append(url)
return url_list
def get_content(self,url):
response=requests.get(url=url,headers=self.headers)
return response.content
def write_items(self,content,index):
with open("百度贴吧-{}.html".format(index),"wb") as f:
f.write(content)
def save_items(self,items):
pass
def run(self):
# 1: 获取路由列表
# 2:发送请求、获取数据
# 3:从响应中获取数据
# 4:把数据保存
url_list=self.get_url_list()
for url in url_list:
content=self.get_content(url)
items=self.write_items(content,url_list.index(url)+1)
self.save_items(items)
pass
if __name__ =="__main__":
tiebaSpider=TiebaSpider("DOTA",150)
tiebaSpider.run()
1.4 requests的使用——获取源码字符串
import requests
url="http://www.baidu.com/s?"
response=requests.get(url=url,headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"})
#返回:bytes内容
response.encoding="utf-8"
content=response.text #将响应结果,转化为字符串
print(content)
1.5 requests的使用——查看基本信息
import requests
url="http://www.baidu.com/s?"
response=requests.get(url=url,headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"})
#返回:bytes内容
response.encoding="utf-8"
content=response.text
# print(content)
print(response.request.url)
print(response.request.headers)
print(response.status_code)
print(response.url)
print(response.headers)
1.6 代理服务器
第二种方案,使用代理服务器
import requests
url="http://www.baidu.com/"
proxies={"http":"http://183.129.244.16:11492/"}
response=requests.get( #返回:bytes内容
url=url,
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"},
proxies=proxies)
print(response.text)
1.7 访问需要登陆的页面
第一种方案,先在网页上登陆,拷贝cookie
import requests
url="https://github.com/settings/profile"
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Cookie":"_octo=GH1.1.846992610.1565078944; _ga=GA1.2.1649463393.1565078957; _device_id=54ad067f0bed35076fb0dff0920224dc; has_recent_activity=1; _gat=1; tz=Asia%2FShanghai; user_session=FKrjlv-r2XR7tePUeoJ7wfPsUAV2K-3LOrl0R-bCja3mFymn; __Host-user_session_same_site=FKrjlv-r2XR7tePUeoJ7wfPsUAV2K-3LOrl0R-bCja3mFymn; logged_in=yes; dotcom_user=huolingyi; _gh_sess=akUxczkxN0lSVFl4TDJZWWJUT1hyT2xSbk1rWjFLRmdwZmMvUUVYeGhickhrcEpzWDNPTCt0R25ZVE5uSElWUFE2S0RaNmsvMHYra2hrSnVTYW9iRi92dVh3TFlSTmlYeGU4c3BDTU5oYmdCeVZybEhHSEVYNytjRGo5elB6VXExblA4eUw5S2xDSDloaHNtajJvV3I0T2hGc3hBMU5FTHBOZHp0WGRSaEN2TTI4YXlRN2c5NzJjc3FhcGYwOTk2R1ZENm82eVhvK1E2dHZzRHlqWllMVlR1U3luaWRjSXd2QWdEdXlQVjZnd2kxanB6NE5wNU1GV0dFbTVvWmNDUy0taHdCc2tEZWNiV3AxSjd6cjhWRHRhZz09--46f2a85d04a3cd2d4d353b514ae6e2f559c61adf"
}
response=requests.get( #返回:bytes内容
url=url,
headers=headers
)
with open("解决cookie的方案.html","wb") as f:
f.write(response.content)
第二种方案,直接在程序中模拟登陆
import requests
url="https://github.com/session"
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Cookie":"_octo=GH1.1.846992610.1565078944; _ga=GA1.2.1649463393.1565078957; _device_id=54ad067f0bed35076fb0dff0920224dc; has_recent_activity=1; tz=Asia%2FShanghai; _gat=1; logged_in=no; _gh_sess=bmdYc2orMjRyNzlFSGFubGZDRk1GY1dESVJOeU1QdldlYnhFYWo2S3JsajY5RnpibVlwVWJDeUhkZ0VPRnh5SlZrUFVNbWFuZXJiMmhPMXlUVEZVc1NtemE4dUgxRFpTeVhZeDRQZ0g5cnVPMk1GeUhCdEpvUTZiVnZEV00vcFdZUzB5dy9QOXFkYk5hU3pFbTZFcHVRL25jSWg3dG1XTWpVTEoxOWR5aWN0U3pzUDJhSW44L3dBTXVZQUY1MllTcmlab2tkS245aVdyZ1FMOEdjdXl0ZVM2OXA3azJpY1hUWGZiRjJMRnI1b2EzaFVTZjU5cDEzR2Y3YlRvR28yakNsU1RBcjRXNHRjN0lrU1JzY3BCVUlNNC9yTHNaRThUbm1FYXpzSktuRVg3OXloZ1ZqQW53R2szbENOd2dhU0UyRVpjZ25SczQ0REE1VytLSUFFUG5nM281OEx0dFhna3Z2Wjh6TkZKUURFdjk2Z3ZqNzFNc0hTVVFaRC9iTEdHbHpURlpiTmcwQXdRemNKVThwbGkzOC9MSSt0SERja0lVcXZRK0hWR3ZZTWt4enhwOVpUMjcwZ0haOThDa3VFMDQ4MGJkZndBa01TMEgyVk5UbzJCMG92bzAwZUxiNCtBWFZDb0NMWk1rQnkveW9wWG1rL0p3Z1FScERjR0FSbzctLWpBbmVyckl3RUpGNDZveUZjT1g1bUE9PQ%3D%3D--54a4e8cc7c03f16cbdc5cf29a6dfe8cc6b7c9d61"
}
data={"commit": "Sign in",
"utf8": "✓",
"authenticity_token": "HZxdsq9TIOUPN8APM/6OG9vDCyb5JdkAgWQ0PPg3G9wksmZA/S+L5o6p3FC83IUkkdn4/QrpWWWllEakhLKwOA==",
"login": "huolingyi",
"password": "jiayou110407",
"webauthn-support": "supported",
"required_field_e712": "",
"timestamp": "1568104579694",
"timestamp_secret": "20ba5ba60967073b9d41dc70356cda6112f2fa6b621e7ef342c15a9b35e67d1b" }
response=requests.post( #返回:bytes内容
url=url,
headers=headers,
data=data
)
#获取登陆成功的cookie
cookies=response.cookies
url="https://github.com/settings/profile"
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
}
response=requests.get( #返回:bytes内容
url=url,
headers=headers,
cookies=cookies
)
with open("解决cookie的方案.html","wb") as f:
f.write(response.content)
第三种方案,使用requests.session(这个比较好)
import requests
#创建session,session自动保存了内部Cookies,并且如果使用session对象,继续发送请求,自动会发送Cookie
session=requests.session()
url="https://github.com/session"
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
"Cookie":"_octo=GH1.1.846992610.1565078944; _ga=GA1.2.1649463393.1565078957; _device_id=54ad067f0bed35076fb0dff0920224dc; has_recent_activity=1; tz=Asia%2FShanghai; _gat=1; logged_in=no; _gh_sess=bmdYc2orMjRyNzlFSGFubGZDRk1GY1dESVJOeU1QdldlYnhFYWo2S3JsajY5RnpibVlwVWJDeUhkZ0VPRnh5SlZrUFVNbWFuZXJiMmhPMXlUVEZVc1NtemE4dUgxRFpTeVhZeDRQZ0g5cnVPMk1GeUhCdEpvUTZiVnZEV00vcFdZUzB5dy9QOXFkYk5hU3pFbTZFcHVRL25jSWg3dG1XTWpVTEoxOWR5aWN0U3pzUDJhSW44L3dBTXVZQUY1MllTcmlab2tkS245aVdyZ1FMOEdjdXl0ZVM2OXA3azJpY1hUWGZiRjJMRnI1b2EzaFVTZjU5cDEzR2Y3YlRvR28yakNsU1RBcjRXNHRjN0lrU1JzY3BCVUlNNC9yTHNaRThUbm1FYXpzSktuRVg3OXloZ1ZqQW53R2szbENOd2dhU0UyRVpjZ25SczQ0REE1VytLSUFFUG5nM281OEx0dFhna3Z2Wjh6TkZKUURFdjk2Z3ZqNzFNc0hTVVFaRC9iTEdHbHpURlpiTmcwQXdRemNKVThwbGkzOC9MSSt0SERja0lVcXZRK0hWR3ZZTWt4enhwOVpUMjcwZ0haOThDa3VFMDQ4MGJkZndBa01TMEgyVk5UbzJCMG92bzAwZUxiNCtBWFZDb0NMWk1rQnkveW9wWG1rL0p3Z1FScERjR0FSbzctLWpBbmVyckl3RUpGNDZveUZjT1g1bUE9PQ%3D%3D--54a4e8cc7c03f16cbdc5cf29a6dfe8cc6b7c9d61"
}
data={"commit": "Sign in",
"utf8": "✓",
"authenticity_token": "HZxdsq9TIOUPN8APM/6OG9vDCyb5JdkAgWQ0PPg3G9wksmZA/S+L5o6p3FC83IUkkdn4/QrpWWWllEakhLKwOA==",
"login": "huolingyi",
"password": "jiayou110407",
"webauthn-support": "supported",
"required_field_e712": "",
"timestamp": "1568104579694",
"timestamp_secret": "20ba5ba60967073b9d41dc70356cda6112f2fa6b621e7ef342c15a9b35e67d1b" }
session.post( #返回:bytes内容
url=url,
headers=headers,
data=data
)
url="https://github.com/settings/profile"
headers={"User-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"
}
response=session.get( #返回:bytes内容
url=url,
headers=headers
)
with open("session解决登陆方案.html","wb") as f:
f.write(response.content)
1.8 js2py工具——逆向js——高级爬虫
import js2py
#创建上下文(承上python启下js)
context = js2py.EvalJs()
#用python执行js代码
# context.execute('console.log("abc")')
# js数据传递给python使用
# context.execute("a=5")
# context.execute('b="aaa"')
# context.execute("c=['a','b','c']")
# context.execute("d={'text':'demo'}")
# print(context.a)
# print(context.b)
# print(context.c)
# print(context.d)
#python数据给js使用
# context.a=5
# context.b="bbb"
# context.c={"a","b","c"}
# context.d={"text":"demo"}
#
# context.execute('console.log(a)')
# context.execute('console.log(b)')
# context.execute('console.log(c)')
# context.execute('console.log(d)')
context.execute(('function add(x,y){return x + y}'))
print(context.add(5,6))
模拟登陆到设置页面,逆向解析js
import js2py
import requests
context = js2py.EvalJs()
phoneNum="15361598720"
password="nihao123"
context.t={
"phoneNum":phoneNum,
"password":password,
"c1":"-100",
}
response=requests.get(
url="http://activity.renren.com/livecell/rKey",
headers={"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
)
context.n=response.json()["data"]
with open("BigInt.js","r",encoding="utf-8") as f:
context.execute(f.read())
with open("RSA.js","r",encoding="utf-8") as w:
context.execute(w.read())
with open("Barrett.js","r",encoding="utf-8") as w:
context.execute(w.read())
js_string='''
t.password = t.password.split("").reverse().join(""),
console.log(t)
setMaxDigits(130);
var o = new RSAKeyPair(n.e,"",n.n)
, r = encryptedString(o, t.password);
t.password = r,
t.rKey = n.rkey
'''
context.execute(js_string)
url="http://activity.renren.com/livecell/ajax/clog"
headers={"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"}
data={'phoneNum':context.t['phoneNum'],'c1':context.t['c1'],'password':context.t['password'],'rKey':context.t['rKey']}
response=requests.post(
url=url,
headers=headers,
data=data)
cookies=response.cookies
print(response.json())
print(requests.utils.dict_from_cookiejar(cookies))
url="http://safe.renren.com/security/account"
response=requests.get(url=url,
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"},
cookies=cookies)
with open("settings.html","wb") as f:
f.write(response.content)