爬虫复习
1.模拟浏览器像服务器发送请求
import urllib. request
url = 'http://www.baidu.com'
response = urllib. request. urlopen( url= url)
print ( response. read( ) )
print ( response. read( 5 ) )
print ( response. read( ) . decode( 'utf-8' ) )
2.下载网页和图片
import urllib. request
url = 'http://www.baidu.com'
urllib. request. urlretrieve( url= url, filename= 'baidu.html' )
url_bank = ''
urllib. request. urlretrieve( url= url, filename= 'bank.jpg' )
3.请求对象的定制
import urllib. request
url = 'http://www.baidu.com'
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
}
request = urllib. request. Request( url= url, headers= headers)
response = urllib. request. urlopen( request)
print ( response. read( ) . decode( 'utf-8' ) )
4.get编码quote
import urllib. request
import urllib. parse
s = '薛之谦'
s = urllib. parse. quote( s)
url = 'https://www.baidu.com/s?ie=UTF-8&wd='
url = url + s
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' ,
'Cookie' : 'BAIDUID=16BDFD68B6C8108833B5C5FF29C6C385:FG=1; BIDUPSID=16BDFD68B6C8108833B5C5FF29C6C385; PSTM=1578133901; _ga=GA1.2.1052536360.1581136015; _gid=GA1.2.1501837378.1581136015; __xsptplus861=861.1.1581138259.1581138259.1%234%7C%7C%7C%7C%7C%23%23gXZDihtYhaFUD36opSYW2pZmsZB2e2rj%23; BDRCVFR[ktez10wUwwD]=IdAnGome-nsnWnYPi4WUvY; delPer=0; PSINO=7; H_PS_PSSID=; BDORZ=FFFB88E999055A3F8A630C64834BD6D0'
}
request = urllib. request. Request( url= url, headers= headers)
response = urllib. request. urlopen( request)
print ( response. read( ) . decode( 'utf-8' ) )
5.get编码urlencode
import urllib. request
import urllib. parse
url = 'https://www.baidu.com/s?ie=UTF-8&'
data = {
'wd' : '薛之谦' ,
'sex' : '男'
}
data = urllib. parse. urlencode( data)
url = url + data
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' ,
'Cookie' : 'BAIDUID=16BDFD68B6C8108833B5C5FF29C6C385:FG=1; BIDUPSID=16BDFD68B6C8108833B5C5FF29C6C385; PSTM=1578133901; _ga=GA1.2.1052536360.1581136015; _gid=GA1.2.1501837378.1581136015; __xsptplus861=861.1.1581138259.1581138259.1%234%7C%7C%7C%7C%7C%23%23gXZDihtYhaFUD36opSYW2pZmsZB2e2rj%23; BDRCVFR[ktez10wUwwD]=IdAnGome-nsnWnYPi4WUvY; delPer=0; PSINO=7; H_PS_PSSID=; BDORZ=FFFB88E999055A3F8A630C64834BD6D0'
}
request = urllib. request. Request( url= url, headers= headers)
response = urllib. request. urlopen( request)
print ( response. read( ) . decode( 'utf-8' ) )
总结:
(1 )url data headers 已知三剑客
(2 )request 求
(3 )response 解
(4 )content 答
6.post百度翻译
import urllib. request
import urllib. parse
url = 'https://fanyi.baidu.com/?aldtype=16047#auto/zh'
data = {
'kw' : 'car'
}
data = urllib. parse. urlencode( data) . encode( 'utf-8' )
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
}
request = urllib. request. Request( url= url, headers= headers, data= data)
response = urllib. request. urlopen( request)
content = response. read( ) . decode( 'utf-8' )
print ( content)
7.post百度翻译详解
import urllib. request
import urllib. parse
url = 'https://fanyi.baidu.com/?aldtype=16047#auto/zh'
data = {
'from' : 'en' ,
'to' : 'zh' ,
'query' : 'repeat' ,
'transtype' : 'realtime' ,
'simple_means_flag' : '3' ,
'sign' : '790121.585048' ,
'token' : '2a8709b12a68eea34ab1fcc81c3384ab' ,
}
data = urllib. parse. urlencode( data) . encode( 'utf-8' )
headers = {
'cookie' : 'BAIDUID=16BDFD68B6C8108833B5C5FF29C6C385:FG=1; BIDUPSID=16BDFD68B6C8108833B5C5FF29C6C385; PSTM=1578133901; _ga=GA1.2.1052536360.1581136015; _gid=GA1.2.1501837378.1581136015; __xsptplus861=861.1.1581138259.1581138259.1%234%7C%7C%7C%7C%7C%23%23gXZDihtYhaFUD36opSYW2pZmsZB2e2rj%23; BDRCVFR[ktez10wUwwD]=IdAnGome-nsnWnYPi4WUvY; delPer=0; PSINO=7; H_PS_PSSID=1446_21116; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1581219297; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1581219297; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; APPGUIDE_8_2_2=1; __yjsv5_shitong=1.0_7_7a7806493e4e83673bc98654639d939fa6a1_300_1581219298344_120.242.182.5_12c8de7d; yjs_js_security_passport=608d972bb2365a501f326bbf66af40bc0d9acbbb_1581219299_js; from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D'
}
request = urllib. request. Request( url= url, headers= headers, data= data)
response = urllib. request. urlopen( request)
content = response. read( ) . decode( 'utf-8' )
import json
obj = json. loads( content)
s = json. dumps( obj, ensure_ascii= False )
print ( s)
8.豆瓣电影
import urllib. request
url = 'https://m.douban.com/rexxar/api/v2/movie/hot_channels?for_mobile=1'
headers = {
'Host' : 'm.douban.com' ,
'Pragma' : 'no-cache' ,
'Referer' : 'https://m.douban.com/movie/' ,
'Sec-Fetch-Mode' : 'cors' ,
'Sec-Fetch-Site' : 'same-origin' ,
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11' ,
'Cookie' : 'bid=QYXCJBzI4Po; douban-fav-remind=1; ll="118194"; ap_v=0,6.0; __utma=30149280.1594892337.1578657150.1578657150.1581228430.2; __utmb=30149280.0.10.1581228430; __utmc=30149280; __utmz=30149280.1581228430.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D2E4D3EC20BB509058737D4D71D9B594A|8b14170f02b91961bb9d2178e24a40d4; Hm_lvt_6d4a8cfea88fa457c3127e14fb5fabc2=1581228442; _ga=GA1.2.1594892337.1578657150; _gid=GA1.2.1168477597.1581228443; Hm_lpvt_6d4a8cfea88fa457c3127e14fb5fabc2=1581228595'
}
request = urllib. request. Request( url= url, headers= headers)
response = urllib. request. urlopen( request)
content = response. read( ) . decode( 'utf-8' )
with open ( 'movie.json' , 'w' , encoding= 'utf-8' ) as fp:
fp. write( content)
9.豆瓣电影-多页下载
(1 )request请求对象的定制
(2 )content获取响应的源码
(3 )下载
import urllib. request
import urllib. parse
def create_request ( page) :
url = 'https://movie.douban.com/explore#?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&'
data = {
'page_limit' : 20 ,
'page_start' : ( page- 1 ) * 20
}
data = urllib. parse. urlencode( data)
url = url+ data
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'
}
request = urllib. request. Request( url= url, headers= headers)
return request
def get_content ( request) :
response = urllib. request. urlopen( request)
content = response. read( ) . decode( 'utf-8' )
return content
def down_load ( page, content) :
with open ( 'movie_' + str ( page) + '.json' , 'w' , encoding= 'utf-8' ) as fp:
fp. write( content)
if __name__== '__main__' :
start_page = int ( input ( '请输入起始页码' ) )
end_page = int ( input ( '请输入结束页码' ) )
for page in range ( start_page, end_page+ 1 )
request = create_request( page)
content = get_content( request)
down_load( page, content)
10.cookie登陆人人网
import urllib. request
url = 'http://www.renren.com/305523888/profile'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' ,
'Cookie' : 'anonymid=jix3nuu4-498h3n; _de=BF83005E46A2ACDF72FFEFECAA50653A696BF75400CE19CC; ln_uact=595165358@qq.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn521/20170509/0940/main_5crY_aee9000088781986.jpg; _r01_=1; wp_fold=0; depovince=SH; jebecookies=49738d3c-ff08-4fc0-8af3-a374bd69b02d|||||; ick_login=8bbe43e3-acbc-4212-b278-15708c7659a5; p=fad75c8a2de3da2fc8b0a058ebcab3d48; first_login_flag=1; t=7088bcd867882dbbb3d3cc07931e8f788; societyguester=7088bcd867882dbbb3d3cc07931e8f788; id=305523888; xnsid=1288ac5a; loginfrom=syshome; jebe_key=b8a3f973-563c-4e6a-ac8f-99deef080f20%7Cdca572dcc866b00768c874af75fd79ec%7C1575942469779%7C1%7C1575942466913'
}
request = urllib. request. Request( url= url, headers= headers)
response = urllib. request. urlopen( request)
content = response. read( ) . decode( 'utf-8' )
with open ( 'renren.html' , 'w' , encoding= 'utf-8' ) as fp:
fp. write( content)
11.cookie登陆微博
import urllib. request
url = 'https://weibo.cn/6451491586/info'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' ,
'cookie' : 'SCF=Ahi2Sm3XHpcYIJvIsbJd8AnqkyO8t5RFmHXn8yHeTOMYgumvEqFGsgNbZbD6BmzlV7GA-B8sNWcbTcHeVmF3eNc.; _T_WM=72574359132; SUB=_2A25w6oxbDeRhGeBK7lMV-S_JwzqIHXVQFBQTrDV6PUJbkdAKLRP4kW1NR6e0UKKZYPpypRCEG3NBMNQYK-ENojRL; SUHB=0LciE0Eb7flXvC; SSOLoginState=1575943179'
}
request = urllib. request. Request( url= url, headers= headers)
response = urllib. request. urlopen( request)
content = response. read( ) . decode( 'utf-8' )
woith open ( 'weibo.html' , 'w' , encoding= 'utf-8' ) as fp:
fp. write( content)
12.handler高级请求头
一般请求三剑客:url headers data
高级请求三剑客:handler opener open
import urllib. request
url = 'http://www.baidu.com'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' ,
}
request = urllib. request. Request( url= url, headers= headers)
handler = urllib. request. HTTPHandler( )
opener = urllib. request. build_opener( handler)
response = opener. open ( request)
content = response. read( ) . decode( 'utf-8' )
print ( content)
13.代理ip
import urllib. request
url = 'url =' https: // www. baidu. com/ s?wd= ip'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' ,
'Cookie' : 'BIDUPSID=CEFB83FC447B9672DFB780A2C32E9BA0; PSTM=1566611808; BD_UPN=12314753; BAIDUID=5FEE5B37DD5B2736815F8A2DD1160F18:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; yjs_js_security_passport=4565615ac7c399c9c910d318e5ad2661c315000c_1575874059_js; delPer=0; BD_HOME=0; H_PS_PSSID=1456_21110_30210_26350; BD_CK_SAM=1; PSINO=3; H_PS_645EC=f238ic8G1MiqqXUkFy55Eb59JNRNWS9lln2aZeLXjZtnbfPxP7AOwh9vInA; COOKIE_SESSION=25_0_2_0_0_5_0_0_1_2_2_4_0_0_0_0_0_0_1575945999%7C2%230_0_1575945999%7C1'
}
reruest = urllib. request. Request( url= url, headers= headers)
proxies = {
'https' : '58.17.125.215:53281'
}
handler = urllib. request. ProxyHandler( proxies= proxies)
opener = urllib. request. build_opener( handler)
response = opener. open ( request)
content = response. read( ) . decode( 'utf-8' )
with open ( 'daili.html' , 'w' , encoding= 'utf-8' ) as fp:
fp. write( content)
14.快代理
快代理需要购买,并给你生成链接,此链接的ip会更换保证爬虫的正常运行
import urllib. request
url_ip = 'http://kps.kdlapi.com/api/getkps/?orderid=967594691554973&num=1&pt=1&sep=1'
response_ip = urllib. request. urlopen( url_ip)
content_ip = response_ip. read( ) . decode( 'utf-8' )
url = 'url =' https: // www. baidu. com/ s?wd= ip'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' ,
'Cookie' : 'BIDUPSID=CEFB83FC447B9672DFB780A2C32E9BA0; PSTM=1566611808; BD_UPN=12314753; BAIDUID=5FEE5B37DD5B2736815F8A2DD1160F18:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; yjs_js_security_passport=4565615ac7c399c9c910d318e5ad2661c315000c_1575874059_js; delPer=0; BD_HOME=0; H_PS_PSSID=1456_21110_30210_26350; BD_CK_SAM=1; PSINO=3; H_PS_645EC=f238ic8G1MiqqXUkFy55Eb59JNRNWS9lln2aZeLXjZtnbfPxP7AOwh9vInA; COOKIE_SESSION=25_0_2_0_0_5_0_0_1_2_2_4_0_0_0_0_0_0_1575945999%7C2%230_0_1575945999%7C1'
}
reruest = urllib. request. Request( url= url, headers= headers)
proxies = {
'https' : 'content_ip'
}
handler = urllib. request. ProxyHandler( proxies= proxies)
opener = urllib. request. build_opener( handler)
response = opener. open ( request)
content = response. read( ) . decode( 'utf-8' )
with open ( 'daili.html' , 'w' , encoding= 'utf-8' ) as fp:
fp. write( content)
15.代理池
import random
ip_list = [
{ 'http' : '122.114.112.242:16819' } ,
{ 'http' : '122.114.112.242:16819' } ,
]
ip = random. choice( ip_list)
import urllib. request
url = 'https://www.baidu.com/s?wd=ip'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' ,
'Cookie' : 'BIDUPSID=CEFB83FC447B9672DFB780A2C32E9BA0; PSTM=1566611808; BD_UPN=12314753; BAIDUID=5FEE5B37DD5B2736815F8A2DD1160F18:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; yjs_js_security_passport=4565615ac7c399c9c910d318e5ad2661c315000c_1575874059_js; delPer=0; BD_HOME=0; H_PS_PSSID=1456_21110_30210_26350; BD_CK_SAM=1; PSINO=3; H_PS_645EC=f238ic8G1MiqqXUkFy55Eb59JNRNWS9lln2aZeLXjZtnbfPxP7AOwh9vInA; COOKIE_SESSION=25_0_2_0_0_5_0_0_1_2_2_4_0_0_0_0_0_0_1575945999%7C2%230_0_1575945999%7C1'
}
request = urllib. request. Request( url= url, headers= headers)
headler = urllib. request. ProxyHandler( proxies= ip)
opender = urllib. request. build_opener( handler)
response = opener. open ( request)
content = response. read( ) . decode( 'utf-8' )
with open ( 'dailichi.html' , 'w' , encoding= 'utf-8' ) as fp:
fp. write( content)
16.动态cookie-全书网
1. cookie库能干啥:通过handler登陆会自动地保存登陆之后的cookie
2. cookie库的配置:
创建一个cookiejar对象
使用cookiejar对象,创建一个handler对象
使用handler创建一个opener
通过opener登陆
handler会自动的保存登陆之后的cookie
案例:全书网
import urllib. request
import urllib. parse
import http. cookiejar
url_login = 'http://www.quanshuwang.com/login.php?do=submit'
headers = {
'User - Agent' : 'Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 74.0.3729.169Safari / 537.36'
}
data = {
'username' : 'action' ,
'password' : 'action' ,
'action' : 'login' ,
}
data = urllib. parse. urlencode( data) . encode( 'utf-8' )
request = urllib. request. Request( url= url_login, headers= headers, data= data)
cookiejar = http. cookiejar. CookieJar( )
handler = urllib. request. HTTPCookieProcessor( cookiejar= cookiejar)
opener = urllib. request. build_opener( handler)
opener. open ( request)
url_bookcase = 'http://www.quanshuwang.com/modules/article/bookcase.php'
request_bookcase = urllib. request. Request( url= url_bookcase, headers= headers)
response_bookcase = opener. open ( request_bookcase)
content = response_bookcase. read( ) . decode( 'gbk' )
with open ( 'bookcase.html' , 'w' , encoding= 'gbk' ) as fp:
fp. write( content)
17.bs4中华英才网
import urllib. request
url = 'http://www.chinahr.com/sou/?city=36%2C400&keyword=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
request = urllib. request. Request( url= url, headers= headers)
response = urllib. request. urlopen( request)
content = response. read( ) . decode( 'utf-8' )
from bs4 import BeautifulSoup
soup = BeautifulSoup( content, 'lxml' )
jobname_list = soup. select( '.resultList > div li > .e1 > a' )
salary_list = soup. select( '.resultList > div .l2 > .e2' )
company_list = soup. select( '.resultList > div .l1' )
for i in range ( len ( company_list) ) :
company_name= company_list[ i] . find_all( 'span' ) [ 2 ] . get_text( ) . strip( )
salary = salary_list[ i] . get_text( )
jobname = jobname_list[ i] . get_text( )
print ( company_name, salary, jobname)
动态加载数据
首页中对应的企业信息数据是通过ajax请求到的
'''
单线程 + 异步协程
event_loop : 事件循环,相当于无限循环
coroutine : 协程对象
task : 任务
future : 代表将来执行或还没有执行的任务
import asyncio
async def request():
print("")
#async修饰的函数,调用之后返回一个协程对象
c = request("url")
#创建一个时间循环对象
loop = asyncio.get_event_loop()
-------------------------------------
#将协程对象注册到loop中,然后启动loop
loop.run_until_complete(c)
--------------------------------------
#task的使用.基于时间循环
task = loop.create_task(c)
print(task)
loop.run_until_complete(task)
-------------------------------------
future的使用.不急于事件循环
task = asyncio.ensure_future(c)
loop.run_until_complete(task)
-------------------------------
#绑定回调
def callback_func(task):
print(task.result())
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(c)
#将回调函数绑定到任务对象中
task.add_done_callback(callback_func)
loop.run_until_complete(task)
import time
import asyncio
asybc def request(url):
print('正在下载',url)
#在异步协程中如果出现了同步模块相关的代码,那么就无法实现异步
当在asyncio中遇到阻塞操作必须进行手动挂起
await asyncio.sleep(2)
print("下载完毕",url)
start = time.time()
urls = [
'www.baidu.com',
'www.sogou.com',
'www.goubanjia.com'
]
#任务列表:存放多个任务对象
stasks = []
for url in urls:
c = request(urls)
#创建一个任务对象
task = asyncio.ensure_future(c)
stasks.append(task)
loop = asyncio.get_event_loop()
#需要将任务列表封装到awit中
loop.run_until_complete(asyncio.wait(stasks))
print(time.time() - start)
#阻塞案例
async def get_page(url):
async with aiohttp.ClientSession() as session:
#get(),post():
#headers,params/data,proxy="http://ip:port"
async with await session.get(url) as response:
#text()返回字符串形式的相应数据
#read()返回的二进制形式的相应数据
#json()返回的是json对象
#注意:获取相应数据操作之前一定要使用await进行手动挂起
page_text = await response.text()
print(page_text)
#基于selenium爬取动态加载数据
from selenium import webdriver
from lxml import etree
from time import sleep
#实例化一个浏览器对象
bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get("http://125.353565555")
#page_source获取浏览器当前页面的页面源码数据
page_text = bro.page_source
#解析企业名称
tree = etree.HTML(page_text)
li_list = tree.xpath("")
for li in li_list:
name = li.xpath("")[0]
print(name)
bro.quit()
from selenium import webdriver
from time import sleep
bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get('https://www.taobao.com/')
#标签定位
search_input = bro.find_element_by_id('q')
#标签交互
search_input.send_keys('Iphone')
#执行一组js程序,向下滑动
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(2)
#点击搜索按钮
btn = bro.find_element_by_css_selector('.btn-search')
btn.click()
bro.get("https://www.baidu.com")
sleep(2)
#回退
bro.back()
#前进
bro.forward()
#selenium 处理iframe
如果定位的标签存在于iframe标签之中,则必须使用switch_to.frame(id)
动作链(拖动):from selenium.webdriver import ActionChains
实例一个动作链对象:action = ActionChains(bro)
click_and_hold(div):长安且点击操作
move_by_offset(x,y)
perform()让动作链立即执行
action.release()释放动作链对象
from selenium import webdriver
from time import sleep
from selenium.webdriver import ActionChains
bro = webdriver.Chrome(executable_path='./chromedriver')
bro.get('url')
#如果定位的标签存在于iframe标签之中的则必须通过如下操作进行标签定位
bro.switch_to.frame('iframeResult') #切换浏览器标签定位的作用域
div = bro.find_element_by_id('draggabie')
#动作链
action = ActionChains(bro)
#点击长按指定的标签
action.click_and_hold(div)
for i in range(5):
#perform()立即执行动作链操作
#move_by_offset(x,y):x水平方向y素值方向
action.move_by_offset(17,0).perform()
sleep(0.5)
#释放动作链
action.release()
bro.quit()
破解token和sign
import requests
import re
import js2py
import json
session = requests.Session()
#token获取
header1 = {
cookie
ua
}
session.headers=headers1
response=session.get("url")
token = re.findall("token:('.*'),",response.content.decode())[0]
#gtk
gtk = re.findall(";window.gtk=('.*')",response.content.decode())[0]
#计算sign
js_ctx = js2py.Evaljs()
js_str = r"""
js代码 网页观察拷贝
"""
#替换其中u的变量
js_str = js_str.replace("",gtk)
js_ctx.execute(js_str)
sign = js_ctx.e("随便写")
print(sign)
#构造url
请求头
把token和sign填到想要爬取页面中的请求头中