day6 常见反爬

常见反爬

01 . requests使用代理IP

  • 使用代理:给参数proxies赋值 - {‘http’:‘ip端口’}

    import requests
    
    
    def get_html(url):
        headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
        }
        # 使用固定的代理ip
        response = requests.get(url, headers=headers,proxies={'https':'119.7.145.225:4558'})
        print(response.text)
        return response.text
    
    if __name__ == '__main__':
        for _ in range(2):
            url = 'https://movie.douban.com/top250'
            get_html(url)
    

04 . selenium使用代理ip

  • 使用代理 - --proxy-server=http://IP:端口号 (ip端口是https的ip)

    from selenium.webdriver import Chrome,ChromeOptions
    
    options = ChromeOptions()
    options.add_argument('--proxy-server=http://119.5.36.108:4531')
    b = Chrome(options=options)
    b.get('https://movie.douban.com/top250')
    

03 . requests自动登录

  • 在谷歌浏览器中打开网页完成登录操作,然后刷新页面

  • 打开当前页面的检查,在network的All选项中找到当前页面的请求,获取请求头request header中的cookie值

  • 在用requests发送请求的时候给headers赋值,在headers中添加cookie对应的键值对

    import requests
    
    headers = {
            'cookie':'_zap=8e7efb63-1ac5-4329-b7bc-c1f282b29c90; d_c0="AGDf6WH-lRSPTsCQPiLp82XCJk0tVCHfxqs=|1646443289"; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=jbZcNMq0oW9FQBAEVRZ%2BvzM%2F7n1J%2BiBA; _xsrf=f1Uz4S6FIenrnXrPFbNYE8yyV750LuTg; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1646651691,1646652767,1647660780,1647833960; __snaker__id=gUl7lWYKdQ5arwzV; gdxidpyhxdE=tEdyzOzRa0M%2F%2F0RCKvT8XDAHDiSG239517tqczPxd5%2Bd1xn%2FHivZ1YeyrJT%2BXI6fLK6IhdTfd2ysVkupPIXnfI%2FbpJmIHwvJ9a%2BiIJYUSgxytwPgy7jiTL29WHrSb8dD6d53NqOJHSVtH30mu3lSyiHAs7LjEonQeIY%5CwypJJm%5CTIxO8%3A1647834862434; YD00517437729195%3AWM_NI=pAo8OUj0PF501UTE30w49OGQOlb38E%2FHkF4aNhj2%2B9qsR7TlJNFklMOJ67zNN1bk7SJoEH4QUd0FpRRF6phpIdnf2kLR8jpz2Hcy5TY40CwYqQb4TYB5%2Fm8R1PPe6VcFSVQ%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eeb8bc67ba9efdb4fc4a978e8eb2d54e868b8b85b53df7a78699d363fc9c978de52af0fea7c3b92ab49fa1d9ce5faaf0a0a7b83cf38b9ca6b23cb5ecf7b0c65b8db4aed4bb4fed98a5a7b76b8eb8a6d3aa33b3ad9ed8d7688a928a8cef478d96fad8f9468eefae8cb6598ab18ba9b75b97bc00a7f35a929afa91eb7c8bbca287b161908ef88bf372f298ad9ab8609bec00a4f653b38faed5d75382ebe18ebb5af390a086b55ea19a9db7d437e2a3; SESSIONID=yOdEeqTwAta8V8XWBjvnmtKDToSOOUp1iuQLjMBa4GF; JOID=VFwcCkxLLCzWctXrXETP8w0UJH1HMG5InRamsisLWkDuKuyxBvLSFbtx2uZSO0OnvaGm3XBgbSu3Yf4Q4WaHeEs=; osd=VVAdCkJKIC3WfNTnXUTB8gEVJHNGPG9IkxeqsysFW0zvKuKwCvPSG7p92-ZcOk-mva-n0XFgYyq7YP4e4GqGeEU=; captcha_session_v2=2|1:0|10:1647834064|18:captcha_session_v2|88:R3RRbjdOL1ozK0ZleVJnVXpMS2dabVIrT21KZVFJL3BQMm5NVUF0cUxsdm96dGJDc0tUejlEK09MR053azBQcA==|9e05def29c63ea0f47e3e4b7d049950179b8b19120e731b53cc52e9171e29cad; captcha_ticket_v2=2|1:0|10:1647834417|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfOEVoZHBrRWJrLmFFWlpjUVJuX2dQcUsxek1hdjJuRmowNHguREp1ZENWWnNCNjhvejBxbjBSOEd0bUNBSGpNb3pwZkVhOV9vWUVkOC4xRm0ubi5LTVlqbndQdzlveU5PZFp3VnA4bnI2SHF2QzBLb0ZoUWdkeDAyVTFJRndUVjk0VHRJSzRtazRGSUdwOC1kRWp5dEJVUkZsUmg0WDFmTHMyUURsMmo4NWFUMlN2VUljS2FTWE5zWTcyVVJnMlA4OEs3SnBuTmptUklZSV9jU1VHLmoxRHBFYWRlN0lpNS1aTzdObENvZnZRX3R4LVI0WnNZOFlYaExFTzVkN19hOVV5RHItYXdwOW9VWkRqQzBIaXZ2VXJrYnRsajdkZUJpLjZ6ajEwbVd4VnplWE1ScGdSa3QyUk1uSkdSbXd0TC5iTlRfVEVJd2ZZcW1Fd29fbXhvb0VzcHV3NTE0cC5IdjVsVkd3Zlp3d2NYdjhhSHc5SXpKSW9VWVZjRS5kQ0ZIQnpaSEdPWFFjTlF2cHVDX0FKWHBoR1AwaHI4X05YVFhXYVlSWWFjazdrTU9NeU9XQm5BT2x4Ui4xYk44eFlTV3paNE5MUE5VOTI0NGpZcUVVVm5reTBFcERiZDFrT0toNWkwTldZWFhhSHdGZFNXcktkUFB0VHlrSU5hMyJ9|d68fa76d4f7f7246f17cceb14c65c623020b69017b839f58ca453ffc79ccc20a; z_c0=2|1:0|10:1647834467|4:z_c0|92:Mi4xTHRLLU53QUFBQUFBWU5fcFlmNlZGQ1lBQUFCZ0FsVk5ZME1sWXdDTTAyd0g3SlZYWUdWb0xQS1cwLVBOQkJtTTR3|60716c2b69507315628a2c721ca0d323dfa3be95b5f75a815ee4f5cf6add389b; q_c1=617cb32a434147c7bcda4cd05dbadaa8|1647834467000|1647834467000; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1647834471; tst=r; NOT_UNREGISTER_WAITING=1; KLBRSID=e42bab774ac0012482937540873c03cf|1647834513|1647833956',
            'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
        }
    
    response = requests.get('https://www.zhihu.com/',headers=headers)
    
    print(response.text)
    

04 . selenium自动登录

  • 穿件浏览器对象,打开需要自动登录的网站

  • 手动完成登录操作

  • 获取cookie数据并保存到本地文件中

    from selenium.webdriver import Chrome,ChromeOptions
    
    b = Chrome()
    b.get('https://www.taobao.com/')
    input('是否完成:')
    cookies = b.get_cookies()
    open('files/taobao.txt', 'w',encoding='utf-8').write(str(cookies))
    

05 . selenium自动登录使用cookie

  • 创建浏览器对象打开网页

  • 添加本地保存的cookie信息

  • 重新打开网页

  • 进行后续其他操作

    from selenium.webdriver import Chrome,ChromeOptions
    from selenium.webdriver.common.keys import Keys
    
    b = Chrome()
    b.get('https://www.taobao.com/')
    cookie_list = eval(open('files/taobao.txt',encoding='utf-8').read())
    for cookie in cookie_list:
        b.add_cookie((cookie))
    
    b.get('https://www.taobao.com/')
    
    search = b.find_element_by_id('q')
    search.send_keys('鞋子')
    search.send_keys(Keys.ENTER)
    

06 . 字体反爬

import requests
from bs4 import BeautifulSoup
import re

headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
url = 'https://mapi.guazi.com/car-source/carList/pcList?page=1&pageSize=12&city_filter=12&city=12&guazi_city=12&tag_types=18&versionId=0.0.0.0&osv=IOS&platfromSource=wap'
response = requests.get(url, headers=headers)

table = {
    '0xe1d0': '7', '0xe325': '4', '0xe41d': '1', '0xe52e': '9', '0xe630': '2', '0xe76e': '8',
    '0xe891': '5', '0xe9ce': '0', '0xeaf2': '3', '0xec4c': '6', '0xf88a': '7'
}
# hex(十进制数)  -  将10进制转换成16进制
all_car = response.json()['data']['postList']
for car in all_car:
    price = car['price'].split(';')
    new_price = ''
    for x in price:
        if x.startswith('&#'):
            new_price += table[hex(int(x[2:]))]
        elif x.startswith('.&#'):
            new_price += '.' + table[hex(int(x[3:]))]
        else:
            new_price += x
    print(new_price)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值