day04-selenium滚动和常见反爬

页面滚动

# -*- coding: UTF-8 -*-
"""
@Author  :张立强
@Project :day4-selenium滚动和常见反爬 
@File    :03页面滚动.py
@IDE     :PyCharm 
@from  :立子的烂笔头
今天不努力,明天当废物
@Date    :2022/8/16 10:08 
"""
import csv
from time import sleep
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup


def get_page(str1='电脑'):
    global a
    a = str1
    global b
    b = Chrome()
    b.get('https://www.jd.com')
    b.find_element_by_id('key').send_keys(f'{str1}\n')
    sleep(2)
    input('开始:')
    get_data()


def get_data():
    global all_data
    all_data = []
    for i in range(2):
        # 执行滚动操作 - 执行js中的滚动代码 window.scrollBy(x方向偏移量, y方向偏移量)
        # b.execute_script('window.scrollBy(0, 5000)')
        for i in range(10):
            b.execute_script('window.scrollBy(0, 800)')
            sleep(1)
        soup = BeautifulSoup(b.page_source, 'lxml')
        all_goods_div = soup.select('#J_goodsList>ul>li>div.gl-i-wrap')
        print(len(all_goods_div))
        for x in all_goods_div:
            name = x.select_one('.p-name em').text.replace('"', '').replace('\t\n', '')
            price = x.select_one('.p-price i').text
            all_data.append([name, price])
        next_btn = b.find_element_by_class_name('pn-next')
        next_btn.click()
        sleep(1)
    input('保存:')
    b.close()
    down_data()


def down_data():
    with open(f'files/{a}.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerows(all_data)
        print('保存完成')


if __name__ == '__main__':
    name = input('请输入你要搜索的物品名称:')
    get_page(name)

requests自动登录

# 自动登录原理:人工在浏览器上完成登录操作,获取登陆后的cookie信息(登录信息),在通过代码发送请求的时候携带登录后的cookie

import requests

res = requests.get(
    url='https://www.zhihu.com/',
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36',
        'cookie': '_zap=aa850a38-2904-4976-bc38-24cd90ce1067; d_c0="AGAfNugA_RSPTjCW87gXy7Zes-xD79Lxrm0=|1653356164"; _9755xjdesxxd_=32; YD00517437729195%3AWM_TID=ivX1zOOxk3lBVRRUUVfRFwdq5raBjQ4R; _xsrf=U4nQoiXnfpzi5aV1QWmXRndDblAh9vor; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1658822105,1660619270; captcha_session_v2=2|1:0|10:1660619270|18:captcha_session_v2|88:NGFMV250OW5Fb0NwYkVwUXExUktmdDZJZmFqei9OQXhlSzYrMXdvS0pJT3VDeVBhQ25XeVBUR1FIWkRZWXVSVQ==|6686adf76c589deb2032ef0b6ede78c4eb5c0dc30b348db1e7b846e179f51ac4; SESSIONID=BCibCV4P3tR0fhq8uwpqAH3qmPZNtfaRPo1Q4ry4mRU; JOID=UlkWAkoqNQG3UG64AislHIBfZL4UaXRq1RI5jlAYQFGFNgfIOC8_-NRQbbgA79BtAqGNPOniEkBy-T5tZgtisEk=; osd=VV0dAkgtMQq3Umm8CSsnG4RUZLwTbX9q1xU9hVAaR1WONgXPPCQ_-tNUZrgC6NRmAqOKOOLiEEd28j5vYQ9psEs=; __snaker__id=PiWXHHi25gr7QHLV; gdxidpyhxdE=XqQyTy8%2FaievZs61Xgv8DMSCDxCI0jx54oSPYODSd7BnGDTY13JKqOLPkQSa4Enm53YCsyYeZijodOaKcN%5CE7x9sUhZ1A4qLMlnky4JxAE%5C%2F0TvIszPdy38CaZxV9HC%2FRL7yWKTATf%2F3CLAc8f5HRmzf%5CyX7Hp53gvqVoxKMAk24hpEb%3A1660620171324; YD00517437729195%3AWM_NI=mF8nZirBRICmNLUoDAe2wnh29L9UAwTbhPyC9r4tzhzhriMwbNEDAOeMAudgEZlwTxZJ4EE4mdeVOjjeUPQk9Fdb9Nn0IJEwVzquVeA0KJtDxqyLgmiEmCjqqbsRPzRxU2I%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eed3f94be98e9bdabb73ed9e8aa6d45f828f9b82c8449ab4a48db679a8adbc92cd2af0fea7c3b92aaf9af88fd963b29e9eaac2678e949b90c74ba7eeb98feb3afb94f7b6c47d98bea8b1fc49fbaefc95f964bab8be8ad07da2928ba5c445a7938f96e76d8786a498d7749a96fca6c840afbcabb7ce62aabb9bb1d461fb9aac83bc46aef087abc87d96a6bf96d56f83f085abf750bcafb7ade667ab9ba3a4e97bb3bebdb3d37292a89bd3cc37e2a3; z_c0=2|1:0|10:1660619564|4:z_c0|92:Mi4xU3NHcUVRQUFBQUFBWUI4MjZBRDlGQ1lBQUFCZ0FsVk5MRm5vWXdDYkx5anVVV1N3bE04Sld1UzZPdDVNZFc3TzBn|0074ec76c4afd598c03c7a4ea52c550663e21a6c590b0d23b85e6ccf2df5c874; q_c1=16d0fa41c078456c8018e596f61263c9|1660619564000|1660619564000; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1660619566; tst=r; NOT_UNREGISTER_WAITING=1; KLBRSID=53650870f91603bc3193342a80cf198c|1660619573|1660619269'
    }
)
print(res.text)

selenium获取cookie

from json import dumps

from selenium.webdriver import Chrome

b = Chrome()
# 1.打开需要完成自动登录的网站(需要获取cookie的网站)
b.get('https://www.taobao.com/')
# 2.给足够长的时间让人工完成自动登录并且人工刷新出登陆后的页面
# 强调一定要把第一个页面刷新出登陆之后的页面
input('已经完成登录:')
# 3.获取登陆后的cookie并且将获取到的cookie保存到本地文件
cookies = b.get_cookies()
print(cookies)
b.close()
with open('files/taobao.txt', 'w', encoding='utf-8') as f:
    f.write(dumps(cookies))

seleniue使用cookie

from json import loads

from selenium.webdriver import Chrome

b = Chrome()

# 1.打开需要自动登录的网页
b.get('https://www.taobao.com/')

# 2.添加cookie
with open('files/taobao.txt', encoding='utf-8') as f:
    content = f.read()
    cookies = loads(content)

for i in cookies:
    b.add_cookie(i)

print(b.page_source)
# 3.重新打开网站
b.get('https://www.taobao.com/')
print(b.page_source)
input(":")

requests 使用代理

import requests

res = requests.get(
    url='https://movie.douban.com/top250',
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
    },
    # proxies={
    #     'http': 'http://119.7.145.65:4531',
    #     'https': 'http://119.7.145.65:4531'
    # }
    # 119.7.145.65:4531
    proxies={
        'http': '119.7.145.65:4531',
        'https': '119.7.145.65:4531'
    }
)
print(res.text)

requests使用代理实际用法

from time import sleep

import requests


def get_ip():
    url = 'http://d.jghttp.alicloudecs.com/getip?num=1&type=1&pro=510000&city=510600&yys=0&port=11&time=2&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1&regions='
    while True:
        response = requests.get(url)
        if response.text[0] == '{':
            print('提取失败,重试!')
            sleep(1)
            continue
        return response.text


def get_douban_film():
    ip = get_ip()
    res = requests.get(
        url='https://movie.douban.com/top250',
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
        },
        proxies={
            'http': f'{ip}',
            'https': f'{ip}'
        }
    )
    print(res)
    print(res.text)


if __name__ == '__main__':
    get_douban_film()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值