day4 数据接口和selenium基础

二 数据接口和selenium基础

2.1数据接口

import requests

response=requests.get('https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js?ts=2767546')
#该网址为数据接口网址
result = response.json()
#将数据接口网址进入后的数据进行json解析
for x in result['hero']:
    print(x['name'])

2.1.1一个英雄皮肤

import requests
def download(img_url: str, name: str):
    res = requests.get(img_url)
    with open(f'files/{name}.jpg', 'wb') as f:
        f.write(res.content)


if __name__ == '__main__':
    response = requests.get('https://game.gtimg.cn/images/lol/act/img/js/hero/1.js')
    result = response.json()
    for x in result['skins']:
        name = x['name']
        img_url = x['mainImg']
        if not img_url:
            img_url = x['chromaImg']
        download(img_url, name)

# 用代码创建文件夹:
import os
if not os.path.exists('所有英雄的皮肤/abc'):
    os.mkdir('所有英雄的皮肤/abc')

2.1.2 所有英雄的皮肤

# 1.获取所有英雄的id
def get_all_hero_id():
    url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
    res = requests.get(url).json()
    return [x['heroId'] for x in res['hero']]

def get_one_hero_skins(hero_id: str):
    # 1. 请求指定英雄对应的数据
    url = f'https://game.gtimg.cn/images/lol/act/img/js/hero/{hero_id}.js'
    res = requests.get(url)
    result = res.json()

    # 2. 创建这个英雄对应的文件夹
    hero_name = result['hero']['name']
    if not os.path.exists(f'所有英雄的皮肤/{hero_name}'):
        os.mkdir(f'所有英雄的皮肤/{hero_name}')

    # 3. 下载这个英雄所有的皮肤
    # 1)遍历拿到每个皮肤的名称和地址
    for skin in result['skins']:
        skin_name = skin['name'].replace('/', '')     # 防止皮肤名称中出现'/'
        skin_img = skin['mainImg']
        if not skin_img:
            skin_img = skin['chromaImg']
        # 2) 下载一张图片
        res = requests.get(skin_img)
        with open(f'所有英雄的皮肤/{hero_name}/{skin_name}.jpg', 'wb') as f:
            f.write(res.content)
        print('下载完成!')

if __name__ == '__main__':
    ids = get_all_hero_id()
    for x in ids[:5]:
        get_one_hero_skins(x)

2.2 selenium的基本用法

from selenium.webdriver import Chrome #控制浏览器模块

查看浏览器版本:chrome://version/

1)创建浏览器对象(浏览器对象如果是全局变量,浏览器不会自动关闭)

b = Chrome()

2)打开网页(你需要爬的数据在哪个网页里面,就打开哪个网页)

b.get('https://movie.douban.com/top250')

3)获取网页源代码(获取到的一定是页面中加载出来的)

print(b.page_source)

4)关闭浏览器

b.close()
from selenium.webdriver import Chrome
from time import sleep

b = Chrome()
b.get('https://www.jd.com')
  1. 输入框输入内容

1)找到输入框

input_tag = b.find_element_by_id('key')

2)输入框输入内容

input_tag.send_keys('电脑\n')
sleep(2)
print(b.page_source)
  1. 点击按钮

1)找到需要点击的标签

btn = b.find_element_by_css_selector('#navitems-group2 .b')

2)点击标签

btn.click()
input('是否结束:')
b.close()

2.2.1在京东搜索毛线

from selenium.webdriver import Chrome
from bs4 import BeautifulSoup


from selenium.webdriver import Chrome
from time import sleep
from bs4 import BeautifulSoup
import csv

b = Chrome()
b.get('https://www.jd.com')

input_tag = b.find_element_by_id('key')
input_tag.send_keys('毛线\n')

sleep(1)

all_data = []
# 解析第一页数据
soup = BeautifulSoup(b.page_source, 'lxml')
all_goods_div = soup.select('#J_goodsList>ul>li>div.gl-i-wrap')
for x in all_goods_div:
    name = x.select_one('.p-name em').text
    price = x.select_one('.p-price i').text
    all_data.append([name, price])

# 点击下一页
next_btn = b.find_element_by_class_name('pn-next')
next_btn.click()
sleep(1)


# 解析第二页数据
soup = BeautifulSoup(b.page_source, 'lxml')
all_goods_div = soup.select('#J_goodsList>ul>li>div.gl-i-wrap')
for x in all_goods_div:
    name = x.select_one('.p-name em').text
    price = x.select_one('.p-price i').text
    all_data.append([name, price])

writer = csv.writer(open('files/毛线.csv', 'w', encoding='utf-8', newline=''))
writer.writerows(all_data)


input('结束:')
b.close()

2.3 切换选项卡

# 1. 基本操作
b = Chrome()                        # 创建浏览器
b.get('https://www.cnki.net/')      # 打开中国知网
search_tag = b.find_element_by_id('txt_SearchText')     # 获取输入框
search_tag.send_keys('数据分析\n')      # 输入框输入'数据分析',然后按回车
sleep(1)        # 切换界面最后做一个等待操作

# 获取需要点击的所有标签: 如果拿到标签后需要点击或者输入,必须通过浏览器获取标签
all_result = b.find_elements_by_css_selector('.result-table-list .name>a')
# 点击第一个结果(这儿会打开一个新的选项卡)
all_result[0].click()
sleep(1)

# 2. 切换选项卡
# 注意:selenium中,浏览器对象(b)默认指向一开始打开的选项卡,除非用代码切换,否则浏览器对象指向的选项卡不会变
# 1)获取当前浏览器上所有的窗口(选项卡): 浏览器.window_handles
# 2)切换选项卡
b.switch_to.window(b.window_handles[-1])

# 3)解析内容
soup = BeautifulSoup(b.page_source, 'lxml')
result = soup.select_one('#ChDivSummary').text
print(result)

b.close()           # 关闭当前指向的窗口(最后一个窗口),窗口关闭后,浏览器对象的指向不会发生改变


# 回到第一个窗口,点击下一个搜索结果
b.switch_to.window(b.window_handles[0])
all_result[1].click()
sleep(1)

b.switch_to.window(b.window_handles[-1])

soup = BeautifulSoup(b.page_source, 'lxml')
result = soup.select_one('#ChDivSummary').text
print(result)

b.close()

input('结束:')
b.close()
2.3.1.1今日作业(爬取知网前20)
from selenium.webdriver import Chrome
from time import sleep
from bs4 import BeautifulSoup
import csv

def ziw():
    b = Chrome()
    b.get('https://www.cnki.net')
    input_list = b.find_element_by_id('txt_SearchText')
    input_list.send_keys('数据分析\n')
    sleep(1)

    list_all=[]
    for i in range(0,20):
        all_result = b.find_elements_by_css_selector('.result-table-list .name>a')
        all_result[i].click()
        sleep(1)

        b.switch_to.window(b.window_handles[-1])

        first = BeautifulSoup(b.page_source,'lxml')
        try:
            result1 = first.select_one('#ChDivSummary').text
        except:
            result1 = first.select_one('.wx-tit').text

        result2 = first.select_one('.wx-tit h1').text

        list_all.append([result2,result1])
        b.close()

        b.switch_to.window(b.window_handles[0])
    b.close()
    return list_all

def file(list_all):
    with open('files/中国知网摘要前20.csv','w',encoding='utf-8',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['题名','摘要'])
        writer.writerows(list_all)

if __name__ == '__main__':
    list_all = ziw()
    file(list_all)

2.4 页面滚动

  1. 执行滚动操作

    执行js中鼓动代码: window.scrollBy(x方向偏移量, y方向偏移量)

    window.scrollBy( x, y)

b.execute_script('window.scrollBy(0, 1800)')
from selenium.webdriver import Chrome
from time import sleep
from bs4 import BeautifulSoup

b = Chrome()
b.get('https://www.jd.com')
b.find_element_by_id('key').send_keys('电脑\n')
sleep(1)


for i in range(10):
    # 1. 执行滚动操作  -  执行js中鼓动代码:  window.scrollBy(x方向偏移量, y方向偏移量)
    # b.execute_script('window.scrollBy(0, 1800)')
    for x in range(10):
        b.execute_script('window.scrollBy(0, 700)')
        sleep(1)


    soup = BeautifulSoup(b.page_source, 'lxml')
    goods_li = soup.select('#J_goodsList>ul>li')
    pnnext = b.find_element_by_class_name('pn-next')
    pnnext.click()
input('关闭:')
b.close()

2.5 requests的自动请求

自动登录原理:人工在浏览器上完成登录操作,获取登录后的cookie信息(登录信息),再通过代码发送请求的时候携带登录后的cookie

headers = {
‘cookie’:‘…’
‘headers’=‘…’
}

import requests
headers = {
    'cookie':'...'
    'headers'='...'
}
response = requests.get('https://www.zhihu.com/signin?next=%2F',headers=headers)
print(response.text)

2.6 selenium获取并且使用cookies

from selenium.webdriver import Chrome
from json import dumps

b = Chrome()

1.打开需要完成自动登录的网站(需要获取cookie的网站)

b.get('https://www.taobao.com/')

2.给足够长的时间让人工完成自动登录并且人工刷新出登录后的界面

强调:一点要把第一个界面刷新出登录后的状态

3.获取登录后的cookie并且将获取到的cookie保存到本地文件

语法:b.get_cookies()

cookies = b.get_cookies()
with open('files/taobao.txt','w',encoding='utf-8') as f:
    f.write(dumps(cookies))

1.打开需要自动登录的网站

b.get('https://www.taobao.com/')

2.添加cookie

b.add_cookie(x)

with open('files/taobao.txt',encoding='utf-8') as f:
    content = f.read()
    cookies = loads(content)

for x in cookies:
    b.add_cookie(x)

3.重新打开需要登录的网页

b.get('https://www.taobao.com/')

2.7 requests使用代理的基本用法

proxies = {
‘http’:‘…’,
‘https’:‘…’
}

import requests

headers = {
    '...'
}
# 方法一:(代理IP的设置)
proxies = {
    'http':'...',
    'https':'...'
}
# 方法二:(代理IP的设置),本电脑无法用,记着
# proxies = {
#     'http':'http://...',
#     'https':'https://...'
# }
response = requests.get('https://movie.douban.com/top250?start=25&filter=',headers=headers,proxies=proxies)

print(response.text)

实际应用

import requests

def get_ip():
    url = '...' #这个网站是需要生成代理ip的网址
    while True:
        response = requests.get(url)
        if response.text[0] == '{':
            continue
        return response.text

def get_douban_film():
    headers = {
        'user-agent': '...'
    }
    ip = get_ip()
    proxies = {
        'http': ip,
        'https': ip
    }
    response = requests.get('https://movie.douban.com/top250?start=25&filter=', headers=headers, proxies=proxies)
    print(response.text)

if __name__ == '__main__':
    get_douban_film()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值