Python爬虫:接口和selenium基础-

接口和selenium基础

豆瓣电影top250信息爬取

import requests
from bs4 import BeautifulSoup
from re import search
import csv
import time


def get_one_page(page):
    headers = {
        'cookie': 'bid=g16urOELfcQ; douban-fav-remind=1; __gads=ID=3f57e16f48f82cf2-2222dd78d6d0003d:T=1646561716:RT=1646561716:S=ALNI_MZc-Jdw3ejofw4l88N8mdU7MdYYzg; ll="118318"; __utma=30149280.808328653.1646561718.1646561718.1646561718.1; __utmz=30149280.1646561718.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ct=y; dbcl2="222590700:fEe+R6RmPAA"; ck=4rf3; push_noty_num=0; push_doumail_num=0',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
    }
    url = f'https://movie.douban.com/top250?start={page * 25}&filter='
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    all_li = soup.select('.grid_view>li')
    all_data = []
    for li in all_li:
        # 电影名
        name = li.select_one('.title').text
        result = li.select_one('.bd>p').text.strip().replace(' ', '')
        # print('==============================================')
        # print(result)
        # 导演
        director = search(r'导演:(.+?)\s', result).group(1)
        # 主演
        actor = search(r'主演:(.+?)\s', result)
        if actor:
            actor = actor.group(1)

        # 上映时间
        time = search(r'\n(\d+).*?/', result).group(1)
        # 国家
        country = search(r'\n.+?/\s*(.+?)\s*/', result).group(1)
        # print(country)
        # 类型
        type = search(r'/.+?/\s*(.+?)$', result).group(1)
        # print(type)
        # 评分
        score = li.select_one('.rating_num').text
        # 人数
        comment_num = li.select('.star>span')[-1].text
        comment_num = search(r'\d+', comment_num).group()
        # 图片地址
        img_url = li.select_one('.pic img').attrs['src']

        # 保存数据
        all_data.append([name, director, actor, time, country, type, score, comment_num, img_url])

    writer = csv.writer(open('files/电影.csv', 'a', encoding='utf-8', newline=''))
    if page == 0:
        writer.writerow(['电影名称', '导演', '主演', '上映时间', '国家', '类型', '评分', '评论人数', '封面地址'])
    writer.writerows(all_data)
    print(f'==========第{page+1}页获取成功===========')


if __name__ == '__main__':
    for page in range(10):
        get_one_page(page)
        time.sleep(1)

数据分析岗位爬虫

import requests, json
from re import search

url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
res = requests.get(url, headers=headers)

# 从请求结果中提取出包含岗位信息的json数据
json_data = search(r'window.__SEARCH_RESULT__ = (.+?)</script>', res.text).group(1)

all_job = json.loads(json_data)['engine_jds']

for job in all_job:
    name = job['job_name']
    providesalary = job.get('providesalary_text', '面议')
    print(name, providesalary)

接口数据

怎么找网站接口:

打开任意一个网站 -> 右键检查 -> 同时选中Network和Fetch/XHR -> 刷新 -> 如果Name为空白则无接口;Name有内容则依次点击它下面的内容 -> 点Preview,如果有想要的数据则点Headers;如果没有则无接口 -> 获取Request URL的值作为请求地址即可

注意:看Request Method请求方法是get还是post

selenium的基本用法

from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
from re import search

# 1. 创建一个浏览器对象
b = Chrome()

# 2. 打开网页
b.get('https://cd.zu.anjuke.com/?from=navigation')

# 3. 获取网页数据(能获取到的数据一定是网页加载出来数据)
# print(b.page_source)
result = b.page_source

# 4. 解析
soup = BeautifulSoup(result, 'lxml')
all_house_div = soup.select('#list-content>.zu-itemmod')
for house in all_house_div:
    title = house.select_one('.strongbox').text
    message = house.select_one('.details-item').text
    area = search(r'\|(.+?)\|', message).group(1)
    price = house.select_one('.zu-side>p').text

selenium控制网页

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time

b = Chrome()
b.get('https://www.jd.com/')

# 1. 自动在输入框中输入内容
# 1)获取页面上的输入框
search_box = b.find_element_by_id('key')
# search_box = b.find_element_by_css_selector('#key')

# 2)向输入框中输入内容
search_box.send_keys('酸奶')

# 3) 按回车
# search_box.send_keys(Keys.ENTER)

# 4) 按搜索键
# a.找到搜索按钮
search_btn = b.find_element_by_css_selector('.button')

# b.点击按钮
search_btn.click()

time.sleep(1)

print(b.page_source)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值