day2-requests 和 bs4

本文介绍了Python中requests库的使用,包括GET和POST请求,设置请求头和代理,以及获取响应信息。接着讲解了如何处理响应头、响应体,特别是JSON数据的解析。此外,还展示了如何下载图片,并提供了千图网图片下载的示例。最后,通过BeautifulSoup解析HTML页面,演示了选择和提取网页内容的方法,以及如何获取标签的属性。文章以豆瓣电影为例,展示了爬取电影信息的实际应用。
摘要由CSDN通过智能技术生成

day2-requests 和 bs4

1.requests的使用方法

# requests:Python基于http协议进行网络请求的第三方库
import requests
(1) 发送请求
  • requests.get(url,*,headers,params,proxies) - 发送get请求
  • requests.post(url,*,headers) - 发送 post 请求
参数
  • url - 请求地址(一个网站的网址、接口的地址、图片地址等)
  • headers - 设置请求头(设置cookie和User-Agent的时候使用)
  • params - 设置参数
  • proxies - 设置代理
# 发送get请求,参数直接拼接到url中
# requests.get('http://api.tianapi.com/auto/index?key=c9d408fefd8ed4081a9079d0d6165d43&num=10')

# 发送post请求,参数设置在params中
"""
params = {
    'key': 'c9d408fefd8ed4081a',
    'num': 10
}
requests.post('http://api.tianapi.com/auto/index', params=params)
"""

response = requests.get("")

"""2. 获取响应信息"""
# 设置编码方式(乱码的时候才需要设置)
response.encoding = 'GBK'

# 获取响应头信息
print(response.headers)

# 获取响应体
# a. 获取txet值(用于请求网页,直接拿到网页源代码)
# print(response.text)

# b. 获取json解析结果(用于返回json数据的数据接口)
# print(response.json())

# c. 获取content值(获取二进制类型的原数据,用于图片、视频、音频的下载)
# print(response.content)

2.添加请求头

import requests

# ----------------------1. 添加User-Agent-------------------
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
# }
#
# response = requests.get('https://www.51job.com/', headers=headers)
#
# response.encoding = 'gbk'
#
# print(response.text)

# ----------------------2. 添加cookie--------------------------
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
    'cookie': '_zap=4d58cb38-ec48-47b2-9e47-8ff8ef963486; _xsrf=veOhJnW2hAC2BDcgK8KTU4NqUrLUYuTe; d_c0="AHAQrl0PjROPTn2Bv2wpyQXt8QUwjW6yjTU=|1628663892"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1628663893; __snaker__id=EYMWPXdZknPXfAye; gdxidpyhxdE=QQQ9DNLdBqx13etuowzGeLbMfcPXBfHckpwZ%2BxZp06A8zi9JHMPDxcbRi4o%5Ca053y5oVnjBBBb99XeqPZicZtcN2%5CR7snyRY8LQP%2Ff1Lu%5CEaPuZo9DldazSjxxzCmy0GXU7zlEHvH5jbqRxsq3d4HX5PN3j%5Cw7yrH2Ls29BYDaDCm0%2Fb%3A1628664795621; _9755xjdesxxd_=32; YD00517437729195%3AWM_NI=xDnvQnHhpYF6yUCebu826Rf%2FtJfpY7qOemzjWKJqvTeiC%2FN7ac2Cye8KddfyGIjjNxMaj1gnnUNWT6pGUEzV16y8CNLWmizD0SakKVmh9ELwcWrCleatFrWHNaWfd%2F1ZdWM%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eed4b479ad9e898bbc2591868bb3d85e968a9aaab566acb08196e862bbb4b7ade52af0fea7c3b92abc9ca297d4668cb8c0bac53ebb8b8d86e17ffceefab3e525b896aa91dc3db391ab8ef96295958692f560a7b78dd0cd3da3bdfea4fc6ffc95ac85e5738597a68bcd748fbfa6d2e666ae8b82b8d73eb4999ba6f95ef3eab7d9c2469089a38af950f48daf8cca5eafb8f7a6cf7da189bea4ef6fa3ac8a93d6448ebf9987e725f386acb8d037e2a3; YD00517437729195%3AWM_TID=Lftr4M6kyApFUEUBFFcv0DqgQ5uBSC%2FF; captcha_session_v2="2|1:0|10:1628663907|18:captcha_session_v2|88:OCtMcVVod1VSRDZ4Q2tTbGNyNUVIUXdJREc5Y0lSbjJyMklwSWh5MTA0NVhpL3JLak1CZXBPMEQ1ZlcycGludQ==|7f6c9d93866de2c49808fd0c3fa7ec6f7ef407e0fa6678072b00b577b351fb5f"; captcha_ticket_v2="2|1:0|10:1628663918|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfWVZ5NHQwWm1wZlJWN1pSQzd6czQ0dUF6cG51Q2xFbHk2d0h0RjdYSWt4RDQ3ODJuOXMta2ROclliYkt6SFNGWUNvc2NCLklvdll0ejVZSmM5T0lOR1lwa2gxTTQwRWlVOWtmdmZqN3U3Q2g2Y2ZQU1c2VjJ3UDJvV0ZWa2hpLTJWUlF6ZDdmTWItMnRDV1dfOHM2ZkNpcFRsYlhOdUZaOXpVVDlCMXhGRy0xTkdoUnJrWlpkUERmelNiVzZMMk83WVVkSkVUSjJzZ1F1WEtnODBIaGV0NlNjcVpUdUt4ZUhSUFNyS1lOUGRfeTl5dEI5TUduS2xFUVpRYzB6REs3d0dzTWpKbW1FUzBiSlBDdUo1WURxd1F0cVdFLTFOX01TQUJOSjdraEYxbDZzSUxRcVVaZmE1NDR5OXRKVXBwa014TkQ2N3lDR0xxNG4yWENUaGhlLUlsMEEyTHFuV3RPa1ppSy1STENCWVVRdkZKaDVYMWR4YVhaeWl5QnpRZ2FrUE5UelNRVmg3RzJVeUJmU1VGVGRyMHpFODktWTcuRENMNzA5cVEuRnZTN0NfWk9XN0swOW9vaUs1anJMcC1SbHotWGRPdE9wTnZpbGJXY3U5dU0uSjFhNTFrODYxREJpZjhJQXJ4X21XMnotTmZMd0RkTzZHSEFpdkJhMyJ9|66c432f2881af153cce75b5940defd6b832a569f89e6b9eaebfd514b1d4ea329"; z_c0="2|1:0|10:1628663938|4:z_c0|92:Mi4xaW5CWUdRQUFBQUFBY0JDdVhRLU5FeVlBQUFCZ0FsVk5ncjRBWWdBZkNmZWVoMkphV0tZWDdSOUl6MVo1VFdPOXJB|58971e1efcbfec5e768e019f0c12ec85652bb22b2917e2cff02d68947b812353"; unlock_ticket="ADAc3rNA2xAmAAAAYAJVTYp3E2GJixRcAVFMYYkPJW256QCAFDClgw=="; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1628664181; SESSIONID=ALeO6SNDSqNPVjPc3Ao25v7TXs18vru2Tvmqpwqdoal; KLBRSID=dc02df4a8178e8c4dfd0a3c8cbd8c726|1628664185|1628663890; JOID=UVARB04wYKVlY693JDctP1npsqUxeiDAK1P3NWBlGuc1NMMsS04dtwNuqHslc7UnVsGLPcQ2PuJWOI7F7kuQiRE=; osd=W1gVC086aKFpYqV_IDssNVHtvqQ7ciTMKln_MWxkEO8xOMImQ0oRtglmrHckeb0jWsCBNcA6P-hePILE5EOUhRA='
}
response = requests.get('https://www.zhihu.com/', headers=headers)
print(response.text)

3.json解析

import requests

# 获取今日头条的json数据接口,然后再发送请求
response = requests.get('https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc&_signature=_02B4Z6wo00d01X.g2AgAAIDDl0iJmFbkIVl.xNyAAD7ve5rc90eYpUagYiMEKQrfIz8iJPKuacCxb32tQcqbwZpt0i3u2X-hae-fgV3NqtDiEbEJK7EPc235gzTPL4EhVZ7cxFeHkLUI27pv29')

all_news = response.json()['data']
for news in all_news:
    print(news['Title'])
    print(news['Image']['url'])

4.图片下载

import requests

def download_image(img_url: str):
    # 请求网络图片数据
    response = requests.get(img_url)
    
    # 获取数据
    data = response.content
    
    # 保存数据到本地文件
    f = open(f'files/{img_url.split("/")[-1]}', 'wb')
    f.write(data)

5.千图网

import requests
from re import findall


def download_image(img_url: str):
    # 请求网络图片数据
    response = requests.get(img_url)

    # 获取数据
    data = response.content

    # 保存数据到本地文件
    f = open(f'files/{img_url.split("/")[-1].split("!")[0]}', 'wb')
    f.write(data)
    print('下载完成!')


if __name__ == '__main__':
    response = requests.get('https://www.58pic.com/tupian/qixi-0-0.html')

    result = findall(r'(?s)<img src="(\S+?)">', response.text)

    for x in result:
        download_image(f'https:{x}')

6.bs4 的使用

from bs4 import BeautifulSoup

# 准备需要解析的网页数据(实际是用request或者selenium获取)
data = open('test2.html', encoding='utf-8').read()

# 2. BeautifulSoup对象(可以自动纠正数据中错误的html结构)
# BeautifulSoup(数据,解析器)
soup = BeautifulSoup(data, 'lxml')

# 3. 通过BeautifulSoup对象获取标签和标签内容
# 1) 获取标签
# BeautifulSoup对象.select(css选择器)  -  获取css选择器选中的所有标签;返回的是列表,列表中的  元素是选中的对象
# BeautifulSoup对象.select_one(css选择器)  -  获取css选择器选中的第一个标签;返回的是标签对象
result = soup.select('p')
print(result)           # [<p>我是段落1</p>, <p>我是段落2</p>, <p>我是超链接3</p>]

result = soup.select_one('p')
print(result)           # <p>我是段落1</p>

result = soup.select('#p1')
print(result)           # [<p id="p1">我是超链接3</p>]

result = soup.select_one('#p1')
print(result)       # <p id="p1">我是超链接3</p

result = soup.select('div p')
print(result)       # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]

result = soup.select('div>p')
print(result)       # [<p>我是段落2</p>]

# 2) 获取标签内容
# a. 标签对象.string   -   获取标签中的文字内容(只有在标签内容是纯文字的时候有效,否则结果是None)
p2 = soup.select_one('div>p')
print(p2)         # <p>我是段落2</p>
print(p2.string)  # '我是段落2'

s1 = soup.select_one('#s1')
print(s1)       # <span id="s1">我是<b>span1</b></span>
print(s1.string)    # None

# b.标签对象.get_text()   -   获取标签内容中所有的文字信息
print(p2.get_text())        # '我是段落2'
print(s1.get_text())        # '我是span1'

# c. 标签对象.contents
print(p2.contents)      # ['我是段落2']
result = s1.contents
print(result)      # ['我是', <b>span1</b>]
print(result[-1].get_text())    # 'span1'

# 3) 获取标签属性
a1 = soup.select_one('div>a')
print(a1)       # <a href="https://www.baidu.com">我是超链接2</a>
print(a1.attrs['href'])     # 'https://www.baidu.com'

img1 = soup.select_one('img')
print(img1)     # <img alt="" src="http://www.gaoimg.com/uploads/allimg/210801/1-210P1151401S1.jpg"/>
print(img1.attrs['src'])        # 'http://www.gaoimg.com/uploads/allimg/210801/1-210P1151401S1.jpg'


# 补充:
# BeautifulSoup对象.select/select_one(css选择器)   -  在整个网页中获取css选择器选中的标签
# 标签对象.select/select_one(css选择器)    -   在指定标签中获取css选择器选中的标签
ps = soup.select('p')
print(ps)       # [<p>我是段落1</p>, <p>我是段落2</p>, <p id="p1">我是超链接3</p>]

div1 = soup.select_one('div')
ps = div1.select('p')
print(ps)       # [<p>我是段落2</p>, <p id="p1">我是超链接3</p>]

7.豆瓣电影

import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'
}

response = requests.get('https://movie.douban.com/top250', headers=headers)

soup = BeautifulSoup(requests.text, 'lxml')
all_movie_li = soup.select('#content > div > div.article > ol > li')

for li in all_movie_li:
    img_url = li.select_one('.pic>a>img').attrs['src']
    print(img_url)
    
    name = li.select_one('.title').get_text()
    print(name)
    
    des = li.select_one('.inq').get_text()
    print(des)
    
    score = li.select_one('.rating_num').get_text()
    print(score)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值