day2-requests和bs4
requests使用方法
import requests
"""1. 发送请求
requests.get(url, *, headers, params, proxies) - 发送get请求
requests.post(url, *, headers, params, proxies) - 发送post请求
参数:
url - 请求地址(一个网站的网址、接口的地址、图片地址等)
headers - 设置请求头(设置cookie和User-Agent的时候使用)
params - 设置参数
proxies - 设置代理
"""
"""
requests.get('http://api.tianapi.com/auto/index?key=c9d408fefd8ed4081a9079d0d6165d43&num=10')
"""
"""
params = {
'key': 'c9d408fefd8ed4081a9079d0d6165d43',
'num': 10
}
requests.post('http://api.tianapi.com/auto/index', params=params)
"""
response = requests. get( 'http://www.yingjiesheng.com/' )
"""2. 获取响应信息"""
response. encoding = 'GBK'
print ( response. headers)
添加请求头
import requests
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36' ,
'cookie' : '_zap=4d58cb38-ec48-47b2-9e47-8ff8ef963486; _xsrf=veOhJnW2hAC2BDcgK8KTU4NqUrLUYuTe; d_c0="AHAQrl0PjROPTn2Bv2wpyQXt8QUwjW6yjTU=|1628663892"; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1628663893; __snaker__id=EYMWPXdZknPXfAye; gdxidpyhxdE=QQQ9DNLdBqx13etuowzGeLbMfcPXBfHckpwZ%2BxZp06A8zi9JHMPDxcbRi4o%5Ca053y5oVnjBBBb99XeqPZicZtcN2%5CR7snyRY8LQP%2Ff1Lu%5CEaPuZo9DldazSjxxzCmy0GXU7zlEHvH5jbqRxsq3d4HX5PN3j%5Cw7yrH2Ls29BYDaDCm0%2Fb%3A1628664795621; _9755xjdesxxd_=32; YD00517437729195%3AWM_NI=xDnvQnHhpYF6yUCebu826Rf%2FtJfpY7qOemzjWKJqvTeiC%2FN7ac2Cye8KddfyGIjjNxMaj1gnnUNWT6pGUEzV16y8CNLWmizD0SakKVmh9ELwcWrCleatFrWHNaWfd%2F1ZdWM%3D; YD00517437729195%3AWM_NIKE=9ca17ae2e6ffcda170e2e6eed4b479ad9e898bbc2591868bb3d85e968a9aaab566acb08196e862bbb4b7ade52af0fea7c3b92abc9ca297d4668cb8c0bac53ebb8b8d86e17ffceefab3e525b896aa91dc3db391ab8ef96295958692f560a7b78dd0cd3da3bdfea4fc6ffc95ac85e5738597a68bcd748fbfa6d2e666ae8b82b8d73eb4999ba6f95ef3eab7d9c2469089a38af950f48daf8cca5eafb8f7a6cf7da189bea4ef6fa3ac8a93d6448ebf9987e725f386acb8d037e2a3; YD00517437729195%3AWM_TID=Lftr4M6kyApFUEUBFFcv0DqgQ5uBSC%2FF; captcha_session_v2="2|1:0|10:1628663907|18:captcha_session_v2|88:OCtMcVVod1VSRDZ4Q2tTbGNyNUVIUXdJREc5Y0lSbjJyMklwSWh5MTA0NVhpL3JLak1CZXBPMEQ1ZlcycGludQ==|7f6c9d93866de2c49808fd0c3fa7ec6f7ef407e0fa6678072b00b577b351fb5f"; captcha_ticket_v2="2|1:0|10:1628663918|17:captcha_ticket_v2|704:eyJ2YWxpZGF0ZSI6IkNOMzFfWVZ5NHQwWm1wZlJWN1pSQzd6czQ0dUF6cG51Q2xFbHk2d0h0RjdYSWt4RDQ3ODJuOXMta2ROclliYkt6SFNGWUNvc2NCLklvdll0ejVZSmM5T0lOR1lwa2gxTTQwRWlVOWtmdmZqN3U3Q2g2Y2ZQU1c2VjJ3UDJvV0ZWa2hpLTJWUlF6ZDdmTWItMnRDV1dfOHM2ZkNpcFRsYlhOdUZaOXpVVDlCMXhGRy0xTkdoUnJrWlpkUERmelNiVzZMMk83WVVkSkVUSjJzZ1F1WEtnODBIaGV0NlNjcVpUdUt4ZUhSUFNyS1lOUGRfeTl5dEI5TUduS2xFUVpRYzB6REs3d0dzTWpKbW1FUzBiSlBDdUo1WURxd1F0cVdFLTFOX01TQUJOSjdraEYxbDZzSUxRcVVaZmE1NDR5OXRKVXBwa014TkQ2N3lDR0xxNG4yWENUaGhlLUlsMEEyTHFuV3RPa1ppSy1STENCWVVRdkZKaDVYMWR4YVhaeWl5QnpRZ2FrUE5UelNRVmg3RzJVeUJmU1VGVGRyMHpFODktWTcuRENMNzA5cVEuRnZTN0NfWk9XN0swOW9vaUs1anJMcC1SbHotWGRPdE9wTnZpbGJXY3U5dU0uSjFhNTFrODYxREJpZjhJQXJ4X21XMnotTmZMd0RkTzZHSEFpdkJhMyJ9|66c432f2881af153cce75b5940defd6b832a569f89e6b9eaebfd514b1d4ea329"; z_c0="2|1:0|10:1628663938|4:z_c0|92:Mi4xaW5CWUdRQUFBQUFBY0JDdVhRLU5FeVlBQUFCZ0FsVk5ncjRBWWdBZkNmZWVoMkphV0tZWDdSOUl6MVo1VFdPOXJB|58971e1efcbfec5e768e019f0c12ec85652bb22b2917e2cff02d68947b812353"; unlock_ticket="ADAc3rNA2xAmAAAAYAJVTYp3E2GJixRcAVFMYYkPJW256QCAFDClgw=="; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1628664181; SESSIONID=ALeO6SNDSqNPVjPc3Ao25v7TXs18vru2Tvmqpwqdoal; KLBRSID=dc02df4a8178e8c4dfd0a3c8cbd8c726|1628664185|1628663890; JOID=UVARB04wYKVlY693JDctP1npsqUxeiDAK1P3NWBlGuc1NMMsS04dtwNuqHslc7UnVsGLPcQ2PuJWOI7F7kuQiRE=; osd=W1gVC086aKFpYqV_IDssNVHtvqQ7ciTMKln_MWxkEO8xOMImQ0oRtglmrHckeb0jWsCBNcA6P-hePILE5EOUhRA='
}
response = requests. get( 'https://www.zhihu.com/' , headers= headers)
print ( response. text)
json解析
import requests
response = requests. get( 'https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc&_signature=_02B4Z6wo00d01X.g2AgAAIDDl0iJmFbkIVl.xNyAAD7ve5rc90eYpUagYiMEKQrfIz8iJPKuacCxb32tQcqbwZpt0i3u2X-hae-fgV3NqtDiEbEJK7EPc235gzTPL4EhVZ7cxFeHkLUI27pv29' )
all_news = response. json( ) [ 'data' ]
for news in all_news:
print ( news[ 'Title' ] )
print ( news[ 'Image' ] [ 'url' ] )
图片下载
import requests
def download_image ( img_url: str ) :
response = requests. get( img_url)
data = response. content
f = open ( f'files/ { img_url. split( "/" ) [ - 1 ] } ' , 'wb' )
f. write( data)
if __name__ == '__main__' :
download_image( 'https://p5.toutiaoimg.com/img/pgc-image/9f5d102756354b6db8fa9408c57d01c8~cs_noop.png' )
千图网
import requests
from re import findall
def download_image ( img_url: str ) :
response = requests. get( img_url)
data = response. content
f = open ( f'files/ { img_url. split( "/" ) [ - 1 ] . split( "!" ) [ 0 ] } ' , 'wb' )
f. write( data)
print ( '下载完成!' )
if __name__ == '__main__' :
response = requests. get( 'https://www.58pic.com/tupian/qixi-0-0.html' )
result = findall( r'(?s)<img src="(\S+?)">' , response. text)
for x in result:
download_image( f'https: { x} ' )
bas4的使用
from bs4 import BeautifulSoup
data = open ( 'test2.html' , encoding= 'utf-8' ) . read( )
soup = BeautifulSoup( data, 'lxml' )
result = soup. select( 'p' )
print ( result)
result = soup. select_one( 'p' )
print ( result)
result = soup. select( '#p1' )
print ( result)
result = soup. select_one( '#p1' )
print ( result)
result = soup. select( 'div p' )
print ( result)
result = soup. select( 'div>p' )
print ( result)
p2 = soup. select_one( 'div>p' )
print ( p2)
print ( p2. string)
s1 = soup. select_one( '#s1' )
print ( s1)
print ( s1. string)
print ( p2. get_text( ) )
print ( s1. get_text( ) )
print ( p2. contents)
result = s1. contents
print ( result)
print ( result[ - 1 ] . get_text( ) )
a1 = soup. select_one( 'div>a' )
print ( a1)
print ( a1. attrs[ 'href' ] )
img1 = soup. select_one( 'img' )
print ( img1)
print ( img1. attrs[ 'src' ] )
ps = soup. select( 'p' )
print ( ps)
div1 = soup. select_one( 'div' )
ps = div1. select( 'p' )
print ( ps)
豆瓣电影
headers = {
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
response = requests. get( 'https://movie.douban.com/top250' , headers= headers)
soup = BeautifulSoup( response. text, 'lxml' )
all_movie_li = soup. select( '#content > div > div.article > ol > li' )
for li in all_movie_li:
img_url = li. select_one( '.pic>a>img' ) . attrs[ 'src' ]
print ( img_url)
name = li. select_one( '.title' ) . get_text( )
print ( name)
des = li. select_one( '.inq' ) . get_text( )
print ( des)
score = li. select_one( '.rating_num' ) . get_text( )
print ( score)
print ( '----------------------------------------------' )