(1)获取百度网页并打印
#encoding:utf-8
import requests
url='http://www.baidu.com'
r=requests.get(url)
r.encoding=r.apparent_encoding
print(r.text)
运行结果:
比如说我现在想把这张图片下载下来:
import requests
s='https://imgs.qunarzz.com/p/p67/1512/a2/0ebfcd965b9391f7.jpg_255x175_06112aad.jpg'
r=requests.get(s)
with open('0ebfcd965b9391f7.jpg','wb') as f:
f.write(r.content)
print('下载完成')
运行结果:
(3)获取视频并下载到本地
import requests
t='https://weibo.com/tv/show/1034:4704088311660563?from=old_pc_videoshow'
r=requests.get(t)
with open('football.mp4','wb') as f:
f.write(r.content)
print('下载完成')
运行结果:
(4)搜狗关键词搜索爬取
import requests
url='https://www.sogou.com/web'
#指定url
a=input('enter a word:')
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
param={
'query':a
}
#发起请求,做好伪装
r=requests.get(url=url,params=param,headers=header)
#获取相应数据
c=r.text
f=a+'.html'
#将数据保存在本地
with open(f,'w',encoding='utf-8') as fp:
fp.write(c)
print(f,'爬取完成!!!')
运行结果如下:
(5)爬取百度翻译
import json
import requests
url='https://fanyi.baidu.com/sug'
word=input('请输入想翻译的词语或句子:')
data={
'kw':word
}
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
r=requests.post(url=url,data=data,headers=header)
dic_obj=r.json()
#print(dic_obj)
f=word+'.json'
with open(f,'w',encoding='utf-8') as fp:
json.dump(dic_obj,fp=fp,ensure_ascii=False)
j=dic_obj['data'][1]['v']
print(j)
运行结果如下:
(6)爬取豆瓣电影榜单
import requests
import json
url='https://movie.douban.com/j/chart/top_list?'
p={ 'type': '11',
'interval_id': '100:90',
'action': '',
'start': '0',
'limit': '20',
}
header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
r=requests.get(url=url,params=p,headers=header)
d=r.json()
print(d)
with open('db.json','w',encoding='utf-8') as fp:
json.dump(d,fp=fp,ensure_ascii=False)
运行结果如下:
(7)比赛图片爬取
import requests
import re
import urllib.request
import time
import os
header={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
url='https://www.sohu.com/a/501590917_100117394'
r=requests.get(url=url,headers=header)
a=r.text
p=re.compile(r'<div class="wrapper-box">.*?<div class="area clearfix" id="article-container">.*?<div class="left main">.*?src="(.*?)".*?</div>',re.S)
item=re.findall(p,a)
#print(item)
os.makedirs('photo',exist_ok=True)
for t in item:
print(t)
for t in item:
print('下载图片:'+t)
b=t.split('/')[-1]
urllib.request.urlretrieve(t,'photo/'+str(int(time.time()))+'.jpg')
print(t+'.jpg')
time.sleep(3)
运行结果如下: