一、Request
import requests
#构造一个向服务器请求资源的Request对象,返回一个包含服务器资源以及请求信息的Response对象
r = requests.get("https://www.baidu.com/")
r.encoding = "utf-8"
print(r.status_code)
print(r.text)
print(r.content)
print(r.encoding)
print(r.apparent_encoding)
r.encoding = r.apparent_encoding
Http协议通过URL做定位,通过以上六个方法对资源进行管理。
- 对于robot协议决绝python爬虫访问的情况,可修改请求头部agent字段,模拟浏览器进行访问
#对于robot协议决绝python爬虫访问的情况,可修改请求头部agent字段,模拟浏览器进行访问
import requests
url = "https://item.taobao.com/item.htm?spm=a219r.lm895.14.9.3e0a515fVirBP4&id=607682256258&ns=1&abbucket=5"
try:
kv = {'user-agent':'Googlebot'}
#通过可变参数显式修改headers中的user-agent字段
r = requests.get(url,headers = kv)
#r = requests.get(url)
r.raise_for_status()
print(r.status_code)
r.encoding = r.apparent_encoding
print(r.text[:100])
print(r.request.headers)
except:
print("爬取失败")
print(r.text)
- 百度360搜过关键词提交
#百度搜索关键词提交
import requests
url = "https://www.baidu.com/s"
keyword = "Python"
try:
kv = {'wd':keyword}
r = requests.get(url,params = kv)
print(r.request.url)
r.raise_for_status()
print(r.status_code)
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
print(r.text)
#360搜索关键词提交
import requests
url = "http://www.so.com/s"
keyword = "Python"
try:
kv = {'q':keyword}
r = requests.get(url,params = kv)
print(r.request.url)
r.raise_for_status()
print(r.status_code)
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
print(r.text)
- 爬取图片并保存到本地
#爬取图片并保存到本地
import requests
import os
url = "https://goss1.cfp.cn/creative/vcg/800/version23/VCG41141105577.jpg"
#url = "http://img0.dili360.com/pic/2019/10/21/5dad590742d1e2899022610_t.jpg"
root = "D:\ArticleForProgram\PythonProgram\Spider\image"
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
else:
print("文件夹已存在")
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
- ip地址归属地查询
#ip地址归属地查询
import requests
url = "http://m.ip138.com/ip.asp?ip="
try :
r = requests.get(url+"202.102.144.56")
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")