完全没有抗性,甚至不需要user—agent
import requests
url="详情页网址"
try:
r= requests.get(url)
r.raise_for_status()
200
r.encoding=r.apparent_encoding
print(r)
except:
print("爬取失败")
有一点点抗性,泛用性user-agent可以解决
url="网址"
try:
kv={'user-agent':'Mozilla/5.0'}
r=requests.get(url,headers=kv)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r)
except:
print("爬取失败")
搜索引擎代码
keyword="python"
try:
kv={'wd':keyword}
r=requests.get("网址/s",params=kv)
print(r.request.url)
r.raise_for_status()
print(r)
except:
print("爬取失败")
图片爬取
improt os
url="图片网址(一般为jpg格式)"
root="保存地址"
path=root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r=requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件保存失败")
except:
print("爬取失败")
代理ip地址查询爬取
url="网址/ip.asp?ip="
try:
r=requests.get(url+'202.204.80.112')
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[-500:])
except:
print("爬取失败")