一.京东商品页面爬取
网址:https://item.jd.com/6463262.html
import requests
try:
url="https://item.jd.com/6463262.html"
r = requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text)
except:
print("访问失败")
二.亚马逊商品页面爬取
import requests
try:
url="https://www.amazon.cn/dp/B07M6RTWTZ/ref=cngwdyfloorv2_recs_0/461-4194424-2220805?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=desktop-2&pf_rd_r=XBBAGP95RJ69AM0HDA0E&pf_rd_r=XBBAGP95RJ69AM0HDA0E&pf_rd_t=36701&pf_rd_p=d2aa3428-dc2b-4cfe-bca6-5e3a33f2342e&pf_rd_p=d2aa3428-dc2b-4cfe-bca6-5e3a33f2342e&pf_rd_i=desktop"
val={"User-Agent":"Mozilla/5.0"}
r=requests.get(url, headers=val)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[:1000])
except:
print("访问失败")
有些网站为了防止爬虫的随意爬取就对访问进行来源审查,这种方式主要是检查请求的请求头,所以在访问这种页面时我们需要伪装一下,在get方法中加一个请求头.
三.百度搜索关键词提交
import requests
try:
url="https://www.baidu.com/s"
val={"wd":"python"}
r=requests.get(url, params=val)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.request.url)
print(r.text)
except:
print("访问失败")
四.图片爬取
import requests
import os
try:
url="https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A9%AC%E5%88%BA&step_word=&hs=0&pn=28&spn=0&di=111760&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=218806883%2C1891356194&os=2213001099%2C835067167&simid=0%2C0&adpicid=0&lpn=0&ln=1703&fr=&fmq=1556614222707_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined©right=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2F04imgmini.eastday.com%2Fmobile%2F20180829%2F3b78ae9c86f7604bbb98d4e34d457d64_wmk.jpeg&fromurl=ippr_z2C%24qAzdH3FAzdH3Ffr56pf_z%26e3Bjwfp1wy_z%26e3Bv54AzdH3FwAzdH3F8babdld8nlclm8aaaaaaa_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined"
root="D://pics"
path="D://pics//maci"
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r=requests.get(url)
r.raise_for_status()
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("图片保存成功")
else:
print("图片已存在")
except:
print("访问失败")
五.IP地址查询
import requests
url="http://m.ip138.com/ip.asp"
val={"ip":"202.204.80.112"}
try:
r=requests.get(url,params=val)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text)
except:
print("访问失败")