1、通用代码框架
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "异常"
2、伪造user-agent(headers)
有些网站对非允许的user-agent禁止爬取,所以需要利用header方法伪造这一信息,当然在此之前需要查看Robots.txt,记录允许的user-agent。
以爬取亚马逊的信息为例:
import requests
def getHTMLText(url):
try:
kv = {'user-agent':'Mozilla/5.0'}
r = requests.get(url,headers=kv)
print(r.headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text[1000:2000]
except:
return "异常"
url = "https://www.amazon.cn/dp/B07FQKB4TM?_encoding=UTF8&ref_=sa_menu_kindle_l3_ki"
print(getHTMLText(url))
3、搜索引擎关键字提交(params)
百度的关键词接口:
http://www.baidu.com/s?wd=keyword
360的关键词接口:
http://www.so.com/s?q=keyword
下面以百度搜索python为例:(由于信息过多,只返回信息长度)
import requests
def getHTMLText(url):
try:
kv = {'wd':'python'}
r = requests.get(url,params=kv)
print(r.headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return len(r.text)
except:
return "异常"
url = "http://www.baidu.com/s"
print(getHTMLText(url))
4、图片爬取
爬取图片,存到本地
import requests
import os
def getHTMLText(url):
root = "D://pics//"
path = root+url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("file save succed!")
else:
print("文件已存在")
except:
return "异常"
url = "http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg"
getHTMLText(url)