本文参考资料为BIT的mooc
寒假前几天都木有学习
实例1:爬取京东商品等简单网页模版
def getHTML(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status() #如果状态不是200,引发HTTPError异常
r.encoding=r.apparent_encoding
return r.text
except:
return "产生异常"
if __name__=="__main__":
url="http://www.taobao.com"
print(getHTML(url))
实例2:爬取亚马逊商品
import requests
url="https://www.amazon.cn/dp/B00OTEQF9I/ref=br_bsl_pdt-4/461-9310616-4693465?pf_rd_m=A1AJ19PSB66TGU&pf_rd_s=desktop-bestsellers-1&pf_rd_r=Z2VQP1NSXB1RB82D49HZ&pf_rd_r=Z2VQP1NSXB1RB82D49HZ&pf_rd_t=36701&pf_rd_p=2239dbbd-1fbb-4795-9d68-c227cd3c9261&pf_rd_p=2239dbbd-1fbb-4795-9d68-c227cd3c9261&pf_rd_i=desktop"
try:
kv={'user-agent':'Mozilla/5.0'}
r=requests.get(url,header=kv)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text)
except:
print("爬取失败")
需要改变身份标识,将headers字段的user-agent作修改,模拟浏览器向亚马逊发起HTTP请求
#如果不改变
r.request.headers
{'User-Agent': 'python-requests/2.19.1', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
实例3:百度360搜索关键词提交
搜索引擎关键字提交接口
百度关键词接口:http://www.baidu.com/s?wd=keyword
360:http://www.so.com/s?q=keyword
import requests
keyword='Python'
try:
kv={'wd':keyword}
r=requests.get("http://www.baidu.com/s",params=kv)#通过params把键值对输入进去
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
实例4:ip地址
import requests
url="http://m.ip138.com/ip.asp?ip="
try:
r=requests.get(url+'210.27.7.153')
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[-500:])
except:
print("爬取失败")
实例5:大学排名
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[3].string])
def printUnivList(ulist, num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","总分"))
for i in range(num):
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo = []
url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html'
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 20) # 20 univs
main()