# # 1.爬取京东商品
import requests
def getHTMLText(url):
try:
r = requests.get(url,timeout=30)
r.raise_for_status() # 如果状态不是200,引发HTTPError异常
r.encoding = r.apparent_encoding
return (r.text[:1000])
except:
return "爬取失败"
url = "https://item.jd.com/100005853638.html"
print(getHTMLText(url))
# 2.Amason商品
import requests
url = "http://book.zongheng.com/showchapter/839344.html"
# r = requests.get(url)
kv = {'user-agent':'Mozilla/5.0'} # 模拟浏览器去爬,会改变r.status_code
r = requests.get(url,headers = kv)
print(r.status_code)
# print(r.text)
import requests
url = "http://book.zongheng.com/showchapter/839344.html"
try:
kv = {'user-agent': 'Mozilla/5.0'} # 模拟浏览器去爬,会改变r.status_code
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[1000:2000])
except:
print("爬取异常")
# 3.百度关键词搜索提交 ,利用参数params
# 首先要知道:百度接口:http://www.baidu.com/s?wd=keyword 360接口:https://www.so.com/s?q=keyword&src=360portal&_re=0
import requests
# kv = {'wd':'Python'}
url = 'http://www.baidu.com'
keyword = "Python"
try:
kv = {'wd':keyword}
r = requests.get(url,params=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.request.url)
print(len(r.text))
print(r.text[1000:2000])
except:
print("爬取失败")
# 4.爬取图片并存储
import requests
import os
url = "https://cn.bing.com/th?id=OHR.Skywalk_ZH-CN3725661090_1920x1080.jpg&rf=LaDigue_1920x1080.jpg&pid=hp"
root = "D://PaChong_pics//"
# path = root + url.split('/')[-1]
path = root + "pic1.jpg"
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
# 5.IP地址查询
# 启示:很多网页的API,可以自己手动试一下获取它的形式,然后就可以写代码啦。
import requests
url = 'http://m.ip138.com/ip.asp?ip='
try:
r = requests.get(url + 'www.njtech.edu.cn')
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[-500:]) #最后500个字节
except:
print("爬取失败")
# 不用+号,也可以用另一种方式:
# add = 'www.njtech.edu.cn'
# url = 'http://m.ip138.com/ip.asp?ip=%s' % add
# r = requests.get(url)