python爬虫第三节:五个小例子

# # 1.爬取京东商品

import requests
def getHTMLText(url):
    try:
        r = requests.get(url,timeout=30)
        r.raise_for_status() # 如果状态不是200,引发HTTPError异常
        r.encoding = r.apparent_encoding
        return (r.text[:1000])
    except:
        return  "爬取失败"

url = "https://item.jd.com/100005853638.html"
print(getHTMLText(url))
# 2.Amason商品
import requests
url = "http://book.zongheng.com/showchapter/839344.html"
# r = requests.get(url)
kv = {'user-agent':'Mozilla/5.0'} # 模拟浏览器去爬,会改变r.status_code
r = requests.get(url,headers = kv)
print(r.status_code)
# print(r.text)
import requests
url = "http://book.zongheng.com/showchapter/839344.html"
try:
    kv = {'user-agent': 'Mozilla/5.0'}  # 模拟浏览器去爬,会改变r.status_code
    r = requests.get(url, headers=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    print(r.text[1000:2000])
except:
    print("爬取异常")
# 3.百度关键词搜索提交 ,利用参数params
# 首先要知道:百度接口:http://www.baidu.com/s?wd=keyword 360接口:https://www.so.com/s?q=keyword&src=360portal&_re=0
import requests
# kv = {'wd':'Python'}
url = 'http://www.baidu.com'
keyword = "Python"
try:
    kv = {'wd':keyword}
    r = requests.get(url,params=kv)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    print(r.request.url)
    print(len(r.text))
    print(r.text[1000:2000])
except:
    print("爬取失败")
# 4.爬取图片并存储
import requests
import os
url = "https://cn.bing.com/th?id=OHR.Skywalk_ZH-CN3725661090_1920x1080.jpg&rf=LaDigue_1920x1080.jpg&pid=hp"
root = "D://PaChong_pics//"
# path = root + url.split('/')[-1]
path = root + "pic1.jpg"
try:
    if not os.path.exists(root):
        os.mkdir(root)
    if not os.path.exists(path):
        r = requests.get(url)
        with open(path,'wb') as f:
            f.write(r.content)
            f.close()
            print("文件保存成功")
    else:
        print("文件已存在")
except:
    print("爬取失败")
# 5.IP地址查询
# 启示:很多网页的API,可以自己手动试一下获取它的形式,然后就可以写代码啦。
import requests
url = 'http://m.ip138.com/ip.asp?ip='
try:
    r = requests.get(url + 'www.njtech.edu.cn')
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    print(r.text[-500:]) #最后500个字节
except:
    print("爬取失败")
# 不用+号,也可以用另一种方式:
    # add = 'www.njtech.edu.cn'
    # url = 'http://m.ip138.com/ip.asp?ip=%s' % add
    # r = requests.get(url)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值