- 入门小练习
- 附注:MOOC
Python网络爬虫与信息提取
- 5个小练习
- 京东商品网面
- 亚马逊商品网面
- 百度,360搜索接口
- 图片下载
- ip自动查询
# -*- coding=utf-8 -*-
import requests
from bs4 import BeautifulSoup
def getHtmlText(url):
try:
req = requests.get(url)
req.raise_for_status()
print(req.text[:1000])
except:
print("爬取失败")
if __name__ == '__main__':
# 京东商品页面爬取
url = 'https://item.jd.com/8460509.html'
# getHtmlText(url)
# 亚马逊商品页
url1 = 'https://www.amazon.cn/gp/product/B01M8L5z3Y'
# req = requests.get(url1)
# print(req.status_code)
# print(req.encoding)
# print(req.apparent_encoding)
# print(len(req.text))
# print(req.text[1000:2000])
# print(req.request.headers) # {'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
# # 如API异常;即web服务器只允许浏览器访问时,如不设置headers
# kv = {'user-agent': 'Mozilla/5.0'}
# r = requests.get(url1, headers=kv)
# 百度360关键词提交
# 百度关键词接口:http://www.baidu.comm/s?wb=keyword
# 360关键词接口: http://www.so.comm/s?q=keyword
#
url2 = 'http://www.baidu.com/s'
# kv = {'wd': 'python'}
# r = requests.get(url2, params=kv)
# print(r.status_code) # ISO-8859-1
# # print(r.headers)
# print(r.request.headers)
# print(r.encoding)
# # print(r.apparent_encoding)
# print(r.request.url) # http://www.baidu.com/s?wd=python
# print(len(r.text)) # 459202
# print(r.text[1000:2000])
#
# 360
#
url3 = 'http://www.so.com/s'
# kv = {'q': 'Pythonic'}
# r3 = requests.get(url3, params=kv)
# print(r3.request.url) # https://registry.co.com?q=Pythonic
# print(r3.status_code) # 200
# print(r3.encoding) # utf-8
# print(r3.apparent_encoding) # utf-8
# print(len(r3.text)) # 213248
# print(r3.text[1000:2000])
# 图片爬取
# 网站练习
# url4 = 'http://www.nationalgeographic.com.cn/photography/photo_of_the_day/3921.html'
# r4 = requests.get(url4)
# print(r4.status_code)
# print(r4.encoding) # ISO-8859-1
# # print(r4.apparent_encoding)
# r4.encoding = r4.apparent_encoding
# print(r4.encoding)
# print(r4.text[:500])
#
# 图片下载
# 图片,音频,视频均可使用.content以二进制形式写入本地
url5 = 'http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg'
path = r"D:/abc.jpg"
r5 = requests.get(url5)
print(r5.status_code) # 200
with open(path, 'wb') as f:
f.write(r5.content)
- 秀下载的图片
# IP地址自动归属地的自动查询
kv = {'ip': '202.204.80.112'}
url6 = "http://m.ip138.com/ip.asp"
r6 = requests.get(url6, params=kv)
print(r6.request.url)
print(r6.status_code)
print(r6.encoding) # None
print(r6.text[1700:2000])
>>>Output
http://m.ip138.com/ip.asp?ip=202.204.80.112
200
None
0" class="form-text" value="202.204.80.112" />
<input type="submit" value="查询" class="form-btn" />
</form>
</div>
<div class="query-hd">ip138.com IP查询(搜索IP地址的地理位置)</div>
<h1 class="query">您查询的IP:202.204.80.112</h1><p class="result">本站主数据:北京市海淀区 北京理工大学 教育网</p><p class="res
[Finished in 0.9s]