爬虫基础(3)

  • 目录
  1. 入门小练习
  2. 附注:MOOCPython网络爬虫与信息提取
  • 5个小练习
    1. 京东商品网面
    2. 亚马逊商品网面
    3. 百度,360搜索接口
    4. 图片下载
    5. ip自动查询
# -*- coding=utf-8 -*-

import requests
from bs4 import BeautifulSoup

def getHtmlText(url):
	try:
		req = requests.get(url)
		req.raise_for_status()
		print(req.text[:1000])
	except:
		print("爬取失败")


if __name__ == '__main__':
	# 京东商品页面爬取
	url = 'https://item.jd.com/8460509.html'
	# getHtmlText(url)

	# 亚马逊商品页
	url1 = 'https://www.amazon.cn/gp/product/B01M8L5z3Y'
	# req = requests.get(url1)
	# print(req.status_code)
	# print(req.encoding)
	# print(req.apparent_encoding)
	# print(len(req.text))
	# print(req.text[1000:2000])
	# print(req.request.headers)  # {'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
	# # 如API异常;即web服务器只允许浏览器访问时,如不设置headers
	# kv = {'user-agent': 'Mozilla/5.0'}
	# r = requests.get(url1, headers=kv)

	# 百度360关键词提交
	# 百度关键词接口:http://www.baidu.comm/s?wb=keyword
	# 360关键词接口: http://www.so.comm/s?q=keyword
	# 
	url2 = 'http://www.baidu.com/s'
	# kv = {'wd': 'python'}
	# r = requests.get(url2, params=kv)
	# print(r.status_code)  # ISO-8859-1
	# # print(r.headers)
	# print(r.request.headers)
	# print(r.encoding)
	# # print(r.apparent_encoding)
	# print(r.request.url)  # http://www.baidu.com/s?wd=python
	# print(len(r.text))  # 459202
	# print(r.text[1000:2000])
	# 
	# 360
	# 
	url3 = 'http://www.so.com/s'
	# kv = {'q': 'Pythonic'}
	# r3 = requests.get(url3, params=kv)
	# print(r3.request.url)  # https://registry.co.com?q=Pythonic
	# print(r3.status_code)  # 200
	# print(r3.encoding)  # utf-8
	# print(r3.apparent_encoding)  # utf-8
	# print(len(r3.text))  # 213248
	# print(r3.text[1000:2000])

	# 图片爬取
	# 网站练习
	# url4 = 'http://www.nationalgeographic.com.cn/photography/photo_of_the_day/3921.html'
	# r4 = requests.get(url4)
	# print(r4.status_code)
	# print(r4.encoding)  # ISO-8859-1
	# # print(r4.apparent_encoding)
	# r4.encoding = r4.apparent_encoding
	# print(r4.encoding)
	# print(r4.text[:500])
	# 
	# 图片下载
	# 图片,音频,视频均可使用.content以二进制形式写入本地
	url5 = 'http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg'
	path = r"D:/abc.jpg"
	r5 = requests.get(url5)
	print(r5.status_code)  # 200
	with open(path, 'wb') as f:
		f.write(r5.content)
  • 秀下载的图片
    在这里插入图片描述
    在这里插入图片描述
# IP地址自动归属地的自动查询
	kv = {'ip': '202.204.80.112'}
	url6 = "http://m.ip138.com/ip.asp"
	r6 = requests.get(url6, params=kv)
	print(r6.request.url)
	print(r6.status_code)
	print(r6.encoding)  # None
	print(r6.text[1700:2000])
	
>>>Output
http://m.ip138.com/ip.asp?ip=202.204.80.112
200
None
0" class="form-text" value="202.204.80.112" />

						<input type="submit" value="查询" class="form-btn" />

					</form>

				</div>

				<div class="query-hd">ip138.com IP查询(搜索IP地址的地理位置)</div>

				<h1 class="query">您查询的IP:202.204.80.112</h1><p class="result">本站主数据:北京市海淀区 北京理工大学 教育网</p><p class="res
[Finished in 0.9s]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值