Python requests 请求(爬虫)

最新推荐文章于 2024-07-18 15:53:23 发布

麦合学长

最新推荐文章于 2024-07-18 15:53:23 发布

阅读量1.3k

点赞数 9

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/rassamqi/article/details/135126034

版权

""""
反爬机制:门户网站，可以通过制定相应的策略或者技术手段，防止爬虫程序进行网站数据的爬取。
反反爬策略:爬虫程序可以通过制定相关的策略或者技术手段，破解门户网站中具备的反爬机制，从而可以获取门户网站中相关的数据。
robots.txt协议:君子协议。规定了网站中哪些数据可以被爬虫爬取哪些数据不可以被爬取。
"""

from urllib.request import urlopen  # 来自 网页模块.请求 导入 网页打开

url = "http://www.baidu.com"

resp = urlopen(url)
print(resp)  # <http.client.HTTPResponse object at 0x000002814508BFD0>
print(resp.read())  # 此时拿到的是页面源代码 (b' 二进制)
print(resp.read().decode("utf-8"))  # 此时拿到的是页面源代码 (UTF-8模式)
# ctrl+f 搜索：charset ，确定转化成 utf-8 , GBK       搜索结果：charset=utf-8

# 源代码：html , css ,js

with open("mybaid.html", mode="w", encoding="utf-8") as f:  # 创建文件.html格式  写 UTF-8模式
	f.write(resp.read().decode("utf-8"))     # 写的内容：读取resp   以UTF-8模式写入
	# 细节：打开写入的文件 "mybaidu.html"，以浏览器或者别的方式打开，你就能看到网站模式。
		# 如果直接print，只能看到源代码。必须写出来才行~
		# 你可以下写入的网站上进行修改，都可以。哈哈~~恶作剧啥的


//***** requests 测试

import requests

# 爬取百度的页面源代码
url = "http://www.baidu.com"
resp = requests.get(url)
resp.encoding = "utf-8"
print(resp)   # <Response [200]>
print(resp.text)  # 拿到页面源代码



//****************   get 请求

import requests

content = input('请输入你要检索的内容:')
url = f"https://www.sogou.com/web?query={content}"

headers = {
	# 添加一个请求头信息. UA
	"User-Agent": " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"
}
# 处理一个小小的反爬     细节：这是字典
resp = requests.get(url, headers=headers)
print(resp.text)

print(resp.request.headers)  # 可以查看到请求头信息
		# {'User-Agent': 'Mozilla/5.0 xxxx', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
	 # 你会发现，就是我们写入的请求头

//******
# 如果不写：headers 的话，默认的 python 的 requests 的请求头信息是什么呢？
resp = requests.get(url)
print(resp.request.headers)   # python-requests/2.31.0  这时默认的。所以，服务器会识别出来。
		# {'User-Agent': 'python-requests/2.31.0', ......}
		# {'User-Agent': 'python-requests/2.31.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Cookie': 'ABTEST=0|1703085357|v17; SNUID=658039558B8C8778FE4F23738BB63A7E; IPLOC=CN5000; SUID=EF0AB2DE2431A40A000000006583052D; cuid=AAHKKmo+SQAAAAqgKmxzkgAAEAM='}



//*********************   post 请求

import requests
import json
url = "https://fanyi.baidu.com/sug"

hehe = {
	"kw": input("请输入一个单词")
}

resp = requests.post(url, data=hehe)

print(resp.text)  # 拿到的是文本字符串
				# {"errno":0,"data":[{"k":"Apple","v":"n. \u82f9\u679c\u516c\u53f8\uff0c\u539f\u79f0\u82f9\u679c\u7535\u8111\u516c\u53f8"},{"k":"apple","v":"n. \u82f9\u679c; \u82f9\u679c\u516c\u53f8; \u82f9\u679c\u6811"},{"k":"APPLE","v":"n. \u82f9\u679c"},{"k":"apples","v":"n. \u82f9\u679c\uff0c\u82f9\u679c\u6811( apple\u7684\u540d\u8bcd\u590d\u6570 ); .....}
print(resp.json())  # 此时拿到的直接是json数据   (字典)
				# {'errno': 0, 'data': [{'k': 'Apple', 'v': 'n. 苹果公司，原称苹果电脑公司'}, {'k': 'apple', 'v': 'n. 苹果; 苹果公司; 苹果树'}, {'k': 'APPLE', 'v': 'n. 苹果'}, {'k': 'apples', 'v': 'n. 苹果，苹果树( apple的名词复数 ); [美国口语]棒球; [美国英语][保龄球]坏球; '}, {'k': 'Apples', 'v': '[地名] [瑞士] 阿
print(resp.json()['data'])  # 拿到的就是'data' 的value

//***********************

import requests

url = "https://movie.douban.com/j/chart/top_list"

hehe = "type=13&interval_id=100%3A90&action=&start=0&limit=20"

resp = requests.get(url,hehe)  # 处理一个小小的反爬
print(resp.text)   # 会发现，什么都没有！！  那就是：检测设备。那就反爬


"""""
	说明一下 parse 
hehe = {
	"type": "13",
	"interval_id": "100:90",
	"action": "",
	"start": "0",
	"limit": "20"
}
params=hehe 等价于 hehe = "type=13&interval_id=100%3A90&action=&start=0&limit=20"
# 你可理解成： parse 相当于 把字典拼(:)接成(=)形式
  所以，我们就写成字典方式
"""""

import requests

url = "https://movie.douban.com/j/chart/top_list"

hehe = {
	"type": "13",
	"interval_id": "100:90",
	"action": "",
	"start": "0",
	"limit": "20"
}

headers = {
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
}


resp = requests.get(url, params=hehe, headers=headers)  # 处理一个小小的反爬              你看这下面：\/view\/photo\/s_ratio_poster\  明显有问题。把网站换个样式了
print(resp.text)   # [{"rating":["9.6","50"],"rank":1,"cover_url":"https://img1.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2561716440.jpg" .....
print(resp.json())   # [{'rating': ['9.6', '50'], 'rank': 1, 'cover_url': 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2561716440.jpg', ...
													# 通过 json()就把网址正确输出了   以后json()也用一下吧，觉得。
print(resp.request.url)   # https://movie.douban.com/j/chart/top_list?type=13&interval_id=100%3A90&action=&start=0&limit=20
print(resp.request)     # <PreparedRequest [GET]>
							# 细节： 请求的时候requests会自动拼接带'？'    不用写成 (url, '?',params=hehe) 这种形式