【爬虫基础】day02 获取/添加请求头信息、代理IP

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/xiaoyaosheng19/article/details/89300405

用多个User-Agent

import urllib.request
import random


def load_baidu():
    url = "http://www.baidu.com"
    user_agent_list = ["Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
                       "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
                       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
                       "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
                       ]
    # 每次请求的浏览器都是不一样的
    random_user_agent = random.choice(user_agent_list)
    # 创建一个请求的对象
    request = urllib.request.Request(url)
    # 增加对应的请求头信息(user-agent)
    # 动态的形式添加
    request.add_header("User-Agent", random_user_agent)
    # 请求数据
    response = urllib.request.urlopen(request)
    # 打印请求头信息
    print(request.get_header("User-agent"))


load_baidu()

import urllib.request
“”"
get传参:
(1)汉字报错:解释器ascii没有汉字。需要url汉字转码
urllib.parse.quote(url, safe = string.printable)
(2) 字典传参:
urllib.parse.urlencode(dictionary)
“”"
“”"
post传参:
urllib.request.urlopen(url,data=“服务器接收的数据”)

“”"

def load_baidu():
    url = "http://www.baidu.com"
    # 添加请求头的信息
    header = {
        # 浏览器的版本
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
        "Hah": "heh"
    }

    # (第一种:获取所有请求头信息)创建请求对象
    request = urllib.request.Request(url, headers=header)
    # 返回值信息:{'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
    # AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134', 'Hah': 'heh'}

    # (第二种:获取特定请求头信息。注意key的大小写区别:首字母大写,其余字母小写)
    request_headers = request.get_header("User-agent")
    print(request_headers)
    # 返回值信息:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
    # (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134
    # 添加请求头方法二:动态添加:add_header(key,value)
    request.add_header("Accept", "*/*")
    # 请求网络数据
    response = urllib.request.urlopen(request)
    data = response.read().decode("utf-8")
    print("=="*50)
    # 获取完整的url
    print("<>"*20)
    final_url = request.get_full_url()
    print(final_url)

    # 获取请求头的信息
    request_headers = request.headers
    print(request_headers)
    with open("day02.html", "w", encoding="utf-8") as f:
        f.write(data)


load_baidu()
展开阅读全文

没有更多推荐了,返回首页