爬虫数据获取-urllib

学习记录册

已于 2023-03-02 14:43:18 修改

阅读量834

点赞数

分类专栏： # 爬虫知识点-基础文章标签：爬虫 python

于 2023-02-26 15:01:10 首次发布

本文链接：https://blog.csdn.net/j1451284189/article/details/129227036

版权

爬虫知识点-基础专栏收录该内容

12 篇文章 0 订阅

订阅专栏

数据获取-urllib

urlopen通用

代码说明

"""get请求"""
#发送请求获取响应
response = urllib.request.urlopen(url) 
 #读取响应内容，byte格式
data = response.read() 
#转码 
str_data = data.decode('utf-8')   # byte转换为字符串
data = str_data.encode('utf-8')  #字符串转换为byte
# 将包含汉字的url进行转译,避免报错
encode_new_url = urllib.parse.quote(final_url,safe=string.printable) 
 #参数转换为url内的形式
params = urllib.parse.urlencode(params)

"""post请求"""
urllib.request.urlopen(url,data='服务器接收的数据') #data为字典格式

简单实例

import urllib.request

"爬取百度网站内容"
def load_data():
    url = 'http://www.baidu.com/' #网站 https和 http返回内容不同
    response = urllib.request.urlopen(url)
    print(response)
    #读取内容，byte格式
    data = response.read()
    
    #将文件内容数据类型  byte转换为字符串  decode()
    str_data = data.decode('utf-8')
    
    # 将文件内容数据类型  字符串转换为byte  encode()
    # str_content = 'baidu'
    # byte_content = str_content.encode('utf-8')

    #将数据写入文件
    with open('baidu.html','w',encoding='utf-8') as f:
        f.write(str_data)
        
"爬取百度网站搜索内容，参数在url中"        
def get_method():
    url = 'http://www.baidu.com/s?wd='
    name = '美女'
    final_url = url+name
    print(final_url)
    # urlopen中参数url中包含汉字时会报错 UnicodeEncodeError:
    # 'ascii' codec can't encode characters in position 10-11: ordinal not in range(128)
    #解决方法：将包含汉字的网址进行转译
    encode_new_url = urllib.parse.quote(final_url,safe=string.printable)  #解决方法：将包含汉字的网址进行转译
    response = urllib.request.urlopen(encode_new_url)
    str_data = response.read().decode('utf-8')
    with open('baidu.html', 'w', encoding='utf-8') as f:
        f.write(str_data)
        
'爬取百度网站搜索内容，get请求通过字典传参数'
def day2_get_params():
    #字典传参相关
    url = 'http://www.baidu.com/s?'
    params  = {
        'wd': '中文',
        'key': 'zhang',
        'value': 'san'
    }
    #参数转换为url内的形式
    str_params = urllib.parse.urlencode(params)
    final_url = url+str_params # url拼接
    end_url = urllib.parse.quote(final_url, safe=string.printable)#中文转译
    response = urllib.request.urlopen(end_url)
    data = response.read().decode('utf-8')

urllib与请求头

代码说明

#创建请求对象
request = urllib.request.Request(url)  
#动态添加请求头信息
request.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') 

#创建请求对象,并添加请求头 headers字典
request = urllib.request.Request(url,headers=headers)

#发送请求并获取请求响应
response = urllib.request.urlopen(request)

#获取请求头数据
request_headers = request.headers 
#获取请求头对应参数数据，注意，首字母大写，其余均小写，否则返回None
request_headers = get_header('User-agent')
#获取请求的完整的url
full_url = request.get_full_url()

#查看响应头信息
response_headers = response.headers

简单实例

' 模拟真实浏览器发生请求 (百度批量搜索)'
def day2_request_header2():
    #添加请求头信息
    url = 'https://www.baidu.com/'
    headers= {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
    }
    # #创建请求对象,并添加请求头
    # request = urllib.request.Request(url,headers=headers)
    
    #动态添加请求头信息
    request = urllib.request.Request(url)
    request.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
    #请求网络数据
    response = urllib.request.urlopen(request)
    #获取请求头数据
    print(request.headers)
    print(request.get_header('User-agent'))#注意，首字母大写，其余均小写，否则返回None
    #获取完整的url
    final_url = request.get_full_url()
    print(final_url)
    #查看响应头信息
    # print(response.headers)
    

'请求头浏览器信息随机'
def day2_random_agent():
    #随机获取浏览器，
    url = 'http://www.baidu.com/'
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
    ]
    random_user_agent = random.choice(user_agent_list)
    request = urllib.request.Request(url)
    request.add_header('User-Agent', random_user_agent)
    response = urllib.request.urlopen(request)
    print(request.get_header('User-agent'))

urllib与IP代理

IP代理简介

1、免费的IP：时效差，错误性高
2、付费IP：贵花钱，失效
3、**IP分类**：
透明：对方知道我们真实的IP
匿名：对方不知道我们真实IP，知道你使用了代理
高匿：对方不知道我们真实IP，也不知道使用了代理

urlopen不支持代理的添加，需自己创建handler，流程如下
创建代理处理器ProxyHandler
使用ProxyHandler创建opener 
使用opener调用open方法请求数据

代码说明

#总结
# 创建自己的处理器，有不同的分类 如HTTPHandler，ProxyHandler
handler = urllib.request.HTTPHandler() 
#创建自己的openner  
opener = urllib.request.build_opener(handler)  
#获取响应内容
response = opener.open(url)

#创建代理处理器，代理的写法（免费与付费的不同，见实例）
proxy_handler = urllib.request.ProxyHandler(proxy)#创建代理处理器

简单实例

 """自定义urlopen类似功能"""
def day2_handler_opener(): 
    #系统的urlopen并没有添加代理功能，需自定义
    #安全套接层 SSL第三方的CA数字证书
    #http 80端口和https 443端口
    # urllib.request.urlopen() 原理 handler处理器  自己的opener请求数据
    url= 'https://blog.csdn.net/jjxp2011/article/details/124546976'
    #创建自己的处理器
    handler = urllib.request.HTTPHandler()
    #创建自己的openner
    opener = urllib.request.build_opener(handler)
    #用自己创建的openner调用open请求数据
    response = opener.open(url)
    data = response.read()

"""创建使用代理IP的opener"""
def day2_proxy_handler():
    url = 'https://blog.csdn.net/jjxp2011/article/details/124546976'
    #添加代理
    proxy={
        #免费的写法
        # 'http':'http://120.77.249.46:8080'
        'http': '120.77.249.46:8080'
        #付费代理的写法
        #'http':'username:pwd@192.168.1.12:8080'
    }
    #代理处理器
    proxy_handler = urllib.request.ProxyHandler(proxy)
    #创建自己opener
    opener = urllib.request.build_opener(proxy_handler)
    #用代理IP请求
    data = opener.open(url).read()
 
"""爬虫多代理IP"""
def day2_random_proxy():    
    url = 'https://blog.csdn.net/jjxp2011/article/details/124546976'
    proxy_list = [
        { 'http': '120.77.249.46:8080'},
        { 'http': '18.191.216.4:80'},
        { 'http': '103.123.234.106:8080'},
        { 'http': '41.79.37.74:8585'},
        { 'http': '117.4.115.169:8080'},
        { 'http': '8.242.207.202:8080'},
        { 'http': '103.48.183.113:4145 '},
    ]
    for proxy in proxy_list:
        print(proxy)
        #利用proxy创建handler
        proxy_handler = urllib.request.ProxyHandler(proxy)
        opener = urllib.request.build_opener(proxy_handler)
        try:
            response = opener.open(url,timeout=1)
        except Exception as err:
            print(str(err))

urllib与密码管理器

代码说明

#总结
#代理IP字典方式的写法,创建handler与opener与免费IP相同
#创建密码管理器，添加用户名和密码 proxy_ip 
pwd_manager = urllib.request.HTTPPasswordMgr()
pwd_manager.add_password(None,proxy_ip,user_name,pwd)

handler = urllib.request.ProxyBasicAuthHandler(pwd_manager)#创建验证代理IP的处理器

简单实例

 """密码管理器简单使用"""
def day3_auth_login():
    user_name = '1dddddd'
    pwd = '1344!jfoem'
    nei_url = 'http://123.23.45.3:8080'
    #创建密码管理器，并添加用户名密码
    pwd_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    pwd_manager.add_password(None,nei_url,user_name,pwd)
    #
    auth_handler = urllib.request.HTTPBasicAuthHandler(pwd_manager)
    opener = urllib.request.build_opener(auth_handler)
    response = opener.open('http://baidu.com')#打开内网的地址
    print(response.read())
    
 """付费代理与密码管理器"""
def day3_money_proxy():
    url = 'https://blog.csdn.net/jjxp2011/article/details/124546976'
    user_name = '1dddddd'
    pwd = '1344!jfoem'
    proxy_money = '123.23.45.3:8080'
    #创建密码管理器，添加用户名和密码
    pwd_manager = urllib.request.HTTPPasswordMgr()
    #uri定位   url资源定位符
    pwd_manager.add_password(None,proxy_money,user_name,pwd)
    #创建验证代理IP的处理器
    handler = urllib.request.ProxyBasicAuthHandler(pwd_manager)
    #创建opener
    opener = urllib.request.build_opener(handler)
    response = opener.open(url)
    print(response.read())

urllib与Cookie

代码说明

cookie用于获取需要权限的内容
#post请求要求bytes类型
login_byte = urllib.parse.urlencode(login_form_data).encode('utf-8') 
# 定义有添加 cookie 功能的处理器
cook_jar = cookiejar.CookieJar()
cook_handler = urllib.request.HTTPCookieProcessor(cook_jar)

简单实例

#方法1：在headers加入cookie即可，和User-Agent相同
# 缺点：需要自己在浏览器上登录获取cookie
def day4_cookies():
    url = 'https://www.baidu.com/my/index'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0',
        'Cookie':'BAIDUID=183FF5BBC1048B6A7AFE9AD123052C0D:SL=0:NR=10:FG=1; 
    }
    request = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(request)
    data = response.read().decode('utf-8')
    # 将数据写入文件
    with open('baidu.html', 'w', encoding='utf-8') as f:
        f.write(data)
        
# 方法2：使用cookiejar登录，登录后自动获取cookie，即可访问
# 缺点：需找到登录网址，和其他登录需要的参数，短信验证码等
def day4_auto_login():
    """代码模拟登录"""
    # 登录页的网址
    login_url = 'http://bpoyg.zdzhiheng.com.cn:8000/prod-api/login'
    login_header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0'
    }
    login_form_data = {
        "username":"10***28",
        "password":"W!j***13#",
        "code":"d74y",
    }

    login_byte = urllib.parse.urlencode(login_form_data).encode('utf-8') #参数转码 post请求要求bytes类型

    # 定义有添加 cookie 功能的处理器
    cook_jar = cookiejar.CookieJar()
    cook_handler = urllib.request.HTTPCookieProcessor(cook_jar)
    # 生成opener
    opener = urllib.request.build_opener(cook_handler)
    # 访问后opener获取cookie
    urllib.request.Request(login_url, headers=login_header, data=login_byte)

    # 用带有cookiejar的opener 访问需要权限的信息，如个人中心
    center_url = 'http://123.com/user/profile'
    center_request = urllib.request.Request(center_url, headers=login_header)
    response = opener.open(center_request)
    
    data = response.read().decode('utf-8')
    # 将数据写入文件
    with open('baidu.html', 'w', encoding='utf-8') as f:
        f.write(data)

urllib与URLError

#URLerror 用于应对出错时抛出对应解决方法
import urllib.error

def day4_urlerr():
    #HttpError UrlError（父）
    url = 'http://www.python.org/'
    try:
        response = urllib.request.urlopen(url,timeout=0.1)
    except urllib.error.HTTPError as err:
        print(err.code)
    except urllib.error.URLError as err:
        print(err)