urllib.request 模块

最新推荐文章于 2024-07-18 19:48:40 发布

玉荒吹笛

最新推荐文章于 2024-07-18 19:48:40 发布

阅读量1.7k

点赞数 1

分类专栏： Python—爬虫

本文链接：https://blog.csdn.net/weixin_42598585/article/details/87890489

版权

Python—爬虫专栏收录该内容

7 篇文章 0 订阅

订阅专栏

urllib.request 模块

处理：基本和摘要式身份验证,重定向,cookies等等。

1.导入：

import urllib.request

2.urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

打开网址URL,这可以是一个字符串或一个 Request对象(response)。

import urllib.request

response = urllib.request.urlopen("http://www.baidu.com")    # 返回一个对象

data（为字节类型）有值时，HTTP请求将会执行POST请求而不是GET请求。
context 如被指定，它必须是一个 ssl.SSLContext实例描述各种SSL选项。点击HTTPSConnection查看更多细节。
可选cafile和capath参数指定一组被HTTPS请求信任的CA证书。cafile应该指向一个文件包含CA证书的包,而capath应该指向一个散列的证书文件的目录。点击ssl.SSLContext.load_verify_locations()查看更多的信息。
cadefault参数被忽略。

# response.geturl()      根据响应内容获取请求的url
print(response.geturl())    # http://www.baidu.com
# response.info()       返回页面的元信息,如标题
print(response.info())    # Bdpagetype: 1 Bdqid: 0xfb1ff00500002b3b Cache-Control: private Content-Type: text/html 等等
# response.getcode()       返回响应的HTTP状态代码
print(response.getcode())    # 200
# response.read() ——读取到html内容，字节类型
print(response.read())    # 字节格式的内容
# response.getheaders() ——获取响应头
print(response.getheaders())    # 返回一个列表，列表内的元素是元组
# response.readlines()——返回一个列表。一行一行读取，列表内是字节类型
print(response.readlines())

3.urllib.request.urlretrieve() 文件链接，保存的名字

response = urllib.request.urlretrieve("http://www.baidu.com")
print(response)         
# 元组 ('C:\\Users\\XIN\\AppData\\Local\\Temp\\tmphnkdp400', <http.client.HTTPMessage object at 0x0000021C735C4FD0>)

4.urllib.request.Request(url, data=None, headers={}, origin_req_host=None,unverifiable=False, method=None) 返回一个request对象

response = urllib.request.Request("http://www.baidu.com")
print(response)         # <urllib.request.Request object at 0x00000245ED1ECA58>

例子：

get请求：

构建请求头信息（反爬第一步）
构建请求对象：urllib.request.Request(url,headers = headers)

import urllib.request
import urllib.parse

url = "https://www.tianyancha.com/company/2343981465"

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
    }


request = urllib.request.Request(url,headers=headers)

response = urllib.request.urlopen(request)

print(response.read().decode())

post请求：

注意：表单数据需要处理 form_data = urllib.parse.urlencode(form_data).encode()

import urllib.request as request
import urllib.parse
import json

str_s = input("请输入你要翻译的单词：")

url = "https://fanyi.baidu.com/sug"

# 构建post表单数据
form_data = {
    "kw":str_s,
}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}

# 构建请求对象
requests = request.Request(url,headers=headers)
# 处理post的表单数据
form_data = urllib.parse.urlencode(form_data).encode()
# 发送请求
response =request.urlopen(requests,data=form_data)

text = response.read().decode()

text = json.loads(text)

print(text["data"][0]["v"])

附录：

requests模块_ajax

# POST 请求
import urllib.request
import urllib.parse

url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
fromdata = {
    'cname': '北京',
    'pid':'',
    'pageIndex': '2',
    'pageSize': '10',
}
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
request = urllib.request.Request(url,headers=headers)
fromdata = urllib.parse.urlencode(fromdata).encode()
response = urllib.request.urlopen(request,data=fromdata)
print(response.read().decode())

requests模块_复杂的get

# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse

url = "http://tieba.baidu.com/f?ie=utf-8"
data = {
    "kw":input("请输入吧名"),
    "pn":str((int(input("请输入页码"))-1)*50),
}
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",

}
fromdata = urllib.parse.urlencode(data)
url += fromdata
request = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(request,data=fromdata.encode())
print(response.read().decode())

requests模块_cookie模拟登陆

# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse

import http.cookiejar   #用来保存cookie
# 创建一个cookiejar对象
cj = http.cookiejar.CookieJar()
# 通过cookiejar创建一个handler
handler = urllib.request.HTTPCookieProcessor(cj)
# 根据handler创建一个opener
opener = urllib.request.build_opener(handler)
login_url = "http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019122253442"
from_data = {
    'email': '账号',
    'icode': '',
    'origURL': 'http://www.renren.com/home',
    'domain': 'renren.com',
    'key_id': '1',
    'captcha_type': 'web_login',
    'password': '密码',
    'rkey': 'd45f5d17e437f1e50eace68adaa5ed89',
    'f': 'http%3A%2F%2Fwww.renren.com%2F969798592',
}
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
from_data = urllib.parse.urlencode(from_data).encode()
request = urllib.request.Request(url=login_url)
response = opener.open(request,data=from_data)      # 会将cookie保存到 cj 对象中
print(response.read().decode())
get_url = "http://www.renren.com/969798592/profile"
request = urllib.request.Request(url=get_url,headers=headers)
response = opener.open(request)
print(response.read().decode())

requests模块_handler模拟登陆

# -*- coding:utf-8 -*-
import urllib.request
import urllib.parse

url = "http://www.baidu.com/"
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
# 创建一个handler
handler = urllib.request.HTTPHandler()
# 通过ihandler创建一个opener
# opener 就是一个对象，一会发送请求，直接使用opener的方法即可，不要使用urlopen
opener = urllib.request.build_opener(handler)
# 构建请求对象
request = urllib.request.Request(url,headers=headers)
# 发送请求 (无论是 get,还是 post 都是用这个 open 方法)
response = opener.open(request)
print(response.read().decode())

handler_代码配置代理，验证

import urllib.request
import urllib.parse

url = "http://www.baidu.com/s?wd=ip"
# 创建handler 括号里面 按照这个格式去写 值是 ip+:+端口号
handler = urllib.request.ProxyHandler({"http:":"113.79.75.104:9797"})
# 创建opener
opener = urllib.request.build_opener(handler)
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
request  = urllib.request.Request(url,headers=headers)
response = opener.open(request)
print(response.read().decode())

urllib.parse 模块（可以用于url 中文转换）

# urllib.parse.quote(url)               对url中的中文进行编码
response = urllib.parse.quote("http://www.mmthat.com/index.html?name=测试者")
print(response)         # http%3A//www.mmthat.com/index.html%3Fname%3D%E6%B5%8B%E8%AF%95%E8%80%85

# urllib.parse.unquote(url)           对url进行解码
response = urllib.parse.upquote("http%3A//www.mmthat.com/index.html%3Fname%3D%E6%B5%8B%E8%AF%95%E8%80%85")
print(response)         # http://www.mmthat.com/index.html?name=测试者

# urllib.parse.urlencode(dict) 将字典转化为 ie=utf-8&f=3&rsv_bp=1&tn=25017023_10_pg 格式的数据，用来拼接新的url地址
import urllib.parse 
response = urllib.parse.urlencode({"key1":"value1","key2":"value2"})
print(response) # key1=value1&key2=value2