爬虫简单笔记——urllib.request有关用法整理

最新推荐文章于 2021-05-11 21:41:59 发布

chu4755

最新推荐文章于 2021-05-11 21:41:59 发布

阅读量281

点赞数

文章标签：爬虫 python ui

原文链接：https://my.oschina.net/u/3406045/blog/893170

版权

一、urllib.request

#1 最简单

import urllib.request
response = urllib.request.urlopen('http://python.org/')
html = response.read()

#2 使用 Request

import urllib.request
req = urllib.request.Request('http://python.org/')
response = urllib.request.urlopen(req)
the_page = response.read()

#3 发送数据
import urllib.parse
import urllib.request

url = 'http://localhost/login.php'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {
'act' : 'login',
'login[email]' : 'yzhang@i9i8.com',
'login[password]' : '123456'
}
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header('Referer', 'http://www.python.org/')
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode("utf8"))

#4 发送数据和header

import urllib.parse
import urllib.request

url = 'http://localhost/login.php'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {
'act' : 'login',
'login[email]' : 'yzhang@i9i8.com',
'login[password]' : '123456'
}
headers = { 'User-Agent' : user_agent }

data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()

print(the_page.decode("utf8"))

#6 异常处理1

from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request("http://twitter.com/")
try:
response = urlopen(req)
except HTTPError as e:
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
except URLError as e:
print('We failed to reach a server.')
print('Reason: ', e.reason)
else:
print("good!")
print(response.read().decode("utf8"))

#7 异常处理2

from urllib.request import Request, urlopen
from urllib.error import URLError
req = Request("http://twitter.com/")
try:
response = urlopen(req)
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
else:
print("good!")
print(response.read().decode("utf8"))

#8 HTTP 认证

import urllib.request

# create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()

# Add the username and password.
# If we knew the realm, we could use it instead of None.
top_level_url = "https://cms.tetx.com/"
password_mgr.add_password(None, top_level_url, 'yzhang', 'cccddd')

handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

# create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)

# use the opener to fetch a URL
a_url = "https://cms.tetx.com/"
x = opener.open(a_url)
print(x.read())

# Install the opener.
# Now all calls to urllib.request.urlopen use our opener.
urllib.request.install_opener(opener)

a = urllib.request.urlopen(a_url).read().decode('utf8')
print(a)

#9 使用代理

import urllib.request

proxy_support = urllib.request.ProxyHandler({'sock5': 'localhost:1080'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

a = urllib.request.urlopen("http://g.cn").read().decode("utf8")
print(a)

#10 超时

import socket
import urllib.request

# timeout in seconds
timeout = 2
socket.setdefaulttimeout(timeout)

# this call to urllib.request.urlopen now uses the default timeout
# we have set in the socket module
req = urllib.request.Request('http://twitter.com/')
a = urllib.request.urlopen(req).read()
print(a)

#11 使用cookie
# 这个模块主要提供了这几个对象，CookieJar，FileCookieJar，MozillaCookieJar,LWPCookieJar。

#urllib与http.cookiejar结合使用

#cookielib模块一般与urllib模块配合使用，

#主要用在urllib.requests.build_oper()函数中作为urllib.requests.HTTPCookieProcessor()的参数。

import urllib.request

import http.cookiejar

# 初始化一个CookieJar来处理Cookie

cookie=http.cookiejar.CookieJar()

#实例化一个全局opener

handler=urllib.request.HTTPCookieProcessor(cookie)

opener=urllib.request.build_opener(handler)

# 获取cookie

r=opener.open('https://www.hao123.com/')

# 访问主页 自动带着cookie信息

result = opener.open('https://www.hao123.com/')

result.status

print (result.read())

#------------------------------------------------------------------------------

import requests

#对于cookies格式的转化，提供了三个工具方法：

#requests.utils.dict_from_cookiejar(cj)

#requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True)

#requests.utils.add_dict_to_cookiejar(cj, cookie_dict)

#工具方法:requests.cookies.RequestsCookieJar转换成字典

load_cookies = requests.utils.dict_from_cookiejar(r.cookies)

#工具方法将字典转换成RequestsCookieJar，赋值给session的cookies.

#requests.utils.cookiejar_from_dict({c.name: c.value for c in r.cookies})

requests.session.cookies=requests.utils.cookiejar_from_dict(load_cookies)

#利用requests 库

r=requests.get('https://www.hao123.com/',cookies=load_cookies)

#获取的cookies，可以用keys()和values()看内容，但本身不是字典格式，以下可以打印出字典方式查看

r.cookies

print ({c.name: c.value for c in r.cookies})

#12 使用cookie2

import urllib.request, urllib.parse, urllib.error
import http.cookiejar

LOGIN_URL = 'http://acm.hit.edu.cn/hoj/system/login'
values = {'user': '******', 'password': '******'} # , 'submit' : 'Login'
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}

cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)

request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
    response = opener.open(request)
    page = response.read().decode()
    # print(page)
except urllib.error.URLError as e:
    print(e.code, ':', e.reason)

cookie.save(ignore_discard=True, ignore_expires=True)  # 保存cookie到cookie.txt中
print(cookie)
for item in cookie:
    print('Name = ' + item.name)
    print('Value = ' + item.value)

get_url = 'http://acm.hit.edu.cn/hoj/problem/solution/?problem=1'  # 利用cookie请求访问另一个网址
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print(get_response.read().decode())
# print('You have not solved this p

'''
Note:
1. 直接open http://acm.hit.edu.cn/hoj/problem/solution/?problem=1页面不知道去哪了，根本不是直接用浏览器登录后的界面！用cookie登录就可以正常访问，html代码中会有一句话you have not solved this problem，因为我没做这道题。
2. 原理：创建一个带有cookie的opener，在访问登录的URL时，将登录后的cookie保存下来，然后利用这个cookie来访问其他网址。查看登录之后才能看到的信息。[python 3.3.2 爬虫记录]
重复使用cookie登录
上面代码中我们保存cookie到文件中了，下面我们可以直接从文件导入cookie进行登录，不用再构建包含用户名和密码的postdata了
'''

import urllib.request, urllib.parse, urllib.error
import http.cookiejar

cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
cookie.load(cookie_filename, ignore_discard=True, ignore_expires=True)
print(cookie)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)

get_url = 'http://acm.hit.edu.cn/hoj/problem/solution/?problem=1'  # 利用cookie请求访问另一个网址
get_request = urllib.request.Request(get_url)
get_response = opener.open(get_request)
print(get_response.read().decode())



'''
用cookie登陆伯乐在线
'''

import urllib.request, urllib.parse, urllib.error
import http.cookiejar

LOGIN_URL = 'http://www.jobbole.com/wp-admin/admin-ajax.php'
get_url = 'http://www.jobbole.com/'  # 利用cookie请求访问另一个网址

values = {'action': 'user_login', 'user_login': '*****', 'user_pass': '******'}
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent}

cookie_filename = 'cookie_jar.txt'
cookie_jar = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie_jar)
opener = urllib.request.build_opener(handler)

request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
    response = opener.open(request)
    # print(response.read().decode())
except urllib.error.URLError as e:
    print(e.code, ':', e.reason)

cookie_jar.save(ignore_discard=True, ignore_expires=True)  # 保存cookie到cookie.txt中
for item in cookie_jar:
    print('Name = ' + item.name)
    print('Value = ' + item.value)

get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print('个人主页' in get_response.read().decode())


'''登录知乎'''

[python] view plain copy print?
#!/usr/bin/env python  
# -*- coding: utf-8 -*-  
""" 
__author__ = 'pi' 
__email__ = 'pipisorry@126.com' 
"""  
import urllib.request  
import urllib.parse  
import urllib.error  
import http.cookiejar  
  
LOGIN_URL = r'https://uis.uestc.edu.cn/amserver/UI/Login'  # 登录教务系统的URL  
get_url = 'http://eams.uestc.edu.cn/eams/teach/grade/course/person.action'  # 利用cookie请求访问另一个网址  
  
values = {'IDToken1': '201106******', 'IDToken2': '***********'}  
postdata = urllib.parse.urlencode(values).encode()  
headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}  
  
cookie_filename = 'cookie_jar.txt'  
cookie_jar = http.cookiejar.MozillaCookieJar(cookie_filename)  
handler = urllib.request.HTTPCookieProcessor(cookie_jar)  
opener = urllib.request.build_opener(handler)  
  
request = urllib.request.Request(LOGIN_URL, postdata, headers)  
try:  
    response = opener.open(request)  
    # print(response.read().decode())  
except urllib.error.URLError as e:  
    print(e.code, ':', e.reason)  
  
cookie_jar.save(ignore_discard=True, ignore_expires=True)  # 保存cookie到cookie.txt中  
for item in cookie_jar:  
    print('Name = ' + item.name)  
    print('Value = ' + item.value)  
  
get_request = urllib.request.Request(get_url, headers=headers)  
get_response = opener.open(get_request)  
print(get_response.read().decode())  


# 代码来源：http://blog.csdn.net/pipisorry/article/details/47948065

转载于:https://my.oschina.net/u/3406045/blog/893170

chu4755

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
爬虫简单笔记——urllib.request有关用法整理

一、urllib.request #1 最简单import urllib.requestresponse = urllib.request.urlopen('http://python.org/')html = response.read() #2 使用 Requestimpo...
复制链接

扫一扫