一、urllib.request
#1 最简单
import urllib.request
response = urllib.request.urlopen('http://python.org/')
html = response.read()
#2 使用 Request
import urllib.request
req = urllib.request.Request('http://python.org/')
response = urllib.request.urlopen(req)
the_page = response.read()
#3 发送数据
import urllib.parse
import urllib.request
url = 'http://localhost/login.php'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {
'act' : 'login',
'login[email]' : 'yzhang@i9i8.com',
'login[password]' : '123456'
}
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header('Referer', 'http://www.python.org/')
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode("utf8"))
#4 发送数据和header
import urllib.parse
import urllib.request
url = 'http://localhost/login.php'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {
'act' : 'login',
'login[email]' : 'yzhang@i9i8.com',
'login[password]' : '123456'
}
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode("utf8"))
#6 异常处理1
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request("http://twitter.com/")
try:
response = urlopen(req)
except HTTPError as e:
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
except URLError as e:
print('We failed to reach a server.')
print('Reason: ', e.reason)
else:
print("good!")
print(response.read().decode("utf8"))
#7 异常处理2
from urllib.request import Request, urlopen
from urllib.error import URLError
req = Request("http://twitter.com/")
try:
response = urlopen(req)
except URLError as e:
if hasattr(e, 'reason'):
print('We failed to reach a server.')
print('Reason: ', e.reason)
elif hasattr(e, 'code'):
print('The server couldn\'t fulfill the request.')
print('Error code: ', e.code)
else:
print("good!")
print(response.read().decode("utf8"))
#8 HTTP 认证
import urllib.request
# create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
# Add the username and password.
# If we knew the realm, we could use it instead of None.
top_level_url = "https://cms.tetx.com/"
password_mgr.add_password(None, top_level_url, 'yzhang', 'cccddd')
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
# create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)
# use the opener to fetch a URL
a_url = "https://cms.tetx.com/"
x = opener.open(a_url)
print(x.read())
# Install the opener.
# Now all calls to urllib.request.urlopen use our opener.
urllib.request.install_opener(opener)
a = urllib.request.urlopen(a_url).read().decode('utf8')
print(a)
#9 使用代理
import urllib.request
proxy_support = urllib.request.ProxyHandler({'sock5': 'localhost:1080'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
a = urllib.request.urlopen("http://g.cn").read().decode("utf8")
print(a)
#10 超时
import socket
import urllib.request
# timeout in seconds
timeout = 2
socket.setdefaulttimeout(timeout)
# this call to urllib.request.urlopen now uses the default timeout
# we have set in the socket module
req = urllib.request.Request('http://twitter.com/')
a = urllib.request.urlopen(req).read()
print(a)
#11 使用cookie
# 这个模块主要提供了这几个对象,CookieJar,FileCookieJar,MozillaCookieJar,LWPCookieJar。
#urllib与http.cookiejar结合使用
#cookielib模块一般与urllib模块配合使用,
#主要用在urllib.requests.build_oper()函数中作为urllib.requests.HTTPCookieProcessor()的参数。
import urllib.request
import http.cookiejar
# 初始化一个CookieJar来处理Cookie
cookie=http.cookiejar.CookieJar()
#实例化一个全局opener
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
# 获取cookie
r=opener.open('https://www.hao123.com/')
# 访问主页 自动带着cookie信息
result = opener.open('https://www.hao123.com/')
result.status
print (result.read())
#------------------------------------------------------------------------------
import requests
#对于cookies格式的转化,提供了三个工具方法:
#requests.utils.dict_from_cookiejar(cj)
#requests.utils.cookiejar_from_dict(cookie_dict, cookiejar=None, overwrite=True)
#requests.utils.add_dict_to_cookiejar(cj, cookie_dict)
#工具方法:requests.cookies.RequestsCookieJar转换成字典
load_cookies = requests.utils.dict_from_cookiejar(r.cookies)
#工具方法将字典转换成RequestsCookieJar,赋值给session的cookies.
#requests.utils.cookiejar_from_dict({c.name: c.value for c in r.cookies})
requests.session.cookies=requests.utils.cookiejar_from_dict(load_cookies)
#利用requests 库
r=requests.get('https://www.hao123.com/',cookies=load_cookies)
#获取的cookies,可以用keys()和values()看内容,但本身不是字典格式,以下可以打印出字典方式查看
r.cookies
print ({c.name: c.value for c in r.cookies})
#12 使用cookie2
import urllib.request, urllib.parse, urllib.error
import http.cookiejar
LOGIN_URL = 'http://acm.hit.edu.cn/hoj/system/login'
values = {'user': '******', 'password': '******'} # , 'submit' : 'Login'
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
page = response.read().decode()
# print(page)
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
cookie.save(ignore_discard=True, ignore_expires=True) # 保存cookie到cookie.txt中
print(cookie)
for item in cookie:
print('Name = ' + item.name)
print('Value = ' + item.value)
get_url = 'http://acm.hit.edu.cn/hoj/problem/solution/?problem=1' # 利用cookie请求访问另一个网址
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print(get_response.read().decode())
# print('You have not solved this p
'''
Note:
1. 直接open http://acm.hit.edu.cn/hoj/problem/solution/?problem=1页面不知道去哪了,根本不是直接用浏览器登录后的界面!用cookie登录就可以正常访问,html代码中会有一句话you have not solved this problem,因为我没做这道题。
2. 原理:创建一个带有cookie的opener,在访问登录的URL时,将登录后的cookie保存下来,然后利用这个cookie来访问其他网址。查看登录之后才能看到的信息。[python 3.3.2 爬虫记录]
重复使用cookie登录
上面代码中我们保存cookie到文件中了,下面我们可以直接从文件导入cookie进行登录,不用再构建包含用户名和密码的postdata了
'''
import urllib.request, urllib.parse, urllib.error
import http.cookiejar
cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
cookie.load(cookie_filename, ignore_discard=True, ignore_expires=True)
print(cookie)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
get_url = 'http://acm.hit.edu.cn/hoj/problem/solution/?problem=1' # 利用cookie请求访问另一个网址
get_request = urllib.request.Request(get_url)
get_response = opener.open(get_request)
print(get_response.read().decode())
'''
用cookie登陆伯乐在线
'''
import urllib.request, urllib.parse, urllib.error
import http.cookiejar
LOGIN_URL = 'http://www.jobbole.com/wp-admin/admin-ajax.php'
get_url = 'http://www.jobbole.com/' # 利用cookie请求访问另一个网址
values = {'action': 'user_login', 'user_login': '*****', 'user_pass': '******'}
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent}
cookie_filename = 'cookie_jar.txt'
cookie_jar = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie_jar)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
# print(response.read().decode())
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
cookie_jar.save(ignore_discard=True, ignore_expires=True) # 保存cookie到cookie.txt中
for item in cookie_jar:
print('Name = ' + item.name)
print('Value = ' + item.value)
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print('个人主页' in get_response.read().decode())
'''登录知乎'''
[python] view plain copy print?
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__author__ = 'pi'
__email__ = 'pipisorry@126.com'
"""
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
LOGIN_URL = r'https://uis.uestc.edu.cn/amserver/UI/Login' # 登录教务系统的URL
get_url = 'http://eams.uestc.edu.cn/eams/teach/grade/course/person.action' # 利用cookie请求访问另一个网址
values = {'IDToken1': '201106******', 'IDToken2': '***********'}
postdata = urllib.parse.urlencode(values).encode()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
cookie_filename = 'cookie_jar.txt'
cookie_jar = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie_jar)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
# print(response.read().decode())
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
cookie_jar.save(ignore_discard=True, ignore_expires=True) # 保存cookie到cookie.txt中
for item in cookie_jar:
print('Name = ' + item.name)
print('Value = ' + item.value)
get_request = urllib.request.Request(get_url, headers=headers)
get_response = opener.open(get_request)
print(get_response.read().decode())
# 代码来源:http://blog.csdn.net/pipisorry/article/details/47948065