Python 3 抓取网页资源的 N 种方法

1、最简单
import urllib.request
response = urllib.request.urlopen( ' http://python.org/ ')
html = response.read()
2、使用 Request
import urllib.request

req = urllib.request.Request( ' http://python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()
 
3、发送数据
复制代码
# ! /usr/bin/env python3

import urllib.parse
import urllib.request

url =  ' http://localhost/login.php '
user_agent =  ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' :  ' login ',
' login[email] ' :  ' yzhang@i9i8.com ',
' login[password] ' :  ' 123456 '
}

data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header( ' Referer '' http://www.python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()

print(the_page.decode( " utf8 "))
复制代码

 
4、发送数据和header
复制代码
# ! /usr/bin/env python3

import urllib.parse
import urllib.request

url =  ' http://localhost/login.php '
user_agent =  ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' :  ' login ',
' login[email] ' :  ' yzhang@i9i8.com ',
' login[password] ' :  ' 123456 '
}
headers = {  ' User-Agent ' : user_agent }

data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()

print(the_page.decode( " utf8 "))
复制代码

 
5、http 错误
复制代码
# ! /usr/bin/env python3

import urllib.request

req = urllib.request.Request( ' http://www.python.org/fish.html ')
try:
urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print(e.code)
print(e.read().decode( " utf8 "))
复制代码
 
6、异常处理1
复制代码
# ! /usr/bin/env python3

from urllib.request  import Request, urlopen
from urllib.error  import URLError, HTTPError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except HTTPError as e:
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code:  ', e.code)
except URLError as e:
print( ' We failed to reach a server. ')
print( ' Reason:  ', e.reason)
else:
print( " good! ")
print(response.read().decode( " utf8 "))
复制代码

 
7、异常处理2
复制代码
# ! /usr/bin/env python3

from urllib.request  import Request, urlopen
from urllib.error  import URLError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except URLError as e:
if hasattr(e,  ' reason '):
print( ' We failed to reach a server. ')
print( ' Reason:  ', e.reason)
elif hasattr(e,  ' code '):
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code:  ', e.code)
else:
print( " good! ")
print(response.read().decode( " utf8 "))
复制代码

 
8、HTTP 认证
复制代码
# ! /usr/bin/env python3

import urllib.request

#  create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()

#  Add the username and password.
#
 If we knew the realm, we could use it instead of None.
top_level_url =  " https://cms.tetx.com/ "
password_mgr.add_password(None, top_level_url,  ' yzhang '' cccddd ')

handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

#  create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)

#  use the opener to fetch a URL
a_url =  " https://cms.tetx.com/ "
x = opener.open(a_url)
print(x.read())

#  Install the opener.
#
 Now all calls to urllib.request.urlopen use our opener.
urllib.request.install_opener(opener)

a = urllib.request.urlopen(a_url).read().decode( ' utf8 ')
print(a)
复制代码

 
9、使用代理
复制代码
# ! /usr/bin/env python3

import urllib.request

proxy_support = urllib.request.ProxyHandler({ ' sock5 '' localhost:1080 '})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)


a = urllib.request.urlopen( " http://g.cn ").read().decode( " utf8 ")
print(a)
复制代码

 
10、超时
复制代码
# ! /usr/bin/env python3

import socket
import urllib.request

#  timeout in seconds
timeout = 2
socket.setdefaulttimeout(timeout)

#  this call to urllib.request.urlopen now uses the default timeout
#
 we have set in the socket module
req = urllib.request.Request( ' http://twitter.com/ ')
a = urllib.request.urlopen(req).read()
print(a)
复制代码
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值