Python 3.X 要使用urllib.request 来抓取网络资源。
最简单的方式:
1
2
3
4
5
6
7
8
|
#coding=utf-8
import
urllib.request
response
=
urllib.request.urlopen(
'http://python.org/'
)
buff
=
response.read()
#显示
html
=
buff.decode(
"utf8"
)
response.close()
print
(html)
|
使用Request的方式:
1
2
3
4
5
6
7
8
9
|
#coding=utf-8
import
urllib.request
req
=
urllib.request.Request(
'http://www.voidspace.org.uk'
)
response
=
urllib.request.urlopen(req)
buff
=
response.read()
#显示
the_page
=
buff.decode(
"utf8"
)
response.close()
print
(the_page)
|
这种方式同样可以用来处理其他URL,例如FTP:
1
2
3
4
5
6
7
8
9
|
#coding=utf-8
import
urllib.request
req
=
urllib.request.Request(
'ftp://ftp.pku.edu.cn/'
)
response
=
urllib.request.urlopen(req)
buff
=
response.read()
#显示
the_page
=
buff.decode(
"utf8"
)
response.close()
print
(the_page)
|
使用POST请求:
1
2
3
4
5
6
7
8
9
10
|
import
urllib.parseimport
urllib.requesturl
=
'http://www.someserver.com/cgi-bin/register.cgi'
values
=
{
'name'
:
'Michael Foord'
,
'location'
:
'Northampton'
,
'language'
:
'Python'
}
data
=
urllib.parse.urlencode(values)
req
=
urllib.request.Request(url, data)
response
=
urllib.request.urlopen(req)
the_page
=
response.read()
|
使用GET请求:
1
2
3
4
5
6
7
8
9
10
11
12
|
import
urllib.request
import
urllib.parse
data
=
{}
data[
'name'
]
=
'Somebody Here'
data[
'location'
]
=
'Northampton'
data[
'language'
]
=
'Python'
url_values
=
urllib.parse.urlencode(data)
print
(url_values)
name
=
Somebody
+
Here&language
=
Python&location
=
Northampton
url
=
'http://www.example.com/example.cgi'
full_url
=
url
+
'?'
+
url_values
data
=
urllib.request.
open
(full_url)
|
添加header:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
import
urllib.parse
import
urllib.request
url
=
'http://www.someserver.com/cgi-bin/register.cgi'
user_agent
=
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values
=
{
'name'
:
'Michael Foord'
,
'location'
:
'Northampton'
,
'language'
:
'Python'
}
headers
=
{
'User-Agent'
: user_agent }
data
=
urllib.parse.urlencode(values)
req
=
urllib.request.Request(url, data, headers)
response
=
urllib.request.urlopen(req)
the_page
=
response.read()
|
错误处理:
1
2
3
4
|
req
=
urllib.request.Request(
'http://www.pretend_server.org'
)
try
: urllib.request.urlopen(req)
except
urllib.error.URLError as e:
print
(e.reason)
|
返回的错误代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# Table mapping response codes to messages; entries have the
# form {code: (shortmessage, longmessage)}.
responses
=
{
100
: (
'Continue'
,
'Request received, please continue'
),
101
: (
'Switching Protocols'
,
'Switching to new protocol; obey Upgrade header'
),
200
: (
'OK'
,
'Request fulfilled, document follows'
),
201
: (
'Created'
,
'Document created, URL follows'
),
202
: (
'Accepted'
,
'Request accepted, processing continues off-line'
),
203
: (
'Non-Authoritative Information'
,
'Request fulfilled from cache'
),
204
: (
'No Content'
,
'Request fulfilled, nothing follows'
),
205
: (
'Reset Content'
,
'Clear input form for further input.'
),
206
: (
'Partial Content'
,
'Partial content follows.'
),
300
: (
'Multiple Choices'
,
'Object has several resources -- see URI list'
),
301
: (
'Moved Permanently'
,
'Object moved permanently -- see URI list'
),
302
: (
'Found'
,
'Object moved temporarily -- see URI list'
),
303
: (
'See Other'
,
'Object moved -- see Method and URL list'
),
304
: (
'Not Modified'
,
'Document has not changed since given time'
),
305
: (
'Use Proxy'
,
'You must use proxy specified in Location to access this '
'resource.'
),
307
: (
'Temporary Redirect'
,
'Object moved temporarily -- see URI list'
),
400
: (
'Bad Request'
,
'Bad request syntax or unsupported method'
),
401
: (
'Unauthorized'
,
'No permission -- see authorization schemes'
),
402
: (
'Payment Required'
,
'No payment -- see charging schemes'
),
403
: (
'Forbidden'
,
'Request forbidden -- authorization will not help'
),
404
: (
'Not Found'
,
'Nothing matches the given URI'
),
405
: (
'Method Not Allowed'
,
'Specified method is invalid for this server.'
),
406
: (
'Not Acceptable'
,
'URI not available in preferred format.'
),
407
: (
'Proxy Authentication Required'
,
'You must authenticate with '
'this proxy before proceeding.'
),
408
: (
'Request Timeout'
,
'Request timed out; try again later.'
),
409
: (
'Conflict'
,
'Request conflict.'
),
410
: (
'Gone'
,
'URI no longer exists and has been permanently removed.'
),
411
: (
'Length Required'
,
'Client must specify Content-Length.'
),
412
: (
'Precondition Failed'
,
'Precondition in headers is false.'
),
413
: (
'Request Entity Too Large'
,
'Entity is too large.'
),
414
: (
'Request-URI Too Long'
,
'URI is too long.'
),
415
: (
'Unsupported Media Type'
,
'Entity body in unsupported format.'
),
416
: (
'Requested Range Not Satisfiable'
,
'Cannot satisfy request range.'
),
417
: (
'Expectation Failed'
,
'Expect condition could not be satisfied.'
),
500
: (
'Internal Server Error'
,
'Server got itself in trouble'
),
501
: (
'Not Implemented'
,
'Server does not support this operation'
),
502
: (
'Bad Gateway'
,
'Invalid responses from another server/proxy.'
),
503
: (
'Service Unavailable'
,
'The server cannot process the request due to a high load'
),
504
: (
'Gateway Timeout'
,
'The gateway server did not receive a timely response'
),
505
: (
'HTTP Version Not Supported'
,
'Cannot fulfill request.'
),
}
|