参考http://www.pythonclub.org/python-network-application/observer-spider
urllib2提供了两个接口
1.urlopen较为简单,后面输入url即可返回response
若希望稍微复杂一些,则需使用request,request中可以包含header,postdata等
2.而想要处理一些复杂的请求,如cookie和代理等,则需要自己创建一个opener,使用build_opener和install_opener
然后在去调用urlopen一个url
3.Request
HTTP是基于请求和应答机制的,即客户端提出请求,服务端提供应答。urllib2用一个Request对象来映射所发出的HTTP请求
Request可以处理post一些表达数据和加入headers等信息
3.response的函数
urlopen返回的应答对象response有几个很有用的方法info() ,geturl(),getcode()
geturl() 用于返回获取的真实url,因为urlopen或许存在重定向,获取的url或许跟请求url不同
info() 返回对象的字典对象,描述当前获取的页面信息,通常是服务器发送的特定头headers
getcode() 返回HTTP response中的状态码
4.异常处理try: urllib2.urlopen(req)
except URLError, e:
print e.reason
try: urllib2.urlopen(req)
except HTTPError, e:
print e.code
print e.read()
需注意的时,except URLError将同样接受到HTTPError异常
import urllib2
import urllib
import cookielib
import bs4
#简单的调用urlopen
def raw(url):
content=urllib2.urlopen(url).read()
return content
#设置代理
def proxy(url,proxyurl):
proxy_support = urllib2.ProxyHandler({'http':proxyurl})
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
content = urllib2.urlopen(url).read()
return content
#设置cookie,这里指获取当前website中的cookie
def cookie(url):
cookie_support= urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
urllib2.install_opener(opener)
content = urllib2.urlopen(url).read()
return content
#post一些data
def post(url,postdata):
req = urllib2.Request(
url = url,
data = postdata
)
content = urllib2.urlopen(req).read()
return content
#伪造浏览器请求
def explorer(url,headers,postdata):
req = urllib2.Request(
url = url,
data = postdata,
headers = headers
)
content = urllib2.urlopen(req).read()
return content
if __name__=="__main__":
url="http://wan.sogou.com/p/index.do"
raw(url)
#cookie(url)
postdata=urllib.urlencode({
'username':'liupeng002@sogou.com',
'password':'asdfasdf',
'captcha':'',
'autoLogin':1,
'client_id':'1100',
'xd':'http://wan.sogou.com/static/jump.html',
'token':''
})
headers={
# 'Host':'account.sogou.com',
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':"en-US,en;q=0.5",
'Accept-Encoding':"gzip, deflate",
'Referer':'http://wan.sogou.com/p/index.do',
'Cookie':'SUV=1312141409210430; SUID=16FDC06F87C50B0A52A7B69900032D39; ad=XE93Syllll2vIEzBlllllV3LICtlllllbDRdVkllllGlllllVklll5@@@@@@@@@@; CXID=3B688E6EF4C25C23F71AB6C3B927D934; IPLOC=CN1100; sgsa_id=sogou.com|1388235645989794; sgsa_vt_14805_15358=1388235645988',
'Connection':'keep-alive'
}
#content=post(url,postdata)
content=explorer(url,headers,postdata)
content = bs4.BeautifulSoup(content)
print content