From: http://www.cnblogs.com/bboy/archive/2010/10/29/1864537.html
用python抓取网页是非常简单的事,简单的几行代码就可以解决。。。这里稍微记录一下
需要引用的包有主要是 urllib2,urllib也可以引入,具体 看代码
#
-------------------------------------------------------------------------------
# Name: 模拟登录web
# Purpose:
#
# Author: huwei
#
# Created: 26/10/2010
# Copyright: (c) huwei 2010
# Licence: <your licence>
# -------------------------------------------------------------------------------
# !/usr/bin/env python
import time,urllib2,urllib
def main():
# 登录博客园
loginCNblogs()
pass
# 登录博客园
def loginCNblogs():
try :
# 设置 cookie
cookies = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
parms = { " tbUserName " : " 用户名 " , " tbPassword " : " 密码 " , " __EVENTTARGET " : " btnLogin " , " __EVENTARGUMENT " : "" ,
" __VIEWSTATE " : " /wEPDwULLTExMDE0MzIzNDRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQtjaGtSZW1lbWJlcmcJekJlt5rFwfnjeMMnX9V58Xhg " ,
" __EVENTVALIDATION " : " /wEWBQKit6iCDALyj/OQAgK3jsrkBALR55GJDgKC3IeGDK6TQlRlirS2Zja1Lmeh02u4XMwV " ,
" txtReturnUrl " : " http://bboy.cnblogs.com " }
loginUrl = " http://passport.cnblogs.com/login.aspx "
login = urllib2.urlopen(loginUrl,urllib.urlencode(parms))
# print(unicode(login.read(),"utf8"))
# 显示配置页面
avatar = urllib2.urlopen( " http://home.cnblogs.com/set/avatar/ " )
# print(avatar.read().decode("utf8"))
except Exception,e:
print (e)
pass
if __name__ == ' __main__ ' :
main()
# Name: 模拟登录web
# Purpose:
#
# Author: huwei
#
# Created: 26/10/2010
# Copyright: (c) huwei 2010
# Licence: <your licence>
# -------------------------------------------------------------------------------
# !/usr/bin/env python
import time,urllib2,urllib
def main():
# 登录博客园
loginCNblogs()
pass
# 登录博客园
def loginCNblogs():
try :
# 设置 cookie
cookies = urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
parms = { " tbUserName " : " 用户名 " , " tbPassword " : " 密码 " , " __EVENTTARGET " : " btnLogin " , " __EVENTARGUMENT " : "" ,
" __VIEWSTATE " : " /wEPDwULLTExMDE0MzIzNDRkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQtjaGtSZW1lbWJlcmcJekJlt5rFwfnjeMMnX9V58Xhg " ,
" __EVENTVALIDATION " : " /wEWBQKit6iCDALyj/OQAgK3jsrkBALR55GJDgKC3IeGDK6TQlRlirS2Zja1Lmeh02u4XMwV " ,
" txtReturnUrl " : " http://bboy.cnblogs.com " }
loginUrl = " http://passport.cnblogs.com/login.aspx "
login = urllib2.urlopen(loginUrl,urllib.urlencode(parms))
# print(unicode(login.read(),"utf8"))
# 显示配置页面
avatar = urllib2.urlopen( " http://home.cnblogs.com/set/avatar/ " )
# print(avatar.read().decode("utf8"))
except Exception,e:
print (e)
pass
if __name__ == ' __main__ ' :
main()
获取 网页很简单 直接 urllib2.urlopen(url).read() 就可以得到网页源码
这里是抓取登录后的页面,所有开头需要设置cookie
cookies
=
urllib2.HTTPCookieProcessor()
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
opener = urllib2.build_opener(cookies)
urllib2.install_opener(opener)
设置完 cookie以后 再使用 urllib2.urlopen()方法就可以带上你登录成功的cookie了