coursera python web_【python爬虫】coursera抓取

1 #-*- coding: utf-8 -*-”

2 #!/usr/bin/env python

3

4 """

5 用于抓取coursera网站的下载链接6 """

7

8 importsys9 importstring10 importre,random11 importurllib,urllib212 importcookielib13 importgetpass14

15

16 classCoursera(object):17 """Coursera类定义18

19 实现模拟登陆,抓取网页代码和正则匹配,保存连接到文件20

21 Attributes:22 login_url:保存真正的登陆页面URL23 url:保存用于爬取下载连接的URL24 user_name:存储用户登陆Email25 password:存储用户登陆密码26 """

27

28 def __init__(self,url,user_name,password):29 self.login_url = "https://accounts.coursera.org/api/v1/login"

30 self.url =url31 if user_name == "" or password == "":32 raise UserOrPwdNone("the username or password can't empty string")33 sys.exit(2)34 else:35 self.user_name=user_name36 self.password =password37

38 defsimulation_login(self):39 """

40 模拟登录函数41 """

42

43 cookie =cookielib.CookieJar()44 opener =urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))45 urllib2.install_opener(opener)46 form_data,request_header =self.structure_headers()47 req = urllib2.Request(self.login_url,data = form_data,headers=request_header)48 try:49 result =urllib2.urlopen(req)50 excepturllib2.URLError,e:51 if hasattr(e, "code"):52 print "The server couldn't fulfill the request.Please check your url and read the Reason"

53 print "Error code: %s" %e.code54 elif hasattr(e, "reason"):55 print "We failed to reach a server. Please check your url and read the Reason"

56 print "Reason: %s" %e.reason57 sys.exit(2)58 if result.getcode()==200:59 print "登录成功..."

60

61 defstructure_headers(self):62 """

63 头部构造函数64 """

65 #模拟表单数据,这个参数不是字典

66 form_data =urllib.urlencode({67 "email":self.user_name,68 "password":self.password,69 "webrequest":"true"

70 })71 user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)"

72 "AppleWebKit/537.36 (KHTML, like Gecko)"

73 "Chrome/38.0.2125.111 Safari/537.36")74 XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8))75 XCSRF2Token = ''.join(self.random_string(24))76 XCSRFToken = ''.join(self.random_string(24))77 cookie = "csrftoken=%s; %s=%s" %(XCSRFToken, XCSRF2Cookie, XCSRF2Token)78

79 request_header ={80 "Referer": "https://accounts.coursera.org/signin", #对付防盗链设置, 为跳转来源的url

81 "User-Agent": user_agent, #伪装成浏览器访问

82 "X-Requested-With": "XMLHttpRequest",83 "X-CSRF2-Cookie": XCSRF2Cookie,84 "X-CSRF2-Token": XCSRF2Token,85 "X-CSRFToken": XCSRFToken,86 "Cookie": cookie87 }88

89 returnform_data,request_header90

91 defrandom_string(self,length):92 """

93 随机生成指定长度的字母和数字序列94 """

95 return ''.join(random.choice(string.letters + string.digits) for i inxrange(length))96

97 defget_links(self):98 """

99 爬取页面代码,获取下载MP4和PDF连接100 """

101

102 try:103 result =urllib2.urlopen(self.url)104 excepturllib2.URLError,e:105 if hasattr(e, "code"):106 print "The server couldn't fulfill the request."

107 print "Error code: %s" %e.code108 elif hasattr(e, "reason"):109 print "We failed to reach a server. Please check your url and read the Reason"

110 print "Reason: %s" %e.reason111 sys.exit(2)112 content = result.read().decode("utf-8")113 print "读取网页成功..."

114 down_links = re.findall(r'

117 returndown_links,down_pdfs118

119 defstart_spider(self):120 """运行爬虫,将爬取链接写入不同文件121 """

122 self.simulation_login()123 down_links,down_pdfs =self.get_links()124 with open("coursera.html","w+") as my_file:125 print "下载链接的长度",len(down_links)126 for link indown_links:127 printlink128 try:129 my_file.write(link+"\n")130 exceptUnicodeEncodeError:131 sys.exit(2)132 with open("coursera.pdf", "w+") as my_file :133 print "下载pdf的长度", len(down_pdfs)134 for pdf indown_pdfs :135 try:136 my_file.write(pdf + "\n")137 exceptUnicodeEncodeError :138 sys.exit(2)139 print "抓取Coursera课程下载链接和pdf链接成功"

140

141

142 classUserOrPwdNone(BaseException):143 """

144 Raised if the username or password is empty string145 """

146

147 defmain():148 """

149 if len(sys.argv) != 2:150 print "Please Input what course you want to download.."151 sys.exit(2)152 """

153

154 """

155 user_name = raw_input("Input your Email > ")156 password = getpass.getpass("Input your Password > ")157 """

158 url = "https://class.coursera.org/{course}/lecture"

159 user_name = "15258691200@163.com"

160 password = "xxxxxxx"

161 spider = Coursera(url.format(course = "python"),user_name,password)162 spider.start_spider()163

164 if __name__ == '__main__':165 main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值