1 #-*- coding: utf-8 -*-”
2 #!/usr/bin/env python
3
4 """
5 用于抓取coursera网站的下载链接6 """
7
8 importsys9 importstring10 importre,random11 importurllib,urllib212 importcookielib13 importgetpass14
15
16 classCoursera(object):17 """Coursera类定义18
19 实现模拟登陆,抓取网页代码和正则匹配,保存连接到文件20
21 Attributes:22 login_url:保存真正的登陆页面URL23 url:保存用于爬取下载连接的URL24 user_name:存储用户登陆Email25 password:存储用户登陆密码26 """
27
28 def __init__(self,url,user_name,password):29 self.login_url = "https://accounts.coursera.org/api/v1/login"
30 self.url =url31 if user_name == "" or password == "":32 raise UserOrPwdNone("the username or password can't empty string")33 sys.exit(2)34 else:35 self.user_name=user_name36 self.password =password37
38 defsimulation_login(self):39 """
40 模拟登录函数41 """
42
43 cookie =cookielib.CookieJar()44 opener =urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))45 urllib2.install_opener(opener)46 form_data,request_header =self.structure_headers()47 req = urllib2.Request(self.login_url,data = form_data,headers=request_header)48 try:49 result =urllib2.urlopen(req)50 excepturllib2.URLError,e:51 if hasattr(e, "code"):52 print "The server couldn't fulfill the request.Please check your url and read the Reason"
53 print "Error code: %s" %e.code54 elif hasattr(e, "reason"):55 print "We failed to reach a server. Please check your url and read the Reason"
56 print "Reason: %s" %e.reason57 sys.exit(2)58 if result.getcode()==200:59 print "登录成功..."
60
61 defstructure_headers(self):62 """
63 头部构造函数64 """
65 #模拟表单数据,这个参数不是字典
66 form_data =urllib.urlencode({67 "email":self.user_name,68 "password":self.password,69 "webrequest":"true"
70 })71 user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)"
72 "AppleWebKit/537.36 (KHTML, like Gecko)"
73 "Chrome/38.0.2125.111 Safari/537.36")74 XCSRF2Cookie = 'csrf2_token_%s' % ''.join(self.random_string(8))75 XCSRF2Token = ''.join(self.random_string(24))76 XCSRFToken = ''.join(self.random_string(24))77 cookie = "csrftoken=%s; %s=%s" %(XCSRFToken, XCSRF2Cookie, XCSRF2Token)78
79 request_header ={80 "Referer": "https://accounts.coursera.org/signin", #对付防盗链设置, 为跳转来源的url
81 "User-Agent": user_agent, #伪装成浏览器访问
82 "X-Requested-With": "XMLHttpRequest",83 "X-CSRF2-Cookie": XCSRF2Cookie,84 "X-CSRF2-Token": XCSRF2Token,85 "X-CSRFToken": XCSRFToken,86 "Cookie": cookie87 }88
89 returnform_data,request_header90
91 defrandom_string(self,length):92 """
93 随机生成指定长度的字母和数字序列94 """
95 return ''.join(random.choice(string.letters + string.digits) for i inxrange(length))96
97 defget_links(self):98 """
99 爬取页面代码,获取下载MP4和PDF连接100 """
101
102 try:103 result =urllib2.urlopen(self.url)104 excepturllib2.URLError,e:105 if hasattr(e, "code"):106 print "The server couldn't fulfill the request."
107 print "Error code: %s" %e.code108 elif hasattr(e, "reason"):109 print "We failed to reach a server. Please check your url and read the Reason"
110 print "Reason: %s" %e.reason111 sys.exit(2)112 content = result.read().decode("utf-8")113 print "读取网页成功..."
114 down_links = re.findall(r'
117 returndown_links,down_pdfs118
119 defstart_spider(self):120 """运行爬虫,将爬取链接写入不同文件121 """
122 self.simulation_login()123 down_links,down_pdfs =self.get_links()124 with open("coursera.html","w+") as my_file:125 print "下载链接的长度",len(down_links)126 for link indown_links:127 printlink128 try:129 my_file.write(link+"\n")130 exceptUnicodeEncodeError:131 sys.exit(2)132 with open("coursera.pdf", "w+") as my_file :133 print "下载pdf的长度", len(down_pdfs)134 for pdf indown_pdfs :135 try:136 my_file.write(pdf + "\n")137 exceptUnicodeEncodeError :138 sys.exit(2)139 print "抓取Coursera课程下载链接和pdf链接成功"
140
141
142 classUserOrPwdNone(BaseException):143 """
144 Raised if the username or password is empty string145 """
146
147 defmain():148 """
149 if len(sys.argv) != 2:150 print "Please Input what course you want to download.."151 sys.exit(2)152 """
153
154 """
155 user_name = raw_input("Input your Email > ")156 password = getpass.getpass("Input your Password > ")157 """
158 url = "https://class.coursera.org/{course}/lecture"
159 user_name = "15258691200@163.com"
160 password = "xxxxxxx"
161 spider = Coursera(url.format(course = "python"),user_name,password)162 spider.start_spider()163
164 if __name__ == '__main__':165 main()