#encoding=utf-8 #目标网站http://www.jikexueyuan.com/course/ # 目标内容:课程名称,课程介绍,课程时间,课程等级,学习人数 # 涉及的知识:requests获取网页,re.sub翻页,正则表达式匹配内容。 import requests import re #因为在Windows系统下,命令提示符默认的编码是GBK,而HTML中utf-8,编码不匹配,那么爬取到的中文内容可能会乱码, # 使用这三行将命令提示符强制转换成utf-8,记住就可以了 import sys reload(sys) sys.setdefaultencoding("gb18030") # sys.setdefaultencoding("utf-8") url = 'http://www.jikexueyuan.com/course/' html = requests.get(url) # print html.text class spider(object): def __init__(self): print "开始爬取内容。。。" def getsource(self,url): html = requests.get(url) return html.text def changepage(self,url,total_page): now_page = int(re.search('pageNum=(\d+)',url,re.S).group(1)) page_group = [] for i in range(now_page,total_page+1): link = re.sub('pageNum=\d+','pageNum=%d'%i,url,re.S) page_group.append(link) return page_group def geteveryclass(self,source): everyclass = re.findall('<li id="(.*?)</li>',source,re.S) #everyclass = re.findall('(<li id="".*?<li>)',source,re.S)这个用法首次见,找出(<li id="".*?<li>)整行 return everyclass def getinfo(self,eachclass): info = {} info['title'] = re.search( 'class="lessonimg" title="(.*?)" alt="',eachclass,re.S).group(1).replace('\n',' ') info['content'] = re.search( '<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>', eachclass, re.S).group(1).replace('\n',' ').strip() timeandlevel = re.findall('<em>(.*?)</em>',eachclass, re.S) info['classtime'] = timeandlevel[0].replace('\n',' ') info['classlevel'] = timeandlevel[1].replace('\n',' ') info['learnnum'] = re.search('class="learn-number">(.*?)</em>', eachclass, re.S).group(1).replace('\n',' ') return info def saveinfo(self,classinfo): f = open('info.txt','a') #用追加方式打开的‘a’。打开一个文件用于追加。如果该文件已存在,文件指针将会放在文件的结尾。 # 也就是说,新的内容将会被写入到已有内容之后。如果该文件不存在,创建新文件进行写入。 for each in classinfo: f.writelines('title:'+each['title']+'\n') f.writelines('content:' + each['content'] + '\n') f.writelines('classtime:' + each['classtime'] + '\n') f.writelines('classlevel:' + each['classlevel'] + '\n') f.writelines('learnnum:' + each['learnnum'] + '\n\n') f.close() if __name__=='__main__': classinfo = [] url = 'http://www.jikexueyuan.com/course/?pageNum=1' jikespider = spider() all_links = jikespider.changepage(url,20) for link in all_links: print "正在处理页面:"+link html = jikespider.getsource(link) everyclass = jikespider.geteveryclass(html) for each in everyclass: info = jikespider.getinfo(each) classinfo.append(info) jikespider.saveinfo(classinfo) # 1. strip():把头和尾的空格去掉 # 2. lstrip():把左边的空格去掉 # 3. rstrip():把右边的空格去掉 # 4. replace('c1', 'c2'):把字符串里的c1替换成c2。故可以用replace(' ', '') 来去掉字符串里的所有空格 # 5. split():通过指定分隔符对字符串进行切片,如果参数num 有指定值,则仅分隔num个子字符串 # In[2]: a = ' ddd dfe dfd efre ddd ' # In[3]: a # Out[3]: ' ddd dfe dfd efre ddd ' # In[4]: a.strip() # Out[4]: 'ddd dfe dfd efre ddd' # In[5]: a.lstrip() # Out[5]: 'ddd dfe dfd efre ddd ' # In[6]: a.rstrip() # Out[6]: ' ddd dfe dfd efre ddd' # In[7]: a.replace(' ', '') # Out[7]: 'ddddfedfdefreddd' # In[8]: a.split() # Out[8]: ['ddd', 'dfe', 'dfd', 'efre', 'ddd'] #title # url = 'http://www.jikexueyuan.com/course/?pageNum=1' # a1 = requests.get(url) # # print a1.text # a = re.findall('class="lessonimg" title="(.*?)" alt="',a1.text,re.S) # print a # print a[0] # for i in a: # print i #content # b = re.findall('<p style="height: 0px; opacity: 0; display: none;">(.*?)</p>',a1.text,re.S) # print b # print b[0] #timeandlevel # c = re.findall('<em>(.*?)</em>',a1.text,re.S) # print c # print c[0] # print c[1] # #classtime # d = c[0] # f = c[2] # print d,f # # #classlevel # e = c[1] # print e # #learnnum # g = re.findall('class="learn-number">(.*?)</em>',a1.text,re.S) # print g # print g[0] #li # g = re.findall('<li id="(.*?)</li>',a1.text,re.S) # print g # print g[1] # everyclass = re.findall('(<li id="(.*?)<li>)',source,re.S) # b = re.search('display: none;">(.*?)</p>', a1, re.S).group(1) # c = re.findall('<em>(.*?)</em>',a1, re.S) # d = c[0] # e = c[1] # b = '' # a = '' # for each1 in info['classtime']: # if each1 == ' ': # each1.replace(' ','') # else: # a = each1 # b = b + a # info['classtime'] = b
极客学院课程爬虫
最新推荐文章于 2024-04-20 09:55:04 发布