# -*-encoding:utf8-*- import re import requests import sys reload(sys) sys.setdefaultencoding('utf8') class crawler: ITEM_PATTERN = '<div class="lessonimg-box">(.*?)</li>' TITLE_PATTERN = 'class="lesson-info-h2">.*?>(.*?)</a>' COURSE_TIME_SPAN_PATTERN = 'class="time-icon"></i><em>(.*?)</em>' COURSE_DIFFICULTY_PATTERN = 'class="xinhao-icon."></i><em>(.*?)</em>' COURSE_INTRODUCE_PATTERN = '<h2 class="lesson-info-h2"><a.*?<p.*?>(.*?)</p>' LEARNING_NUM = '<em class="learn-number">(.*?)</em>' IMG_URL_PATTERN = '<img src="(.*?)" class="lessonimg".*?>' TOTAL_PAGE_NUM_PATTERN = '<li class="thpoint pagetotal" style="margin-top:3px;">(.*?)</li>' def __init__(self,url='http://www.jikexueyuan.com/course/?pageNum=1'): self.url = url self.headers = 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 ' \ '(KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36' def getcontent(self,currentpagenum,url): print '正在抓取第%s页数据...%s'%(currentpagenum,url) html = requests.get(url,self.headers).text html.encode('utf-8') items = re.findall(crawler.ITEM_PATTERN,html,re.S) course = {} courses = [] for item in items: title = re.findall(crawler.TITLE_PATTERN,item,re.S) timespan = re.findall(crawler.COURSE_TIME_SPAN_PATTERN,item,re.S) difficulty = re.findall(crawler.COURSE_DIFFICULTY_PATTERN, item, re.S) introduce = re.findall(crawler.COURSE_INTRODUCE_PATTERN,item,re.S) learningnum = re.findall(crawler.LEARNING_NUM,item,re.S) course['title'] = title and title.pop(0).strip() or '' course['timespan'] = timespan and timespan.pop(0) or '' course['difficulty'] = difficulty and difficulty.pop(0).strip() or '' course['introduce'] = introduce and introduce.pop(0).strip() or '' course['learningnum'] = learningnum and learningnum.pop(0)or '' courses.append(course) course = {} #print item #print html return courses def setcurpagelink(self,url): self.url = url def geteachpagelink(self): #self.pager = requests.get('http://s1.jikexueyuan.com/common/widget/pager/jquery.pager_5df04e5.js',self.headers) #self.totalpagenum = re.findall(crawler.TOTAL_PAGE_NUM_PATTERN,self.pager,re.S).pop(0)[2:3] #print self.pager.text self.currentpage = int(re.search('pageNum=(\d+)',self.url,re.S).group(1)) links =[] for i in range(1,90+1): link = re.sub('pageNum=\d+','pageNum=%s'%i, self.url, re.S) links.append(link) #print self.totalpagenum return links def getcourseinfo(self,course): return '课程名称:%s \n难度:%s 课时:%s 学习人数:%s \n简介:%s\n\n'%(course['title'], course['difficulty'], course['timespan'], course['learningnum'], course['introduce']) def save(self,content): file = open('course_data.txt','a') file.write(content) file.close() p = crawler() links = p.geteachpagelink() for i,eachpagelink in enumerate(links): p.setcurpagelink(eachpagelink) courses = p.getcontent(str(i+1),eachpagelink) #循环遍历 这样打印出来的不好看 #for eachcourse in courses: # for item in eachcourse: # print item +": " + eachcourse[item] coursesinfo = '' coursesinfo += '\n-----第%s页---------------------------------------------------------------' \ '-------------------------------\n'%str(i+1) for course in courses: courseinfo = p.getcourseinfo(course) coursesinfo += courseinfo p.save(coursesinfo)
[python]爬极客网课程
最新推荐文章于 2020-12-11 02:47:03 发布