通过python 爬虫从网页上获取多页数标题、内容、时间、等级、学习人数等信息;
# -*-coding:utf-8 -*-
'''
Created on 2017年3月13日
@author: July_whj
'''
import requests
import sys
import re
#由于网页编码是UTF-8 故将此设置称UTF-8编码,防止中文乱码;
reload(sys)
sys.setdefaultencoding('utf-8')
#创建类
class spjikexuey(object):
def __init__(self):
print"开始爬取网页内容。。。"
#传递网页URL,返回整个页面源码;
def getsourct(self,url):
html = requests.get(url)
return html.text
#通过re.search 获取指定网页页数;page = 20 ;获取20页数据;返回20条URL;
def chanegpage(self,url,page):
now_page = int(re.search('pageNum=(\d+)', url, re.S).group(1))
page_group = []
for i in range(now_page,page):
link = re.sub('pageNum=\d+', 'pageNum=%s'%i, url, re.S)
page_group.append(link)
return page_group
#由getsourct()获取网页源码,通过geteveryclass()进行初步数据节选;缩小数据范围;返回筛选后的html
def geteveryclass(self,source):
everyclass = re.findall('li id="(.*?)</li>', source, re.S)
return everyclass
#获取标题、内容、时间、等级、学习人数等数据信息;并赋值到info集合
def getinfo(self,eachclass):
info = {}
info['title'] = re.search('class="lessonimg" title="(.*?)" alt="', each, re.S).group(1)
info['content'] = re.search('display: none;">(.*?)</p>', each, re.S).group(1)
timeandlevel = re.findall('<em>(.*?)</em>', each, re.S)
info['classTime'] = timeandlevel[0]
info['classlevel'] = timeandlevel[1]
info['learnnum'] = re.search('"learn-number">(.*?)</em>', each, re.S).group(1)
return info
#将抓取数据出储存到info.txt中;
def saveinfo(self,classinfo):
f = open('info.txt','a')
for each in classinfo:
f.writelines('title:'+each['title']+'\n')
f.writelines('content'+each['content']+'\n')
f.writelines('classTime'+each['classTime']+'\n')
f.writelines('classlevel'+each['classlevel']+'\n')
f.writelines('learnnum'+each['learnnum']+'\n\n')
f.close()
if __name__ == "__main__":
classinfo = []
url = 'http://www.jikexueyuan.com/course/?pageNum=1'
jikespider = spjikexuey()
all_links = jikespider.chanegpage(url, 20)
for link in all_links:
print '正在处理页面:'+link
html = jikespider.getsourct(link)
print html
everyclass = jikespider.geteveryclass(html)
print everyclass
for each in everyclass:
info = jikespider.getinfo(each)
classinfo.append(info)
jikespider.saveinfo(classinfo)