需求:
- 爬取的链接: http://www.imooc.com/course/list
- 爬取的内容: 课程链接, 课程的图片url, 课程的名称, 学习人数, 课程描述;
- 爬取的内容如何存储:
- 文件(.csv, );
- mysql数据库;
- 分析爬取的信息;
- 词云
1 获取页面内容
import re
import requests
import lxml.etree as etree
import csv
def get_content(url):
"""爬取页面内容的函数"""
try:
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36"
response = requests.get(url, headers={'User-Agent': user_agent})
response.raise_for_status() # 如果返回的状态码不是200, 则抛出异常;
response.encoding = response.apparent_encoding # 判断网页的编码格式, 便于respons.text知道如何解码;
except Exception as e:
print("爬取错误")
else:
print(response.url)
print("爬取成功!")
return response.content
2.解析页面
发现我们需要的这个div块class=“course-card-container”
课程名称存储在h3里class=“course-card-name”,所以我们只需要获取这里的标签内容,其他的同理
def parser_content(html):
"""分析页面获取需要的信息:课程链接, 课程的图片url, 课程的名称, 学习人数, 课程描述 """
# 1). 将html内容转化成xpath可以解析/匹配的格式;
selector = etree.HTML(html)
# 2). 获取每个课程的信息: <div class="course-card-container">
courseDetails = selector.xpath('//div[@class="course-card-container"]')
courseInfos = []
for courseDetail in courseDetails:
# 课程的名称: <h3 class="course-card-name">初识HTML+CSS</h3>
name = courseDetail.xpath('.//h3[@class="course-card-name"]/text()')[0]
# 学习人数
"""
<div class="course-card-info">
<span>入门</span><span><i class="icon-set_sns"></i>1000167</span>
</div>
"""
studentNum = courseDetail.xpath('.//span/text()')[1]
# 课程描述: <p class="course-card-desc">HTML+CSS基础教程8小时带领大家步步深入学习标签用法和意义</p>
courseInfo = courseDetail.xpath(".//p[@class='course-card-desc']/text()")[0]
# print(name, studentNum, courseInfo)
# 课程链接, h获取/learn/9 ====》 http://www.imooc.com/learn/9
# <a target="_blank" href="/learn/9" class="course-card">
courseUrl = "http://www.imooc.com" + courseDetail.xpath('.//a/@href')[0]
# print(courseUrl)
# 课程的图片url:
"""
<img class="course-banner lazy" data-original="//img1.mukewang.com/529dc3380001379906000338-240-135.jpg"
src="//img1.mukewang.com/529dc3380001379906000338-240-135.jpg" style="display: inline;">
"""
courseImgUrl = 'http:' + courseDetail.xpath('.//img/@src')[0]
courseInfos.append((name, studentNum, courseInfo, courseUrl, courseImgUrl))
return courseInfos
3.保存为csv格式
def save_csv(courseInfo):
"""将获取的课程信息保存为csv格式"""
with open('doc/mooc.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(courseInfo)
print("csv文件保存成功........")
4.保存为json格式
def save_json(courseInfo):
"""将获取的信息保存为json格式"""
import json
with open('doc/mooc.json', 'w', encoding='utf-8') as f:
for item in courseInfo:
item = {
'name': item[0],
'studentNum': item[1],
'courseInfo': item[2],
'courseUrl': item[3],
'courseImgUrl': item[4]