导入库
import requests
from bs4 import BeautifulSoup
import bs4
获取网页信息
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
获得课程名称并输出
def getCourseName(key, url):
html = getHTMLText(url)
soup = BeautifulSoup(html, "html.parser")
title = soup.title.string[0:-12]
if key in title:
print('\n'+title+' '+url+'\n')
CourseListString = '|' + title + '|' + '|' + '[点击进入]('+ url + ')|\n'
with open('result.md', 'a', encoding = 'UTF-8') as f:
f.write(CourseListString)
循环搜索
'''
start 起始网址代码(包括start)
end 结束网址代码(不包括end)
'''
def printUnivList(start, end, key):
with open('result.md', 'a', encoding = 'UTF-8') as f:
f.write('|课程名称|课程网址|\n|:----:|:----:|\n')
for n in range(start,end):
try:
print("\r进度:{:7}/{:7}".format(n-start,end-start),end='')
url = 'https://ke.qq.com/course/{}'.format(n)
getCourseName(key, url)
except:
continue
执行文件
key = input('请输入关键字:')
print('开始爬取。。。')
printUnivList(1000000, 1010000, key)
input('爬取结束,按任意键退出。。。')
附录:源文件
点击此处下载
附录:部分结果演示
![bfj](https://img-blog.csdnimg.cn/20200823194516358.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80Mzk2NDk5Mw==,size_16,color_FFFFFF,t_70)