import requests
from lxml import etree
def text_create(name, msg):
try:
desktop_path = "D:\\test\\" # 新创建的txt文件的存放路径
full_path = desktop_path + str(name) + '.txt' # 也可以创建一个.doc的word文档
file = open(full_path, 'w')
file.write(msg)
except Exception as E_results:
print(E_results)
def title(headers,url,x):
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
# 获取标题
titles = html.xpath("//div[@class='course-details-title-cont-text']//ul//li//h2/text()")[0].replace('\t','')+str(x)
# 获取主讲人
pepoe = html.xpath("//div[@class='course-details-title-cont-text']//ul//li//p//span[1]//text()")
class_test = html.xpath("//div[@class='course-details-view-list-introduce-cont']//p[@class='middleColor']/text()")[0]
test_txt="标题\n"+str(titles)+"主讲人\n"+str(pepoe[0])+"章节数\n"+str(pepoe[1])+"课时\n"+str(pepoe[2])+"学习人数\n"+str(pepoe[3])+"课程简介\n"+str(class_test)
print(pepoe)
text_create(titles, test_txt)
#
# if str(pepoe[1])=="0":
# print("文件异常过滤" +titles)
# return 1;
#
# else:
# text_create(titles,test_txt)
# return 0;
if __name__ == '__main__':
numberss=0
for i in range(0,5):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
}
url = "https://www.51moot.net/main/course?search_id=0&is_free=-1&page_index="+str(i)
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
new_url = html.xpath("//div[@class='course-details-cont-view']//ul//li//a//@href")
for i in range(len(new_url)):
title(headers, "https://www.51moot.net" + new_url[i],i)
# temp=
# numberss=numberss+int(temp)
# print(numberss)
Python爬取前五页内容并保存访问每个视频内容数据-保存文件夹
于 2022-08-02 16:18:27 首次发布