爬虫

最新推荐文章于 2024-04-01 13:30:49 发布

AbyssssssssssS

最新推荐文章于 2024-04-01 13:30:49 发布

阅读量112

点赞数

本文链接：https://blog.csdn.net/abysssssssssss/article/details/83512383

版权

from urllib import request
from urllib import error
import socket

url= 'http://www.xuetangx.com/courses?cid=117'
try:
    resp = request.urlopen(url)
    html_data = resp.read().decode('utf-8')
except error.URLError as ex:
    html_data = None
except socket.timeout as ex:
    html_data = None
    
if html_data:
    #print(html_data);
    #输出为xuetang.html文件
    file = open('xuetang.html','w',encoding='utf-8')
    file.write(html)
    file.close()


    from urllib.request import urlopen#在urllib库里面，查找python的request模块，只导入一个urlopen函数
from bs4 import BeautifulSoup#导入BeautifulSoup对象

data = []
f = open('xuetang.html', 'r',encoding="utf-8")
html = f.read()
f.close()

file = open('xuetang.txt','w',encoding='utf-8')#文件操作

num = 16
for num in range(1,16):
    html = urlopen('http://www.xuetangx.com/courses?credential=0&page_type=0&cid=117&process=0&org=0&course_mode=0&page='+str(num))
    bs_obj = BeautifulSoup(html.read(),'html.parser')#把html内容传到BeautifulSoup对象
    course_list = bs_obj.find_all("h2","coursetitle")#找到所有class = "coursetitle"的h2标签
    subject_list = bs_obj.find_all("span","subject")#找到所有class = "subject"的span标签
    teacher_list = bs_obj.find_all("div","fl name")#找到所有class = "fl name"的div标签
    introduction_list = bs_obj.find_all("p","txt")
    for course,subject,teacher,introduction in zip(course_list,subject_list,teacher_list,introduction_list):
        #把将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表
        info = {
            'course':course.get_text().strip(),#strip()去掉左右两边的空格
            'subject':subject.get_text().strip(),
            'teacher':teacher.li.get_text().replace("\n"," "),#替换函数
            "introduction":introduction.get_text()
        }
        data.append(info)#把信息放入data列表
    
    for i in data:#打印data
        file = open('xuetang.txt','a',encoding='utf-8')
        file.write("?")
        file.write("".join(i['course']))
        file.write(" ".join(i['subject']))
        file.write(" ".join(i['teacher']))
        file.write("".join(i['introduction']))
        print(i['course'],"\t",i['subject'],"\t",i['teacher'],"\n",i['introduction'],"\n")
    data.clear()
    file.clearBlankLine()
    file.close()
html.close()#关闭文件


def clearBlankLine():
    file1 = open('text1.txt', 'r', encoding='utf-8') # 要去掉空行的文件 
    file2 = open('text2.txt', 'w', encoding='utf-8') # 生成没有空行的文件
    try:
        for line in file1.readlines():
            if line == '\n':
                line = line.strip("\n")
            if line == '\t':
                line = line.strip("\t")
            file2.write(line)
    finally:
        file1.close()
        file2.close()


if __name__ == '__main__':
    clearBlankLine()

翻页——记录页数

from lxml import etree
f = open('xuetang.html', 'r',encoding="utf-8")
html = f.read()
f.close()

selector = etree.HTML(html)
#link =  selector.xpath('//*[@id="list_pager"]/ul/li[12]/a[last()]/@href') 
#link =  selector.xpath('//*[@id="list_style"]/li[10]/div/div[2]/div/a/h2/text()') #这句可以实现
link =  selector.xpath('//*[@id="list_pager"]/ul/li[12]/a/text()')#F12模式下右键Xpath直接复制
for each in link:
    print (each)
#我觉得其实已经可以了就是网页有问题！！！！

AbyssssssssssS

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫

from urllib import requestfrom urllib import errorimport socketurl= 'http://www.xuetangx.com/courses?cid=117'try: resp = request.urlopen(url) html_data = resp.read().decode('utf-8')exce...
复制链接

扫一扫

爬虫

“相关推荐”对你有帮助么？