爬虫

from urllib import request
from urllib import error
import socket

url= 'http://www.xuetangx.com/courses?cid=117'
try:
    resp = request.urlopen(url)
    html_data = resp.read().decode('utf-8')
except error.URLError as ex:
    html_data = None
except socket.timeout as ex:
    html_data = None
    
if html_data:
    #print(html_data);
    #输出为xuetang.html文件
    file = open('xuetang.html','w',encoding='utf-8')
    file.write(html)
    file.close()


    from urllib.request import urlopen#在urllib库里面,查找python的request模块,只导入一个urlopen函数
from bs4 import BeautifulSoup#导入BeautifulSoup对象

data = []
f = open('xuetang.html', 'r',encoding="utf-8")
html = f.read()
f.close()

file = open('xuetang.txt','w',encoding='utf-8')#文件操作

num = 16
for num in range(1,16):
    html = urlopen('http://www.xuetangx.com/courses?credential=0&page_type=0&cid=117&process=0&org=0&course_mode=0&page='+str(num))
    bs_obj = BeautifulSoup(html.read(),'html.parser')#把html内容传到BeautifulSoup对象
    course_list = bs_obj.find_all("h2","coursetitle")#找到所有class = "coursetitle"的h2标签
    subject_list = bs_obj.find_all("span","subject")#找到所有class = "subject"的span标签
    teacher_list = bs_obj.find_all("div","fl name")#找到所有class = "fl name"的div标签
    introduction_list = bs_obj.find_all("p","txt")
    for course,subject,teacher,introduction in zip(course_list,subject_list,teacher_list,introduction_list):
        #把将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
        info = {
            'course':course.get_text().strip(),#strip()去掉左右两边的空格
            'subject':subject.get_text().strip(),
            'teacher':teacher.li.get_text().replace("\n"," "),#替换函数
            "introduction":introduction.get_text()
        }
        data.append(info)#把信息放入data列表
    
    for i in data:#打印data
        file = open('xuetang.txt','a',encoding='utf-8')
        file.write("?")
        file.write("".join(i['course']))
        file.write(" ".join(i['subject']))
        file.write(" ".join(i['teacher']))
        file.write("".join(i['introduction']))
        print(i['course'],"\t",i['subject'],"\t",i['teacher'],"\n",i['introduction'],"\n")
    data.clear()
    file.clearBlankLine()
    file.close()
html.close()#关闭文件


def clearBlankLine():
    file1 = open('text1.txt', 'r', encoding='utf-8') # 要去掉空行的文件 
    file2 = open('text2.txt', 'w', encoding='utf-8') # 生成没有空行的文件
    try:
        for line in file1.readlines():
            if line == '\n':
                line = line.strip("\n")
            if line == '\t':
                line = line.strip("\t")
            file2.write(line)
    finally:
        file1.close()
        file2.close()


if __name__ == '__main__':
    clearBlankLine()

翻页——记录页数

from lxml import etree
f = open('xuetang.html', 'r',encoding="utf-8")
html = f.read()
f.close()

selector = etree.HTML(html)
#link =  selector.xpath('//*[@id="list_pager"]/ul/li[12]/a[last()]/@href') 
#link =  selector.xpath('//*[@id="list_style"]/li[10]/div/div[2]/div/a/h2/text()') #这句可以实现
link =  selector.xpath('//*[@id="list_pager"]/ul/li[12]/a/text()')#F12模式下右键Xpath直接复制
for each in link:
    print (each)
#我觉得其实已经可以了就是网页有问题!!!!

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值