from urllib import request
from urllib import error
import socket
url= 'http://www.xuetangx.com/courses?cid=117'
try:
resp = request.urlopen(url)
html_data = resp.read().decode('utf-8')
except error.URLError as ex:
html_data = None
except socket.timeout as ex:
html_data = None
if html_data:
#print(html_data);
#输出为xuetang.html文件
file = open('xuetang.html','w',encoding='utf-8')
file.write(html)
file.close()
from urllib.request import urlopen#在urllib库里面,查找python的request模块,只导入一个urlopen函数
from bs4 import BeautifulSoup#导入BeautifulSoup对象
data = []
f = open('xuetang.html', 'r',encoding="utf-8")
html = f.read()
f.close()
file = open('xuetang.txt','w',encoding='utf-8')#文件操作
num = 16
for num in range(1,16):
html = urlopen('http://www.xuetangx.com/courses?credential=0&page_type=0&cid=117&process=0&org=0&course_mode=0&page='+str(num))
bs_obj = BeautifulSoup(html.read(),'html.parser')#把html内容传到BeautifulSoup对象
course_list = bs_obj.find_all("h2","coursetitle")#找到所有class = "coursetitle"的h2标签
subject_list = bs_obj.find_all("span","subject")#找到所有class = "subject"的span标签
teacher_list = bs_obj.find_all("div","fl name")#找到所有class = "fl name"的div标签
introduction_list = bs_obj.find_all("p","txt")
for course,subject,teacher,introduction in zip(course_list,subject_list,teacher_list,introduction_list):
#把将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
info = {
'course':course.get_text().strip(),#strip()去掉左右两边的空格
'subject':subject.get_text().strip(),
'teacher':teacher.li.get_text().replace("\n"," "),#替换函数
"introduction":introduction.get_text()
}
data.append(info)#把信息放入data列表
for i in data:#打印data
file = open('xuetang.txt','a',encoding='utf-8')
file.write("?")
file.write("".join(i['course']))
file.write(" ".join(i['subject']))
file.write(" ".join(i['teacher']))
file.write("".join(i['introduction']))
print(i['course'],"\t",i['subject'],"\t",i['teacher'],"\n",i['introduction'],"\n")
data.clear()
file.clearBlankLine()
file.close()
html.close()#关闭文件
def clearBlankLine():
file1 = open('text1.txt', 'r', encoding='utf-8') # 要去掉空行的文件
file2 = open('text2.txt', 'w', encoding='utf-8') # 生成没有空行的文件
try:
for line in file1.readlines():
if line == '\n':
line = line.strip("\n")
if line == '\t':
line = line.strip("\t")
file2.write(line)
finally:
file1.close()
file2.close()
if __name__ == '__main__':
clearBlankLine()
翻页——记录页数
from lxml import etree
f = open('xuetang.html', 'r',encoding="utf-8")
html = f.read()
f.close()
selector = etree.HTML(html)
#link = selector.xpath('//*[@id="list_pager"]/ul/li[12]/a[last()]/@href')
#link = selector.xpath('//*[@id="list_style"]/li[10]/div/div[2]/div/a/h2/text()') #这句可以实现
link = selector.xpath('//*[@id="list_pager"]/ul/li[12]/a/text()')#F12模式下右键Xpath直接复制
for each in link:
print (each)
#我觉得其实已经可以了就是网页有问题!!!!