#爬取大学排名网中的大学排名数据
#将爬取到的数据保存在CSV文件中
#由于习惯 作者会将获取到的数据保存 然后在保存的文件中进行匹配 这样会降低程序运行时间
'''import requests from lxml import html from bs4 import BeautifulSoup url = 'http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html' headers = { 'Accept': 'ext/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'Hm_lvt_2ce94714199fe618dcebb5872c6def14=1558577559,1558579599; Hm_lpvt_2ce94714199fe618dcebb5872c6def14=1558579599', 'Host': 'www.zuihaodaxue.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:0.9.4) Gecko/20011128 Netscape6/6.2.1', } res = requests.get(url=url,headers=headers) #解决获取到的源码中文字符显示问题 res.encoding = 'utf8' print(res.text) # etree = html.etree # conts = etree.HTML(res.text) # print(conts) f = open('best_college.html','w') f.write(res.text) f.close()''' #爬取大学排名网中的大学排名数据并保存在csv文件中 from lxml import html f = open('best_college.html','r') conts = f.read() # print(conts) f.close() etree = html.etree cont = etree.HTML(conts) f = open('school_sort.csv','a') s1 = cont.xpath("//thead/tr/th/text()|//thead/th/text()") # print(s1) #使用列表内的遍历方式进行删除操作 s4 = [i2.strip() for i2 in s1 if i2.strip != ''] print(s4) s5 = str(s4).replace("'",'').replace('[','').replace(']','') + '\n' print(s5) f.write(s5) s2 = cont.xpath("//tbody/tr/td[1]/text()|//tbody/tr/td/div/text()|//tbody/tr/td[3]/text()|//tbody/tr/td[4]/text()|//tbody/tr/td[5]/text()") # print(s2) s3 = [] for i in range(0,len(s2),5): s3.append(s2[i:i+5]) # print(s3) for i1 in s3: # print(i1) i5 = str(i1).replace('[','').replace(']','').replace("'",'') + '\n' print(i5) f.write(i5) f.close()