仅作为操作记录,大佬请跳过。
博主用python爬虫,找院校信息。
参考大佬博主文章
————————————————————
三个代码都可以直接运行。
代码
网站:https://yz.chsi.com.cn/sch/search.do?start=0
直接上代码
from urllib.request import urlopen
# import pymysql
from urllib.error import HTTPError,URLError
from bs4 import BeautifulSoup
import re
#爬取院校信息方法
def findSchoolInfo(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read(),'lxml')
shcoolInfo = bsObj.findAll("table",{"class":"ch-table"})
except AttributeError as e:
return None
return shcoolInfo
#处理信息为需要的信息
def handleSchoolInfo(info):
if info == None:
print("没有院校信息")
else:
school_list = []
for item in info:
list = item.findAll("tr")
for x in list:
school = x.findAll("td")
if len(school):
school_list.append(school[0:3])
else:
continue
for item in school_list:
school_name = item[0].get_text().strip()
school_shengfen = item[1].get_text()
shcool_belong = item[2].get_text()
shcoolInfo = findSchoolInfo("https://yz.chsi.com.cn/sch/search.do?start=0")
handleSchoolInfo(shcoolInfo)
print("爬取完成")
可直接运行,数据储存在school_name
、school_shengfen
、shcool_belong
这三个变量里
————————————————————————————
代码v2.0
博主想把数据写入txt文件,加入代码
即可
源代码
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html = urlopen("https://yz.chsi.com.cn/sch/?start=0")#括号内的是需要爬取的网址地址
# bsObj = BeautifulSoup(html.read())
#
# print(bsObj.title)
# from urllib.request import urlopen
#
# def getHtml(url):
# page = urlopen(url)
# html = page.read()
# return html
#
# html = getHtml("https://yz.chsi.com.cn/sch/?start=0")
#
# print(html.title)
#爬取院校信息方法
from urllib.request import urlopen
# import pymysql
from urllib.error import HTTPError,URLError
from bs4 import BeautifulSoup
import re
#爬取院校信息方法
def findSchoolInfo(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read(),'lxml')
shcoolInfo = bsObj.findAll("table",{"class":"ch-table"})
except AttributeError as e:
return None
return shcoolInfo
#处理信息为需要的信息
def handleSchoolInfo(info):
if info == None:
print("没有院校信息")
else:
school_list = []
for item in info:
list = item.findAll("tr")
for x in list:
school = x.findAll("td")
if len(school):
school_list.append(school[0:3])
else:
continue
for item in school_list:
school_name = item[0].get_text().strip();writefile(school_name)
school_shengfen = item[1].get_text();writefile(school_shengfen)
shcool_belong = item[2].get_text();writefileandenter(shcool_belong)
def writefile(content):
f=open(r'E:\b1reptile20201114pro院校信息.txt','a',encoding='utf8')
f.write(content+' ')
f.close()
def writefileandenter(content):
f=open(r'E:\b1reptile20201114pro院校信息.txt','a',encoding='utf8')
f.write(content+'\n')
f.close()
shcoolInfo = findSchoolInfo("https://yz.chsi.com.cn/sch/search.do?start=0")
handleSchoolInfo(shcoolInfo)
print("爬取完成")
——————————————————————————————
代码v3.0
改进之处,将原txt清空,再写。
增加代码
源代码
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
#
# html = urlopen("https://yz.chsi.com.cn/sch/?start=0")#括号内的是需要爬取的网址地址
# bsObj = BeautifulSoup(html.read())
#
# print(bsObj.title)
# from urllib.request import urlopen
#
# def getHtml(url):
# page = urlopen(url)
# html = page.read()
# return html
#
# html = getHtml("https://yz.chsi.com.cn/sch/?start=0")
#
# print(html.title)
#爬取院校信息方法
from urllib.request import urlopen
# import pymysql
from urllib.error import HTTPError,URLError
from bs4 import BeautifulSoup
import re
#爬取院校信息方法
def findSchoolInfo(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
try:
bsObj = BeautifulSoup(html.read(),'lxml')
shcoolInfo = bsObj.findAll("table",{"class":"ch-table"})
except AttributeError as e:
return None
return shcoolInfo
#处理信息为需要的信息
def handleSchoolInfo(info):
if info == None:
print("没有院校信息")
else:
school_list = []
for item in info:
list = item.findAll("tr")
for x in list:
school = x.findAll("td")
if len(school):
school_list.append(school[0:3])
else:
continue
for item in school_list:
school_name = item[0].get_text().strip();writefile(school_name)
school_shengfen = item[1].get_text();writefile(school_shengfen)
shcool_belong = item[2].get_text();writefileandenter(shcool_belong)
def writefileinit():
f=open(r'E:\b1reptile20201114pro院校信息proandgo.txt','w',encoding='utf8')
f.close()
def writefile(content):
f=open(r'E:\b1reptile20201114pro院校信息proandgo.txt','a',encoding='utf8')
f.write(content+' ')
f.close()
def writefileandenter(content):
f=open(r'E:\b1reptile20201114pro院校信息proandgo.txt','a',encoding='utf8')
f.write(content+'\n')
f.close()
shcoolInfo = findSchoolInfo("https://yz.chsi.com.cn/sch/search.do?start=0")
writefileinit()
handleSchoolInfo(shcoolInfo)
print("爬取完成")