参考链接: 从人人网获取全国中学信息(省市县)
主要代码为参考+改进上面博客的原创,在人人网的select弹框form里面抓取出全国高校名单.
主要代码块如下
def getProvinceData():
content = open("./cityArray.js", encoding='utf-8')
# 分离出市级id和名称
partten = re.compile("(\d+):([\w\d\\\\]+)")
provinceList = []
for line in content.readlines():
data = partten.findall(line)
citys = []
province = {}
for s in data:
# print(s)
if len(s[0]) == 4: # 城市
# print s[0],s[1].decode('unicode_escape')
citys.append({"id": s[0], "name": unescape(s[1])})
province_id = len(data[0][0]) == 4 and data[0][0] or data[0][0][0:4]
# 只处理列表中的几个省
if int(province_id) in provinceMap.keys():
province['id'] = province_id
province['name'] = provinceMap[int(province_id)]
province['citys'] = citys
provinceList.append(province)
return provinceList
def getTownHtml(town_id, scoolType):
try:
url = "http://support.renren.com/{0}/{1}.html".format(scoolType, town_id)
# print "请求网络数据:",url
a = requests.get(url, headers=headers).text
print(a)
return a
except:
print("网络错误!")
pass
def getCitySchool(content):
selector = etree.HTML(content)
# 某个城市的中学列表
# 县区的列表
townlist = selector.xpath('//ul')
# print(townlist)
d = {}
for town1 in townlist:
name1 = town1.xpath('./@id')[0].strip()
if name1 == "schoolCityQuList":
d["city"] = []
city1 = town1.xpath('.//a')
for y in city1:
y1 = etree.tostring(y, encoding='utf-8', pretty_print=True, method="html").decode(encoding="utf-8")
d["city"].append({
"name": re.findall('>(.*?)</a>', y1)[0],
"id": re.findall("'city_qu_(.*?)'", y1)[0]
})
continue
citySchoolData = []
townLiList = town1.xpath('.//a')
for town in townLiList:
p = {}
town = etree.tostring(town, encoding='utf-8', pretty_print=True, method="html").decode(encoding="utf-8")
print(town)
# input()
try:
p['name'] = re.findall('>(.*?)</a>', town)[0]
except:
p['name'] = re.findall('>(.*?)\n', town)[0]
p['id'] = re.findall('href="(.*?)"', town)[0]
citySchoolData.append(p)
for x in d.get('city'):
if name1.replace('city_qu_', '') == x.get('id'):
x['schoolList'] = citySchoolData
return d.get('city')
def getUnicodeStr(s):
name = []
for word in s.split(";"):
try:
name.append(chr(int(word[2:])))
except:
pass
return "".join(name)