在做一些项目时需要用到全国地名的中文和拼音,因此我选取天气网站进行爬取。
结果是以两个列表的形式存放拼音和中文地名,有需要的可以自己进行修改扩充后使用。
# -*- coding: gbk -*-
import requests
from bs4 import BeautifulSoup
url ='https://lishi.tianqi.com'
def crwal(url):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url=url, headers=headers,timeout=8)
# response.raise_for_status()
response.encoding = 'utf-8'
page_text = response.text
# print(page_text)
bs = BeautifulSoup(page_text,"html.parser")
test = []
jiexi=bs.findAll('a')
# print(jiexi)
area_zwnameList=[]
area_pinyinList=[]
for i in jiexi:
area_zwnameList.append(i.string)
area_pinyinList.append(i['href'])
# print(area_zwnameList)
# print(area_pinyinList)
# 输出地名拼音
# print(area_pinyinList[77])
# print(area_pinyinList[3430])
# 输出地名中文
# print(area_nameList[77])
# print(area_nameList[3430])
# 输出拼音+地名
area_pinyinList_new = []
area_zwnameList_new = []
for i in range(77,3431):
if int(len(area_zwnameList[i])) > 1 and int(len(area_pinyinList[i]) > 1):
area_zwnameList_new.append(area_zwnameList[i])
area_pinyinList_new.append(area_pinyinList[i].replace("/index.html","").replace("/",""))
print(area_pinyinList_new)
print(area_zwnameList_new)
crwal(url)