纯属练手
import requests
import re
from requests.exceptions import RequestException
def get_page(url):
try:
res=requests.get(url)
res.encoding = 'utf-8'
if res.status_code==200:
return res.text
else:
return None
except RequestException:
return None
def parse_page(html):
pattern=re.compile('<h1 >(.*?)</h1>.*?(区码代码|行政代码).*?basicInfo-item value">\n(.*?)(<sup>|\n).*?</dd>',re.S)
#pattern=re.compile('(.*?)<sup>',re.S)
result=re.findall(pattern,html)
print(result)
#return result
if result !=[]:
return {
'Code':result[0][2]
,'Rname':result[0][0]
}
else:
return {
'Code':''
,'Rname':''
}
def main():
List=['龙门县','遂溪县','广州市','从化市','万山海洋开发试验区','aaa']
codes=[]
for c in List:
html=get_page('http://baike.baidu.com/item/'+str(c))
code=parse_page(html)
code['Qname']=c
codes.append(code)
print(codes)
if __name__ == '__main__':
main()