#!/usr/bin python
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import requests
import re
import traceback
def print_fun(entity):
for k in entity:
print "%s\t%s"%(k,entity[k])
print '-'*70
def filter_sum_fun(summary):
summary_list=summary.split('\n')
summary_list=filter(lambda x:len(x)>10,summary_list)
new_summary='\n'.join(summary_list)
return new_summary
p_each=re.compile('<div class="ct-text ">(.*?)</div>',re.S)
p_link=re.compile('<a href="(.*?)"',re.S)
p_name=re.compile('<h3>(.*?)</h3>',re.S)
p_region=re.compile('<ul class="seg-info-list clearfix">(.*?)</ul>',re.S)
p_final=re.compile('>(.*?)<',re.S)
p_summary1=re.compile('summary">(.*?)</div>',re.S)
p_summary2=re.compile(u'简介</h3>.*?<p>(.*?)</p>',re.S)
p_pic_outer1=re.compile('class="pic-big">(.*?)</div>',re.S)
p_pic_outer2=re.compile('class="pic-r">(.*?)</div>',re.S)
p_pic_in=re.compile('<img src="(.*?)"',re.S)
def get_entity_by_name(tourist_name):
url=u'http://www.mafengwo.cn/search/s.php?q=%s&t=poi&seid=A271F36A-F900-49D3-86E1-168806400065'%tourist_name
entityList=[]
try:
html,cont='',[]
html=requests.get(url)
htmlText=html.text.encode(html.encoding).decode('utf8')
cont=p_each.findall(htmlText)
for i in cont:
entity={}
link=p_link.search(i).group(1)
if 'sr-keyword' not in p_name.search(i).group(1):
continue
name_area=p_name.search(i).group(1).replace('<span class="sr-keyword">','').replace('</span>','')
region_area=p_region.search(i).group(1).replace('<li>','')
naType=p_final.search(name_area).group(1).replace(' - ','-')
typer,name=naType.split('-')
if ' ' in name:
name,alias=name.split(' ',1)
else:
alias=""
region=p_final.search(region_area).group(1).replace(' ','')
sub_html=requests.get(link)
sub_htmlText=sub_html.text.encode(sub_html.encoding).decode('utf8')
summary_area,pic_area=None,None
if typer=='景点':
summary_area=p_summary1.search(sub_htmlText)
pic_area=p_pic_outer1.search(sub_htmlText)
elif typer=='娱乐':
summary_area=p_summary2.search(sub_htmlText)
pic_area=p_pic_outer2.search(sub_htmlText)
else:
continue
summary=summary_area.group(1).replace(' ','').replace('<br/>','').strip() if summary_area else ''
pic=p_pic_in.search(pic_area.group(1)).group(1) if pic_area else ''
entity['name']=name
entity['alias']=alias
entity['type']=typer
entity['region']=region
entity['link']=link
entity['summary']=filter_sum_fun(summary)
entity['pic']=pic
entityList.append(entity)
# print_fun(entity)
except Exception,e:
# sys.stderr.write('=>'+url+'\n')
traceback.print_exc()
#output the result
return entityList
if __name__ == "__main__":
name=raw_input('please enter the tourist_name:')
res_list=get_entity_by_name(name)
for i in res_list:
print_fun(i)
自己写的一个爬蚂蜂窝的景点数据的Python程序,Mark一下
最新推荐文章于 2024-05-03 11:07:56 发布