自己写的一个爬蚂蜂窝的景点数据的Python程序，Mark一下

最新推荐文章于 2023-05-25 16:14:04 发布

小豪666

最新推荐文章于 2023-05-25 16:14:04 发布

阅读量431

点赞数

分类专栏：爬虫-python 文章标签： python 爬虫蚂蜂窝景点数据

本文链接：https://blog.csdn.net/wofuhao/article/details/78112030

版权

爬虫-python 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

#!/usr/bin python
#coding:utf-8

import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import requests
import re
import traceback


def print_fun(entity):
    for k in entity:
        print "%s\t%s"%(k,entity[k])
    print '-'*70
def filter_sum_fun(summary):
    summary_list=summary.split('\n')
    summary_list=filter(lambda x:len(x)>10,summary_list)
    new_summary='\n'.join(summary_list)
    return new_summary


p_each=re.compile('<div class="ct-text ">(.*?)</div>',re.S)
p_link=re.compile('<a href="(.*?)"',re.S)
p_name=re.compile('<h3>(.*?)</h3>',re.S)
p_region=re.compile('<ul class="seg-info-list clearfix">(.*?)</ul>',re.S)
p_final=re.compile('>(.*?)<',re.S)

p_summary1=re.compile('summary">(.*?)</div>',re.S)
p_summary2=re.compile(u'简介</h3>.*?<p>(.*?)</p>',re.S)
p_pic_outer1=re.compile('class="pic-big">(.*?)</div>',re.S)
p_pic_outer2=re.compile('class="pic-r">(.*?)</div>',re.S)
p_pic_in=re.compile('<img src="(.*?)"',re.S)



def get_entity_by_name(tourist_name):
    url=u'http://www.mafengwo.cn/search/s.php?q=%s&t=poi&seid=A271F36A-F900-49D3-86E1-168806400065'%tourist_name
    entityList=[]
    try:
        html,cont='',[]
        html=requests.get(url)
        htmlText=html.text.encode(html.encoding).decode('utf8')   
        cont=p_each.findall(htmlText)
        for i in cont:
            entity={}
            link=p_link.search(i).group(1)
            if 'sr-keyword' not in p_name.search(i).group(1):
                continue
            name_area=p_name.search(i).group(1).replace('<span class="sr-keyword">','').replace('</span>','')
            region_area=p_region.search(i).group(1).replace('<li>','')
            naType=p_final.search(name_area).group(1).replace(' - ','-')

            typer,name=naType.split('-')
            if ' ' in name:
                name,alias=name.split(' ',1)
            else:
                alias=""
            region=p_final.search(region_area).group(1).replace(' ','')

            sub_html=requests.get(link)
            sub_htmlText=sub_html.text.encode(sub_html.encoding).decode('utf8')

            summary_area,pic_area=None,None
            if typer=='景点':
                summary_area=p_summary1.search(sub_htmlText)
                pic_area=p_pic_outer1.search(sub_htmlText)
            elif typer=='娱乐':
                summary_area=p_summary2.search(sub_htmlText)
                pic_area=p_pic_outer2.search(sub_htmlText)
            else: 
                continue
            summary=summary_area.group(1).replace(' ','').replace('<br/>','').strip() if summary_area else ''
            pic=p_pic_in.search(pic_area.group(1)).group(1) if pic_area else ''

            entity['name']=name
            entity['alias']=alias
            entity['type']=typer
            entity['region']=region
            entity['link']=link
            entity['summary']=filter_sum_fun(summary)
            entity['pic']=pic
            entityList.append(entity)
            # print_fun(entity)

    except Exception,e:
        # sys.stderr.write('=>'+url+'\n')
        traceback.print_exc()
    #output the result
    return entityList

if __name__ == "__main__":
    name=raw_input('please enter the tourist_name:')
    res_list=get_entity_by_name(name)
    for i in res_list:
        print_fun(i)

小豪666

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
自己写的一个爬蚂蜂窝的景点数据的Python程序，Mark一下

#!/usr/bin python#coding:utf-8import sysreload(sys)sys.setdefaultencoding("utf-8")import requestsimport reimport tracebackdef print_fun(entity): for k in entity: print "%s\t%s"%(k,en
复制链接

扫一扫