网络爬虫-百度地图-全国地址信息-MySQL数据库

本文仅供学习参考

1.采用转文件转MySQL数据库形式存储内容

2.爬取速度不宜过快,太快容易失败

代码如下:

先爬取并将其全部存储至文件中

import json, time
import random
import requests

six_cities_list = ['北京市', '上海市', '重庆市', '天津市', '香港特别行政区', '澳门特别行政区']

province_list = ['河北省', '山西省', '辽宁省', '吉林省', '黑龙江省', '江苏省', '浙江省', '安徽省', '福建省', '江西省',
                 '山东省', '河南省','湖北省', '湖南省', '广东省', '海南省', '四川省', '贵州省', '云南省', '陕西省', '甘肃省',
                 '青海省', '台湾省', '内蒙古自治区', '广西壮族自治区', '西藏自治区', '宁夏回族自治区', '新疆维吾尔自治区']

def getjson(loc, page_num=0):
    headers = {
    'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
     Gecko Chrome/63.0.3239.132'
    }
    pa = {
        'query': '公园',
        # 'tag': '',
        'region': loc,
        'scope': '2',
        'page_size': 20,
        'page_num': page_num,
            'ak': '自己的API'
    }
    try:
        r = requests.get('http://api.map.baidu.com/place/v2/search?query={}&region={}&output=json&ak={}'\
                         .format(pa['query'], pa['region'], pa['ak']), params=pa, headers=headers)
        decodejson=json.loads(r.text)
        return decodejson
    except Exception as e:
        getjson(loc)
        print('over-requests! Error:', e)


def six_city():
    decodejson = getjson('全国')
    for eachprovince in decodejson['results']:
        try:
            city = eachprovince['name']
            num = eachprovince['num']
            if city in six_cities_list:
                output = '\t'.join([city, str(num)]) + '\n'
                with open('cities.txt', 'a+', encoding='UTF-8') as f:
                    f.write(output)
                    f.close()
        except Exception as e:
            print('over_cities! Error:', e)

def else_city():
    for eachprovince in province_list:
        decodejson = getjson(eachprovince)
        try:
            for eachcity in decodejson['results']:
                try:
                    city = eachcity['name']
                    num = eachcity['num']
                    output = '\t'.join([city, str(num)]) + '个\n'
                    with open('cities.txt', 'a+', encoding='UTF-8') as f:
                        f.write(output)
                        f.close()
                except Exception as e:
                    continue
        except Exception as e:
            print('over-eachprovince! Error:', e)
        finally:
            time.sleep(random.random())

if __name__ == '__main__':
    print('正在爬取全国各地"公园"分布数目并存入cities.txt.')
    six_city()
    else_city()

利用全国公园里各大城市的地址获取想要的结果

import json, time
import random
import pymysql
import requests
city_list = list()

config = {
          'host':'localhost',
          'port':3306,
          'user':'root',
          'password':'113754',
          'db':'baidu_map',
          'charset':'utf8mb4',
          'cursorclass':pymysql.cursors.DictCursor,
}

conn = pymysql.connect(**config)
cur = conn.cursor()

def Word():
    with open('cities.txt', 'r', encoding='UTF-8') as txt_file:
        for eachLine in txt_file:
            if eachLine != '' and eachLine != '\n':
                fields = eachLine.split('\t')
                city = fields[0]
                city_list.append(city)
        txt_file.close()

def getjson(loc, page_num=0):
    headers = {
    'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
     Gecko Chrome/63.0.3239.132'
    }
    pa = {
        'query': 'CoCo',
         'tag': '美食',
        'region': loc,
        'scope': '2',
        'page_size': 20,
        'page_num': page_num,
            'ak': 'yX91zbGwxNxaGWMwo3LPx3MWovVCScHj'
    }
    try:
        r = requests.get('http://api.map.baidu.com/place/v2/search?query={}&tag={}&region={}&output=json&ak={}'\
                         .format(pa['query'], pa['tag'], pa['region'], pa['ak']), params=pa, headers=headers)
        decodejson=json.loads(r.text)
        return decodejson
    except Exception as e:
        print('over-requests! Error:', e)
        getjson(loc)

def Insert_mysql():
    for eachcity in city_list:
        not_last_page = True
        page_num = 0
        while not_last_page:
            decodejson = getjson(eachcity, page_num)
            time.sleep(random.random())
            print(eachcity, page_num)
            try:
                if decodejson['results']:
                    for eachone in decodejson['results']:
                        try:
                            park = eachone['name']
                        except:
                            park = None
                        try:
                            location_lat = eachone['location']['lat']
                        except:
                            location_lat = None
                        try:
                            location_lng = eachone['location']['lng']
                        except:
                            location_lng = None
                        try:
                            address = eachone['address']
                        except:
                            address = None
                        try:
                            street_id = eachone['street_id']
                        except:
                            street_id = None
                        try:
                            uid = eachone['uid']
                        except:
                            uid = None
                        sql = '''INSERT INTO baidu_map.city
                        (city, park, location_lat, location_lng, address, street_id, uid)
                        VALUES (%s, %s, %s, %s, %s, %s, %s)'''
                        cur.execute(sql, (eachcity, park, location_lat, location_lng, address, street_id, uid))
                        conn.commit()
                    page_num += 1
            except Exception as e:
                print('Error:', e)
                not_last_page = False


if __name__ == '__main__':
    Word()
    Insert_mysql()
    cur.close()
    conn.close()

运行截图如下:

 

 

  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值