python爬虫基础 使用urllib库抓取高德接口边界信息

使用的是python3

需要pip install MySQL

#爬取逻辑

1.根据name查询匹配建筑物

2.通过接口返回的建筑物信息获取建筑物ID

3.用建筑物ID查询边界信息

4.数据返回格式是json字符串,直接转成json处理

5.遍历出边界信息保存

6.存储边界信息

#coding=utf-8
import urllib.request
from urllib.parse import quote
import time
import json
import pymysql
import random
import string

COON = pymysql.connect(
    host='127.0.0.1',
    port=3306,
    user='root',
    passwd='root',
    db='navi_scrapy',
    charset='utf8')

pagestart = 1

'''
1.数据表增加border字段
2.修改数据库链接
3.修改对应中的name和city字段
4.启动run_gaode_border.py

'''


start_url = 'https://restapi.amap.com/v3/place/text?key=4b86820a7590de60e4f81f53e59ae17f&citylimit=true&output=json&'    #开始网址

url = "https://ditu.amap.com/detail/get/detail?id="



def hello():
     citys = get_data() #查询数据
     print(citys)
     for city in citys:
         tempurl = quote(start_url + "keywords="+str(city[2])+"&city="+str(city[3])+"", safe=string.printable)
         request = urllib.request.Request(url=tempurl, headers=get_header(),method='GET')
         time.sleep(0.8)
         response = urllib.request.urlopen(request)
         parse(response,{"id": city[0], "name": city[2]})


def parse(response,meta):

    try:
        data = json.loads(response.read().decode("utf8"))
        print(data)
        if data["status"] == "1":
            poi = data["pois"][0] # 一般第一个就是查找的
            print( poi  )
            if poi["parent"] != []:
                print("查询parent ID~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
                id = poi["parent"]
            else:
                id = poi["id"]

            gaode_url = url + str(id)
            print(gaode_url)
            request = urllib.request.Request(url=gaode_url, headers=get_header(), method='GET')
            time.sleep(0.8)
            response = urllib.request.urlopen(request)
            info(response,{"id": meta["id"], "url": gaode_url})

        else:
            print("接口返回异常~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·")
    except Exception as e:

        print("查询失败~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·" + e)


def info(response,meta):
    try:
        data = json.loads(response.read().decode("utf8"))
        print(data)
        if data["status"] == "1":
            spec = data["data"]["spec"]
            border = spec["mining_shape"]["shape"]
            print("border :~~~~~~~~~~~~~~~~~~~",border)
            update_data((meta["id"]),border)
    except Exception as e:
        print("查询错误~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + meta["url"] + " error: "+ e)

def get_data():
    # 创建游标
    cursor = COON.cursor()
    try:
        sql = "SELECT * FROM new_shopping WHERE border is null"
        # 执行SQL,并返回受影响行数,执行多次
        cursor.execute(sql)
        infoList = cursor.fetchall()
        return infoList
    except Exception as e:
        print(e)
    finally:
        # 关闭
        cursor.close()

def update_data(id,border):
    # 创建游标
    cursor = COON.cursor()
    try:
        sql = "update new_shopping set border='" +str(border)+ " 'where id=" + str(id)
        # 执行SQL,并返回受影响行数,执行多次
        cursor.execute(sql)
    except Exception as e:
        print(e)
    finally:
        # 关闭游标
        cursor.close()

def get_header():
    '''
               随机生成User-Agent
               :return:
               '''
    head_connection = ['Keep-Alive', 'close']
    head_accept = ['text/html, application/xhtml+xml, */*',
                   'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8']
    head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
    head_user_agent = ['Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
                       'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
                       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
                       'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
                       'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
                       'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
                       'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
                       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
                       'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11',
                       'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
                       'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
                       'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
                       'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
                       ]
    result = {
        'Connection': head_connection[0],
        'Accept': head_accept[0],
        'Accept-Language': head_accept_language[1],
        'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
    }
    return result

hello()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值