import requests
import json
import time
import pandas as pd
import hashlib
import random
sl = set() # url存储
count = 0 # key计数
keys = ['227b360433411dd173a81ab636a89543','227b360433411dd173a81ab636a89543']
def getTypes():
df = pd.read_excel('./amap_poicode.xlsx',sheet_name='POI分类与编码(中英文)')
type_list = list(df.iloc[:,1].map(lambda x: str(x).rjust(6,'0')))
return type_list
def getHeaders():
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
headers={ "User-Agent":random.choice(USER_AGENTS) }
return headers
def getKeys():
global count
if count == 0:
count += 1
return keys[0]
count = 0
return keys[1]
def getjson(page,types):
pa = {
'key': getKeys(),
'keywords': '',
'types':types,
'city':610113,
'citylimit':'true',
'offset': 20,
'extensions': 'all',
'children':1,
'page':page
}
r = requests.get('https://restapi.amap.com/v3/place/text?', params=pa, headers=getHeaders())
# url 去重
md5_obj = hashlib.md5()
md5_obj.update((r.url).encode(encoding='utf-8'))
encryp_url = md5_obj.hexdigest()
if encryp_url in sl:
return False
sl.add(encryp_url)
decodejson = json.loads(r.text)
return decodejson
for types in getTypes():
not_last_page = True
page = 1
while not_last_page:
decodejson = getjson(page,types)
if decodejson:
if(decodejson['count'] == '0'):
not_last_page = False
break
for eachone in decodejson['pois']:
try:
id = eachone['parentId'] #POI名称
except:
id = None
try:
parentId = eachone['parent'] #POI所属类别
except:
parentId = None
try:
name = eachone['name'] #POI地址
except:
name = None
try:
address = eachone['address'] #POI坐标
except:
address = None
try:
tel = eachone['tel'] #城市
except:
tel = None
data={
'id':id,
'parentId':parentId,
'name':name,
'address':address,
'tel':tel
}
print(data)
time.sleep(0.2)
page += 1
else:
not_last_page = False
spider-gaode-2
最新推荐文章于 2020-10-01 23:50:12 发布