本文仅供学习参考
1.采用转文件转MySQL数据库形式存储内容
2.爬取速度不宜过快,太快容易失败
代码如下:
先爬取并将其全部存储至文件中
import json, time
import random
import requests
six_cities_list = ['北京市', '上海市', '重庆市', '天津市', '香港特别行政区', '澳门特别行政区']
province_list = ['河北省', '山西省', '辽宁省', '吉林省', '黑龙江省', '江苏省', '浙江省', '安徽省', '福建省', '江西省',
'山东省', '河南省','湖北省', '湖南省', '广东省', '海南省', '四川省', '贵州省', '云南省', '陕西省', '甘肃省',
'青海省', '台湾省', '内蒙古自治区', '广西壮族自治区', '西藏自治区', '宁夏回族自治区', '新疆维吾尔自治区']
def getjson(loc, page_num=0):
headers = {
'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
Gecko Chrome/63.0.3239.132'
}
pa = {
'query': '公园',
# 'tag': '',
'region': loc,
'scope': '2',
'page_size': 20,
'page_num': page_num,
'ak': '自己的API'
}
try:
r = requests.get('http://api.map.baidu.com/place/v2/search?query={}®ion={}&output=json&ak={}'\
.format(pa['query'], pa['region'], pa['ak']), params=pa, headers=headers)
decodejson=json.loads(r.text)
return decodejson
except Exception as e:
getjson(loc)
print('over-requests! Error:', e)
def six_city():
decodejson = getjson('全国')
for eachprovince in decodejson['results']:
try:
city = eachprovince['name']
num = eachprovince['num']
if city in six_cities_list:
output = '\t'.join([city, str(num)]) + '\n'
with open('cities.txt', 'a+', encoding='UTF-8') as f:
f.write(output)
f.close()
except Exception as e:
print('over_cities! Error:', e)
def else_city():
for eachprovince in province_list:
decodejson = getjson(eachprovince)
try:
for eachcity in decodejson['results']:
try:
city = eachcity['name']
num = eachcity['num']
output = '\t'.join([city, str(num)]) + '个\n'
with open('cities.txt', 'a+', encoding='UTF-8') as f:
f.write(output)
f.close()
except Exception as e:
continue
except Exception as e:
print('over-eachprovince! Error:', e)
finally:
time.sleep(random.random())
if __name__ == '__main__':
print('正在爬取全国各地"公园"分布数目并存入cities.txt.')
six_city()
else_city()
利用全国公园里各大城市的地址获取想要的结果
import json, time
import random
import pymysql
import requests
city_list = list()
config = {
'host':'localhost',
'port':3306,
'user':'root',
'password':'113754',
'db':'baidu_map',
'charset':'utf8mb4',
'cursorclass':pymysql.cursors.DictCursor,
}
conn = pymysql.connect(**config)
cur = conn.cursor()
def Word():
with open('cities.txt', 'r', encoding='UTF-8') as txt_file:
for eachLine in txt_file:
if eachLine != '' and eachLine != '\n':
fields = eachLine.split('\t')
city = fields[0]
city_list.append(city)
txt_file.close()
def getjson(loc, page_num=0):
headers = {
'user-agent': 'Mozilla/5.0 (Windows; U; Windows NT 10.0; en-US; rv:1.9.1.6)\
Gecko Chrome/63.0.3239.132'
}
pa = {
'query': 'CoCo',
'tag': '美食',
'region': loc,
'scope': '2',
'page_size': 20,
'page_num': page_num,
'ak': 'yX91zbGwxNxaGWMwo3LPx3MWovVCScHj'
}
try:
r = requests.get('http://api.map.baidu.com/place/v2/search?query={}&tag={}®ion={}&output=json&ak={}'\
.format(pa['query'], pa['tag'], pa['region'], pa['ak']), params=pa, headers=headers)
decodejson=json.loads(r.text)
return decodejson
except Exception as e:
print('over-requests! Error:', e)
getjson(loc)
def Insert_mysql():
for eachcity in city_list:
not_last_page = True
page_num = 0
while not_last_page:
decodejson = getjson(eachcity, page_num)
time.sleep(random.random())
print(eachcity, page_num)
try:
if decodejson['results']:
for eachone in decodejson['results']:
try:
park = eachone['name']
except:
park = None
try:
location_lat = eachone['location']['lat']
except:
location_lat = None
try:
location_lng = eachone['location']['lng']
except:
location_lng = None
try:
address = eachone['address']
except:
address = None
try:
street_id = eachone['street_id']
except:
street_id = None
try:
uid = eachone['uid']
except:
uid = None
sql = '''INSERT INTO baidu_map.city
(city, park, location_lat, location_lng, address, street_id, uid)
VALUES (%s, %s, %s, %s, %s, %s, %s)'''
cur.execute(sql, (eachcity, park, location_lat, location_lng, address, street_id, uid))
conn.commit()
page_num += 1
except Exception as e:
print('Error:', e)
not_last_page = False
if __name__ == '__main__':
Word()
Insert_mysql()
cur.close()
conn.close()
运行截图如下: