爬取某度地图某商超地址信息
import requests
import pymysql
import random
import time
def get_content(pn,nn,supermarket):
try:
url = 'https://map.baidu.com/'
params = {
'newmap': 1,
'reqflag': 'pcmap',
'biz': 1,
'from': 'webmap',
'da_par': 'baidu',
'pcevaname': 'pc4.1',
'qt': 'con',
'from': 'webmap',
'c': 257,
'wd': supermarket,
'pn': pn,
'nn': nn,
'db': 0,
'sug': 0,
'addr': 0,
'da_src': 'shareurl',
'on_gel': 1,
'src': 7,
'gr': 3,
'l': 13,
'device_ratio': 1,
'tn': 'B_NORMAL_MAP',
'ie': 'utf-8',
'newfrom': 'zhuzhan_webmap'
}
header = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
....
]
header = random.choice(header)
headers = {
'User-Agent':header
}
resp = requests.get(url = url ,headers = headers,params = params)
content = resp.json()
return content
except Exception as e:
pass
print("url请求响应问题:",e)
def get_parse(content,supermarket):
try:
datas = []
if 'content' in content:
content = content['content'][:10]
for store in content:
province = store['admin_info']['province_name']
city = store['admin_info']['city_name']
area = store['admin_info']['area_name']
town = store['admin_info']['town_name']
supermarket = supermarket
store_name = store['name']
addr = store['addr']
datas.append((province,city,area,town,supermarket,store_name,addr))
return datas
else:
print("该页没数据!!")
return datas
except Exception as e:
print("解析错误信息:",e)
def get_mysqlconnect():
db = pymysql.connect(host='192.168.xx.xx',
user='root',
password='root',
port=3306,
database='demo')
cur = db.cursor()
return db,cur 返回mysql连接对象、游标
def close_mysqlconnect(db, cur):
db.close()
cur.close()
def main(pn):
try:
total_datas = []
supermarket = 'xxx'
for pn in range(1,pn):
nn = (pn-1) * 10
content = get_content(pn, nn,supermarket)
datas = get_parse(content,supermarket)
total_datas.extend(datas)
t = random.randint(1,4)
time.sleep(t)
print(str(pn) +'页 '+ '休眠' + str(t) + '秒' )
if total_datas != []:
total_datas = str(total_datas)[1:-1]
db,cur = get_mysqlconnect()
sql = """
insert into supermarket_info (省,市,区县,乡镇,超市,超市分店,详细地址) values
"""
sql = sql + total_datas
cur.execute(sql)
db.commit()
close_mysqlconnect(db, cur)
except Exception as e:
print("插入错误信息:",e)
finally:
print("*"*30+"爬取完毕"+"*"*30)
if __name__ == '__main__' :
pn = 11
main(pn)
爬取某东商品信息
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
def get_headers():
ua = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
...
]
useragent = random.choice(ua)
return useragent
def get_content(page,s):
url = 'https://search.jd.com/Search'
useragent = get_headers()
headers = {
'Cookie':'xxx',
'User-Agent':useragent
}
params = {
'keyword': 'xxx',
'qrst': 1,
'suggest': '1.his.0.0',
'wq': 'xxx',
'stock': 1,
'ev': 'exbrand_xxx(HAOYIKANG) ^',
'pvid': '5059b011112a4068831c4dd4555e7791',
'page': page,
's': s,
'click': 0
}
respone = requests.get(url=url,headers=headers,params=params)
content = respone.text
return content
def get_analysis():
datas = []
for page in range(1,86):
s = 1 + (page -1) * 30
content = get_content(page,s)
soup = BeautifulSoup(content,'lxml')
products = soup.find_all('div', class_='gl-i-wrap')
for product in products:
price = product.find('div', class_='p-price').find('i').get_text()
prod_info = product.find('div',class_='p-name p-name-type-2').find('em').get_text()
storename = product.find('span',class_='J_im_icon').find('a').get_text()
datas.append((storename,prod_info,price))
t = random.randint(1,4)
time.sleep(t)
print('正在爬取'+str(page)+'页!!'+ '休眠' +str(t)+'秒')
return datas
def save_datas(datas):
columns = ['店铺名称','商品信息','商品信息']
pd.DataFrame(datas,columns=columns).to_excel('./data/xxx_京东.xlsx',index=False)
def main():
datas = get_analysis()
save_datas(datas)
if __name__ == '__main__':
main()
爬取某宝商品信息
import requests
import re
import pandas as pd
import random
import time
def get_content(page):
url = r'https://s.taobao.com/search'
params = {
'q': 'xxx',
'suggest': 'history_4',
'commend': 'all',
'ssid': 's5-e',
'search_type': 'item',
'sourceId': 'tb.index',
'spm': 'a21bo.jianhua.201856-taobao-item.2',
'ie': 'utf8',
'initiative_id': 'tbindexz_20170306',
'_input_charset': 'utf-8',
'wq': '',
'suggest_query': '',
'source': 'suggest',
'bcoffset': 2,
'ntoffset': 2,
'p4ppushleft': '2,47',
's': page
}
headers = {
'Cookie':'xxx',
'Host':'s.taobao.com',
'Referer':'https://www.taobao.com/',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
respone = requests.get(url=url,headers=headers,params=params)
content = respone.text
return content
def parser_content(content):
data = []
title = re.findall('"raw_title":"([\s\S]*?)"',content)
shopName = re.findall('"shopName":"([\s\S]*?)"',content)
price = re.findall('"view_price":"([\s\S]*?)"',content)
itemLoc = re.findall('"item_loc":"([\s\S]*?)"',content)
for i in range(len(title)):
data.append((title[i],shopName[i],price[i],itemLoc[i]))
return data
if __name__ == '__main__':
datas = []
tag = ['商品信息','店铺名称','商品价格','商品地址']
for page in range(0,100,10):
content = get_content(page)
data = parser_content(content)
for x in data:
datas.append(x)
t = random.randint(1,3)
time.sleep(t)
print(str(page)+'位置'+str(t)+'秒')
pd.DataFrame(datas,columns=tag).to_excel('./data/xxx.xlsx',index=False)
持续更新中…